summaryrefslogtreecommitdiffstats
path: root/src/core
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 03:50:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 03:50:40 +0000
commitfc53809803cd2bc2434e312b19a18fa36776da12 (patch)
treeb4b43bd6538f51965ce32856e9c053d0f90919c8 /src/core
parentAdding upstream version 255.5. (diff)
downloadsystemd-fc53809803cd2bc2434e312b19a18fa36776da12.tar.xz
systemd-fc53809803cd2bc2434e312b19a18fa36776da12.zip
Adding upstream version 256.upstream/256
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/automount.c75
-rw-r--r--src/core/bpf-devices.c82
-rw-r--r--src/core/bpf-firewall.c160
-rw-r--r--src/core/bpf-foreign.c21
-rw-r--r--src/core/bpf-lsm.h28
-rw-r--r--src/core/bpf-restrict-fs.c (renamed from src/core/bpf-lsm.c)92
-rw-r--r--src/core/bpf-restrict-fs.h23
-rw-r--r--src/core/bpf-restrict-ifaces.c (renamed from src/core/restrict-ifaces.c)69
-rw-r--r--src/core/bpf-restrict-ifaces.h (renamed from src/core/restrict-ifaces.h)8
-rw-r--r--src/core/bpf-socket-bind.c51
-rw-r--r--src/core/bpf-socket-bind.h2
-rw-r--r--src/core/bpf-util.c3
-rw-r--r--src/core/cgroup.c1646
-rw-r--r--src/core/cgroup.h132
-rw-r--r--src/core/core-varlink.c105
-rw-r--r--src/core/core-varlink.h4
-rw-r--r--src/core/crash-handler.c8
-rw-r--r--src/core/dbus-cgroup.c21
-rw-r--r--src/core/dbus-execute.c117
-rw-r--r--src/core/dbus-execute.h1
-rw-r--r--src/core/dbus-job.c25
-rw-r--r--src/core/dbus-manager.c407
-rw-r--r--src/core/dbus-mount.c31
-rw-r--r--src/core/dbus-scope.c24
-rw-r--r--src/core/dbus-service.c2
-rw-r--r--src/core/dbus-socket.c4
-rw-r--r--src/core/dbus-unit.c158
-rw-r--r--src/core/dbus-util.c7
-rw-r--r--src/core/dbus-util.h3
-rw-r--r--src/core/dbus.c92
-rw-r--r--src/core/device.c75
-rw-r--r--src/core/dynamic-user.c49
-rw-r--r--src/core/emergency-action.c32
-rw-r--r--src/core/emergency-action.h6
-rw-r--r--src/core/exec-credential.c256
-rw-r--r--src/core/exec-credential.h4
-rw-r--r--src/core/exec-invoke.c649
-rw-r--r--src/core/execute-serialize.c131
-rw-r--r--src/core/execute.c239
-rw-r--r--src/core/execute.h175
-rw-r--r--src/core/executor.c5
-rw-r--r--src/core/fuzz-execute-serialize.c2
-rw-r--r--src/core/generator-setup.c12
-rw-r--r--src/core/import-creds.c17
-rw-r--r--src/core/job.c42
-rw-r--r--src/core/job.h1
-rw-r--r--src/core/kmod-setup.c48
-rw-r--r--src/core/load-fragment-gperf.gperf.in15
-rw-r--r--src/core/load-fragment.c353
-rw-r--r--src/core/load-fragment.h4
-rw-r--r--src/core/main.c246
-rw-r--r--src/core/main.h14
-rw-r--r--src/core/manager-dump.c2
-rw-r--r--src/core/manager-serialize.c97
-rw-r--r--src/core/manager.c671
-rw-r--r--src/core/manager.h63
-rw-r--r--src/core/meson.build7
-rw-r--r--src/core/mount.c353
-rw-r--r--src/core/mount.h1
-rw-r--r--src/core/namespace.c333
-rw-r--r--src/core/path.c81
-rw-r--r--src/core/scope.c95
-rw-r--r--src/core/scope.h1
-rw-r--r--src/core/selinux-access.c5
-rw-r--r--src/core/service.c787
-rw-r--r--src/core/service.h4
-rw-r--r--src/core/show-status.c4
-rw-r--r--src/core/slice.c147
-rw-r--r--src/core/slice.h2
-rw-r--r--src/core/socket.c382
-rw-r--r--src/core/socket.h4
-rw-r--r--src/core/swap.c257
-rw-r--r--src/core/swap.h1
-rw-r--r--src/core/system.conf.in3
-rw-r--r--src/core/taint.c85
-rw-r--r--src/core/taint.h4
-rw-r--r--src/core/target.c57
-rw-r--r--src/core/timer.c89
-rw-r--r--src/core/transaction.c8
-rw-r--r--src/core/unit-printf.c59
-rw-r--r--src/core/unit-serialize.c279
-rw-r--r--src/core/unit.c1264
-rw-r--r--src/core/unit.h166
83 files changed, 6467 insertions, 4620 deletions
diff --git a/src/core/automount.c b/src/core/automount.c
index 14bf7e6..6cb9d52 100644
--- a/src/core/automount.c
+++ b/src/core/automount.c
@@ -38,10 +38,10 @@
#include "unit.h"
static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = {
- [AUTOMOUNT_DEAD] = UNIT_INACTIVE,
+ [AUTOMOUNT_DEAD] = UNIT_INACTIVE,
[AUTOMOUNT_WAITING] = UNIT_ACTIVE,
[AUTOMOUNT_RUNNING] = UNIT_ACTIVE,
- [AUTOMOUNT_FAILED] = UNIT_FAILED
+ [AUTOMOUNT_FAILED] = UNIT_FAILED,
};
static int open_dev_autofs(Manager *m);
@@ -51,10 +51,8 @@ static void automount_stop_expire(Automount *a);
static int automount_send_ready(Automount *a, Set *tokens, int status);
static void automount_init(Unit *u) {
- Automount *a = AUTOMOUNT(u);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
- assert(a);
- assert(u);
assert(u->load_state == UNIT_STUB);
a->pipe_fd = -EBADF;
@@ -88,9 +86,7 @@ static void unmount_autofs(Automount *a) {
}
static void automount_done(Unit *u) {
- Automount *a = AUTOMOUNT(u);
-
- assert(a);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
unmount_autofs(a);
@@ -126,7 +122,7 @@ static int automount_add_mount_dependencies(Automount *a) {
if (r < 0)
return r;
- return unit_require_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT);
+ return unit_add_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT, UNIT_MOUNT_REQUIRES);
}
static int automount_add_default_dependencies(Automount *a) {
@@ -227,10 +223,9 @@ static int automount_add_extras(Automount *a) {
}
static int automount_load(Unit *u) {
- Automount *a = AUTOMOUNT(u);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
int r;
- assert(u);
assert(u->load_state == UNIT_STUB);
/* Load a .automount file */
@@ -250,6 +245,7 @@ static int automount_load(Unit *u) {
static void automount_set_state(Automount *a, AutomountState state) {
AutomountState old_state;
+
assert(a);
if (a->state != state)
@@ -271,10 +267,9 @@ static void automount_set_state(Automount *a, AutomountState state) {
}
static int automount_coldplug(Unit *u) {
- Automount *a = AUTOMOUNT(u);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
int r;
- assert(a);
assert(a->state == AUTOMOUNT_DEAD);
if (a->deserialized_state == a->state)
@@ -310,9 +305,7 @@ static int automount_coldplug(Unit *u) {
}
static void automount_dump(Unit *u, FILE *f, const char *prefix) {
- Automount *a = AUTOMOUNT(u);
-
- assert(a);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
fprintf(f,
"%sAutomount State: %s\n"
@@ -478,30 +471,22 @@ static int automount_send_ready(Automount *a, Set *tokens, int status) {
r = 0;
/* Autofs thankfully does not hand out 0 as a token */
- while ((token = PTR_TO_UINT(set_steal_first(tokens)))) {
- int k;
-
+ while ((token = PTR_TO_UINT(set_steal_first(tokens))))
/* Autofs fun fact:
*
- * if you pass a positive status code here, kernels
- * prior to 4.12 will freeze! Yay! */
-
- k = autofs_send_ready(UNIT(a)->manager->dev_autofs_fd,
- ioctl_fd,
- token,
- status);
- if (k < 0)
- r = k;
- }
+ * if you pass a positive status code here, kernels prior to 4.12 will freeze! Yay! */
+ RET_GATHER(r, autofs_send_ready(UNIT(a)->manager->dev_autofs_fd,
+ ioctl_fd,
+ token,
+ status));
return r;
}
static void automount_trigger_notify(Unit *u, Unit *other) {
- Automount *a = AUTOMOUNT(u);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
int r;
- assert(a);
assert(other);
/* Filter out invocations with bogus state */
@@ -697,11 +682,10 @@ static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) {
}
static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) {
+ Automount *a = ASSERT_PTR(AUTOMOUNT(userdata));
_cleanup_close_ int ioctl_fd = -EBADF;
- Automount *a = AUTOMOUNT(userdata);
int r;
- assert(a);
assert(source == a->expire_event_source);
ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
@@ -815,13 +799,12 @@ fail:
}
static int automount_start(Unit *u) {
- Automount *a = AUTOMOUNT(u);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
int r;
- assert(a);
assert(IN_SET(a->state, AUTOMOUNT_DEAD, AUTOMOUNT_FAILED));
- if (path_is_mount_point(a->where, NULL, 0) > 0)
+ if (path_is_mount_point(a->where) > 0)
return log_unit_error_errno(u, SYNTHETIC_ERRNO(EEXIST), "Path %s is already a mount point, refusing start.", a->where);
r = unit_test_trigger_loaded(u);
@@ -838,9 +821,8 @@ static int automount_start(Unit *u) {
}
static int automount_stop(Unit *u) {
- Automount *a = AUTOMOUNT(u);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
- assert(a);
assert(IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING));
automount_enter_dead(a, AUTOMOUNT_SUCCESS);
@@ -848,11 +830,10 @@ static int automount_stop(Unit *u) {
}
static int automount_serialize(Unit *u, FILE *f, FDSet *fds) {
- Automount *a = AUTOMOUNT(u);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
void *p;
int r;
- assert(a);
assert(f);
assert(fds);
@@ -873,10 +854,9 @@ static int automount_serialize(Unit *u, FILE *f, FDSet *fds) {
}
static int automount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Automount *a = AUTOMOUNT(u);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
int r;
- assert(a);
assert(fds);
if (streq(key, "state")) {
@@ -958,13 +938,12 @@ static bool automount_may_gc(Unit *u) {
}
static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata) {
+ Automount *a = ASSERT_PTR(AUTOMOUNT(userdata));
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
union autofs_v5_packet_union packet;
- Automount *a = AUTOMOUNT(userdata);
Unit *trigger;
int r;
- assert(a);
assert(fd == a->pipe_fd);
if (events & (EPOLLHUP|EPOLLERR)) {
@@ -1048,9 +1027,7 @@ static void automount_shutdown(Manager *m) {
}
static void automount_reset_failed(Unit *u) {
- Automount *a = AUTOMOUNT(u);
-
- assert(a);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
if (a->state == AUTOMOUNT_FAILED)
automount_set_state(a, AUTOMOUNT_DEAD);
@@ -1068,11 +1045,9 @@ static bool automount_supported(void) {
}
static int automount_can_start(Unit *u) {
- Automount *a = AUTOMOUNT(u);
+ Automount *a = ASSERT_PTR(AUTOMOUNT(u));
int r;
- assert(a);
-
r = unit_test_start_limit(u);
if (r < 0) {
automount_enter_dead(a, AUTOMOUNT_FAILURE_START_LIMIT_HIT);
diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c
index 06d2146..8484dbc 100644
--- a/src/core/bpf-devices.c
+++ b/src/core/bpf-devices.c
@@ -24,15 +24,15 @@ assert_cc((unsigned) BPF_DEVCG_ACC_WRITE == (unsigned) CGROUP_DEVICE_WRITE);
static int bpf_prog_allow_list_device(
BPFProgram *prog,
char type,
- int major,
- int minor,
+ unsigned major,
+ unsigned minor,
CGroupDevicePermissions p) {
int r;
assert(prog);
- log_trace("%s: %c %d:%d %s", __func__, type, major, minor, cgroup_device_permissions_to_string(p));
+ log_trace("%s: %c %u:%u %s", __func__, type, major, minor, cgroup_device_permissions_to_string(p));
if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
return -EINVAL;
@@ -56,22 +56,22 @@ static int bpf_prog_allow_list_device(
else
r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
if (r < 0)
- log_error_errno(r, "Extending device control BPF program failed: %m");
+ return log_error_errno(r, "Extending device control BPF program failed: %m");
- return r;
+ return 1; /* return 1 → we did something */
}
static int bpf_prog_allow_list_major(
BPFProgram *prog,
char type,
- int major,
+ unsigned major,
CGroupDevicePermissions p) {
int r;
assert(prog);
- log_trace("%s: %c %d:* %s", __func__, type, major, cgroup_device_permissions_to_string(p));
+ log_trace("%s: %c %u:* %s", __func__, type, major, cgroup_device_permissions_to_string(p));
if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
return -EINVAL;
@@ -94,9 +94,9 @@ static int bpf_prog_allow_list_major(
else
r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
if (r < 0)
- log_error_errno(r, "Extending device control BPF program failed: %m");
+ return log_error_errno(r, "Extending device control BPF program failed: %m");
- return r;
+ return 1; /* return 1 → we did something */
}
static int bpf_prog_allow_list_class(
@@ -130,9 +130,9 @@ static int bpf_prog_allow_list_class(
else
r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
if (r < 0)
- log_error_errno(r, "Extending device control BPF program failed: %m");
+ return log_error_errno(r, "Extending device control BPF program failed: %m");
- return r;
+ return 1; /* return 1 → we did something */
}
int bpf_devices_cgroup_init(
@@ -165,8 +165,10 @@ int bpf_devices_cgroup_init(
assert(ret);
- if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list)
+ if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list) {
+ *ret = NULL;
return 0;
+ }
r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &prog);
if (r < 0)
@@ -179,8 +181,7 @@ int bpf_devices_cgroup_init(
}
*ret = TAKE_PTR(prog);
-
- return 0;
+ return 1;
}
int bpf_devices_apply_policy(
@@ -307,8 +308,8 @@ static int allow_list_device_pattern(
BPFProgram *prog,
const char *path,
char type,
- const unsigned *maj,
- const unsigned *min,
+ unsigned major,
+ unsigned minor,
CGroupDevicePermissions p) {
assert(IN_SET(type, 'b', 'c'));
@@ -317,10 +318,10 @@ static int allow_list_device_pattern(
if (!prog)
return 0;
- if (maj && min)
- return bpf_prog_allow_list_device(prog, type, *maj, *min, p);
- else if (maj)
- return bpf_prog_allow_list_major(prog, type, *maj, p);
+ if (major != UINT_MAX && minor != UINT_MAX)
+ return bpf_prog_allow_list_device(prog, type, major, minor, p);
+ else if (major != UINT_MAX)
+ return bpf_prog_allow_list_major(prog, type, major, p);
else
return bpf_prog_allow_list_class(prog, type, p);
@@ -328,10 +329,10 @@ static int allow_list_device_pattern(
char buf[2+DECIMAL_STR_MAX(unsigned)*2+2+4];
int r;
- if (maj && min)
- xsprintf(buf, "%c %u:%u %s", type, *maj, *min, cgroup_device_permissions_to_string(p));
- else if (maj)
- xsprintf(buf, "%c %u:* %s", type, *maj, cgroup_device_permissions_to_string(p));
+ if (major != UINT_MAX && minor != UINT_MAX)
+ xsprintf(buf, "%c %u:%u %s", type, major, minor, cgroup_device_permissions_to_string(p));
+ else if (major != UINT_MAX)
+ xsprintf(buf, "%c %u:* %s", type, major, cgroup_device_permissions_to_string(p));
else
xsprintf(buf, "%c *:* %s", type, cgroup_device_permissions_to_string(p));
@@ -371,8 +372,14 @@ int bpf_devices_allow_list_device(
return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node);
struct stat st;
- if (stat(node, &st) < 0)
+ if (stat(node, &st) < 0) {
+ if (errno == ENOENT) {
+ log_debug_errno(errno, "Device '%s' does not exist, skipping.", node);
+ return 0; /* returning 0 means → skipped */
+ }
+
return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
+ }
if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), "%s is not a device.", node);
@@ -381,8 +388,7 @@ int bpf_devices_allow_list_device(
rdev = (dev_t) st.st_rdev;
}
- unsigned maj = major(rdev), min = minor(rdev);
- return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', &maj, &min, p);
+ return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', major(rdev), minor(rdev), p);
}
int bpf_devices_allow_list_major(
@@ -392,7 +398,7 @@ int bpf_devices_allow_list_major(
char type,
CGroupDevicePermissions permissions) {
- unsigned maj;
+ unsigned major;
int r;
assert(path);
@@ -401,12 +407,12 @@ int bpf_devices_allow_list_major(
if (streq(name, "*"))
/* If the name is a wildcard, then apply this list to all devices of this type */
- return allow_list_device_pattern(prog, path, type, NULL, NULL, permissions);
+ return allow_list_device_pattern(prog, path, type, /* major= */ UINT_MAX, /* minor= */ UINT_MAX, permissions);
- if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj))
+ if (safe_atou(name, &major) >= 0 && DEVICE_MAJOR_VALID(major))
/* The name is numeric and suitable as major. In that case, let's take its major, and create
* the entry directly. */
- return allow_list_device_pattern(prog, path, type, &maj, NULL, permissions);
+ return allow_list_device_pattern(prog, path, type, major, /* minor= */ UINT_MAX, permissions);
_cleanup_fclose_ FILE *f = NULL;
bool good = false, any = false;
@@ -450,10 +456,10 @@ int bpf_devices_allow_list_major(
continue;
*w = 0;
- r = safe_atou(p, &maj);
+ r = safe_atou(p, &major);
if (r < 0)
continue;
- if (maj <= 0)
+ if (major <= 0)
continue;
w++;
@@ -462,15 +468,15 @@ int bpf_devices_allow_list_major(
if (fnmatch(name, w, 0) != 0)
continue;
- any = true;
- (void) allow_list_device_pattern(prog, path, type, &maj, NULL, permissions);
+ if (allow_list_device_pattern(prog, path, type, major, /* minor= */ UINT_MAX, permissions) > 0)
+ any = true;
}
if (!any)
return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
"Device allow list pattern \"%s\" did not match anything.", name);
- return 0;
+ return any;
}
int bpf_devices_allow_list_static(
@@ -492,13 +498,13 @@ int bpf_devices_allow_list_static(
NULSTR_FOREACH_PAIR(node, acc, auto_devices) {
k = bpf_devices_allow_list_device(prog, path, node, cgroup_device_permissions_from_string(acc));
- if (r >= 0 && k < 0)
+ if ((r >= 0 && k < 0) || (r >= 0 && k > 0))
r = k;
}
/* PTS (/dev/pts) devices may not be duplicated, but accessed */
k = bpf_devices_allow_list_major(prog, path, "pts", 'c', CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE);
- if (r >= 0 && k < 0)
+ if ((r >= 0 && k < 0) || (r >= 0 && k > 0))
r = k;
return r;
diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c
index 66773e1..185ed7d 100644
--- a/src/core/bpf-firewall.c
+++ b/src/core/bpf-firewall.c
@@ -1,12 +1,13 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/* Make sure the net/if.h header is included before any linux/ one */
+#include <net/if.h>
#include <arpa/inet.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/bpf_insn.h>
#include <net/ethernet.h>
-#include <net/if.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <stddef.h>
@@ -196,19 +197,26 @@ static int bpf_firewall_compile_bpf(
_cleanup_(bpf_program_freep) BPFProgram *p = NULL;
int accounting_map_fd, r;
bool access_enabled;
+ CGroupRuntime *crt;
assert(u);
assert(ret);
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt) {
+ *ret = NULL;
+ return 0;
+ }
+
accounting_map_fd = is_ingress ?
- u->ip_accounting_ingress_map_fd :
- u->ip_accounting_egress_map_fd;
+ crt->ip_accounting_ingress_map_fd :
+ crt->ip_accounting_egress_map_fd;
access_enabled =
- u->ipv4_allow_map_fd >= 0 ||
- u->ipv6_allow_map_fd >= 0 ||
- u->ipv4_deny_map_fd >= 0 ||
- u->ipv6_deny_map_fd >= 0 ||
+ crt->ipv4_allow_map_fd >= 0 ||
+ crt->ipv6_allow_map_fd >= 0 ||
+ crt->ipv4_deny_map_fd >= 0 ||
+ crt->ipv6_deny_map_fd >= 0 ||
ip_allow_any ||
ip_deny_any;
@@ -234,26 +242,26 @@ static int bpf_firewall_compile_bpf(
* - Otherwise, access will be granted
*/
- if (u->ipv4_deny_map_fd >= 0) {
- r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
+ if (crt->ipv4_deny_map_fd >= 0) {
+ r = add_lookup_instructions(p, crt->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
if (r < 0)
return r;
}
- if (u->ipv6_deny_map_fd >= 0) {
- r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
+ if (crt->ipv6_deny_map_fd >= 0) {
+ r = add_lookup_instructions(p, crt->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
if (r < 0)
return r;
}
- if (u->ipv4_allow_map_fd >= 0) {
- r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
+ if (crt->ipv4_allow_map_fd >= 0) {
+ r = add_lookup_instructions(p, crt->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
if (r < 0)
return r;
}
- if (u->ipv6_allow_map_fd >= 0) {
- r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
+ if (crt->ipv6_allow_map_fd >= 0) {
+ r = add_lookup_instructions(p, crt->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
if (r < 0)
return r;
}
@@ -495,37 +503,36 @@ static int bpf_firewall_prepare_access_maps(
return 0;
}
-static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
+static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, CGroupRuntime *crt) {
int r;
assert(u);
- assert(fd_ingress);
- assert(fd_egress);
+ assert(crt);
if (enabled) {
- if (*fd_ingress < 0) {
+ if (crt->ip_accounting_ingress_map_fd < 0) {
char *name = strjoina("I_", u->id);
r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
if (r < 0)
return r;
- *fd_ingress = r;
+ crt->ip_accounting_ingress_map_fd = r;
}
- if (*fd_egress < 0) {
+ if (crt->ip_accounting_egress_map_fd < 0) {
char *name = strjoina("E_", u->id);
r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
if (r < 0)
return r;
- *fd_egress = r;
+ crt->ip_accounting_egress_map_fd = r;
}
} else {
- *fd_ingress = safe_close(*fd_ingress);
- *fd_egress = safe_close(*fd_egress);
+ crt->ip_accounting_ingress_map_fd = safe_close(crt->ip_accounting_ingress_map_fd);
+ crt->ip_accounting_egress_map_fd = safe_close(crt->ip_accounting_egress_map_fd);
- zero(u->ip_accounting_extra);
+ zero(crt->ip_accounting_extra);
}
return 0;
@@ -535,6 +542,7 @@ int bpf_firewall_compile(Unit *u) {
const char *ingress_name = NULL, *egress_name = NULL;
bool ip_allow_any = false, ip_deny_any = false;
CGroupContext *cc;
+ CGroupRuntime *crt;
int r, supported;
assert(u);
@@ -543,6 +551,10 @@ int bpf_firewall_compile(Unit *u) {
if (!cc)
return -EINVAL;
+ crt = unit_setup_cgroup_runtime(u);
+ if (!crt)
+ return -ENOMEM;
+
supported = bpf_firewall_supported();
if (supported < 0)
return supported;
@@ -569,14 +581,14 @@ int bpf_firewall_compile(Unit *u) {
* but we reuse the accounting maps. That way the firewall in effect always maps to the actual
* configuration, but we don't flush out the accounting unnecessarily */
- u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
- u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
+ crt->ip_bpf_ingress = bpf_program_free(crt->ip_bpf_ingress);
+ crt->ip_bpf_egress = bpf_program_free(crt->ip_bpf_egress);
- u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
- u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
+ crt->ipv4_allow_map_fd = safe_close(crt->ipv4_allow_map_fd);
+ crt->ipv4_deny_map_fd = safe_close(crt->ipv4_deny_map_fd);
- u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
- u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
+ crt->ipv6_allow_map_fd = safe_close(crt->ipv6_allow_map_fd);
+ crt->ipv6_deny_map_fd = safe_close(crt->ipv6_deny_map_fd);
if (u->type != UNIT_SLICE) {
/* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
@@ -585,24 +597,24 @@ int bpf_firewall_compile(Unit *u) {
* means that all configure IP access rules *will* take effect on processes, even though we never
* compile them for inner nodes. */
- r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
+ r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &crt->ipv4_allow_map_fd, &crt->ipv6_allow_map_fd, &ip_allow_any);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m");
- r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
+ r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &crt->ipv4_deny_map_fd, &crt->ipv6_deny_map_fd, &ip_deny_any);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m");
}
- r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
+ r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, crt);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m");
- r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
+ r = bpf_firewall_compile_bpf(u, ingress_name, true, &crt->ip_bpf_ingress, ip_allow_any, ip_deny_any);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m");
- r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
+ r = bpf_firewall_compile_bpf(u, egress_name, false, &crt->ip_bpf_egress, ip_allow_any, ip_deny_any);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m");
@@ -634,6 +646,7 @@ static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set
int bpf_firewall_load_custom(Unit *u) {
CGroupContext *cc;
+ CGroupRuntime *crt;
int r, supported;
assert(u);
@@ -641,6 +654,9 @@ int bpf_firewall_load_custom(Unit *u) {
cc = unit_get_cgroup_context(u);
if (!cc)
return 0;
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
return 0;
@@ -653,10 +669,10 @@ int bpf_firewall_load_custom(Unit *u) {
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
- r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
+ r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &crt->ip_bpf_custom_ingress);
if (r < 0)
return r;
- r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
+ r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &crt->ip_bpf_custom_egress);
if (r < 0)
return r;
@@ -686,6 +702,7 @@ int bpf_firewall_install(Unit *u) {
_cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL;
_cleanup_free_ char *path = NULL;
CGroupContext *cc;
+ CGroupRuntime *crt;
int r, supported;
uint32_t flags;
@@ -694,9 +711,12 @@ int bpf_firewall_install(Unit *u) {
cc = unit_get_cgroup_context(u);
if (!cc)
return -EINVAL;
- if (!u->cgroup_path)
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return -EINVAL;
+ if (!crt->cgroup_path)
return -EINVAL;
- if (!u->cgroup_realized)
+ if (!crt->cgroup_realized)
return -EINVAL;
supported = bpf_firewall_supported();
@@ -709,11 +729,11 @@ int bpf_firewall_install(Unit *u) {
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units.");
if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
- (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
+ (!set_isempty(crt->ip_bpf_custom_ingress) || !set_isempty(crt->ip_bpf_custom_egress)))
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &path);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m");
@@ -724,44 +744,44 @@ int bpf_firewall_install(Unit *u) {
* after attaching the new programs, so that there's no time window where neither program is
* attached. (There will be a program where both are attached, but that's OK, since this is a
* security feature where we rather want to lock down too much than too little */
- ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed);
- ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed);
+ ip_bpf_egress_uninstall = TAKE_PTR(crt->ip_bpf_egress_installed);
+ ip_bpf_ingress_uninstall = TAKE_PTR(crt->ip_bpf_ingress_installed);
} else {
/* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly
* detach them) right before attaching the new program, to minimize the time window when we
* don't account for IP traffic. */
- u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
- u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
+ crt->ip_bpf_egress_installed = bpf_program_free(crt->ip_bpf_egress_installed);
+ crt->ip_bpf_ingress_installed = bpf_program_free(crt->ip_bpf_ingress_installed);
}
- if (u->ip_bpf_egress) {
- r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
+ if (crt->ip_bpf_egress) {
+ r = bpf_program_cgroup_attach(crt->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
if (r < 0)
return log_unit_error_errno(u, r,
"bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path);
/* Remember that this BPF program is installed now. */
- u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress);
+ crt->ip_bpf_egress_installed = TAKE_PTR(crt->ip_bpf_egress);
}
- if (u->ip_bpf_ingress) {
- r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
+ if (crt->ip_bpf_ingress) {
+ r = bpf_program_cgroup_attach(crt->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
if (r < 0)
return log_unit_error_errno(u, r,
"bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path);
- u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress);
+ crt->ip_bpf_ingress_installed = TAKE_PTR(crt->ip_bpf_ingress);
}
/* And now, definitely get rid of the old programs, and detach them */
ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall);
ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall);
- r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
+ r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &crt->ip_bpf_custom_egress, &crt->ip_bpf_custom_egress_installed);
if (r < 0)
return r;
- r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
+ r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &crt->ip_bpf_custom_ingress, &crt->ip_bpf_custom_ingress_installed);
if (r < 0)
return r;
@@ -954,21 +974,25 @@ void emit_bpf_firewall_warning(Unit *u) {
void bpf_firewall_close(Unit *u) {
assert(u);
- u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd);
- u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return;
+
+ crt->ip_accounting_ingress_map_fd = safe_close(crt->ip_accounting_ingress_map_fd);
+ crt->ip_accounting_egress_map_fd = safe_close(crt->ip_accounting_egress_map_fd);
- u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
- u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
- u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
- u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
+ crt->ipv4_allow_map_fd = safe_close(crt->ipv4_allow_map_fd);
+ crt->ipv6_allow_map_fd = safe_close(crt->ipv6_allow_map_fd);
+ crt->ipv4_deny_map_fd = safe_close(crt->ipv4_deny_map_fd);
+ crt->ipv6_deny_map_fd = safe_close(crt->ipv6_deny_map_fd);
- u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
- u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
- u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
- u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
+ crt->ip_bpf_ingress = bpf_program_free(crt->ip_bpf_ingress);
+ crt->ip_bpf_ingress_installed = bpf_program_free(crt->ip_bpf_ingress_installed);
+ crt->ip_bpf_egress = bpf_program_free(crt->ip_bpf_egress);
+ crt->ip_bpf_egress_installed = bpf_program_free(crt->ip_bpf_egress_installed);
- u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress);
- u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress);
- u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed);
- u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed);
+ crt->ip_bpf_custom_ingress = set_free(crt->ip_bpf_custom_ingress);
+ crt->ip_bpf_custom_egress = set_free(crt->ip_bpf_custom_egress);
+ crt->ip_bpf_custom_ingress_installed = set_free(crt->ip_bpf_custom_ingress_installed);
+ crt->ip_bpf_custom_egress_installed = set_free(crt->ip_bpf_custom_egress_installed);
}
diff --git a/src/core/bpf-foreign.c b/src/core/bpf-foreign.c
index cff2f61..851cc42 100644
--- a/src/core/bpf-foreign.c
+++ b/src/core/bpf-foreign.c
@@ -45,8 +45,8 @@ static int bpf_foreign_key_compare_func(const BPFForeignKey *a, const BPFForeign
}
static void bpf_foreign_key_hash_func(const BPFForeignKey *p, struct siphash *h) {
- siphash24_compress(&p->prog_id, sizeof(p->prog_id), h);
- siphash24_compress(&p->attach_type, sizeof(p->attach_type), h);
+ siphash24_compress_typesafe(p->prog_id, h);
+ siphash24_compress_typesafe(p->attach_type, h);
}
DEFINE_PRIVATE_HASH_OPS_FULL(bpf_foreign_by_key_hash_ops,
@@ -81,6 +81,7 @@ static int bpf_foreign_prepare(
Unit *u,
enum bpf_attach_type attach_type,
const char *bpffs_path) {
+
_cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
_cleanup_free_ BPFForeignKey *key = NULL;
uint32_t prog_id;
@@ -101,6 +102,11 @@ static int bpf_foreign_prepare(
return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
"bpf-foreign: Path in BPF filesystem is expected.");
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Failed to get control group runtime object.");
+
r = bpf_program_new_from_bpffs_path(bpffs_path, &prog);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-foreign: Failed to create foreign BPF program: %m");
@@ -114,7 +120,7 @@ static int bpf_foreign_prepare(
return log_unit_error_errno(u, r,
"bpf-foreign: Failed to create foreign BPF program key from path '%s': %m", bpffs_path);
- r = hashmap_ensure_put(&u->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog);
+ r = hashmap_ensure_put(&crt->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog);
if (r == -EEXIST) {
log_unit_warning_errno(u, r, "bpf-foreign: Foreign BPF program already exists, ignoring: %m");
return 0;
@@ -131,6 +137,7 @@ static int bpf_foreign_prepare(
int bpf_foreign_install(Unit *u) {
_cleanup_free_ char *cgroup_path = NULL;
CGroupContext *cc;
+ CGroupRuntime *crt;
int r, ret = 0;
assert(u);
@@ -139,7 +146,11 @@ int bpf_foreign_install(Unit *u) {
if (!cc)
return 0;
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_path);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-foreign: Failed to get cgroup path: %m");
@@ -149,6 +160,6 @@ int bpf_foreign_install(Unit *u) {
ret = r;
}
- r = attach_programs(u, cgroup_path, u->bpf_foreign_by_key, BPF_F_ALLOW_MULTI);
+ r = attach_programs(u, cgroup_path, crt->bpf_foreign_by_key, BPF_F_ALLOW_MULTI);
return ret < 0 ? ret : r;
}
diff --git a/src/core/bpf-lsm.h b/src/core/bpf-lsm.h
deleted file mode 100644
index a6eda19..0000000
--- a/src/core/bpf-lsm.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1-or-later */
-#pragma once
-
-#include "hashmap.h"
-
-typedef enum FilesystemParseFlags {
- FILESYSTEM_PARSE_INVERT = 1 << 0,
- FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1,
- FILESYSTEM_PARSE_LOG = 1 << 2,
-} FilesystemParseFlags;
-
-typedef struct Unit Unit;
-typedef struct Manager Manager;
-
-typedef struct restrict_fs_bpf restrict_fs_bpf;
-
-bool lsm_bpf_supported(bool initialize);
-int lsm_bpf_setup(Manager *m);
-int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list);
-int lsm_bpf_cleanup(const Unit *u);
-int lsm_bpf_map_restrict_fs_fd(Unit *u);
-void lsm_bpf_destroy(struct restrict_fs_bpf *prog);
-int lsm_bpf_parse_filesystem(const char *name,
- Set **filesystems,
- FilesystemParseFlags flags,
- const char *unit,
- const char *filename,
- unsigned line);
diff --git a/src/core/bpf-lsm.c b/src/core/bpf-restrict-fs.c
index 216fc34..d36bfb5 100644
--- a/src/core/bpf-lsm.c
+++ b/src/core/bpf-restrict-fs.c
@@ -10,7 +10,7 @@
#include <unistd.h>
#include "alloc-util.h"
-#include "bpf-lsm.h"
+#include "bpf-restrict-fs.h"
#include "cgroup-util.h"
#include "fd-util.h"
#include "fileio.h"
@@ -51,7 +51,7 @@ static bool bpf_can_link_lsm_program(struct bpf_program *prog) {
/* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory
* pointer. That is the case when the helper is called on architectures where BPF trampoline (hence
* BPF_LSM_MAC attach type) is not supported. */
- return sym_libbpf_get_error(link) == 0;
+ return bpf_get_error_translated(link) == 0;
}
static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) {
@@ -63,36 +63,36 @@ static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) {
obj = restrict_fs_bpf__open();
if (!obj)
- return log_error_errno(errno, "bpf-lsm: Failed to open BPF object: %m");
+ return log_error_errno(errno, "bpf-restrict-fs: Failed to open BPF object: %m");
/* TODO Maybe choose a number based on runtime information? */
r = sym_bpf_map__set_max_entries(obj->maps.cgroup_hash, CGROUP_HASH_SIZE_MAX);
assert(r <= 0);
if (r < 0)
- return log_error_errno(r, "bpf-lsm: Failed to resize BPF map '%s': %m",
+ return log_error_errno(r, "bpf-restrict-fs: Failed to resize BPF map '%s': %m",
sym_bpf_map__name(obj->maps.cgroup_hash));
/* Dummy map to satisfy the verifier */
inner_map_fd = compat_bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), sizeof(uint32_t), 128U, NULL);
if (inner_map_fd < 0)
- return log_error_errno(errno, "bpf-lsm: Failed to create BPF map: %m");
+ return log_error_errno(errno, "bpf-restrict-fs: Failed to create BPF map: %m");
r = sym_bpf_map__set_inner_map_fd(obj->maps.cgroup_hash, inner_map_fd);
assert(r <= 0);
if (r < 0)
- return log_error_errno(r, "bpf-lsm: Failed to set inner map fd: %m");
+ return log_error_errno(r, "bpf-restrict-fs: Failed to set inner map fd: %m");
r = restrict_fs_bpf__load(obj);
assert(r <= 0);
if (r < 0)
- return log_error_errno(r, "bpf-lsm: Failed to load BPF object: %m");
+ return log_error_errno(r, "bpf-restrict-fs: Failed to load BPF object: %m");
*ret_obj = TAKE_PTR(obj);
return 0;
}
-bool lsm_bpf_supported(bool initialize) {
+bool bpf_restrict_fs_supported(bool initialize) {
_cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
static int supported = -1;
int r;
@@ -107,12 +107,11 @@ bool lsm_bpf_supported(bool initialize) {
r = lsm_supported("bpf");
if (r < 0) {
- log_warning_errno(r, "bpf-lsm: Can't determine whether the BPF LSM module is used: %m");
+ log_warning_errno(r, "bpf-restrict-fs: Can't determine whether the BPF LSM module is used: %m");
return (supported = false);
}
if (r == 0) {
- log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
- "bpf-lsm: BPF LSM hook not enabled in the kernel, BPF LSM not supported");
+ log_info("bpf-restrict-fs: BPF LSM hook not enabled in the kernel, BPF LSM not supported.");
return (supported = false);
}
@@ -121,15 +120,14 @@ bool lsm_bpf_supported(bool initialize) {
return (supported = false);
if (!bpf_can_link_lsm_program(obj->progs.restrict_filesystems)) {
- log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
- "bpf-lsm: Failed to link program; assuming BPF LSM is not available");
+ log_warning("bpf-restrict-fs: Failed to link program; assuming BPF LSM is not available.");
return (supported = false);
}
return (supported = true);
}
-int lsm_bpf_setup(Manager *m) {
+int bpf_restrict_fs_setup(Manager *m) {
_cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
_cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
int r;
@@ -141,12 +139,12 @@ int lsm_bpf_setup(Manager *m) {
return r;
link = sym_bpf_program__attach_lsm(obj->progs.restrict_filesystems);
- r = sym_libbpf_get_error(link);
+ r = bpf_get_error_translated(link);
if (r != 0)
- return log_error_errno(r, "bpf-lsm: Failed to link '%s' LSM BPF program: %m",
+ return log_error_errno(r, "bpf-restrict-fs: Failed to link '%s' LSM BPF program: %m",
sym_bpf_program__name(obj->progs.restrict_filesystems));
- log_info("bpf-lsm: LSM BPF program attached");
+ log_info("bpf-restrict-fs: LSM BPF program attached");
obj->links.restrict_filesystems = TAKE_PTR(link);
m->restrict_fs = TAKE_PTR(obj);
@@ -154,7 +152,7 @@ int lsm_bpf_setup(Manager *m) {
return 0;
}
-int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list) {
+int bpf_restrict_fs_update(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list) {
uint32_t dummy_value = 1, zero = 0;
const char *fs;
const statfs_f_type_t *magic;
@@ -171,35 +169,35 @@ int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int
128U, /* Should be enough for all filesystem types */
NULL);
if (inner_map_fd < 0)
- return log_error_errno(errno, "bpf-lsm: Failed to create inner BPF map: %m");
+ return log_error_errno(errno, "bpf-restrict-fs: Failed to create inner BPF map: %m");
if (sym_bpf_map_update_elem(outer_map_fd, &cgroup_id, &inner_map_fd, BPF_ANY) != 0)
- return log_error_errno(errno, "bpf-lsm: Error populating BPF map: %m");
+ return log_error_errno(errno, "bpf-restrict-fs: Error populating BPF map: %m");
uint32_t allow = allow_list;
/* Use key 0 to store whether this is an allow list or a deny list */
if (sym_bpf_map_update_elem(inner_map_fd, &zero, &allow, BPF_ANY) != 0)
- return log_error_errno(errno, "bpf-lsm: Error initializing map: %m");
+ return log_error_errno(errno, "bpf-restrict-fs: Error initializing map: %m");
SET_FOREACH(fs, filesystems) {
r = fs_type_from_string(fs, &magic);
if (r < 0) {
- log_warning("bpf-lsm: Invalid filesystem name '%s', ignoring.", fs);
+ log_warning("bpf-restrict-fs: Invalid filesystem name '%s', ignoring.", fs);
continue;
}
- log_debug("bpf-lsm: Restricting filesystem access to '%s'", fs);
+ log_debug("bpf-restrict-fs: Restricting filesystem access to '%s'", fs);
for (int i = 0; i < FILESYSTEM_MAGIC_MAX; i++) {
if (magic[i] == 0)
break;
if (sym_bpf_map_update_elem(inner_map_fd, &magic[i], &dummy_value, BPF_ANY) != 0) {
- r = log_error_errno(errno, "bpf-lsm: Failed to update BPF map: %m");
+ r = log_error_errno(errno, "bpf-restrict-fs: Failed to update BPF map: %m");
if (sym_bpf_map_delete_elem(outer_map_fd, &cgroup_id) != 0)
- log_debug_errno(errno, "bpf-lsm: Failed to delete cgroup entry from BPF map: %m");
+ log_debug_errno(errno, "bpf-restrict-fs: Failed to delete cgroup entry from BPF map: %m");
return r;
}
@@ -209,31 +207,37 @@ int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int
return 0;
}
-int lsm_bpf_cleanup(const Unit *u) {
+int bpf_restrict_fs_cleanup(Unit *u) {
+ CGroupRuntime *crt;
+
assert(u);
assert(u->manager);
/* If we never successfully detected support, there is nothing to clean up. */
- if (!lsm_bpf_supported(/* initialize = */ false))
+ if (!bpf_restrict_fs_supported(/* initialize = */ false))
return 0;
if (!u->manager->restrict_fs)
return 0;
- if (u->cgroup_id == 0)
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
+
+ if (crt->cgroup_id == 0)
return 0;
int fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash);
if (fd < 0)
- return log_unit_error_errno(u, errno, "bpf-lsm: Failed to get BPF map fd: %m");
+ return log_unit_error_errno(u, errno, "bpf-restrict-fs: Failed to get BPF map fd: %m");
- if (sym_bpf_map_delete_elem(fd, &u->cgroup_id) != 0 && errno != ENOENT)
- return log_unit_debug_errno(u, errno, "bpf-lsm: Failed to delete cgroup entry from LSM BPF map: %m");
+ if (sym_bpf_map_delete_elem(fd, &crt->cgroup_id) != 0 && errno != ENOENT)
+ return log_unit_debug_errno(u, errno, "bpf-restrict-fs: Failed to delete cgroup entry from LSM BPF map: %m");
return 0;
}
-int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
+int bpf_restrict_fs_map_fd(Unit *unit) {
assert(unit);
assert(unit->manager);
@@ -243,36 +247,36 @@ int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
return sym_bpf_map__fd(unit->manager->restrict_fs->maps.cgroup_hash);
}
-void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
+void bpf_restrict_fs_destroy(struct restrict_fs_bpf *prog) {
restrict_fs_bpf__destroy(prog);
}
#else /* ! BPF_FRAMEWORK */
-bool lsm_bpf_supported(bool initialize) {
+bool bpf_restrict_fs_supported(bool initialize) {
return false;
}
-int lsm_bpf_setup(Manager *m) {
- return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to set up LSM BPF: %m");
+int bpf_restrict_fs_setup(Manager *m) {
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-restrict-fs: BPF framework is not supported.");
}
-int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, const bool allow_list) {
- return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to restrict filesystems using LSM BPF: %m");
+int bpf_restrict_fs_update(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, const bool allow_list) {
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-restrict-fs: BPF framework is not supported.");
}
-int lsm_bpf_cleanup(const Unit *u) {
+int bpf_restrict_fs_cleanup(Unit *u) {
return 0;
}
-int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
+int bpf_restrict_fs_map_fd(Unit *unit) {
return -ENOMEDIUM;
}
-void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
+void bpf_restrict_fs_destroy(struct restrict_fs_bpf *prog) {
return;
}
#endif
-int lsm_bpf_parse_filesystem(
+int bpf_restrict_fs_parse_filesystem(
const char *name,
Set **filesystems,
FilesystemParseFlags flags,
@@ -290,7 +294,7 @@ int lsm_bpf_parse_filesystem(
set = filesystem_set_find(name);
if (!set) {
log_syntax(unit, flags & FILESYSTEM_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
- "bpf-lsm: Unknown filesystem group, ignoring: %s", name);
+ "bpf-restrict-fs: Unknown filesystem group, ignoring: %s", name);
return 0;
}
@@ -299,7 +303,7 @@ int lsm_bpf_parse_filesystem(
* (i.e. take away the FILESYSTEM_PARSE_LOG flag) since any issues in the group table
* are our own problem, not a problem in user configuration data and we shouldn't
* pretend otherwise by complaining about them. */
- r = lsm_bpf_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line);
+ r = bpf_restrict_fs_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line);
if (r < 0)
return r;
}
diff --git a/src/core/bpf-restrict-fs.h b/src/core/bpf-restrict-fs.h
new file mode 100644
index 0000000..8da12de
--- /dev/null
+++ b/src/core/bpf-restrict-fs.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "hashmap.h"
+
+typedef enum FilesystemParseFlags {
+ FILESYSTEM_PARSE_INVERT = 1 << 0,
+ FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1,
+ FILESYSTEM_PARSE_LOG = 1 << 2,
+} FilesystemParseFlags;
+
+typedef struct Unit Unit;
+typedef struct Manager Manager;
+
+typedef struct restrict_fs_bpf restrict_fs_bpf;
+
+bool bpf_restrict_fs_supported(bool initialize);
+int bpf_restrict_fs_setup(Manager *m);
+int bpf_restrict_fs_update(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list);
+int bpf_restrict_fs_cleanup(Unit *u);
+int bpf_restrict_fs_map_fd(Unit *u);
+void bpf_restrict_fs_destroy(struct restrict_fs_bpf *prog);
+int bpf_restrict_fs_parse_filesystem(const char *name, Set **filesystems, FilesystemParseFlags flags, const char *unit, const char *filename, unsigned line);
diff --git a/src/core/restrict-ifaces.c b/src/core/bpf-restrict-ifaces.c
index 4dd8656..64d8d1a 100644
--- a/src/core/restrict-ifaces.c
+++ b/src/core/bpf-restrict-ifaces.c
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "fd-util.h"
-#include "restrict-ifaces.h"
+#include "bpf-restrict-ifaces.h"
#include "netlink-util.h"
#if BPF_FRAMEWORK
@@ -72,7 +72,7 @@ static int prepare_restrict_ifaces_bpf(
return 0;
}
-int restrict_network_interfaces_supported(void) {
+int bpf_restrict_ifaces_supported(void) {
_cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
static int supported = -1;
int r;
@@ -97,19 +97,24 @@ int restrict_network_interfaces_supported(void) {
return (supported = bpf_can_link_program(obj->progs.sd_restrictif_i));
}
-static int restrict_network_interfaces_install_impl(Unit *u) {
+static int restrict_ifaces_install_impl(Unit *u) {
_cleanup_(bpf_link_freep) struct bpf_link *egress_link = NULL, *ingress_link = NULL;
_cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
_cleanup_free_ char *cgroup_path = NULL;
_cleanup_close_ int cgroup_fd = -EBADF;
CGroupContext *cc;
+ CGroupRuntime *crt;
int r;
cc = unit_get_cgroup_context(u);
if (!cc)
return 0;
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_path);
if (r < 0)
return log_unit_error_errno(u, r, "restrict-interfaces: Failed to get cgroup path: %m");
@@ -128,51 +133,69 @@ static int restrict_network_interfaces_install_impl(Unit *u) {
return -errno;
ingress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_i, cgroup_fd);
- r = sym_libbpf_get_error(ingress_link);
+ r = bpf_get_error_translated(ingress_link);
if (r != 0)
return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create ingress cgroup link: %m");
egress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_e, cgroup_fd);
- r = sym_libbpf_get_error(egress_link);
+ r = bpf_get_error_translated(egress_link);
if (r != 0)
return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create egress cgroup link: %m");
- u->restrict_ifaces_ingress_bpf_link = TAKE_PTR(ingress_link);
- u->restrict_ifaces_egress_bpf_link = TAKE_PTR(egress_link);
+ crt->restrict_ifaces_ingress_bpf_link = TAKE_PTR(ingress_link);
+ crt->restrict_ifaces_egress_bpf_link = TAKE_PTR(egress_link);
return 0;
}
-int restrict_network_interfaces_install(Unit *u) {
- int r = restrict_network_interfaces_install_impl(u);
- fdset_close(u->initial_restric_ifaces_link_fds);
+int bpf_restrict_ifaces_install(Unit *u) {
+ CGroupRuntime *crt;
+ int r;
+
+ assert(u);
+
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
+
+ r = restrict_ifaces_install_impl(u);
+ fdset_close(crt->initial_restrict_ifaces_link_fds);
return r;
}
-int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) {
+int bpf_restrict_ifaces_serialize(Unit *u, FILE *f, FDSet *fds) {
+ CGroupRuntime *crt;
int r;
assert(u);
- r = bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_ingress_bpf_link);
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
+
+ r = bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", crt->restrict_ifaces_ingress_bpf_link);
if (r < 0)
return r;
- return bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_egress_bpf_link);
+ return bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", crt->restrict_ifaces_egress_bpf_link);
}
-int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) {
+int bpf_restrict_ifaces_add_initial_link_fd(Unit *u, int fd) {
int r;
assert(u);
- if (!u->initial_restric_ifaces_link_fds) {
- u->initial_restric_ifaces_link_fds = fdset_new();
- if (!u->initial_restric_ifaces_link_fds)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return -EINVAL;
+
+ if (!crt->initial_restrict_ifaces_link_fds) {
+ crt->initial_restrict_ifaces_link_fds = fdset_new();
+ if (!crt->initial_restrict_ifaces_link_fds)
return log_oom();
}
- r = fdset_put(u->initial_restric_ifaces_link_fds, fd);
+ r = fdset_put(crt->initial_restrict_ifaces_link_fds, fd);
if (r < 0)
return log_unit_error_errno(u, r,
"restrict-interfaces: Failed to put restrict-ifaces-bpf-fd %d to restored fdset: %m", fd);
@@ -181,20 +204,20 @@ int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) {
}
#else /* ! BPF_FRAMEWORK */
-int restrict_network_interfaces_supported(void) {
+int bpf_restrict_ifaces_supported(void) {
return 0;
}
-int restrict_network_interfaces_install(Unit *u) {
+int bpf_restrict_ifaces_install(Unit *u) {
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
"restrict-interfaces: Failed to install; BPF programs built from source code are not supported: %m");
}
-int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) {
+int bpf_restrict_ifaces_serialize(Unit *u, FILE *f, FDSet *fds) {
return 0;
}
-int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) {
+int bpf_restrict_ifaces_add_initial_link_fd(Unit *u, int fd) {
return 0;
}
#endif
diff --git a/src/core/restrict-ifaces.h b/src/core/bpf-restrict-ifaces.h
index 6e7a824..28f7427 100644
--- a/src/core/restrict-ifaces.h
+++ b/src/core/bpf-restrict-ifaces.h
@@ -6,11 +6,11 @@
typedef struct Unit Unit;
-int restrict_network_interfaces_supported(void);
-int restrict_network_interfaces_install(Unit *u);
+int bpf_restrict_ifaces_supported(void);
+int bpf_restrict_ifaces_install(Unit *u);
-int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds);
+int bpf_restrict_ifaces_serialize(Unit *u, FILE *f, FDSet *fds);
/* Add BPF link fd created before daemon-reload or daemon-reexec.
* FDs will be closed at the end of restrict_network_interfaces_install. */
-int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd);
+int bpf_restrict_ifaces_add_initial_link_fd(Unit *u, int fd);
diff --git a/src/core/bpf-socket-bind.c b/src/core/bpf-socket-bind.c
index 88ab487..2a1a027 100644
--- a/src/core/bpf-socket-bind.c
+++ b/src/core/bpf-socket-bind.c
@@ -148,13 +148,18 @@ int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) {
assert(u);
- if (!u->initial_socket_bind_link_fds) {
- u->initial_socket_bind_link_fds = fdset_new();
- if (!u->initial_socket_bind_link_fds)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Failed to get control group runtime object.");
+
+ if (!crt->initial_socket_bind_link_fds) {
+ crt->initial_socket_bind_link_fds = fdset_new();
+ if (!crt->initial_socket_bind_link_fds)
return log_oom();
}
- r = fdset_put(u->initial_socket_bind_link_fds, fd);
+ r = fdset_put(crt->initial_socket_bind_link_fds, fd);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to put BPF fd %d to initial fdset", fd);
@@ -167,6 +172,7 @@ static int socket_bind_install_impl(Unit *u) {
_cleanup_free_ char *cgroup_path = NULL;
_cleanup_close_ int cgroup_fd = -EBADF;
CGroupContext *cc;
+ CGroupRuntime *crt;
int r;
assert(u);
@@ -175,7 +181,11 @@ static int socket_bind_install_impl(Unit *u) {
if (!cc)
return 0;
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_path);
if (r < 0)
return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to get cgroup path: %m");
@@ -191,46 +201,53 @@ static int socket_bind_install_impl(Unit *u) {
return log_unit_error_errno(u, errno, "bpf-socket-bind: Failed to open cgroup %s for reading: %m", cgroup_path);
ipv4 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind4, cgroup_fd);
- r = sym_libbpf_get_error(ipv4);
+ r = bpf_get_error_translated(ipv4);
if (r != 0)
return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m",
sym_bpf_program__name(obj->progs.sd_bind4));
ipv6 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind6, cgroup_fd);
- r = sym_libbpf_get_error(ipv6);
+ r = bpf_get_error_translated(ipv6);
if (r != 0)
return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m",
sym_bpf_program__name(obj->progs.sd_bind6));
- u->ipv4_socket_bind_link = TAKE_PTR(ipv4);
- u->ipv6_socket_bind_link = TAKE_PTR(ipv6);
+ crt->ipv4_socket_bind_link = TAKE_PTR(ipv4);
+ crt->ipv6_socket_bind_link = TAKE_PTR(ipv6);
return 0;
}
int bpf_socket_bind_install(Unit *u) {
+ CGroupRuntime *crt;
int r;
assert(u);
- r = socket_bind_install_impl(u);
- if (r == -ENOMEM)
- return r;
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
- fdset_close(u->initial_socket_bind_link_fds);
+ r = socket_bind_install_impl(u);
+ fdset_close(crt->initial_socket_bind_link_fds);
return r;
}
-int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) {
+int bpf_socket_bind_serialize(Unit *u, FILE *f, FDSet *fds) {
+ CGroupRuntime *crt;
int r;
assert(u);
- r = bpf_serialize_link(f, fds, "ipv4-socket-bind-bpf-link", u->ipv4_socket_bind_link);
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
+
+ r = bpf_serialize_link(f, fds, "ipv4-socket-bind-bpf-link", crt->ipv4_socket_bind_link);
if (r < 0)
return r;
- return bpf_serialize_link(f, fds, "ipv6-socket-bind-bpf-link", u->ipv6_socket_bind_link);
+ return bpf_serialize_link(f, fds, "ipv6-socket-bind-bpf-link", crt->ipv6_socket_bind_link);
}
#else /* ! BPF_FRAMEWORK */
@@ -247,7 +264,7 @@ int bpf_socket_bind_install(Unit *u) {
"bpf-socket-bind: Failed to install; BPF framework is not supported");
}
-int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) {
+int bpf_socket_bind_serialize(Unit *u, FILE *f, FDSet *fds) {
return 0;
}
#endif
diff --git a/src/core/bpf-socket-bind.h b/src/core/bpf-socket-bind.h
index 7d426df..28b25f6 100644
--- a/src/core/bpf-socket-bind.h
+++ b/src/core/bpf-socket-bind.h
@@ -12,4 +12,4 @@ int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd);
int bpf_socket_bind_install(Unit *u);
-int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds);
+int bpf_socket_bind_serialize(Unit *u, FILE *f, FDSet *fds);
diff --git a/src/core/bpf-util.c b/src/core/bpf-util.c
index 6fe229e..b337ba9 100644
--- a/src/core/bpf-util.c
+++ b/src/core/bpf-util.c
@@ -20,8 +20,7 @@ bool cgroup_bpf_supported(void) {
}
if (r == 0) {
- log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
- "Not running with unified cgroup hierarchy, disabling cgroup BPF features.");
+ log_info("Not running with unified cgroup hierarchy, disabling cgroup BPF features.");
return (supported = false);
}
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index 61ac4df..34fd2a2 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -10,6 +10,7 @@
#include "bpf-devices.h"
#include "bpf-firewall.h"
#include "bpf-foreign.h"
+#include "bpf-restrict-ifaces.h"
#include "bpf-socket-bind.h"
#include "btrfs-util.h"
#include "bus-error.h"
@@ -32,7 +33,8 @@
#include "percent-util.h"
#include "process-util.h"
#include "procfs-util.h"
-#include "restrict-ifaces.h"
+#include "set.h"
+#include "serialize.h"
#include "special.h"
#include "stdio-util.h"
#include "string-table.h"
@@ -115,10 +117,16 @@ bool unit_has_host_root_cgroup(Unit *u) {
static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
int r;
- r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
+ assert(u);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -EOWNERDEAD;
+
+ r = cg_set_attribute(controller, crt->cgroup_path, attribute, value);
if (r < 0)
log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
- strna(attribute), empty_to_root(u->cgroup_path), (int) strcspn(value, NEWLINE), value);
+ strna(attribute), empty_to_root(crt->cgroup_path), (int) strcspn(value, NEWLINE), value);
return r;
}
@@ -172,6 +180,8 @@ void cgroup_context_init(CGroupContext *c) {
.memory_limit = CGROUP_LIMIT_MAX,
+ .memory_zswap_writeback = true,
+
.io_weight = CGROUP_WEIGHT_INVALID,
.startup_io_weight = CGROUP_WEIGHT_INVALID,
@@ -189,6 +199,319 @@ void cgroup_context_init(CGroupContext *c) {
};
}
+int cgroup_context_add_io_device_weight_dup(CGroupContext *c, const CGroupIODeviceWeight *w) {
+ _cleanup_free_ CGroupIODeviceWeight *n = NULL;
+
+ assert(c);
+ assert(w);
+
+ n = new(CGroupIODeviceWeight, 1);
+ if (!n)
+ return -ENOMEM;
+
+ *n = (CGroupIODeviceWeight) {
+ .path = strdup(w->path),
+ .weight = w->weight,
+ };
+ if (!n->path)
+ return -ENOMEM;
+
+ LIST_PREPEND(device_weights, c->io_device_weights, TAKE_PTR(n));
+ return 0;
+}
+
+int cgroup_context_add_io_device_limit_dup(CGroupContext *c, const CGroupIODeviceLimit *l) {
+ _cleanup_free_ CGroupIODeviceLimit *n = NULL;
+
+ assert(c);
+ assert(l);
+
+ n = new0(CGroupIODeviceLimit, 1);
+ if (!n)
+ return -ENOMEM;
+
+ n->path = strdup(l->path);
+ if (!n->path)
+ return -ENOMEM;
+
+ for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+ n->limits[type] = l->limits[type];
+
+ LIST_PREPEND(device_limits, c->io_device_limits, TAKE_PTR(n));
+ return 0;
+}
+
+int cgroup_context_add_io_device_latency_dup(CGroupContext *c, const CGroupIODeviceLatency *l) {
+ _cleanup_free_ CGroupIODeviceLatency *n = NULL;
+
+ assert(c);
+ assert(l);
+
+ n = new(CGroupIODeviceLatency, 1);
+ if (!n)
+ return -ENOMEM;
+
+ *n = (CGroupIODeviceLatency) {
+ .path = strdup(l->path),
+ .target_usec = l->target_usec,
+ };
+ if (!n->path)
+ return -ENOMEM;
+
+ LIST_PREPEND(device_latencies, c->io_device_latencies, TAKE_PTR(n));
+ return 0;
+}
+
+int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, const CGroupBlockIODeviceWeight *w) {
+ _cleanup_free_ CGroupBlockIODeviceWeight *n = NULL;
+
+ assert(c);
+ assert(w);
+
+ n = new(CGroupBlockIODeviceWeight, 1);
+ if (!n)
+ return -ENOMEM;
+
+ *n = (CGroupBlockIODeviceWeight) {
+ .path = strdup(w->path),
+ .weight = w->weight,
+ };
+ if (!n->path)
+ return -ENOMEM;
+
+ LIST_PREPEND(device_weights, c->blockio_device_weights, TAKE_PTR(n));
+ return 0;
+}
+
+int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, const CGroupBlockIODeviceBandwidth *b) {
+ _cleanup_free_ CGroupBlockIODeviceBandwidth *n = NULL;
+
+ assert(c);
+ assert(b);
+
+ n = new(CGroupBlockIODeviceBandwidth, 1);
+ if (!n)
+ return -ENOMEM;
+
+ *n = (CGroupBlockIODeviceBandwidth) {
+ .rbps = b->rbps,
+ .wbps = b->wbps,
+ };
+
+ LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, TAKE_PTR(n));
+ return 0;
+}
+
+int cgroup_context_add_device_allow_dup(CGroupContext *c, const CGroupDeviceAllow *a) {
+ _cleanup_free_ CGroupDeviceAllow *n = NULL;
+
+ assert(c);
+ assert(a);
+
+ n = new(CGroupDeviceAllow, 1);
+ if (!n)
+ return -ENOMEM;
+
+ *n = (CGroupDeviceAllow) {
+ .path = strdup(a->path),
+ .permissions = a->permissions,
+ };
+ if (!n->path)
+ return -ENOMEM;
+
+ LIST_PREPEND(device_allow, c->device_allow, TAKE_PTR(n));
+ return 0;
+}
+
+static int cgroup_context_add_socket_bind_item_dup(CGroupContext *c, const CGroupSocketBindItem *i, CGroupSocketBindItem *h) {
+ _cleanup_free_ CGroupSocketBindItem *n = NULL;
+
+ assert(c);
+ assert(i);
+
+ n = new(CGroupSocketBindItem, 1);
+ if (!n)
+ return -ENOMEM;
+
+ *n = (CGroupSocketBindItem) {
+ .address_family = i->address_family,
+ .ip_protocol = i->ip_protocol,
+ .nr_ports = i->nr_ports,
+ .port_min = i->port_min,
+ };
+
+ LIST_PREPEND(socket_bind_items, h, TAKE_PTR(n));
+ return 0;
+}
+
+int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, const CGroupSocketBindItem *i) {
+ return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_allow);
+}
+
+int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, const CGroupSocketBindItem *i) {
+ return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_deny);
+}
+
+int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src) {
+ struct in_addr_prefix *i;
+ char *iface;
+ int r;
+
+ assert(src);
+ assert(dst);
+
+ dst->cpu_accounting = src->cpu_accounting;
+ dst->io_accounting = src->io_accounting;
+ dst->blockio_accounting = src->blockio_accounting;
+ dst->memory_accounting = src->memory_accounting;
+ dst->tasks_accounting = src->tasks_accounting;
+ dst->ip_accounting = src->ip_accounting;
+
+ dst->memory_oom_group = src->memory_oom_group;
+
+ dst->cpu_weight = src->cpu_weight;
+ dst->startup_cpu_weight = src->startup_cpu_weight;
+ dst->cpu_quota_per_sec_usec = src->cpu_quota_per_sec_usec;
+ dst->cpu_quota_period_usec = src->cpu_quota_period_usec;
+
+ dst->cpuset_cpus = src->cpuset_cpus;
+ dst->startup_cpuset_cpus = src->startup_cpuset_cpus;
+ dst->cpuset_mems = src->cpuset_mems;
+ dst->startup_cpuset_mems = src->startup_cpuset_mems;
+
+ dst->io_weight = src->io_weight;
+ dst->startup_io_weight = src->startup_io_weight;
+
+ LIST_FOREACH_BACKWARDS(device_weights, w, LIST_FIND_TAIL(device_weights, src->io_device_weights)) {
+ r = cgroup_context_add_io_device_weight_dup(dst, w);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH_BACKWARDS(device_limits, l, LIST_FIND_TAIL(device_limits, src->io_device_limits)) {
+ r = cgroup_context_add_io_device_limit_dup(dst, l);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH_BACKWARDS(device_latencies, l, LIST_FIND_TAIL(device_latencies, src->io_device_latencies)) {
+ r = cgroup_context_add_io_device_latency_dup(dst, l);
+ if (r < 0)
+ return r;
+ }
+
+ dst->default_memory_min = src->default_memory_min;
+ dst->default_memory_low = src->default_memory_low;
+ dst->default_startup_memory_low = src->default_startup_memory_low;
+ dst->memory_min = src->memory_min;
+ dst->memory_low = src->memory_low;
+ dst->startup_memory_low = src->startup_memory_low;
+ dst->memory_high = src->memory_high;
+ dst->startup_memory_high = src->startup_memory_high;
+ dst->memory_max = src->memory_max;
+ dst->startup_memory_max = src->startup_memory_max;
+ dst->memory_swap_max = src->memory_swap_max;
+ dst->startup_memory_swap_max = src->startup_memory_swap_max;
+ dst->memory_zswap_max = src->memory_zswap_max;
+ dst->startup_memory_zswap_max = src->startup_memory_zswap_max;
+
+ dst->default_memory_min_set = src->default_memory_min_set;
+ dst->default_memory_low_set = src->default_memory_low_set;
+ dst->default_startup_memory_low_set = src->default_startup_memory_low_set;
+ dst->memory_min_set = src->memory_min_set;
+ dst->memory_low_set = src->memory_low_set;
+ dst->startup_memory_low_set = src->startup_memory_low_set;
+ dst->startup_memory_high_set = src->startup_memory_high_set;
+ dst->startup_memory_max_set = src->startup_memory_max_set;
+ dst->startup_memory_swap_max_set = src->startup_memory_swap_max_set;
+ dst->startup_memory_zswap_max_set = src->startup_memory_zswap_max_set;
+ dst->memory_zswap_writeback = src->memory_zswap_writeback;
+
+ SET_FOREACH(i, src->ip_address_allow) {
+ r = in_addr_prefix_add(&dst->ip_address_allow, i);
+ if (r < 0)
+ return r;
+ }
+
+ SET_FOREACH(i, src->ip_address_deny) {
+ r = in_addr_prefix_add(&dst->ip_address_deny, i);
+ if (r < 0)
+ return r;
+ }
+
+ dst->ip_address_allow_reduced = src->ip_address_allow_reduced;
+ dst->ip_address_deny_reduced = src->ip_address_deny_reduced;
+
+ if (!strv_isempty(src->ip_filters_ingress)) {
+ dst->ip_filters_ingress = strv_copy(src->ip_filters_ingress);
+ if (!dst->ip_filters_ingress)
+ return -ENOMEM;
+ }
+
+ if (!strv_isempty(src->ip_filters_egress)) {
+ dst->ip_filters_egress = strv_copy(src->ip_filters_egress);
+ if (!dst->ip_filters_egress)
+ return -ENOMEM;
+ }
+
+ LIST_FOREACH_BACKWARDS(programs, l, LIST_FIND_TAIL(programs, src->bpf_foreign_programs)) {
+ r = cgroup_context_add_bpf_foreign_program_dup(dst, l);
+ if (r < 0)
+ return r;
+ }
+
+ SET_FOREACH(iface, src->restrict_network_interfaces) {
+ r = set_put_strdup(&dst->restrict_network_interfaces, iface);
+ if (r < 0)
+ return r;
+ }
+ dst->restrict_network_interfaces_is_allow_list = src->restrict_network_interfaces_is_allow_list;
+
+ dst->cpu_shares = src->cpu_shares;
+ dst->startup_cpu_shares = src->startup_cpu_shares;
+
+ dst->blockio_weight = src->blockio_weight;
+ dst->startup_blockio_weight = src->startup_blockio_weight;
+
+ LIST_FOREACH_BACKWARDS(device_weights, l, LIST_FIND_TAIL(device_weights, src->blockio_device_weights)) {
+ r = cgroup_context_add_block_io_device_weight_dup(dst, l);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH_BACKWARDS(device_bandwidths, l, LIST_FIND_TAIL(device_bandwidths, src->blockio_device_bandwidths)) {
+ r = cgroup_context_add_block_io_device_bandwidth_dup(dst, l);
+ if (r < 0)
+ return r;
+ }
+
+ dst->memory_limit = src->memory_limit;
+
+ dst->device_policy = src->device_policy;
+ LIST_FOREACH_BACKWARDS(device_allow, l, LIST_FIND_TAIL(device_allow, src->device_allow)) {
+ r = cgroup_context_add_device_allow_dup(dst, l);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH_BACKWARDS(socket_bind_items, l, LIST_FIND_TAIL(socket_bind_items, src->socket_bind_allow)) {
+ r = cgroup_context_add_socket_bind_item_allow_dup(dst, l);
+ if (r < 0)
+ return r;
+
+ }
+
+ LIST_FOREACH_BACKWARDS(socket_bind_items, l, LIST_FIND_TAIL(socket_bind_items, src->socket_bind_deny)) {
+ r = cgroup_context_add_socket_bind_item_deny_dup(dst, l);
+ if (r < 0)
+ return r;
+ }
+
+ dst->tasks_max = src->tasks_max;
+
+ return 0;
+}
+
void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
assert(c);
assert(a);
@@ -306,10 +629,11 @@ void cgroup_context_done(CGroupContext *c) {
static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
assert(u);
- if (!u->cgroup_realized)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -EOWNERDEAD;
- return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret);
+ return cg_get_attribute_as_uint64("memory", crt->cgroup_path, file, ret);
}
static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
@@ -425,11 +749,12 @@ static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_
#define FORMAT_CGROUP_DIFF_MAX 128
-static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) {
+static char *format_cgroup_memory_limit_comparison(Unit *u, const char *property_name, char *buf, size_t l) {
uint64_t kval, sval;
int r;
assert(u);
+ assert(property_name);
assert(buf);
assert(l > 0);
@@ -499,18 +824,9 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
_cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL;
CGroupContext *c;
struct in_addr_prefix *iaai;
-
- char cda[FORMAT_CGROUP_DIFF_MAX];
- char cdb[FORMAT_CGROUP_DIFF_MAX];
- char cdc[FORMAT_CGROUP_DIFF_MAX];
- char cdd[FORMAT_CGROUP_DIFF_MAX];
- char cde[FORMAT_CGROUP_DIFF_MAX];
- char cdf[FORMAT_CGROUP_DIFF_MAX];
- char cdg[FORMAT_CGROUP_DIFF_MAX];
- char cdh[FORMAT_CGROUP_DIFF_MAX];
- char cdi[FORMAT_CGROUP_DIFF_MAX];
- char cdj[FORMAT_CGROUP_DIFF_MAX];
- char cdk[FORMAT_CGROUP_DIFF_MAX];
+ char cda[FORMAT_CGROUP_DIFF_MAX], cdb[FORMAT_CGROUP_DIFF_MAX], cdc[FORMAT_CGROUP_DIFF_MAX], cdd[FORMAT_CGROUP_DIFF_MAX],
+ cde[FORMAT_CGROUP_DIFF_MAX], cdf[FORMAT_CGROUP_DIFF_MAX], cdg[FORMAT_CGROUP_DIFF_MAX], cdh[FORMAT_CGROUP_DIFF_MAX],
+ cdi[FORMAT_CGROUP_DIFF_MAX], cdj[FORMAT_CGROUP_DIFF_MAX], cdk[FORMAT_CGROUP_DIFF_MAX];
assert(u);
assert(f);
@@ -564,6 +880,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
"%sStartupMemorySwapMax: %" PRIu64 "%s\n"
"%sMemoryZSwapMax: %" PRIu64 "%s\n"
"%sStartupMemoryZSwapMax: %" PRIu64 "%s\n"
+ "%sMemoryZSwapWriteback: %s\n"
"%sMemoryLimit: %" PRIu64 "\n"
"%sTasksMax: %" PRIu64 "\n"
"%sDevicePolicy: %s\n"
@@ -597,17 +914,18 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
prefix, c->startup_blockio_weight,
prefix, c->default_memory_min,
prefix, c->default_memory_low,
- prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"),
- prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"),
- prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "StartupMemoryLow"),
- prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryHigh"),
- prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "StartupMemoryHigh"),
- prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdf, sizeof(cdf), u, "MemoryMax"),
- prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(cdg, sizeof(cdg), u, "StartupMemoryMax"),
- prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cdh, sizeof(cdh), u, "MemorySwapMax"),
- prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(cdi, sizeof(cdi), u, "StartupMemorySwapMax"),
- prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(cdj, sizeof(cdj), u, "MemoryZSwapMax"),
- prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(cdk, sizeof(cdk), u, "StartupMemoryZSwapMax"),
+ prefix, c->memory_min, format_cgroup_memory_limit_comparison(u, "MemoryMin", cda, sizeof(cda)),
+ prefix, c->memory_low, format_cgroup_memory_limit_comparison(u, "MemoryLow", cdb, sizeof(cdb)),
+ prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(u, "StartupMemoryLow", cdc, sizeof(cdc)),
+ prefix, c->memory_high, format_cgroup_memory_limit_comparison(u, "MemoryHigh", cdd, sizeof(cdd)),
+ prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(u, "StartupMemoryHigh", cde, sizeof(cde)),
+ prefix, c->memory_max, format_cgroup_memory_limit_comparison(u, "MemoryMax", cdf, sizeof(cdf)),
+ prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryMax", cdg, sizeof(cdg)),
+ prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(u, "MemorySwapMax", cdh, sizeof(cdh)),
+ prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(u, "StartupMemorySwapMax", cdi, sizeof(cdi)),
+ prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(u, "MemoryZSwapMax", cdj, sizeof(cdj)),
+ prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryZSwapMax", cdk, sizeof(cdk)),
+ prefix, yes_no(c->memory_zswap_writeback),
prefix, c->memory_limit,
prefix, cgroup_tasks_max_resolve(&c->tasks_max),
prefix, cgroup_device_policy_to_string(c->device_policy),
@@ -811,7 +1129,7 @@ int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_typ
assert(bpffs_path);
if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path))
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m");
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized.");
d = strdup(bpffs_path);
if (!d)
@@ -867,12 +1185,13 @@ static void unit_set_xattr_graceful(Unit *u, const char *name, const void *data,
assert(u);
assert(name);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return;
- r = cg_set_xattr(u->cgroup_path, name, data, size, 0);
+ r = cg_set_xattr(crt->cgroup_path, name, data, size, 0);
if (r < 0)
- log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path));
+ log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path));
}
static void unit_remove_xattr_graceful(Unit *u, const char *name) {
@@ -881,12 +1200,13 @@ static void unit_remove_xattr_graceful(Unit *u, const char *name) {
assert(u);
assert(name);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return;
- r = cg_remove_xattr(u->cgroup_path, name);
+ r = cg_remove_xattr(crt->cgroup_path, name);
if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
- log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path));
+ log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path));
}
static void cgroup_oomd_xattr_apply(Unit *u) {
@@ -1013,9 +1333,13 @@ static void cgroup_survive_xattr_apply(Unit *u) {
assert(u);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return;
+
if (u->survive_final_kill_signal) {
r = cg_set_xattr(
- u->cgroup_path,
+ crt->cgroup_path,
"user.survive_final_kill_signal",
"1",
1,
@@ -1023,7 +1347,7 @@ static void cgroup_survive_xattr_apply(Unit *u) {
/* user xattr support was added in kernel v5.7 */
if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
r = cg_set_xattr(
- u->cgroup_path,
+ crt->cgroup_path,
"trusted.survive_final_kill_signal",
"1",
1,
@@ -1033,7 +1357,7 @@ static void cgroup_survive_xattr_apply(Unit *u) {
r,
"Failed to set 'survive_final_kill_signal' xattr on control "
"group %s, ignoring: %m",
- empty_to_root(u->cgroup_path));
+ empty_to_root(crt->cgroup_path));
} else {
unit_remove_xattr_graceful(u, "user.survive_final_kill_signal");
unit_remove_xattr_graceful(u, "trusted.survive_final_kill_signal");
@@ -1170,6 +1494,12 @@ usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution,
static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
usec_t new_period;
+ assert(u);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return USEC_INFINITY;
+
if (quota == USEC_INFINITY)
/* Always use default period for infinity quota. */
return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
@@ -1182,10 +1512,10 @@ static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t qu
new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
if (new_period != period) {
- log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
+ log_unit_full(u, crt->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
"Clamping CPU interval for cpu.max: period is now %s",
FORMAT_TIMESPAN(new_period, 1));
- u->warned_clamping_cpu_quota_period = true;
+ crt->warned_clamping_cpu_quota_period = true;
}
return new_period;
@@ -1205,17 +1535,25 @@ static void cgroup_apply_unified_cpu_idle(Unit *u, uint64_t weight) {
bool is_idle;
const char *idle_val;
+ assert(u);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return;
+
is_idle = weight == CGROUP_WEIGHT_IDLE;
idle_val = one_zero(is_idle);
- r = cg_set_attribute("cpu", u->cgroup_path, "cpu.idle", idle_val);
+ r = cg_set_attribute("cpu", crt->cgroup_path, "cpu.idle", idle_val);
if (r < 0 && (r != -ENOENT || is_idle))
log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m",
- "cpu.idle", empty_to_root(u->cgroup_path), idle_val);
+ "cpu.idle", empty_to_root(crt->cgroup_path), idle_val);
}
static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
+ assert(u);
+
period = cgroup_cpu_adjust_period_and_log(u, period, quota);
if (quota != USEC_INFINITY)
xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
@@ -1331,6 +1669,12 @@ static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t i
uint64_t bfq_weight;
int r;
+ assert(u);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -EOWNERDEAD;
+
/* FIXME: drop this function when distro kernels properly support BFQ through "io.weight"
* See also: https://github.com/systemd/systemd/pull/13335 and
* https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */
@@ -1343,7 +1687,7 @@ static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t i
else
xsprintf(buf, "%" PRIu64 "\n", bfq_weight);
- r = cg_set_attribute(controller, u->cgroup_path, p, buf);
+ r = cg_set_attribute(controller, crt->cgroup_path, p, buf);
/* FIXME: drop this when kernels prior
* 795fe54c2a82 ("bfq: Add per-device weight") v5.4
@@ -1367,13 +1711,19 @@ static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_
dev_t dev;
int r, r1, r2;
+ assert(u);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return;
+
if (lookup_block_device(dev_path, &dev) < 0)
return;
r1 = set_bfq_weight(u, "io", dev, io_weight);
xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight);
- r2 = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
+ r2 = cg_set_attribute("io", crt->cgroup_path, "io.weight", buf);
/* Look at the configured device, when both fail, prefer io.weight errno. */
r = r2 == -EOPNOTSUPP ? r1 : r2;
@@ -1381,7 +1731,7 @@ static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_
if (r < 0)
log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r),
r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m",
- empty_to_root(u->cgroup_path), (int) strcspn(buf, NEWLINE), buf);
+ empty_to_root(crt->cgroup_path), (int) strcspn(buf, NEWLINE), buf);
}
static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
@@ -1498,7 +1848,8 @@ void unit_modify_nft_set(Unit *u, bool add) {
if (cg_all_unified() <= 0)
return;
- if (u->cgroup_id == 0)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || crt->cgroup_id == 0)
return;
if (!u->manager->fw_ctx) {
@@ -1515,15 +1866,15 @@ void unit_modify_nft_set(Unit *u, bool add) {
if (nft_set->source != NFT_SET_SOURCE_CGROUP)
continue;
- uint64_t element = u->cgroup_id;
+ uint64_t element = crt->cgroup_id;
r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element));
if (r < 0)
log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64 ", ignoring: %m",
- add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id);
+ add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id);
else
log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64,
- add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id);
+ add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id);
}
}
@@ -1536,18 +1887,20 @@ static void cgroup_apply_socket_bind(Unit *u) {
static void cgroup_apply_restrict_network_interfaces(Unit *u) {
assert(u);
- (void) restrict_network_interfaces_install(u);
+ (void) bpf_restrict_ifaces_install(u);
}
static int cgroup_apply_devices(Unit *u) {
_cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
- const char *path;
CGroupContext *c;
CGroupDevicePolicy policy;
int r;
assert_se(c = unit_get_cgroup_context(u));
- assert_se(path = u->cgroup_path);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -EOWNERDEAD;
policy = c->device_policy;
@@ -1561,9 +1914,9 @@ static int cgroup_apply_devices(Unit *u) {
* EINVAL here. */
if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO)
- r = cg_set_attribute("devices", path, "devices.deny", "a");
+ r = cg_set_attribute("devices", crt->cgroup_path, "devices.deny", "a");
else
- r = cg_set_attribute("devices", path, "devices.allow", "a");
+ r = cg_set_attribute("devices", crt->cgroup_path, "devices.allow", "a");
if (r < 0)
log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to reset devices.allow/devices.deny: %m");
@@ -1571,10 +1924,14 @@ static int cgroup_apply_devices(Unit *u) {
bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
(policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
- if (allow_list_static)
- (void) bpf_devices_allow_list_static(prog, path);
- bool any = allow_list_static;
+ bool any = false;
+ if (allow_list_static) {
+ r = bpf_devices_allow_list_static(prog, crt->cgroup_path);
+ if (r > 0)
+ any = true;
+ }
+
LIST_FOREACH(device_allow, a, c->device_allow) {
const char *val;
@@ -1582,22 +1939,22 @@ static int cgroup_apply_devices(Unit *u) {
continue;
if (path_startswith(a->path, "/dev/"))
- r = bpf_devices_allow_list_device(prog, path, a->path, a->permissions);
+ r = bpf_devices_allow_list_device(prog, crt->cgroup_path, a->path, a->permissions);
else if ((val = startswith(a->path, "block-")))
- r = bpf_devices_allow_list_major(prog, path, val, 'b', a->permissions);
+ r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'b', a->permissions);
else if ((val = startswith(a->path, "char-")))
- r = bpf_devices_allow_list_major(prog, path, val, 'c', a->permissions);
+ r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'c', a->permissions);
else {
log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
continue;
}
- if (r >= 0)
+ if (r > 0)
any = true;
}
if (prog && !any) {
- log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter.");
+ log_unit_warning(u, "No devices matched by device filter.");
/* The kernel verifier would reject a program we would build with the normal intro and outro
but no allow-listing rules (outro would contain an unreachable instruction for successful
@@ -1605,7 +1962,7 @@ static int cgroup_apply_devices(Unit *u) {
policy = CGROUP_DEVICE_POLICY_STRICT;
}
- r = bpf_devices_apply_policy(&prog, policy, any, path, &u->bpf_device_control_installed);
+ r = bpf_devices_apply_policy(&prog, policy, any, crt->cgroup_path, &crt->bpf_device_control_installed);
if (r < 0) {
static bool warned = false;
@@ -1652,9 +2009,9 @@ static void cgroup_context_apply(
CGroupMask apply_mask,
ManagerState state) {
+ bool is_host_root, is_local_root;
const char *path;
CGroupContext *c;
- bool is_host_root, is_local_root;
int r;
assert(u);
@@ -1669,7 +2026,12 @@ static void cgroup_context_apply(
is_host_root = unit_has_host_root_cgroup(u);
assert_se(c = unit_get_cgroup_context(u));
- assert_se(path = u->cgroup_path);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return;
+
+ path = crt->cgroup_path;
if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
path = "/";
@@ -1879,6 +2241,7 @@ static void cgroup_context_apply(
cgroup_apply_unified_memory_limit(u, "memory.zswap.max", zswap_max);
(void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
+ (void) set_attribute_and_warn(u, "memory", "memory.zswap.writeback", one_zero(c->memory_zswap_writeback));
} else {
char buf[DECIMAL_STR_MAX(uint64_t) + 1];
@@ -2137,20 +2500,24 @@ CGroupMask unit_get_members_mask(Unit *u) {
/* Returns the mask of controllers all of the unit's children require, merged */
- if (u->cgroup_members_mask_valid)
- return u->cgroup_members_mask; /* Use cached value if possible */
-
- u->cgroup_members_mask = 0;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (crt && crt->cgroup_members_mask_valid)
+ return crt->cgroup_members_mask; /* Use cached value if possible */
+ CGroupMask m = 0;
if (u->type == UNIT_SLICE) {
Unit *member;
UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
- u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
+ m |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
+ }
+
+ if (crt) {
+ crt->cgroup_members_mask = m;
+ crt->cgroup_members_mask_valid = true;
}
- u->cgroup_members_mask_valid = true;
- return u->cgroup_members_mask;
+ return m;
}
CGroupMask unit_get_siblings_mask(Unit *u) {
@@ -2236,8 +2603,12 @@ void unit_invalidate_cgroup_members_masks(Unit *u) {
assert(u);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return;
+
/* Recurse invalidate the member masks cache all the way up the tree */
- u->cgroup_members_mask_valid = false;
+ crt->cgroup_members_mask_valid = false;
slice = UNIT_GET_SLICE(u);
if (slice)
@@ -2249,11 +2620,12 @@ const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
/* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
while (u) {
-
- if (u->cgroup_path &&
- u->cgroup_realized &&
- FLAGS_SET(u->cgroup_realized_mask, mask))
- return u->cgroup_path;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (crt &&
+ crt->cgroup_path &&
+ crt->cgroup_realized &&
+ FLAGS_SET(crt->cgroup_realized_mask, mask))
+ return crt->cgroup_path;
u = UNIT_GET_SLICE(u);
}
@@ -2303,27 +2675,34 @@ int unit_default_cgroup_path(const Unit *u, char **ret) {
int unit_set_cgroup_path(Unit *u, const char *path) {
_cleanup_free_ char *p = NULL;
+ CGroupRuntime *crt;
int r;
assert(u);
- if (streq_ptr(u->cgroup_path, path))
+ crt = unit_get_cgroup_runtime(u);
+
+ if (crt && streq_ptr(crt->cgroup_path, path))
return 0;
+ unit_release_cgroup(u);
+
+ crt = unit_setup_cgroup_runtime(u);
+ if (!crt)
+ return -ENOMEM;
+
if (path) {
p = strdup(path);
if (!p)
return -ENOMEM;
- }
- if (p) {
r = hashmap_put(u->manager->cgroup_unit, p, u);
if (r < 0)
return r;
}
- unit_release_cgroup(u);
- u->cgroup_path = TAKE_PTR(p);
+ assert(!crt->cgroup_path);
+ crt->cgroup_path = TAKE_PTR(p);
return 1;
}
@@ -2337,10 +2716,11 @@ int unit_watch_cgroup(Unit *u) {
/* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
* cgroupv2 is available. */
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return 0;
- if (u->cgroup_control_inotify_wd >= 0)
+ if (crt->cgroup_control_inotify_wd >= 0)
return 0;
/* Only applies to the unified hierarchy */
@@ -2358,30 +2738,29 @@ int unit_watch_cgroup(Unit *u) {
if (r < 0)
return log_oom();
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.events", &events);
if (r < 0)
return log_oom();
- u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
- if (u->cgroup_control_inotify_wd < 0) {
+ crt->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+ if (crt->cgroup_control_inotify_wd < 0) {
if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
* is not an error */
return 0;
- return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
+ return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path));
}
- r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
+ r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd), u);
if (r < 0)
- return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
+ return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path));
return 0;
}
int unit_watch_cgroup_memory(Unit *u) {
_cleanup_free_ char *events = NULL;
- CGroupContext *c;
int r;
assert(u);
@@ -2389,10 +2768,11 @@ int unit_watch_cgroup_memory(Unit *u) {
/* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
* cgroupv2 is available. */
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return 0;
- c = unit_get_cgroup_context(u);
+ CGroupContext *c = unit_get_cgroup_context(u);
if (!c)
return 0;
@@ -2407,7 +2787,7 @@ int unit_watch_cgroup_memory(Unit *u) {
if (u->type == UNIT_SLICE)
return 0;
- if (u->cgroup_memory_inotify_wd >= 0)
+ if (crt->cgroup_memory_inotify_wd >= 0)
return 0;
/* Only applies to the unified hierarchy */
@@ -2421,23 +2801,23 @@ int unit_watch_cgroup_memory(Unit *u) {
if (r < 0)
return log_oom();
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "memory.events", &events);
if (r < 0)
return log_oom();
- u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
- if (u->cgroup_memory_inotify_wd < 0) {
+ crt->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+ if (crt->cgroup_memory_inotify_wd < 0) {
if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
* is not an error */
return 0;
- return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
+ return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path));
}
- r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
+ r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd), u);
if (r < 0)
- return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
+ return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path));
return 0;
}
@@ -2448,12 +2828,15 @@ int unit_pick_cgroup_path(Unit *u) {
assert(u);
- if (u->cgroup_path)
- return 0;
-
if (!UNIT_HAS_CGROUP_CONTEXT(u))
return -EINVAL;
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
+ if (!crt)
+ return -ENOMEM;
+ if (crt->cgroup_path)
+ return 0;
+
r = unit_default_cgroup_path(u, &path);
if (r < 0)
return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m");
@@ -2483,30 +2866,35 @@ static int unit_update_cgroup(
if (!UNIT_HAS_CGROUP_CONTEXT(u))
return 0;
+ if (u->freezer_state != FREEZER_RUNNING)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY), "Cannot realize cgroup for frozen unit.");
+
/* Figure out our cgroup path */
r = unit_pick_cgroup_path(u);
if (r < 0)
return r;
+ CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
+
/* First, create our own group */
- r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
+ r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, crt->cgroup_path);
if (r < 0)
- return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(u->cgroup_path));
+ return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(crt->cgroup_path));
created = r;
if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
uint64_t cgroup_id = 0;
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_full_path);
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_full_path);
if (r == 0) {
r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id);
if (r < 0)
log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path);
} else
- log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+ log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
- u->cgroup_id = cgroup_id;
+ crt->cgroup_id = cgroup_id;
}
/* Start watching it */
@@ -2515,23 +2903,23 @@ static int unit_update_cgroup(
/* For v2 we preserve enabled controllers in delegated units, adjust others,
* for v1 we figure out which controller hierarchies need migration. */
- if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
+ if (created || !crt->cgroup_realized || !unit_cgroup_delegate(u)) {
CGroupMask result_mask = 0;
/* Enable all controllers we need */
- r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
+ r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, crt->cgroup_path, &result_mask);
if (r < 0)
- log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+ log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
/* Remember what's actually enabled now */
- u->cgroup_enabled_mask = result_mask;
+ crt->cgroup_enabled_mask = result_mask;
- migrate_mask = u->cgroup_realized_mask ^ target_mask;
+ migrate_mask = crt->cgroup_realized_mask ^ target_mask;
}
/* Keep track that this is now realized */
- u->cgroup_realized = true;
- u->cgroup_realized_mask = target_mask;
+ crt->cgroup_realized = true;
+ crt->cgroup_realized_mask = target_mask;
/* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling).
*
@@ -2541,14 +2929,14 @@ static int unit_update_cgroup(
* delegated units.
*/
if (cg_all_unified() == 0) {
- r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u);
+ r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, crt->cgroup_path, migrate_callback, u);
if (r < 0)
- log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(u->cgroup_path));
+ log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(crt->cgroup_path));
is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
- r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice);
+ r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, crt->cgroup_path, !is_root_slice);
if (r < 0)
- log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(u->cgroup_path));
+ log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(crt->cgroup_path));
}
/* Set attributes */
@@ -2578,11 +2966,12 @@ static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suf
if (!u->manager->system_bus)
return -EIO;
- if (!u->cgroup_path)
- return -EINVAL;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -EOWNERDEAD;
/* Determine this unit's cgroup path relative to our cgroup root */
- pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
+ pp = path_startswith(crt->cgroup_path, u->manager->cgroup_root);
if (!pp)
return -EINVAL;
@@ -2626,10 +3015,12 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
if (r < 0)
return r;
+ CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
+
if (isempty(suffix_path))
- p = u->cgroup_path;
+ p = crt->cgroup_path;
else {
- joined = path_join(u->cgroup_path, suffix_path);
+ joined = path_join(crt->cgroup_path, suffix_path);
if (!joined)
return -ENOMEM;
@@ -2701,7 +3092,7 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
continue;
/* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
- if (delegated_mask & u->cgroup_realized_mask & bit) {
+ if (delegated_mask & crt->cgroup_realized_mask & bit) {
r = cg_attach(cgroup_controller_to_string(c), p, pid->pid);
if (r >= 0)
continue; /* Success! */
@@ -2734,6 +3125,10 @@ static bool unit_has_mask_realized(
assert(u);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return false;
+
/* Returns true if this unit is fully realized. We check four things:
*
* 1. Whether the cgroup was created at all
@@ -2749,10 +3144,10 @@ static bool unit_has_mask_realized(
* enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
* simply don't matter. */
- return u->cgroup_realized &&
- ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
- ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
- u->cgroup_invalidated_mask == 0;
+ return crt->cgroup_realized &&
+ ((crt->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
+ ((crt->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
+ crt->cgroup_invalidated_mask == 0;
}
static bool unit_has_mask_disables_realized(
@@ -2762,14 +3157,18 @@ static bool unit_has_mask_disables_realized(
assert(u);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return true;
+
/* Returns true if all controllers which should be disabled are indeed disabled.
*
* Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
* already removed. */
- return !u->cgroup_realized ||
- (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
- FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
+ return !crt->cgroup_realized ||
+ (FLAGS_SET(crt->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
+ FLAGS_SET(crt->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
}
static bool unit_has_mask_enables_realized(
@@ -2779,14 +3178,18 @@ static bool unit_has_mask_enables_realized(
assert(u);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return false;
+
/* Returns true if all controllers which should be enabled are indeed enabled.
*
* Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
* we want to add is already added. */
- return u->cgroup_realized &&
- ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
- ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
+ return crt->cgroup_realized &&
+ ((crt->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (crt->cgroup_realized_mask & CGROUP_MASK_V1) &&
+ ((crt->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (crt->cgroup_enabled_mask & CGROUP_MASK_V2);
}
void unit_add_to_cgroup_realize_queue(Unit *u) {
@@ -2835,8 +3238,10 @@ static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
return 0;
- new_target_mask = u->cgroup_realized_mask | target_mask;
- new_enable_mask = u->cgroup_enabled_mask | enable_mask;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+
+ new_target_mask = (crt ? crt->cgroup_realized_mask : 0) | target_mask;
+ new_enable_mask = (crt ? crt->cgroup_enabled_mask : 0) | enable_mask;
return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
}
@@ -2855,9 +3260,13 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
int r;
+ CGroupRuntime *rt = unit_get_cgroup_runtime(m);
+ if (!rt)
+ continue;
+
/* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
* holding any controllers open anyway. */
- if (!m->cgroup_realized)
+ if (!rt->cgroup_realized)
continue;
/* We must disable those below us first in order to release the controller. */
@@ -2871,8 +3280,8 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
continue;
- new_target_mask = m->cgroup_realized_mask & target_mask;
- new_enable_mask = m->cgroup_enabled_mask & enable_mask;
+ new_target_mask = rt->cgroup_realized_mask & target_mask;
+ new_enable_mask = rt->cgroup_enabled_mask & enable_mask;
r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
if (r < 0)
@@ -2959,8 +3368,10 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
if (r < 0)
return r;
+ CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u));
+
/* Now, reset the invalidation mask */
- u->cgroup_invalidated_mask = 0;
+ crt->cgroup_invalidated_mask = 0;
return 0;
}
@@ -3011,11 +3422,13 @@ void unit_add_family_to_cgroup_realize_queue(Unit *u) {
* masks. */
do {
- Unit *m;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
/* Children of u likely changed when we're called */
- u->cgroup_members_mask_valid = false;
+ if (crt)
+ crt->cgroup_members_mask_valid = false;
+ Unit *m;
UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
/* No point in doing cgroup application for units without active processes. */
@@ -3024,7 +3437,8 @@ void unit_add_family_to_cgroup_realize_queue(Unit *u) {
/* We only enqueue siblings if they were realized once at least, in the main
* hierarchy. */
- if (!m->cgroup_realized)
+ crt = unit_get_cgroup_runtime(m);
+ if (!crt || !crt->cgroup_realized)
continue;
/* If the unit doesn't need any new controllers and has current ones
@@ -3075,26 +3489,50 @@ void unit_release_cgroup(Unit *u) {
/* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
* when we close down everything for reexecution, where we really want to leave the cgroup in place. */
- if (u->cgroup_path) {
- (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
- u->cgroup_path = mfree(u->cgroup_path);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return;
+
+ if (crt->cgroup_path) {
+ (void) hashmap_remove(u->manager->cgroup_unit, crt->cgroup_path);
+ crt->cgroup_path = mfree(crt->cgroup_path);
}
- if (u->cgroup_control_inotify_wd >= 0) {
- if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
- log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
+ if (crt->cgroup_control_inotify_wd >= 0) {
+ if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_control_inotify_wd) < 0)
+ log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", crt->cgroup_control_inotify_wd, u->id);
- (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
- u->cgroup_control_inotify_wd = -1;
+ (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd));
+ crt->cgroup_control_inotify_wd = -1;
}
- if (u->cgroup_memory_inotify_wd >= 0) {
- if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
- log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
+ if (crt->cgroup_memory_inotify_wd >= 0) {
+ if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_memory_inotify_wd) < 0)
+ log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", crt->cgroup_memory_inotify_wd, u->id);
- (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
- u->cgroup_memory_inotify_wd = -1;
+ (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd));
+ crt->cgroup_memory_inotify_wd = -1;
}
+
+ *(CGroupRuntime**) ((uint8_t*) u + UNIT_VTABLE(u)->cgroup_runtime_offset) = cgroup_runtime_free(crt);
+}
+
+int unit_cgroup_is_empty(Unit *u) {
+ int r;
+
+ assert(u);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return -ENXIO;
+ if (!crt->cgroup_path)
+ return -EOWNERDEAD;
+
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty, ignoring: %m", empty_to_root(crt->cgroup_path));
+
+ return r;
}
bool unit_maybe_release_cgroup(Unit *u) {
@@ -3102,17 +3540,16 @@ bool unit_maybe_release_cgroup(Unit *u) {
assert(u);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return true;
- /* Don't release the cgroup if there are still processes under it. If we get notified later when all the
- * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed)
- * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned
- * up later. */
- r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
- if (r < 0)
- log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m");
- else if (r == 1) {
+ /* Don't release the cgroup if there are still processes under it. If we get notified later when all
+ * the processes exit (e.g. the processes were in D-state and exited after the unit was marked as
+ * failed) we need the cgroup paths to continue to be tracked by the manager so they can be looked up
+ * and cleaned up later. */
+ r = unit_cgroup_is_empty(u);
+ if (r == 1) {
unit_release_cgroup(u);
return true;
}
@@ -3127,28 +3564,32 @@ void unit_prune_cgroup(Unit *u) {
assert(u);
/* Removes the cgroup, if empty and possible, and stops watching it. */
-
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return;
- (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
+ /* Cache the last CPU and memory usage values before we destroy the cgroup */
+ (void) unit_get_cpu_usage(u, /* ret = */ NULL);
+
+ for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++)
+ (void) unit_get_memory_accounting(u, metric, /* ret = */ NULL);
#if BPF_FRAMEWORK
- (void) lsm_bpf_cleanup(u); /* Remove cgroup from the global LSM BPF map */
+ (void) bpf_restrict_fs_cleanup(u); /* Remove cgroup from the global LSM BPF map */
#endif
unit_modify_nft_set(u, /* add = */ false);
is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
- r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
+ r = cg_trim_everywhere(u->manager->cgroup_supported, crt->cgroup_path, !is_root_slice);
if (r < 0)
/* One reason we could have failed here is, that the cgroup still contains a process.
* However, if the cgroup becomes removable at a later time, it might be removed when
* the containing slice is stopped. So even if we failed now, this unit shouldn't assume
* that the cgroup is still realized the next time it is started. Do not return early
* on error, continue cleanup. */
- log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+ log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path));
if (is_root_slice)
return;
@@ -3156,11 +3597,15 @@ void unit_prune_cgroup(Unit *u) {
if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
return;
- u->cgroup_realized = false;
- u->cgroup_realized_mask = 0;
- u->cgroup_enabled_mask = 0;
+ crt = unit_get_cgroup_runtime(u); /* The above might have destroyed the runtime object, let's see if it's still there */
+ if (!crt)
+ return;
+
+ crt->cgroup_realized = false;
+ crt->cgroup_realized_mask = 0;
+ crt->cgroup_enabled_mask = 0;
- u->bpf_device_control_installed = bpf_program_free(u->bpf_device_control_installed);
+ crt->bpf_device_control_installed = bpf_program_free(crt->bpf_device_control_installed);
}
int unit_search_main_pid(Unit *u, PidRef *ret) {
@@ -3171,17 +3616,20 @@ int unit_search_main_pid(Unit *u, PidRef *ret) {
assert(u);
assert(ret);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -ENXIO;
- r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
+ r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, &f);
if (r < 0)
return r;
for (;;) {
_cleanup_(pidref_done) PidRef npidref = PIDREF_NULL;
- r = cg_read_pidref(f, &npidref);
+ /* cg_read_pidref() will return an error on unmapped PIDs.
+ * We can't reasonably deal with units that contain those. */
+ r = cg_read_pidref(f, &npidref, CGROUP_DONT_SKIP_UNMAPPED);
if (r < 0)
return r;
if (r == 0)
@@ -3223,7 +3671,7 @@ static int unit_watch_pids_in_path(Unit *u, const char *path) {
for (;;) {
_cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
- r = cg_read_pidref(f, &pid);
+ r = cg_read_pidref(f, &pid, /* flags = */ 0);
if (r == 0)
break;
if (r < 0) {
@@ -3270,7 +3718,8 @@ int unit_synthesize_cgroup_empty_event(Unit *u) {
* support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
* get as notification source as soon as we stopped having any useful PIDs to watch for. */
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -ENOENT;
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
@@ -3296,7 +3745,8 @@ int unit_watch_all_pids(Unit *u) {
* get reliable cgroup empty notifications: we try to use
* SIGCHLD as replacement. */
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -ENOENT;
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
@@ -3305,7 +3755,7 @@ int unit_watch_all_pids(Unit *u) {
if (r > 0) /* On unified we can use proper notifications */
return 0;
- return unit_watch_pids_in_path(u, u->cgroup_path);
+ return unit_watch_pids_in_path(u, crt->cgroup_path);
}
static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
@@ -3370,15 +3820,8 @@ void unit_add_to_cgroup_empty_queue(Unit *u) {
return;
/* Let's verify that the cgroup is really empty */
- if (!u->cgroup_path)
- return;
-
- r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
- if (r < 0) {
- log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
- return;
- }
- if (r == 0)
+ r = unit_cgroup_is_empty(u);
+ if (r <= 0)
return;
LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
@@ -3406,7 +3849,10 @@ int unit_check_oomd_kill(Unit *u) {
uint64_t n = 0;
int r;
- if (!u->cgroup_path)
+ assert(u);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return 0;
r = cg_all_unified();
@@ -3415,7 +3861,7 @@ int unit_check_oomd_kill(Unit *u) {
else if (r == 0)
return 0;
- r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_ooms", &value);
+ r = cg_get_xattr_malloc(crt->cgroup_path, "user.oomd_ooms", &value);
if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
return r;
@@ -3425,15 +3871,15 @@ int unit_check_oomd_kill(Unit *u) {
return r;
}
- increased = n > u->managed_oom_kill_last;
- u->managed_oom_kill_last = n;
+ increased = n > crt->managed_oom_kill_last;
+ crt->managed_oom_kill_last = n;
if (!increased)
return 0;
n = 0;
value = mfree(value);
- r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_kill", &value);
+ r = cg_get_xattr_malloc(crt->cgroup_path, "user.oomd_kill", &value);
if (r >= 0 && !isempty(value))
(void) safe_atou64(value, &n);
@@ -3460,10 +3906,16 @@ int unit_check_oom(Unit *u) {
uint64_t c;
int r;
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return 0;
- r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
+ r = cg_get_keyed_attribute(
+ "memory",
+ crt->cgroup_path,
+ "memory.events",
+ STRV_MAKE("oom_kill"),
+ &oom_kill);
if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */
c = 0;
else if (r < 0)
@@ -3474,8 +3926,8 @@ int unit_check_oom(Unit *u) {
return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
}
- increased = c > u->oom_kill_last;
- u->oom_kill_last = c;
+ increased = c > crt->oom_kill_last;
+ crt->oom_kill_last = c;
if (!increased)
return 0;
@@ -3525,7 +3977,9 @@ static void unit_add_to_cgroup_oom_queue(Unit *u) {
if (u->in_cgroup_oom_queue)
return;
- if (!u->cgroup_path)
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return;
LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
@@ -3541,7 +3995,7 @@ static void unit_add_to_cgroup_oom_queue(Unit *u) {
return;
}
- r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
+ r = sd_event_source_set_priority(s, EVENT_PRIORITY_CGROUP_OOM);
if (r < 0) {
log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
return;
@@ -3562,11 +4016,16 @@ static int unit_check_cgroup_events(Unit *u) {
assert(u);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return 0;
- r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
- STRV_MAKE("populated", "frozen"), values);
+ r = cg_get_keyed_attribute_graceful(
+ SYSTEMD_CGROUP_CONTROLLER,
+ crt->cgroup_path,
+ "cgroup.events",
+ STRV_MAKE("populated", "frozen"),
+ values);
if (r < 0)
return r;
@@ -3580,8 +4039,10 @@ static int unit_check_cgroup_events(Unit *u) {
unit_add_to_cgroup_empty_queue(u);
}
- /* Disregard freezer state changes due to operations not initiated by us */
- if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) {
+ /* Disregard freezer state changes due to operations not initiated by us.
+ * See: https://github.com/systemd/systemd/pull/13512/files#r416469963 and
+ * https://github.com/systemd/systemd/pull/13512#issuecomment-573007207 */
+ if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT, FREEZER_THAWING)) {
if (streq(values[1], "0"))
unit_thawed(u);
else
@@ -3670,7 +4131,7 @@ static int cg_bpf_mask_supported(CGroupMask *ret) {
mask |= CGROUP_MASK_BPF_SOCKET_BIND;
/* BPF-based cgroup_skb/{egress|ingress} hooks */
- r = restrict_network_interfaces_supported();
+ r = bpf_restrict_ifaces_supported();
if (r < 0)
return r;
if (r > 0)
@@ -3747,7 +4208,7 @@ int manager_setup_cgroup(Manager *m) {
/* Schedule cgroup empty checks early, but after having processed service notification messages or
* SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
* notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
- r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+ r = sd_event_source_set_priority(m->cgroup_empty_event_source, EVENT_PRIORITY_CGROUP_EMPTY);
if (r < 0)
return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
@@ -3776,7 +4237,7 @@ int manager_setup_cgroup(Manager *m) {
/* Process cgroup empty notifications early. Note that when this event is dispatched it'll
* just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
* handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
- r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+ r = sd_event_source_set_priority(m->cgroup_inotify_event_source, EVENT_PRIORITY_CGROUP_INOTIFY);
if (r < 0)
return log_error_errno(r, "Failed to set priority of inotify event source: %m");
@@ -3885,7 +4346,7 @@ Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
}
}
-Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid) {
+Unit *manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid) {
_cleanup_free_ char *cgroup = NULL;
assert(m);
@@ -3896,7 +4357,7 @@ Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid) {
return manager_get_unit_by_cgroup(m, cgroup);
}
-Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid) {
+Unit *manager_get_unit_by_pidref_watching(Manager *m, const PidRef *pid) {
Unit *u, **array;
assert(m);
@@ -3915,7 +4376,7 @@ Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid) {
return NULL;
}
-Unit *manager_get_unit_by_pidref(Manager *m, PidRef *pid) {
+Unit *manager_get_unit_by_pidref(Manager *m, const PidRef *pid) {
Unit *u;
assert(m);
@@ -3994,7 +4455,8 @@ int unit_get_memory_available(Unit *u, uint64_t *ret) {
if (!unit_context)
return -ENODATA;
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
continue;
(void) unit_get_memory_current(u, &current);
@@ -4026,21 +4488,22 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) {
if (!UNIT_CGROUP_BOOL(u, memory_accounting))
return -ENODATA;
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -ENODATA;
/* The root cgroup doesn't expose this information, let's get it from /proc instead */
if (unit_has_host_root_cgroup(u))
return procfs_memory_get_used(ret);
- if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
+ if ((crt->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
return -ENODATA;
r = cg_all_unified();
if (r < 0)
return r;
- return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret);
+ return cg_get_attribute_as_uint64("memory", crt->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret);
}
int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret) {
@@ -4063,7 +4526,10 @@ int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uin
if (!UNIT_CGROUP_BOOL(u, memory_accounting))
return -ENODATA;
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return -ENODATA;
+ if (!crt->cgroup_path)
/* If the cgroup is already gone, we try to find the last cached value. */
goto finish;
@@ -4071,7 +4537,7 @@ int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uin
if (unit_has_host_root_cgroup(u))
return -ENODATA;
- if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_MEMORY))
+ if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_MEMORY))
return -ENODATA;
r = cg_all_unified();
@@ -4080,14 +4546,14 @@ int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uin
if (r == 0)
return -ENODATA;
- r = cg_get_attribute_as_uint64("memory", u->cgroup_path, attributes_table[metric], &bytes);
+ r = cg_get_attribute_as_uint64("memory", crt->cgroup_path, attributes_table[metric], &bytes);
if (r < 0 && r != -ENODATA)
return r;
updated = r >= 0;
finish:
if (metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST) {
- uint64_t *last = &u->memory_accounting_last[metric];
+ uint64_t *last = &crt->memory_accounting_last[metric];
if (updated)
*last = bytes;
@@ -4112,17 +4578,18 @@ int unit_get_tasks_current(Unit *u, uint64_t *ret) {
if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
return -ENODATA;
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -ENODATA;
/* The root cgroup doesn't expose this information, let's get it from /proc instead */
if (unit_has_host_root_cgroup(u))
return procfs_tasks_get_current(ret);
- if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
+ if ((crt->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
return -ENODATA;
- return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret);
+ return cg_get_attribute_as_uint64("pids", crt->cgroup_path, "pids.current", ret);
}
static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
@@ -4132,7 +4599,8 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
assert(u);
assert(ret);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -ENODATA;
/* The root cgroup doesn't expose this information, let's get it from /proc instead */
@@ -4140,7 +4608,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
return procfs_cpu_get_usage(ret);
/* Requisite controllers for CPU accounting are not enabled */
- if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
+ if ((get_cpu_accounting_mask() & ~crt->cgroup_realized_mask) != 0)
return -ENODATA;
r = cg_all_unified();
@@ -4150,7 +4618,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
_cleanup_free_ char *val = NULL;
uint64_t us;
- r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
+ r = cg_get_keyed_attribute("cpu", crt->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
if (IN_SET(r, -ENOENT, -ENXIO))
return -ENODATA;
if (r < 0)
@@ -4162,7 +4630,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
ns = us * NSEC_PER_USEC;
} else
- return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret);
+ return cg_get_attribute_as_uint64("cpuacct", crt->cgroup_path, "cpuacct.usage", ret);
*ret = ns;
return 0;
@@ -4178,27 +4646,31 @@ int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
* started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
* call this function with a NULL return value. */
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -ENODATA;
+
if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
return -ENODATA;
r = unit_get_cpu_usage_raw(u, &ns);
- if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
+ if (r == -ENODATA && crt->cpu_usage_last != NSEC_INFINITY) {
/* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
* cached value. */
if (ret)
- *ret = u->cpu_usage_last;
+ *ret = crt->cpu_usage_last;
return 0;
}
if (r < 0)
return r;
- if (ns > u->cpu_usage_base)
- ns -= u->cpu_usage_base;
+ if (ns > crt->cpu_usage_base)
+ ns -= crt->cpu_usage_base;
else
ns = 0;
- u->cpu_usage_last = ns;
+ crt->cpu_usage_last = ns;
if (ret)
*ret = ns;
@@ -4221,9 +4693,13 @@ int unit_get_ip_accounting(
if (!UNIT_CGROUP_BOOL(u, ip_accounting))
return -ENODATA;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -ENODATA;
+
fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
- u->ip_accounting_ingress_map_fd :
- u->ip_accounting_egress_map_fd;
+ crt->ip_accounting_ingress_map_fd :
+ crt->ip_accounting_egress_map_fd;
if (fd < 0)
return -ENODATA;
@@ -4238,11 +4714,62 @@ int unit_get_ip_accounting(
* all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
* ip_accounting_extra[] field, and add them in here transparently. */
- *ret = value + u->ip_accounting_extra[metric];
+ *ret = value + crt->ip_accounting_extra[metric];
return r;
}
+static uint64_t unit_get_effective_limit_one(Unit *u, CGroupLimitType type) {
+ CGroupContext *cc;
+
+ assert(u);
+ assert(UNIT_HAS_CGROUP_CONTEXT(u));
+
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ switch (type) {
+ case CGROUP_LIMIT_MEMORY_MAX:
+ case CGROUP_LIMIT_MEMORY_HIGH:
+ return physical_memory();
+ case CGROUP_LIMIT_TASKS_MAX:
+ return system_tasks_max();
+ default:
+ assert_not_reached();
+ }
+
+ cc = ASSERT_PTR(unit_get_cgroup_context(u));
+ switch (type) {
+ /* Note: on legacy/hybrid hierarchies memory_max stays CGROUP_LIMIT_MAX unless configured
+ * explicitly. Effective value of MemoryLimit= (cgroup v1) is not implemented. */
+ case CGROUP_LIMIT_MEMORY_MAX:
+ return cc->memory_max;
+ case CGROUP_LIMIT_MEMORY_HIGH:
+ return cc->memory_high;
+ case CGROUP_LIMIT_TASKS_MAX:
+ return cgroup_tasks_max_resolve(&cc->tasks_max);
+ default:
+ assert_not_reached();
+ }
+}
+
+int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret) {
+ uint64_t infimum;
+
+ assert(u);
+ assert(ret);
+ assert(type >= 0);
+ assert(type < _CGROUP_LIMIT_TYPE_MAX);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return -EINVAL;
+
+ infimum = unit_get_effective_limit_one(u, type);
+ for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice))
+ infimum = MIN(infimum, unit_get_effective_limit_one(slice, type));
+
+ *ret = infimum;
+ return 0;
+}
+
static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
[CGROUP_IO_READ_BYTES] = "rbytes=",
@@ -4257,7 +4784,8 @@ static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_AC
assert(u);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -ENODATA;
if (unit_has_host_root_cgroup(u))
@@ -4266,13 +4794,13 @@ static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_AC
r = cg_all_unified();
if (r < 0)
return r;
- if (r == 0) /* TODO: support cgroupv1 */
+ if (r == 0)
return -ENODATA;
- if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
+ if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_IO))
return -ENODATA;
- r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
+ r = cg_get_path("io", crt->cgroup_path, "io.stat", &path);
if (r < 0)
return r;
@@ -4340,26 +4868,30 @@ int unit_get_io_accounting(
if (!UNIT_CGROUP_BOOL(u, io_accounting))
return -ENODATA;
- if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -ENODATA;
+
+ if (allow_cache && crt->io_accounting_last[metric] != UINT64_MAX)
goto done;
r = unit_get_io_accounting_raw(u, raw);
- if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
+ if (r == -ENODATA && crt->io_accounting_last[metric] != UINT64_MAX)
goto done;
if (r < 0)
return r;
for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
/* Saturated subtraction */
- if (raw[i] > u->io_accounting_base[i])
- u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
+ if (raw[i] > crt->io_accounting_base[i])
+ crt->io_accounting_last[i] = raw[i] - crt->io_accounting_base[i];
else
- u->io_accounting_last[i] = 0;
+ crt->io_accounting_last[i] = 0;
}
done:
if (ret)
- *ret = u->io_accounting_last[metric];
+ *ret = crt->io_accounting_last[metric];
return 0;
}
@@ -4369,11 +4901,15 @@ int unit_reset_cpu_accounting(Unit *u) {
assert(u);
- u->cpu_usage_last = NSEC_INFINITY;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return 0;
- r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
+ crt->cpu_usage_last = NSEC_INFINITY;
+
+ r = unit_get_cpu_usage_raw(u, &crt->cpu_usage_base);
if (r < 0) {
- u->cpu_usage_base = 0;
+ crt->cpu_usage_base = 0;
return r;
}
@@ -4383,7 +4919,11 @@ int unit_reset_cpu_accounting(Unit *u) {
void unit_reset_memory_accounting_last(Unit *u) {
assert(u);
- FOREACH_ARRAY(i, u->memory_accounting_last, ELEMENTSOF(u->memory_accounting_last))
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return;
+
+ FOREACH_ELEMENT(i, crt->memory_accounting_last)
*i = UINT64_MAX;
}
@@ -4392,13 +4932,17 @@ int unit_reset_ip_accounting(Unit *u) {
assert(u);
- if (u->ip_accounting_ingress_map_fd >= 0)
- RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd));
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return 0;
+
+ if (crt->ip_accounting_ingress_map_fd >= 0)
+ RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_ingress_map_fd));
- if (u->ip_accounting_egress_map_fd >= 0)
- RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd));
+ if (crt->ip_accounting_egress_map_fd >= 0)
+ RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_egress_map_fd));
- zero(u->ip_accounting_extra);
+ zero(crt->ip_accounting_extra);
return r;
}
@@ -4406,7 +4950,11 @@ int unit_reset_ip_accounting(Unit *u) {
void unit_reset_io_accounting_last(Unit *u) {
assert(u);
- FOREACH_ARRAY(i, u->io_accounting_last, _CGROUP_IO_ACCOUNTING_METRIC_MAX)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return;
+
+ FOREACH_ARRAY(i, crt->io_accounting_last, _CGROUP_IO_ACCOUNTING_METRIC_MAX)
*i = UINT64_MAX;
}
@@ -4415,11 +4963,15 @@ int unit_reset_io_accounting(Unit *u) {
assert(u);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return 0;
+
unit_reset_io_accounting_last(u);
- r = unit_get_io_accounting_raw(u, u->io_accounting_base);
+ r = unit_get_io_accounting_raw(u, crt->io_accounting_base);
if (r < 0) {
- zero(u->io_accounting_base);
+ zero(crt->io_accounting_base);
return r;
}
@@ -4445,6 +4997,10 @@ void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
if (!UNIT_HAS_CGROUP_CONTEXT(u))
return;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return;
+
if (m == 0)
return;
@@ -4455,10 +5011,10 @@ void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
- if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
+ if (FLAGS_SET(crt->cgroup_invalidated_mask, m)) /* NOP? */
return;
- u->cgroup_invalidated_mask |= m;
+ crt->cgroup_invalidated_mask |= m;
unit_add_to_cgroup_realize_queue(u);
}
@@ -4468,10 +5024,14 @@ void unit_invalidate_cgroup_bpf(Unit *u) {
if (!UNIT_HAS_CGROUP_CONTEXT(u))
return;
- if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return;
+
+ if (crt->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
return;
- u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
+ crt->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
unit_add_to_cgroup_realize_queue(u);
/* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
@@ -4523,66 +5083,102 @@ void manager_invalidate_startup_units(Manager *m) {
unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
}
+static int unit_cgroup_freezer_kernel_state(Unit *u, FreezerState *ret) {
+ _cleanup_free_ char *val = NULL;
+ FreezerState s;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
+ return -EOWNERDEAD;
+
+ r = cg_get_keyed_attribute(
+ SYSTEMD_CGROUP_CONTROLLER,
+ crt->cgroup_path,
+ "cgroup.events",
+ STRV_MAKE("frozen"),
+ &val);
+ if (IN_SET(r, -ENOENT, -ENXIO))
+ return -ENODATA;
+ if (r < 0)
+ return r;
+
+ if (streq(val, "0"))
+ s = FREEZER_RUNNING;
+ else if (streq(val, "1"))
+ s = FREEZER_FROZEN;
+ else {
+ log_unit_debug(u, "Unexpected cgroup frozen state: %s", val);
+ s = _FREEZER_STATE_INVALID;
+ }
+
+ *ret = s;
+ return 0;
+}
+
int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
_cleanup_free_ char *path = NULL;
- FreezerState target, kernel = _FREEZER_STATE_INVALID;
- int r, ret;
+ FreezerState target, current, next;
+ int r;
assert(u);
- assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_PARENT_FREEZE,
+ FREEZER_THAW, FREEZER_PARENT_THAW));
if (!cg_freezer_supported())
return 0;
- /* Ignore all requests to thaw init.scope or -.slice and reject all requests to freeze them */
- if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE))
- return action == FREEZER_FREEZE ? -EPERM : 0;
-
- if (!u->cgroup_realized)
- return -EBUSY;
+ unit_next_freezer_state(u, action, &next, &target);
- if (action == FREEZER_THAW) {
- Unit *slice = UNIT_GET_SLICE(u);
-
- if (slice) {
- r = unit_cgroup_freezer_action(slice, FREEZER_THAW);
- if (r < 0)
- return log_unit_error_errno(u, r, "Failed to thaw slice %s of unit: %m", slice->id);
- }
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_realized) {
+ /* No realized cgroup = nothing to freeze */
+ u->freezer_state = freezer_state_finish(next);
+ return 0;
}
- target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING;
-
- r = unit_freezer_state_kernel(u, &kernel);
+ r = unit_cgroup_freezer_kernel_state(u, &current);
if (r < 0)
- log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m");
+ return r;
- if (target == kernel) {
- u->freezer_state = target;
- if (action == FREEZER_FREEZE)
- return 0;
- ret = 0;
- } else
- ret = 1;
+ if (current == target)
+ next = freezer_state_finish(next);
+ else if (IN_SET(next, FREEZER_FROZEN, FREEZER_FROZEN_BY_PARENT, FREEZER_RUNNING)) {
+ /* We're transitioning into a finished state, which implies that the cgroup's
+ * current state already matches the target and thus we'd return 0. But, reality
+ * shows otherwise. This indicates that our freezer_state tracking has diverged
+ * from the real state of the cgroup, which can happen if someone meddles with the
+ * cgroup from underneath us. This really shouldn't happen during normal operation,
+ * though. So, let's warn about it and fix up the state to be valid */
+
+ log_unit_warning(u, "Unit wants to transition to %s freezer state but cgroup is unexpectedly %s, fixing up.",
+ freezer_state_to_string(next), freezer_state_to_string(current) ?: "(invalid)");
+
+ if (next == FREEZER_FROZEN)
+ next = FREEZER_FREEZING;
+ else if (next == FREEZER_FROZEN_BY_PARENT)
+ next = FREEZER_FREEZING_BY_PARENT;
+ else if (next == FREEZER_RUNNING)
+ next = FREEZER_THAWING;
+ }
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path);
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.freeze", &path);
if (r < 0)
return r;
- log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing");
-
- if (target != kernel) {
- if (action == FREEZER_FREEZE)
- u->freezer_state = FREEZER_FREEZING;
- else
- u->freezer_state = FREEZER_THAWING;
- }
+ log_unit_debug(u, "Unit freezer state was %s, now %s.",
+ freezer_state_to_string(u->freezer_state),
+ freezer_state_to_string(next));
- r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER);
+ r = write_string_file(path, one_zero(target == FREEZER_FROZEN), WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return r;
- return ret;
+ u->freezer_state = next;
+ return target != current;
}
int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
@@ -4592,10 +5188,11 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
assert(u);
assert(cpus);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -ENODATA;
- if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
+ if ((crt->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
return -ENODATA;
r = cg_all_unified();
@@ -4604,7 +5201,7 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
if (r == 0)
return -ENODATA;
- r = cg_get_attribute("cpuset", u->cgroup_path, name, &v);
+ r = cg_get_attribute("cpuset", crt->cgroup_path, name, &v);
if (r == -ENOENT)
return -ENODATA;
if (r < 0)
@@ -4613,6 +5210,422 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
}
+CGroupRuntime *cgroup_runtime_new(void) {
+ _cleanup_(cgroup_runtime_freep) CGroupRuntime *crt = NULL;
+
+ crt = new(CGroupRuntime, 1);
+ if (!crt)
+ return NULL;
+
+ *crt = (CGroupRuntime) {
+ .cpu_usage_last = NSEC_INFINITY,
+
+ .cgroup_control_inotify_wd = -1,
+ .cgroup_memory_inotify_wd = -1,
+
+ .ip_accounting_ingress_map_fd = -EBADF,
+ .ip_accounting_egress_map_fd = -EBADF,
+
+ .ipv4_allow_map_fd = -EBADF,
+ .ipv6_allow_map_fd = -EBADF,
+ .ipv4_deny_map_fd = -EBADF,
+ .ipv6_deny_map_fd = -EBADF,
+
+ .cgroup_invalidated_mask = _CGROUP_MASK_ALL,
+ };
+
+ FOREACH_ELEMENT(i, crt->memory_accounting_last)
+ *i = UINT64_MAX;
+ FOREACH_ELEMENT(i, crt->io_accounting_base)
+ *i = UINT64_MAX;
+ FOREACH_ELEMENT(i, crt->io_accounting_last)
+ *i = UINT64_MAX;
+ FOREACH_ELEMENT(i, crt->ip_accounting_extra)
+ *i = UINT64_MAX;
+
+ return TAKE_PTR(crt);
+}
+
+CGroupRuntime *cgroup_runtime_free(CGroupRuntime *crt) {
+ if (!crt)
+ return NULL;
+
+ fdset_free(crt->initial_socket_bind_link_fds);
+#if BPF_FRAMEWORK
+ bpf_link_free(crt->ipv4_socket_bind_link);
+ bpf_link_free(crt->ipv6_socket_bind_link);
+#endif
+ hashmap_free(crt->bpf_foreign_by_key);
+
+ bpf_program_free(crt->bpf_device_control_installed);
+
+#if BPF_FRAMEWORK
+ bpf_link_free(crt->restrict_ifaces_ingress_bpf_link);
+ bpf_link_free(crt->restrict_ifaces_egress_bpf_link);
+#endif
+ fdset_free(crt->initial_restrict_ifaces_link_fds);
+
+ safe_close(crt->ipv4_allow_map_fd);
+ safe_close(crt->ipv6_allow_map_fd);
+ safe_close(crt->ipv4_deny_map_fd);
+ safe_close(crt->ipv6_deny_map_fd);
+
+ bpf_program_free(crt->ip_bpf_ingress);
+ bpf_program_free(crt->ip_bpf_ingress_installed);
+ bpf_program_free(crt->ip_bpf_egress);
+ bpf_program_free(crt->ip_bpf_egress_installed);
+
+ set_free(crt->ip_bpf_custom_ingress);
+ set_free(crt->ip_bpf_custom_ingress_installed);
+ set_free(crt->ip_bpf_custom_egress);
+ set_free(crt->ip_bpf_custom_egress_installed);
+
+ free(crt->cgroup_path);
+
+ return mfree(crt);
+}
+
+static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes",
+ [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets",
+ [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes",
+ [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric);
+
+static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-base",
+ [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-base",
+ [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-base",
+ [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric);
+
+static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-last",
+ [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-last",
+ [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-last",
+ [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric);
+
+static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = {
+ [CGROUP_MEMORY_PEAK] = "memory-accounting-peak",
+ [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric);
+
+static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) {
+ _cleanup_free_ char *s = NULL;
+ int r;
+
+ assert(f);
+ assert(key);
+
+ if (mask == 0)
+ return 0;
+
+ r = cg_mask_to_string(mask, &s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to format cgroup mask: %m");
+
+ return serialize_item(f, key, s);
+}
+
+int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return 0;
+
+ (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, crt->cpu_usage_base);
+ if (crt->cpu_usage_last != NSEC_INFINITY)
+ (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, crt->cpu_usage_last);
+
+ if (crt->managed_oom_kill_last > 0)
+ (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, crt->managed_oom_kill_last);
+
+ if (crt->oom_kill_last > 0)
+ (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, crt->oom_kill_last);
+
+ for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) {
+ uint64_t v;
+
+ r = unit_get_memory_accounting(u, metric, &v);
+ if (r >= 0)
+ (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v);
+ }
+
+ for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
+ uint64_t v;
+
+ r = unit_get_ip_accounting(u, m, &v);
+ if (r >= 0)
+ (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v);
+ }
+
+ for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) {
+ (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, crt->io_accounting_base[im]);
+
+ if (crt->io_accounting_last[im] != UINT64_MAX)
+ (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, crt->io_accounting_last[im]);
+ }
+
+ if (crt->cgroup_path)
+ (void) serialize_item(f, "cgroup", crt->cgroup_path);
+ if (crt->cgroup_id != 0)
+ (void) serialize_item_format(f, "cgroup-id", "%" PRIu64, crt->cgroup_id);
+
+ (void) serialize_bool(f, "cgroup-realized", crt->cgroup_realized);
+ (void) serialize_cgroup_mask(f, "cgroup-realized-mask", crt->cgroup_realized_mask);
+ (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", crt->cgroup_enabled_mask);
+ (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", crt->cgroup_invalidated_mask);
+
+ (void) bpf_socket_bind_serialize(u, f, fds);
+
+ (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", crt->ip_bpf_ingress_installed);
+ (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", crt->ip_bpf_egress_installed);
+ (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", crt->bpf_device_control_installed);
+ (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", crt->ip_bpf_custom_ingress_installed);
+ (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", crt->ip_bpf_custom_egress_installed);
+
+ (void) bpf_restrict_ifaces_serialize(u, f, fds);
+
+ return 0;
+}
+
+#define MATCH_DESERIALIZE(u, key, l, v, parse_func, target) \
+ ({ \
+ bool _deserialize_matched = streq(l, key); \
+ if (_deserialize_matched) { \
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
+ if (!crt) \
+ log_oom_debug(); \
+ else { \
+ int _deserialize_r = parse_func(v); \
+ if (_deserialize_r < 0) \
+ log_unit_debug_errno(u, _deserialize_r, \
+ "Failed to parse \"%s=%s\", ignoring.", l, v); \
+ else \
+ crt->target = _deserialize_r; \
+ } \
+ } \
+ _deserialize_matched; \
+ })
+
+#define MATCH_DESERIALIZE_IMMEDIATE(u, key, l, v, parse_func, target) \
+ ({ \
+ bool _deserialize_matched = streq(l, key); \
+ if (_deserialize_matched) { \
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
+ if (!crt) \
+ log_oom_debug(); \
+ else { \
+ int _deserialize_r = parse_func(v, &crt->target); \
+ if (_deserialize_r < 0) \
+ log_unit_debug_errno(u, _deserialize_r, \
+ "Failed to parse \"%s=%s\", ignoring", l, v); \
+ } \
+ } \
+ _deserialize_matched; \
+ })
+
+#define MATCH_DESERIALIZE_METRIC(u, key, l, v, parse_func, target) \
+ ({ \
+ bool _deserialize_matched = streq(l, key); \
+ if (_deserialize_matched) { \
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \
+ if (!crt) \
+ log_oom_debug(); \
+ else { \
+ int _deserialize_r = parse_func(v); \
+ if (_deserialize_r < 0) \
+ log_unit_debug_errno(u, _deserialize_r, \
+ "Failed to parse \"%s=%s\", ignoring.", l, v); \
+ else \
+ crt->target = _deserialize_r; \
+ } \
+ } \
+ _deserialize_matched; \
+ })
+
+int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, FDSet *fds) {
+ int r;
+
+ assert(u);
+ assert(value);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return 0;
+
+ if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-base", key, value, safe_atou64, cpu_usage_base) ||
+ MATCH_DESERIALIZE_IMMEDIATE(u, "cpuacct-usage-base", key, value, safe_atou64, cpu_usage_base))
+ return 1;
+
+ if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-last", key, value, safe_atou64, cpu_usage_last))
+ return 1;
+
+ if (MATCH_DESERIALIZE_IMMEDIATE(u, "managed-oom-kill-last", key, value, safe_atou64, managed_oom_kill_last))
+ return 1;
+
+ if (MATCH_DESERIALIZE_IMMEDIATE(u, "oom-kill-last", key, value, safe_atou64, oom_kill_last))
+ return 1;
+
+ if (streq(key, "cgroup")) {
+ r = unit_set_cgroup_path(u, value);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", value);
+
+ (void) unit_watch_cgroup(u);
+ (void) unit_watch_cgroup_memory(u);
+ return 1;
+ }
+
+ if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-id", key, value, safe_atou64, cgroup_id))
+ return 1;
+
+ if (MATCH_DESERIALIZE(u, "cgroup-realized", key, value, parse_boolean, cgroup_realized))
+ return 1;
+
+ if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized-mask", key, value, cg_mask_from_string, cgroup_realized_mask))
+ return 1;
+
+ if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-enabled-mask", key, value, cg_mask_from_string, cgroup_enabled_mask))
+ return 1;
+
+ if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-invalidated-mask", key, value, cg_mask_from_string, cgroup_invalidated_mask))
+ return 1;
+
+ if (STR_IN_SET(key, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
+ int fd;
+
+ fd = deserialize_fd(fds, value);
+ if (fd >= 0)
+ (void) bpf_socket_bind_add_initial_link_fd(u, fd);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(key,
+ "ip-bpf-ingress-installed", "ip-bpf-egress-installed",
+ "bpf-device-control-installed",
+ "ip-bpf-custom-ingress-installed", "ip-bpf-custom-egress-installed")) {
+
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
+ if (!crt)
+ log_oom_debug();
+ else {
+ if (streq(key, "ip-bpf-ingress-installed"))
+ (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_ingress_installed);
+
+ if (streq(key, "ip-bpf-egress-installed"))
+ (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_egress_installed);
+
+ if (streq(key, "bpf-device-control-installed"))
+ (void) bpf_program_deserialize_attachment(value, fds, &crt->bpf_device_control_installed);
+
+ if (streq(key, "ip-bpf-custom-ingress-installed"))
+ (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_ingress_installed);
+
+ if (streq(key, "ip-bpf-custom-egress-installed"))
+ (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_egress_installed);
+ }
+
+ return 1;
+ }
+
+ if (streq(key, "restrict-ifaces-bpf-fd")) {
+ int fd;
+
+ fd = deserialize_fd(fds, value);
+ if (fd >= 0)
+ (void) bpf_restrict_ifaces_add_initial_link_fd(u, fd);
+ return 1;
+ }
+
+ CGroupMemoryAccountingMetric mm = memory_accounting_metric_field_last_from_string(key);
+ if (mm >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(value, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", value);
+ else {
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
+ if (!crt)
+ log_oom_debug();
+ else
+ crt->memory_accounting_last[mm] = c;
+ }
+
+ return 1;
+ }
+
+ CGroupIPAccountingMetric ipm = ip_accounting_metric_field_from_string(key);
+ if (ipm >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(value, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", value);
+ else {
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
+ if (!crt)
+ log_oom_debug();
+ else
+ crt->ip_accounting_extra[ipm] = c;
+ }
+
+ return 1;
+ }
+
+ CGroupIOAccountingMetric iom = io_accounting_metric_field_base_from_string(key);
+ if (iom >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(value, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", value);
+ else {
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
+ if (!crt)
+ log_oom_debug();
+ else
+ crt->io_accounting_base[iom] = c;
+ }
+
+ return 1;
+ }
+
+ iom = io_accounting_metric_field_last_from_string(key);
+ if (iom >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(value, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", value);
+ else {
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
+ if (!crt)
+ log_oom_debug();
+ else
+ crt->io_accounting_last[iom] = c;
+ }
+ return 1;
+ }
+
+ return 0;
+}
+
static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
[CGROUP_DEVICE_POLICY_AUTO] = "auto",
[CGROUP_DEVICE_POLICY_CLOSED] = "closed",
@@ -4621,17 +5634,10 @@ static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] =
DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
-static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
- [FREEZER_FREEZE] = "freeze",
- [FREEZER_THAW] = "thaw",
-};
-
-DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
-
static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
- [CGROUP_PRESSURE_WATCH_OFF] = "off",
+ [CGROUP_PRESSURE_WATCH_OFF] = "off",
[CGROUP_PRESSURE_WATCH_AUTO] = "auto",
- [CGROUP_PRESSURE_WATCH_ON] = "on",
+ [CGROUP_PRESSURE_WATCH_ON] = "on",
[CGROUP_PRESSURE_WATCH_SKIP] = "skip",
};
@@ -4663,3 +5669,11 @@ static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_AC
};
DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric);
+
+static const char *const cgroup_effective_limit_type_table[_CGROUP_LIMIT_TYPE_MAX] = {
+ [CGROUP_LIMIT_MEMORY_MAX] = "EffectiveMemoryMax",
+ [CGROUP_LIMIT_MEMORY_HIGH] = "EffectiveMemoryHigh",
+ [CGROUP_LIMIT_TASKS_MAX] = "EffectiveTasksMax",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_effective_limit_type, CGroupLimitType);
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
index f1b674b..72fe275 100644
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@@ -3,7 +3,10 @@
#include <stdbool.h>
-#include "bpf-lsm.h"
+#include "sd-event.h"
+
+#include "bpf-program.h"
+#include "bpf-restrict-fs.h"
#include "cgroup-util.h"
#include "cpu-set-util.h"
#include "firewall-util.h"
@@ -35,6 +38,7 @@ typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram;
typedef struct CGroupSocketBindItem CGroupSocketBindItem;
+typedef struct CGroupRuntime CGroupRuntime;
typedef enum CGroupDevicePolicy {
/* When devices listed, will allow those, plus built-in ones, if none are listed will allow
@@ -53,7 +57,9 @@ typedef enum CGroupDevicePolicy {
typedef enum FreezerAction {
FREEZER_FREEZE,
+ FREEZER_PARENT_FREEZE,
FREEZER_THAW,
+ FREEZER_PARENT_THAW,
_FREEZER_ACTION_MAX,
_FREEZER_ACTION_INVALID = -EINVAL,
@@ -129,6 +135,9 @@ typedef enum CGroupPressureWatch {
_CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
} CGroupPressureWatch;
+/* The user-supplied cgroup-related configuration options. This remains mostly immutable while the service
+ * manager is running (except for an occasional SetProperty() configuration change), outside of reload
+ * cycles. When adding members make sure to update cgroup_context_copy() accordingly. */
struct CGroupContext {
bool cpu_accounting;
bool io_accounting;
@@ -188,6 +197,8 @@ struct CGroupContext {
bool startup_memory_swap_max_set:1;
bool startup_memory_zswap_max_set:1;
+ bool memory_zswap_writeback;
+
Set *ip_address_allow;
Set *ip_address_deny;
/* These two flags indicate that redundant entries have been removed from
@@ -276,6 +287,95 @@ typedef enum CGroupMemoryAccountingMetric {
_CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL,
} CGroupMemoryAccountingMetric;
+/* Used for limits whose value sets have infimum */
+typedef enum CGroupLimitType {
+ CGROUP_LIMIT_MEMORY_MAX,
+ CGROUP_LIMIT_MEMORY_HIGH,
+ CGROUP_LIMIT_TASKS_MAX,
+ _CGROUP_LIMIT_TYPE_MAX,
+ _CGROUP_LIMIT_INVALID = -EINVAL,
+} CGroupLimitType;
+
+/* The dynamic, regular updated information about a unit that as a realized cgroup. This is only allocated when a unit is first realized */
+typedef struct CGroupRuntime {
+ /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */
+ nsec_t cpu_usage_base;
+ nsec_t cpu_usage_last; /* the most recently read value */
+
+ /* Most recently read value of memory accounting metrics */
+ uint64_t memory_accounting_last[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1];
+
+ /* The current counter of OOM kills initiated by systemd-oomd */
+ uint64_t managed_oom_kill_last;
+
+ /* The current counter of the oom_kill field in the memory.events cgroup attribute */
+ uint64_t oom_kill_last;
+
+ /* Where the io.stat data was at the time the unit was started */
+ uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
+ uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */
+
+ /* Counterparts in the cgroup filesystem */
+ char *cgroup_path;
+ uint64_t cgroup_id;
+ CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */
+ CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */
+ CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */
+ CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */
+
+ /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */
+ int cgroup_control_inotify_wd;
+ int cgroup_memory_inotify_wd;
+
+ /* Device Controller BPF program */
+ BPFProgram *bpf_device_control_installed;
+
+ /* IP BPF Firewalling/accounting */
+ int ip_accounting_ingress_map_fd;
+ int ip_accounting_egress_map_fd;
+ uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];
+
+ int ipv4_allow_map_fd;
+ int ipv6_allow_map_fd;
+ int ipv4_deny_map_fd;
+ int ipv6_deny_map_fd;
+ BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed;
+ BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed;
+
+ Set *ip_bpf_custom_ingress;
+ Set *ip_bpf_custom_ingress_installed;
+ Set *ip_bpf_custom_egress;
+ Set *ip_bpf_custom_egress_installed;
+
+ /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd,
+ * attached to unit cgroup by provided program fd and attach type. */
+ Hashmap *bpf_foreign_by_key;
+
+ FDSet *initial_socket_bind_link_fds;
+#if BPF_FRAMEWORK
+ /* BPF links to BPF programs attached to cgroup/bind{4|6} hooks and
+ * responsible for allowing or denying a unit to bind(2) to a socket
+ * address. */
+ struct bpf_link *ipv4_socket_bind_link;
+ struct bpf_link *ipv6_socket_bind_link;
+#endif
+
+ FDSet *initial_restrict_ifaces_link_fds;
+#if BPF_FRAMEWORK
+ struct bpf_link *restrict_ifaces_ingress_bpf_link;
+ struct bpf_link *restrict_ifaces_egress_bpf_link;
+#endif
+
+ bool cgroup_realized:1;
+ bool cgroup_members_mask_valid:1;
+
+ /* Reset cgroup accounting next time we fork something off */
+ bool reset_accounting:1;
+
+ /* Whether we warned about clamping the CPU quota period */
+ bool warned_clamping_cpu_quota_period:1;
+} CGroupRuntime;
+
typedef struct Unit Unit;
typedef struct Manager Manager;
typedef enum ManagerState ManagerState;
@@ -285,6 +385,7 @@ uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
void cgroup_context_init(CGroupContext *c);
+int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src);
void cgroup_context_done(CGroupContext *c);
void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
@@ -309,6 +410,17 @@ static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
+static inline int cgroup_context_add_bpf_foreign_program_dup(CGroupContext *c, const CGroupBPFForeignProgram *p) {
+ return cgroup_context_add_bpf_foreign_program(c, p->attach_type, p->bpffs_path);
+}
+int cgroup_context_add_io_device_limit_dup(CGroupContext *c, const CGroupIODeviceLimit *l);
+int cgroup_context_add_io_device_weight_dup(CGroupContext *c, const CGroupIODeviceWeight *w);
+int cgroup_context_add_io_device_latency_dup(CGroupContext *c, const CGroupIODeviceLatency *l);
+int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, const CGroupBlockIODeviceWeight *w);
+int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, const CGroupBlockIODeviceBandwidth *b);
+int cgroup_context_add_device_allow_dup(CGroupContext *c, const CGroupDeviceAllow *a);
+int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, const CGroupSocketBindItem *i);
+int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, const CGroupSocketBindItem *i);
void unit_modify_nft_set(Unit *u, bool add);
@@ -336,6 +448,7 @@ int unit_watch_cgroup(Unit *u);
int unit_watch_cgroup_memory(Unit *u);
void unit_add_to_cgroup_realize_queue(Unit *u);
+int unit_cgroup_is_empty(Unit *u);
void unit_release_cgroup(Unit *u);
/* Releases the cgroup only if it is recursively empty.
* Returns true if the cgroup was released, false otherwise. */
@@ -353,9 +466,9 @@ void manager_shutdown_cgroup(Manager *m, bool delete);
unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
-Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid);
-Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid);
-Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid);
+Unit *manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid);
+Unit *manager_get_unit_by_pidref_watching(Manager *m, const PidRef *pid);
+Unit* manager_get_unit_by_pidref(Manager *m, const PidRef *pid);
Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
uint64_t unit_get_ancestor_memory_min(Unit *u);
@@ -374,6 +487,7 @@ int unit_get_tasks_current(Unit *u, uint64_t *ret);
int unit_get_cpu_usage(Unit *u, nsec_t *ret);
int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
+int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret);
int unit_reset_cpu_accounting(Unit *u);
void unit_reset_memory_accounting_last(Unit *u);
@@ -413,6 +527,13 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
const char* freezer_action_to_string(FreezerAction a) _const_;
FreezerAction freezer_action_from_string(const char *s) _pure_;
+CGroupRuntime *cgroup_runtime_new(void);
+CGroupRuntime *cgroup_runtime_free(CGroupRuntime *crt);
+DEFINE_TRIVIAL_CLEANUP_FUNC(CGroupRuntime*, cgroup_runtime_free);
+
+int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds);
+int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, FDSet *fds);
+
const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_;
CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_;
@@ -425,5 +546,8 @@ CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s)
const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_;
CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_;
+const char* cgroup_effective_limit_type_to_string(CGroupLimitType m) _const_;
+CGroupLimitType cgroup_effective_limit_type_from_string(const char *s) _pure_;
+
const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_;
CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_;
diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c
index cd91381..3e6168d 100644
--- a/src/core/core-varlink.c
+++ b/src/core/core-varlink.c
@@ -69,6 +69,10 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, J
if (!c)
return -EINVAL;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt)
+ return -EINVAL;
+
if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
/* systemd-oomd should always treat inactive units as though they didn't enable any action since they
* should not have a valid cgroup */
@@ -83,19 +87,24 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, J
return json_build(ret_v, JSON_BUILD_OBJECT(
JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)),
- JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)),
+ JSON_BUILD_PAIR("path", JSON_BUILD_STRING(crt->cgroup_path)),
JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)),
JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit))));
}
int manager_varlink_send_managed_oom_update(Unit *u) {
_cleanup_(json_variant_unrefp) JsonVariant *arr = NULL, *v = NULL;
+ CGroupRuntime *crt;
CGroupContext *c;
int r;
assert(u);
- if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->cgroup_path)
+ if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager)
+ return 0;
+
+ crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return 0;
if (MANAGER_IS_SYSTEM(u->manager)) {
@@ -119,10 +128,10 @@ int manager_varlink_send_managed_oom_update(Unit *u) {
if (r < 0)
return r;
- for (size_t i = 0; i < ELEMENTSOF(managed_oom_mode_properties); i++) {
+ FOREACH_ELEMENT(i, managed_oom_mode_properties) {
_cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
- r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[i], &e);
+ r = build_managed_oom_json_array_element(u, *i, &e);
if (r < 0)
return r;
@@ -173,16 +182,16 @@ static int build_managed_oom_cgroups_json(Manager *m, JsonVariant **ret) {
if (!c)
continue;
- for (size_t j = 0; j < ELEMENTSOF(managed_oom_mode_properties); j++) {
+ FOREACH_ELEMENT(i, managed_oom_mode_properties) {
_cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
/* For the initial varlink call we only care about units that enabled (i.e. mode is not
* set to "auto") oomd properties. */
- if (!(streq(managed_oom_mode_properties[j], "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) &&
- !(streq(managed_oom_mode_properties[j], "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL))
+ if (!(streq(*i, "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) &&
+ !(streq(*i, "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL))
continue;
- r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[j], &e);
+ r = build_managed_oom_json_array_element(u, *i, &e);
if (r < 0)
return r;
@@ -359,7 +368,7 @@ static int build_group_json(const char *group_name, gid_t gid, JsonVariant **ret
JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid)),
JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")),
JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic"))))));
- }
+}
static bool group_match_lookup_parameters(LookupParameters *p, const char *name, gid_t gid) {
assert(p);
@@ -491,6 +500,43 @@ static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) {
m->managed_oom_varlink = varlink_unref(link);
}
+static int manager_setup_varlink_server(Manager *m, VarlinkServer **ret) {
+ _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to allocate varlink server object: %m");
+
+ varlink_server_set_userdata(s, m);
+
+ r = varlink_server_add_interface_many(
+ s,
+ &vl_interface_io_systemd_UserDatabase,
+ &vl_interface_io_systemd_ManagedOOM);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add interfaces to varlink server: %m");
+
+ r = varlink_server_bind_method_many(
+ s,
+ "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record,
+ "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record,
+ "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships,
+ "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to register varlink methods: %m");
+
+ r = varlink_server_bind_disconnect(s, vl_disconnect);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to register varlink disconnect handler: %m");
+
+ *ret = TAKE_PTR(s);
+ return 0;
+}
+
static int manager_varlink_init_system(Manager *m) {
_cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
int r;
@@ -527,7 +573,7 @@ static int manager_varlink_init_system(Manager *m) {
}
}
- r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
+ r = varlink_server_attach_event(s, m->event, EVENT_PRIORITY_IPC);
if (r < 0)
return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
@@ -585,7 +631,7 @@ static int manager_varlink_init_user(Manager *m) {
if (r < 0)
return r;
- r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
+ r = varlink_attach_event(link, m->event, EVENT_PRIORITY_IPC);
if (r < 0)
return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
@@ -597,43 +643,6 @@ static int manager_varlink_init_user(Manager *m) {
return 1;
}
-int manager_setup_varlink_server(Manager *m, VarlinkServer **ret) {
- _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
- int r;
-
- assert(m);
- assert(ret);
-
- r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
- if (r < 0)
- return log_debug_errno(r, "Failed to allocate varlink server object: %m");
-
- varlink_server_set_userdata(s, m);
-
- r = varlink_server_add_interface_many(
- s,
- &vl_interface_io_systemd_UserDatabase,
- &vl_interface_io_systemd_ManagedOOM);
- if (r < 0)
- return log_error_errno(r, "Failed to add interfaces to varlink server: %m");
-
- r = varlink_server_bind_method_many(
- s,
- "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record,
- "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record,
- "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships,
- "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups);
- if (r < 0)
- return log_debug_errno(r, "Failed to register varlink methods: %m");
-
- r = varlink_server_bind_disconnect(s, vl_disconnect);
- if (r < 0)
- return log_debug_errno(r, "Failed to register varlink disconnect handler: %m");
-
- *ret = TAKE_PTR(s);
- return 0;
-}
-
int manager_varlink_init(Manager *m) {
return MANAGER_IS_SYSTEM(m) ? manager_varlink_init_system(m) : manager_varlink_init_user(m);
}
diff --git a/src/core/core-varlink.h b/src/core/core-varlink.h
index 7f810d1..20507a4 100644
--- a/src/core/core-varlink.h
+++ b/src/core/core-varlink.h
@@ -6,10 +6,6 @@
int manager_varlink_init(Manager *m);
void manager_varlink_done(Manager *m);
-/* Creates a new VarlinkServer and binds methods. Does not set up sockets or attach events.
- * Used for manager serialize/deserialize. */
-int manager_setup_varlink_server(Manager *m, VarlinkServer **ret_s);
-
/* The manager is expected to send an update to systemd-oomd if one of the following occurs:
* - The value of ManagedOOM*= properties change
* - A unit with ManagedOOM*= properties changes unit active state */
diff --git a/src/core/crash-handler.c b/src/core/crash-handler.c
index f5c31b6..4a3fc01 100644
--- a/src/core/crash-handler.c
+++ b/src/core/crash-handler.c
@@ -27,7 +27,13 @@ _noreturn_ void freeze_or_exit_or_reboot(void) {
_exit(EXIT_EXCEPTION);
}
- if (arg_crash_reboot) {
+ if (arg_crash_action == CRASH_POWEROFF) {
+ log_notice("Shutting down...");
+ (void) reboot(RB_POWER_OFF);
+ log_struct_errno(LOG_EMERG, errno,
+ LOG_MESSAGE("Failed to power off: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_FAILED_STR);
+ } else if (arg_crash_action == CRASH_REBOOT) {
log_notice("Rebooting in 10s...");
(void) sleep(10);
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
index 8a9570f..49e84b4 100644
--- a/src/core/dbus-cgroup.c
+++ b/src/core/dbus-cgroup.c
@@ -487,6 +487,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_PROPERTY("StartupMemorySwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_swap_max), 0),
SD_BUS_PROPERTY("MemoryZSwapMax", "t", NULL, offsetof(CGroupContext, memory_zswap_max), 0),
SD_BUS_PROPERTY("StartupMemoryZSwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_zswap_max), 0),
+ SD_BUS_PROPERTY("MemoryZSwapWriteback", "b", bus_property_get_bool, offsetof(CGroupContext, memory_zswap_writeback), 0),
SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0),
SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0),
SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0),
@@ -1279,6 +1280,9 @@ int bus_cgroup_set_property(
if (streq(name, "MemoryLimitScale"))
return bus_cgroup_set_memory_scale(u, name, &c->memory_limit, message, flags, error);
+ if (streq(name, "MemoryZSwapWriteback"))
+ return bus_cgroup_set_boolean(u, name, &c->memory_zswap_writeback, CGROUP_MASK_MEMORY, message, flags, error);
+
if (streq(name, "TasksAccounting"))
return bus_cgroup_set_boolean(u, name, &c->tasks_accounting, CGROUP_MASK_PIDS, message, flags, error);
@@ -1300,17 +1304,18 @@ int bus_cgroup_set_property(
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
c->cpu_quota_per_sec_usec = u64;
- u->warned_clamping_cpu_quota_period = false;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (crt)
+ crt->warned_clamping_cpu_quota_period = false;
unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
if (c->cpu_quota_per_sec_usec == USEC_INFINITY)
unit_write_setting(u, flags, "CPUQuota", "CPUQuota=");
else
- /* config_parse_cpu_quota() requires an integer, so truncating division is used on
- * purpose here. */
unit_write_settingf(u, flags, "CPUQuota",
- "CPUQuota=%0.f%%",
- (double) (c->cpu_quota_per_sec_usec / 10000));
+ "CPUQuota=" USEC_FMT ".%02" PRI_USEC "%%",
+ c->cpu_quota_per_sec_usec / 10000,
+ (c->cpu_quota_per_sec_usec % 10000) / 100);
}
return 1;
@@ -1324,7 +1329,9 @@ int bus_cgroup_set_property(
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
c->cpu_quota_period_usec = u64;
- u->warned_clamping_cpu_quota_period = false;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (crt)
+ crt->warned_clamping_cpu_quota_period = false;
unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
if (c->cpu_quota_period_usec == USEC_INFINITY)
unit_write_setting(u, flags, "CPUQuotaPeriodSec", "CPUQuotaPeriodSec=");
@@ -2188,7 +2195,7 @@ int bus_cgroup_set_property(
c->restrict_network_interfaces_is_allow_list = is_allow_list;
STRV_FOREACH(s, l) {
- if (!ifname_valid(*s)) {
+ if (!ifname_valid_full(*s, IFNAME_VALID_ALTERNATIVE)) {
log_full(LOG_WARNING, "Invalid interface name, ignoring: %s", *s);
continue;
}
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 2d05ba7..21c260b 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -67,6 +67,7 @@ static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_policy, "i", ExecContext,
static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_priority, "i", ExecContext, exec_context_get_cpu_sched_priority);
static BUS_DEFINE_PROPERTY_GET(property_get_coredump_filter, "t", ExecContext, exec_context_get_coredump_filter);
static BUS_DEFINE_PROPERTY_GET(property_get_timer_slack_nsec, "t", ExecContext, exec_context_get_timer_slack_nsec);
+static BUS_DEFINE_PROPERTY_GET(property_get_set_login_environment, "b", ExecContext, exec_context_get_set_login_environment);
static int property_get_environment_files(
sd_bus *bus,
@@ -1038,7 +1039,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("User", "s", NULL, offsetof(ExecContext, user), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Group", "s", NULL, offsetof(ExecContext, group), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DynamicUser", "b", bus_property_get_bool, offsetof(ExecContext, dynamic_user), SD_BUS_VTABLE_PROPERTY_CONST),
- SD_BUS_PROPERTY("SetLoginEnvironment", "b", bus_property_get_tristate, offsetof(ExecContext, set_login_environment), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SetLoginEnvironment", "b", property_get_set_login_environment, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(ExecContext, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SetCredential", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SetCredentialEncrypted", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1305,18 +1306,24 @@ int bus_set_transient_exec_command(
sd_bus_message *message,
UnitWriteFlags flags,
sd_bus_error *error) {
- bool is_ex_prop = endswith(name, "Ex");
- unsigned n = 0;
+
+ const char *ex_prop = endswith(ASSERT_PTR(name), "Ex");
+ size_t n = 0;
int r;
+ assert(u);
+ assert(exec_command);
+ assert(message);
+ assert(error);
+
/* Drop Ex from the written setting. E.g. ExecStart=, not ExecStartEx=. */
- const char *written_name = is_ex_prop ? strndupa(name, strlen(name) - 2) : name;
+ const char *written_name = ex_prop ? strndupa_safe(name, ex_prop - name) : name;
- r = sd_bus_message_enter_container(message, 'a', is_ex_prop ? "(sasas)" : "(sasb)");
+ r = sd_bus_message_enter_container(message, 'a', ex_prop ? "(sasas)" : "(sasb)");
if (r < 0)
return r;
- while ((r = sd_bus_message_enter_container(message, 'r', is_ex_prop ? "sasas" : "sasb")) > 0) {
+ while ((r = sd_bus_message_enter_container(message, 'r', ex_prop ? "sasas" : "sasb")) > 0) {
_cleanup_strv_free_ char **argv = NULL, **ex_opts = NULL;
const char *path;
int b;
@@ -1338,7 +1345,7 @@ int bus_set_transient_exec_command(
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
"\"%s\" argv cannot be empty", name);
- r = is_ex_prop ? sd_bus_message_read_strv(message, &ex_opts) : sd_bus_message_read(message, "b", &b);
+ r = ex_prop ? sd_bus_message_read_strv(message, &ex_opts) : sd_bus_message_read(message, "b", &b);
if (r < 0)
return r;
@@ -1347,29 +1354,28 @@ int bus_set_transient_exec_command(
return r;
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
- ExecCommand *c;
+ _cleanup_(exec_command_freep) ExecCommand *c = NULL;
- c = new0(ExecCommand, 1);
+ c = new(ExecCommand, 1);
if (!c)
return -ENOMEM;
- c->path = strdup(path);
- if (!c->path) {
- free(c);
- return -ENOMEM;
- }
+ *c = (ExecCommand) {
+ .argv = TAKE_PTR(argv),
+ };
- c->argv = TAKE_PTR(argv);
+ r = path_simplify_alloc(path, &c->path);
+ if (r < 0)
+ return r;
- if (is_ex_prop) {
+ if (ex_prop) {
r = exec_command_flags_from_strv(ex_opts, &c->flags);
if (r < 0)
return r;
- } else
- c->flags = b ? EXEC_COMMAND_IGNORE_FAILURE : 0;
+ } else if (b)
+ c->flags |= EXEC_COMMAND_IGNORE_FAILURE;
- path_simplify(c->path);
- exec_command_append_list(exec_command, c);
+ exec_command_append_list(exec_command, TAKE_PTR(c));
}
n++;
@@ -1738,6 +1744,9 @@ int bus_exec_context_set_transient_property(
if (streq(name, "PrivateMounts"))
return bus_set_transient_tristate(u, name, &c->private_mounts, message, flags, error);
+ if (streq(name, "MountAPIVFS"))
+ return bus_set_transient_tristate(u, name, &c->mount_apivfs, message, flags, error);
+
if (streq(name, "PrivateNetwork"))
return bus_set_transient_bool(u, name, &c->private_network, message, flags, error);
@@ -1897,7 +1906,7 @@ int bus_exec_context_set_transient_property(
c->restrict_filesystems_allow_list = allow_list;
STRV_FOREACH(s, l) {
- r = lsm_bpf_parse_filesystem(
+ r = bpf_restrict_fs_parse_filesystem(
*s,
&c->restrict_filesystems,
FILESYSTEM_PARSE_LOG|
@@ -1948,7 +1957,7 @@ int bus_exec_context_set_transient_property(
r = strv_extend_strv(&c->supplementary_groups, l, true);
if (r < 0)
- return -ENOMEM;
+ return r;
joined = strv_join(c->supplementary_groups, " ");
if (!joined)
@@ -2705,51 +2714,51 @@ int bus_exec_context_set_transient_property(
return 1;
- } else if (streq(name, "MountAPIVFS")) {
- bool b;
-
- r = bus_set_transient_bool(u, name, &b, message, flags, error);
- if (r < 0)
- return r;
-
- if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
- c->mount_apivfs = b;
- c->mount_apivfs_set = true;
- }
-
- return 1;
-
} else if (streq(name, "WorkingDirectory")) {
+ _cleanup_free_ char *simplified = NULL;
+ bool missing_ok = false, is_home = false;
const char *s;
- bool missing_ok;
r = sd_bus_message_read(message, "s", &s);
if (r < 0)
return r;
- if (s[0] == '-') {
- missing_ok = true;
- s++;
- } else
- missing_ok = false;
+ if (!isempty(s)) {
+ if (s[0] == '-') {
+ missing_ok = true;
+ s++;
+ }
- if (!isempty(s) && !streq(s, "~") && !path_is_absolute(s))
- return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "WorkingDirectory= expects an absolute path or '~'");
+ if (streq(s, "~"))
+ is_home = true;
+ else {
+ if (!path_is_absolute(s))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS,
+ "WorkingDirectory= expects an absolute path or '~'");
- if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
- if (streq(s, "~")) {
- c->working_directory = mfree(c->working_directory);
- c->working_directory_home = true;
- } else {
- r = free_and_strdup(&c->working_directory, empty_to_null(s));
+ r = path_simplify_alloc(s, &simplified);
if (r < 0)
return r;
- c->working_directory_home = false;
+ if (!path_is_normalized(simplified))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS,
+ "WorkingDirectory= expects a normalized path or '~'");
+
+ if (path_below_api_vfs(simplified))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS,
+ "WorkingDirectory= may not be below /proc/, /sys/ or /dev/");
}
+ }
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ free_and_replace(c->working_directory, simplified);
+ c->working_directory_home = is_home;
c->working_directory_missing_ok = missing_ok;
- unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s);
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "WorkingDirectory=%s%s",
+ c->working_directory_missing_ok ? "-" : "",
+ c->working_directory_home ? "~" : strempty(c->working_directory));
}
return 1;
@@ -3173,7 +3182,7 @@ int bus_exec_context_set_transient_property(
r = strv_extend_strv(dirs, l, true);
if (r < 0)
- return -ENOMEM;
+ return r;
unit_write_settingf(u, flags, name, "%s=%s", name, joined);
}
@@ -3200,7 +3209,7 @@ int bus_exec_context_set_transient_property(
_cleanup_free_ char *joined = NULL;
r = strv_extend_strv(&c->exec_search_path, l, true);
if (r < 0)
- return -ENOMEM;
+ return r;
joined = strv_join(c->exec_search_path, ":");
if (!joined)
return log_oom();
diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h
index 5926bdb..4b7cb86 100644
--- a/src/core/dbus-execute.h
+++ b/src/core/dbus-execute.h
@@ -9,6 +9,7 @@
#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \
+ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoffTimestamp", (offset) + offsetof(ExecStatus, handoff_timestamp), flags), \
SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \
SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \
SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags)
diff --git a/src/core/dbus-job.c b/src/core/dbus-job.c
index c88d8c2..693efbb 100644
--- a/src/core/dbus-job.c
+++ b/src/core/dbus-job.c
@@ -54,7 +54,7 @@ int bus_job_method_cancel(sd_bus_message *message, void *userdata, sd_bus_error
if (!sd_bus_track_contains(j->bus_track, sd_bus_message_get_sender(message))) {
/* And for everybody else consult polkit */
- r = bus_verify_manage_units_async(j->unit->manager, message, error);
+ r = bus_verify_manage_units_async(j->manager, message, error);
if (r < 0)
return r;
if (r == 0)
@@ -87,22 +87,23 @@ int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_
if (r < 0)
return r;
- for (int i = 0; i < n; i ++) {
+ FOREACH_ARRAY(i, list, n) {
_cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+ Job *job = *i;
- job_path = job_dbus_path(list[i]);
+ job_path = job_dbus_path(job);
if (!job_path)
return -ENOMEM;
- unit_path = unit_dbus_path(list[i]->unit);
+ unit_path = unit_dbus_path(job->unit);
if (!unit_path)
return -ENOMEM;
r = sd_bus_message_append(reply, "(usssoo)",
- list[i]->id,
- list[i]->unit->id,
- job_type_to_string(list[i]->type),
- job_state_to_string(list[i]->state),
+ job->id,
+ job->unit->id,
+ job_type_to_string(job->type),
+ job_state_to_string(job->state),
job_path,
unit_path);
if (r < 0)
@@ -262,7 +263,7 @@ void bus_job_send_pending_change_signal(Job *j, bool including_new) {
if (!j->sent_dbus_new_signal && !including_new)
return;
- if (MANAGER_IS_RELOADING(j->unit->manager))
+ if (MANAGER_IS_RELOADING(j->manager))
return;
bus_job_send_change_signal(j);
@@ -331,12 +332,12 @@ static int bus_job_allocate_bus_track(Job *j) {
if (j->bus_track)
return 0;
- return sd_bus_track_new(j->unit->manager->api_bus, &j->bus_track, bus_job_track_handler, j);
+ return sd_bus_track_new(j->manager->api_bus, &j->bus_track, bus_job_track_handler, j);
}
int bus_job_coldplug_bus_track(Job *j) {
- int r;
_cleanup_strv_free_ char **deserialized_clients = NULL;
+ int r;
assert(j);
@@ -361,7 +362,7 @@ int bus_job_track_sender(Job *j, sd_bus_message *m) {
assert(j);
assert(m);
- if (sd_bus_message_get_bus(m) != j->unit->manager->api_bus) {
+ if (sd_bus_message_get_bus(m) != j->manager->api_bus) {
j->ref_by_private_bus = true;
return 0;
}
diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c
index 745f5cc..2515f54 100644
--- a/src/core/dbus-manager.c
+++ b/src/core/dbus-manager.c
@@ -11,6 +11,7 @@
#include "bus-common-errors.h"
#include "bus-get-properties.h"
#include "bus-log-control-api.h"
+#include "bus-util.h"
#include "chase.h"
#include "confidential-virt.h"
#include "data-fd-util.h"
@@ -39,6 +40,7 @@
#include "string-util.h"
#include "strv.h"
#include "syslog-util.h"
+#include "taint.h"
#include "user-util.h"
#include "version.h"
#include "virt.h"
@@ -125,13 +127,10 @@ static int property_get_tainted(
void *userdata,
sd_bus_error *error) {
- _cleanup_free_ char *s = NULL;
- Manager *m = ASSERT_PTR(userdata);
-
assert(bus);
assert(reply);
- s = manager_taint_string(m);
+ _cleanup_free_ char *s = taint_string();
if (!s)
return log_oom();
@@ -464,18 +463,13 @@ static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char
* its sleeve: if the name is specified empty we use the client's unit. */
if (isempty(name)) {
- _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
- pid_t pid;
-
- r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
- if (r < 0)
- return r;
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
- r = sd_bus_creds_get_pid(creds, &pid);
+ r = bus_query_sender_pidref(message, &pidref);
if (r < 0)
return r;
- u = manager_get_unit_by_pid(m, pid);
+ u = manager_get_unit_by_pidref(m, &pidref);
if (!u)
return sd_bus_error_set(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit.");
} else {
@@ -542,7 +536,7 @@ static int method_get_unit(sd_bus_message *message, void *userdata, sd_bus_error
static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) {
Manager *m = ASSERT_PTR(userdata);
- pid_t pid;
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
Unit *u;
int r;
@@ -552,27 +546,20 @@ static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bu
/* Anyone can call this method */
- r = sd_bus_message_read(message, "u", &pid);
+ r = sd_bus_message_read(message, "u", &pidref.pid);
if (r < 0)
return r;
- if (pid < 0)
- return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pid);
-
- if (pid == 0) {
- _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
-
- r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
- if (r < 0)
- return r;
-
- r = sd_bus_creds_get_pid(creds, &pid);
+ if (pidref.pid < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pidref.pid);
+ if (pidref.pid == 0) {
+ r = bus_query_sender_pidref(message, &pidref);
if (r < 0)
return r;
}
- u = manager_get_unit_by_pid(m, pid);
+ u = manager_get_unit_by_pidref(m, &pidref);
if (!u)
- return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pid);
+ return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pidref.pid);
return reply_unit_path(u, message, error);
}
@@ -581,41 +568,27 @@ static int method_get_unit_by_invocation_id(sd_bus_message *message, void *userd
_cleanup_free_ char *path = NULL;
Manager *m = ASSERT_PTR(userdata);
sd_id128_t id;
- const void *a;
Unit *u;
- size_t sz;
int r;
assert(message);
/* Anyone can call this method */
- r = sd_bus_message_read_array(message, 'y', &a, &sz);
- if (r < 0)
- return r;
- if (sz == 0)
- id = SD_ID128_NULL;
- else if (sz == 16)
- memcpy(&id, a, sz);
- else
+ if (bus_message_read_id128(message, &id) < 0)
return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid invocation ID");
if (sd_id128_is_null(id)) {
- _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
- pid_t pid;
-
- r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
- if (r < 0)
- return r;
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
- r = sd_bus_creds_get_pid(creds, &pid);
+ r = bus_query_sender_pidref(message, &pidref);
if (r < 0)
return r;
- u = manager_get_unit_by_pid(m, pid);
+ u = manager_get_unit_by_pidref(m, &pidref);
if (!u)
return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
- "Client " PID_FMT " not member of any unit.", pid);
+ "Client " PID_FMT " not member of any unit.", pidref.pid);
} else {
u = hashmap_get(m->units_by_invocation_id, &id);
if (!u)
@@ -797,6 +770,7 @@ static int method_generic_unit_operation(
assert(message);
assert(m);
+ assert(handler);
/* Read the first argument from the command and pass the operation to the specified per-unit
* method. */
@@ -860,11 +834,13 @@ static int method_clean_unit(sd_bus_message *message, void *userdata, sd_bus_err
}
static int method_freeze_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
- return method_generic_unit_operation(message, userdata, error, bus_unit_method_freeze, 0);
+ /* Only active units can be frozen, which must be properly loaded already */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_freeze, GENERIC_UNIT_VALIDATE_LOADED);
}
static int method_thaw_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
- return method_generic_unit_operation(message, userdata, error, bus_unit_method_thaw, 0);
+ /* Same as freeze above */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_thaw, GENERIC_UNIT_VALIDATE_LOADED);
}
static int method_reset_failed_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
@@ -972,9 +948,10 @@ static int method_list_units_by_names(sd_bus_message *message, void *userdata, s
}
static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
- /* Don't load a unit (since it won't have any processes if it's not loaded), but don't insist on the
- * unit being loaded (because even improperly loaded units might still have processes around */
- return method_generic_unit_operation(message, userdata, error, bus_unit_method_get_processes, 0);
+ /* Don't load a unit actively (since it won't have any processes if it's not loaded), but don't
+ * insist on the unit being loaded either (because even improperly loaded units might still have
+ * processes around). */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_get_processes, /* flags = */ 0);
}
static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
@@ -1430,11 +1407,11 @@ static int dump_impl(
* operations, and can cause PID1 to stall. So it seems similar enough in terms of security
* considerations and impact, and thus use the same access check for dumps which, given the
* large amount of data to fetch, can stall PID1 for quite some time. */
- r = mac_selinux_access_check(message, "reload", error);
+ r = mac_selinux_access_check(message, "reload", /* error = */ NULL);
if (r < 0)
goto ratelimited;
- r = bus_verify_bypass_dump_ratelimit_async(m, message, error);
+ r = bus_verify_bypass_dump_ratelimit_async(m, message, /* error = */ NULL);
if (r < 0)
goto ratelimited;
if (r == 0)
@@ -1469,7 +1446,7 @@ static int method_dump(sd_bus_message *message, void *userdata, sd_bus_error *er
static int reply_dump_by_fd(sd_bus_message *message, char *dump) {
_cleanup_close_ int fd = -EBADF;
- fd = acquire_data_fd(dump, strlen(dump), 0);
+ fd = acquire_data_fd(dump);
if (fd < 0)
return fd;
@@ -1621,10 +1598,10 @@ static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error *
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
/* Write a log message noting the unit or process who requested the Reload() */
- log_caller(message, m, "Reloading");
+ log_caller(message, m, "Reload");
/* Check the rate limit after the authorization succeeds, to avoid denial-of-service issues. */
- if (!ratelimit_below(&m->reload_ratelimit)) {
+ if (!ratelimit_below(&m->reload_reexec_ratelimit)) {
log_warning("Reloading request rejected due to rate limit.");
return sd_bus_error_setf(error,
SD_BUS_ERROR_LIMITS_EXCEEDED,
@@ -1667,7 +1644,15 @@ static int method_reexecute(sd_bus_message *message, void *userdata, sd_bus_erro
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
/* Write a log message noting the unit or process who requested the Reexecute() */
- log_caller(message, m, "Reexecuting");
+ log_caller(message, m, "Reexecution");
+
+ /* Check the rate limit after the authorization succeeds, to avoid denial-of-service issues. */
+ if (!ratelimit_below(&m->reload_reexec_ratelimit)) {
+ log_warning("Reexecution request rejected due to rate limit.");
+ return sd_bus_error_setf(error,
+ SD_BUS_ERROR_LIMITS_EXCEEDED,
+ "Reexecute() request rejected due to rate limit.");
+ }
/* We don't send a reply back here, the client should
* just wait for us disconnecting. */
@@ -2329,85 +2314,53 @@ static int send_unit_files_changed(sd_bus *bus, void *userdata) {
return sd_bus_send(bus, message, NULL);
}
-/* Create an error reply, using the error information from changes[]
- * if possible, and fall back to generating an error from error code c.
- * The error message only describes the first error.
- */
+static void manager_unit_files_changed(Manager *m, const InstallChange *changes, size_t n_changes) {
+ int r;
+
+ assert(m);
+ assert(changes || n_changes == 0);
+
+ if (!install_changes_have_modification(changes, n_changes))
+ return;
+
+ /* See comments for this variable in manager.h */
+ m->unit_file_state_outdated = true;
+
+ r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to send UnitFilesChanged signal, ignoring: %m");
+}
+
static int install_error(
sd_bus_error *error,
int c,
InstallChange *changes,
size_t n_changes) {
- CLEANUP_ARRAY(changes, n_changes, install_changes_free);
+ int r;
- for (size_t i = 0; i < n_changes; i++)
+ /* Create an error reply, using the error information from changes[] if possible, and fall back to
+ * generating an error from error code c. The error message only describes the first error. */
- /* When making changes here, make sure to also change install_changes_dump() in install.c. */
+ assert(changes || n_changes == 0);
- switch (changes[i].type) {
- case 0 ... _INSTALL_CHANGE_TYPE_MAX: /* not errors */
- break;
+ CLEANUP_ARRAY(changes, n_changes, install_changes_free);
- case -EEXIST:
- if (changes[i].source)
- return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
- "File %s already exists and is a symlink to %s.",
- changes[i].path, changes[i].source);
- return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
- "File %s already exists.",
- changes[i].path);
-
- case -ERFKILL:
- return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED,
- "Unit file %s is masked.", changes[i].path);
-
- case -EADDRNOTAVAIL:
- return sd_bus_error_setf(error, BUS_ERROR_UNIT_GENERATED,
- "Unit %s is transient or generated.", changes[i].path);
-
- case -ETXTBSY:
- return sd_bus_error_setf(error, BUS_ERROR_UNIT_BAD_PATH,
- "File %s is under the systemd unit hierarchy already.", changes[i].path);
-
- case -EBADSLT:
- return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
- "Invalid specifier in %s.", changes[i].path);
-
- case -EIDRM:
- return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
- "Destination unit %s is a non-template unit.", changes[i].path);
-
- case -EUCLEAN:
- return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
- "\"%s\" is not a valid unit name.",
- changes[i].path);
-
- case -ELOOP:
- return sd_bus_error_setf(error, BUS_ERROR_UNIT_LINKED,
- "Refusing to operate on alias name or linked unit file: %s",
- changes[i].path);
-
- case -EXDEV:
- if (changes[i].source)
- return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
- "Cannot alias %s as %s.",
- changes[i].source, changes[i].path);
- return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
- "Invalid unit reference %s.", changes[i].path);
-
- case -ENOENT:
- return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
- "Unit file %s does not exist.", changes[i].path);
+ FOREACH_ARRAY(i, changes, n_changes) {
+ _cleanup_free_ char *err_message = NULL;
+ const char *bus_error;
- case -EUNATCH:
- return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
- "Cannot resolve specifiers in %s.", changes[i].path);
+ if (i->type >= 0)
+ continue;
- default:
- assert(changes[i].type < 0); /* other errors */
- return sd_bus_error_set_errnof(error, changes[i].type, "File %s: %m", changes[i].path);
- }
+ r = install_change_dump_error(i, &err_message, &bus_error);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "File %s: %m", i->path);
+
+ return sd_bus_error_set(error, bus_error, err_message);
+ }
return c < 0 ? c : -EINVAL;
}
@@ -2426,12 +2379,6 @@ static int reply_install_changes_and_free(
CLEANUP_ARRAY(changes, n_changes, install_changes_free);
- if (install_changes_have_modification(changes, n_changes)) {
- r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL);
- if (r < 0)
- log_debug_errno(r, "Failed to send UnitFilesChanged signal: %m");
- }
-
r = sd_bus_message_new_method_return(message, &reply);
if (r < 0)
return r;
@@ -2446,18 +2393,17 @@ static int reply_install_changes_and_free(
if (r < 0)
return r;
- for (size_t i = 0; i < n_changes; i++) {
-
- if (changes[i].type < 0) {
+ FOREACH_ARRAY(i, changes, n_changes) {
+ if (i->type < 0) {
bad = true;
continue;
}
r = sd_bus_message_append(
reply, "(sss)",
- install_change_type_to_string(changes[i].type),
- changes[i].path,
- changes[i].source);
+ install_change_type_to_string(i->type),
+ i->path,
+ i->source);
if (r < 0)
return r;
@@ -2521,7 +2467,7 @@ static int method_enable_unit_files_generic(
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes);
- m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ manager_unit_files_changed(m, changes, n_changes);
if (r < 0)
return install_error(error, r, changes, n_changes);
@@ -2594,7 +2540,7 @@ static int method_preset_unit_files_with_mode(sd_bus_message *message, void *use
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
r = unit_file_preset(m->runtime_scope, flags, NULL, l, preset_mode, &changes, &n_changes);
- m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ manager_unit_files_changed(m, changes, n_changes);
if (r < 0)
return install_error(error, r, changes, n_changes);
@@ -2648,7 +2594,7 @@ static int method_disable_unit_files_generic(
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes);
- m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ manager_unit_files_changed(m, changes, n_changes);
if (r < 0)
return install_error(error, r, changes, n_changes);
@@ -2691,7 +2637,7 @@ static int method_revert_unit_files(sd_bus_message *message, void *userdata, sd_
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
r = unit_file_revert(m->runtime_scope, NULL, l, &changes, &n_changes);
- m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ manager_unit_files_changed(m, changes, n_changes);
if (r < 0)
return install_error(error, r, changes, n_changes);
@@ -2722,6 +2668,7 @@ static int method_set_default_target(sd_bus_message *message, void *userdata, sd
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
r = unit_file_set_default(m->runtime_scope, force ? UNIT_FILE_FORCE : 0, NULL, name, &changes, &n_changes);
+ manager_unit_files_changed(m, changes, n_changes);
if (r < 0)
return install_error(error, r, changes, n_changes);
@@ -2764,7 +2711,7 @@ static int method_preset_all_unit_files(sd_bus_message *message, void *userdata,
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
r = unit_file_preset_all(m->runtime_scope, flags, NULL, preset_mode, &changes, &n_changes);
- m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ manager_unit_files_changed(m, changes, n_changes);
if (r < 0)
return install_error(error, r, changes, n_changes);
@@ -2804,7 +2751,7 @@ static int method_add_dependency_unit_files(sd_bus_message *message, void *userd
return -EINVAL;
r = unit_file_add_dependency(m->runtime_scope, flags, NULL, l, target, dep, &changes, &n_changes);
- m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ manager_unit_files_changed(m, changes, n_changes);
if (r < 0)
return install_error(error, r, changes, n_changes);
@@ -2933,6 +2880,175 @@ static int method_dump_unit_descriptor_store(sd_bus_message *message, void *user
return method_generic_unit_operation(message, userdata, error, bus_service_method_dump_file_descriptor_store, 0);
}
+static int aux_scope_from_message(Manager *m, sd_bus_message *message, Unit **ret_scope, sd_bus_error *error) {
+ _cleanup_(pidref_done) PidRef sender_pidref = PIDREF_NULL;
+ _cleanup_free_ PidRef *pidrefs = NULL;
+ const char *name;
+ Unit *from, *scope;
+ PidRef *main_pid;
+ CGroupContext *cc;
+ size_t n_pids = 0;
+ uint64_t flags;
+ int r;
+
+ assert(ret_scope);
+
+ r = bus_query_sender_pidref(message, &sender_pidref);
+ if (r < 0)
+ return r;
+
+ from = manager_get_unit_by_pidref(m, &sender_pidref);
+ if (!from)
+ return sd_bus_error_set(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit.");
+
+ if (!IN_SET(from->type, UNIT_SERVICE, UNIT_SCOPE))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Starting auxiliary scope is supported only for service and scope units, refusing.");
+
+ if (!unit_name_is_valid(from->id, UNIT_NAME_PLAIN))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Auxiliary scope can be started only for non-template service units and scope units, refusing.");
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ if (!unit_name_is_valid(name, UNIT_NAME_PLAIN))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid name \"%s\" for auxiliary scope.", name);
+
+ if (unit_name_to_type(name) != UNIT_SCOPE)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Name \"%s\" of auxiliary scope doesn't have .scope suffix.", name);
+
+ main_pid = unit_main_pid(from);
+
+ r = sd_bus_message_enter_container(message, 'a', "h");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_(pidref_done) PidRef p = PIDREF_NULL;
+ Unit *unit;
+ int fd;
+
+ r = sd_bus_message_read(message, "h", &fd);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ r = pidref_set_pidfd(&p, fd);
+ if (r < 0) {
+ log_unit_warning_errno(from, r, "Failed to create process reference from PIDFD, ignoring: %m");
+ continue;
+ }
+
+ unit = manager_get_unit_by_pidref(m, &p);
+ if (!unit) {
+ log_unit_warning(from, "Failed to get unit from PIDFD, ignoring.");
+ continue;
+ }
+
+ if (!streq(unit->id, from->id)) {
+ log_unit_warning(from, "PID " PID_FMT " is not running in the same service as the calling process, ignoring.", p.pid);
+ continue;
+ }
+
+ if (pidref_equal(main_pid, &p)) {
+ log_unit_warning(from, "Main PID cannot be migrated into auxiliary scope, ignoring.");
+ continue;
+ }
+
+ if (!GREEDY_REALLOC(pidrefs, n_pids+1))
+ return -ENOMEM;
+
+ pidrefs[n_pids++] = TAKE_PIDREF(p);
+ }
+
+ if (n_pids == 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "No processes can be migrated to auxiliary scope.");
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "t", &flags);
+ if (r < 0)
+ return r;
+
+ if (flags != 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Flags must be zero.");
+
+ r = manager_load_unit(m, name, NULL, error, &scope);
+ if (r < 0)
+ return r;
+
+ if (!unit_is_pristine(scope))
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+ "Unit %s was already loaded or has a fragment file.", name);
+
+ r = unit_set_slice(scope, UNIT_GET_SLICE(from));
+ if (r < 0)
+ return r;
+
+ cc = unit_get_cgroup_context(scope);
+
+ r = cgroup_context_copy(cc, unit_get_cgroup_context(from));
+ if (r < 0)
+ return r;
+
+ r = unit_make_transient(scope);
+ if (r < 0)
+ return r;
+
+ r = bus_unit_set_properties(scope, message, UNIT_RUNTIME, true, error);
+ if (r < 0)
+ return r;
+
+ FOREACH_ARRAY(p, pidrefs, n_pids) {
+ r = unit_pid_attachable(scope, p, error);
+ if (r < 0)
+ return r;
+
+ r = unit_watch_pidref(scope, p, /* exclusive= */ false);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+
+ /* Now load the missing bits of the unit we just created */
+ unit_add_to_load_queue(scope);
+ manager_dispatch_load_queue(m);
+
+ *ret_scope = TAKE_PTR(scope);
+
+ return 1;
+}
+
+static int method_start_aux_scope(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u = NULL; /* avoid false maybe-uninitialized warning */
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = aux_scope_from_message(m, message, &u, error);
+ if (r < 0)
+ return r;
+
+ return bus_unit_queue_job(message, u, JOB_START, JOB_REPLACE, 0, error);
+}
+
const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_VTABLE_START(0),
@@ -2948,6 +3064,7 @@ const sd_bus_vtable bus_manager_vtable[] = {
BUS_PROPERTY_DUAL_TIMESTAMP("InitRDTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD]), SD_BUS_VTABLE_PROPERTY_CONST),
BUS_PROPERTY_DUAL_TIMESTAMP("UserspaceTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_USERSPACE]), SD_BUS_VTABLE_PROPERTY_CONST),
BUS_PROPERTY_DUAL_TIMESTAMP("FinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("ShutdownStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SHUTDOWN_START]), SD_BUS_VTABLE_PROPERTY_CONST),
BUS_PROPERTY_DUAL_TIMESTAMP("SecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST),
BUS_PROPERTY_DUAL_TIMESTAMP("SecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -3045,6 +3162,7 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, defaults.oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("CtrlAltDelBurstAction", "s", bus_property_get_emergency_action, offsetof(Manager, cad_burst_action), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SoftRebootsCount", "u", bus_property_get_unsigned, offsetof(Manager, soft_reboots_count), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_METHOD_WITH_ARGS("GetUnit",
SD_BUS_ARGS("s", name),
@@ -3491,6 +3609,11 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_RESULT("a(suuutuusu)", entries),
method_dump_unit_descriptor_store,
SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StartAuxiliaryScope",
+ SD_BUS_ARGS("s", name, "ah", pidfds, "t", flags, "a(sv)", properties),
+ SD_BUS_RESULT("o", job),
+ method_start_aux_scope,
+ SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_SIGNAL_WITH_ARGS("UnitNew",
SD_BUS_ARGS("s", id, "o", unit),
diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c
index 7dbbdd0..f6a9ea9 100644
--- a/src/core/dbus-mount.c
+++ b/src/core/dbus-mount.c
@@ -6,6 +6,7 @@
#include "dbus-kill.h"
#include "dbus-mount.h"
#include "dbus-util.h"
+#include "fstab-util.h"
#include "mount.h"
#include "string-util.h"
#include "unit.h"
@@ -62,7 +63,7 @@ const sd_bus_vtable bus_mount_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Mount, where), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("What", "s", property_get_what, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
- SD_BUS_PROPERTY("Options","s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Options", "s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
SD_BUS_PROPERTY("Type", "s", property_get_type, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Mount, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Mount, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
@@ -88,6 +89,7 @@ static int bus_mount_set_transient_property(
sd_bus_error *error) {
Unit *u = UNIT(m);
+ int r;
assert(m);
assert(name);
@@ -98,8 +100,31 @@ static int bus_mount_set_transient_property(
if (streq(name, "Where"))
return bus_set_transient_path(u, name, &m->where, message, flags, error);
- if (streq(name, "What"))
- return bus_set_transient_string(u, name, &m->parameters_fragment.what, message, flags, error);
+ if (streq(name, "What")) {
+ _cleanup_free_ char *path = NULL;
+ const char *v;
+
+ r = sd_bus_message_read(message, "s", &v);
+ if (r < 0)
+ return r;
+
+ if (!isempty(v)) {
+ path = fstab_node_to_udev_node(v);
+ if (!path)
+ return -ENOMEM;
+
+ /* path_is_valid is not used - see the comment for config_parse_mount_node */
+ if (strlen(path) >= PATH_MAX)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Resolved What=%s too long", path);
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ free_and_replace(m->parameters_fragment.what, path);
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "What=%s", strempty(m->parameters_fragment.what));
+ }
+
+ return 1;
+ }
if (streq(name, "Options"))
return bus_set_transient_string(u, name, &m->parameters_fragment.options, message, flags, error);
diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c
index 78196a1..165aa65 100644
--- a/src/core/dbus-scope.c
+++ b/src/core/dbus-scope.c
@@ -3,6 +3,7 @@
#include "alloc-util.h"
#include "bus-common-errors.h"
#include "bus-get-properties.h"
+#include "bus-util.h"
#include "dbus-cgroup.h"
#include "dbus-kill.h"
#include "dbus-manager.h"
@@ -84,7 +85,7 @@ static int bus_scope_set_transient_property(
return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error);
if (streq(name, "PIDs")) {
- _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ _cleanup_(pidref_done) PidRef sender_pidref = PIDREF_NULL;
unsigned n = 0;
r = sd_bus_message_enter_container(message, 'a', "u");
@@ -94,7 +95,7 @@ static int bus_scope_set_transient_property(
for (;;) {
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
uint32_t upid;
- pid_t pid;
+ PidRef *p;
r = sd_bus_message_read(message, "u", &upid);
if (r < 0)
@@ -103,28 +104,27 @@ static int bus_scope_set_transient_property(
break;
if (upid == 0) {
- if (!creds) {
- r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (!pidref_is_set(&sender_pidref)) {
+ r = bus_query_sender_pidref(message, &sender_pidref);
if (r < 0)
return r;
}
- r = sd_bus_creds_get_pid(creds, &pid);
+ p = &sender_pidref;
+ } else {
+ r = pidref_set_pid(&pidref, upid);
if (r < 0)
return r;
- } else
- pid = (uid_t) upid;
- r = pidref_set_pid(&pidref, pid);
- if (r < 0)
- return r;
+ p = &pidref;
+ }
- r = unit_pid_attachable(u, &pidref, error);
+ r = unit_pid_attachable(u, p, error);
if (r < 0)
return r;
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
- r = unit_watch_pidref(u, &pidref, /* exclusive= */ false);
+ r = unit_watch_pidref(u, p, /* exclusive= */ false);
if (r < 0 && r != -EEXIST)
return r;
}
diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c
index cc478f4..ff970df 100644
--- a/src/core/dbus-service.c
+++ b/src/core/dbus-service.c
@@ -166,9 +166,7 @@ static int bus_service_method_mount(sd_bus_message *message, void *userdata, sd_
r = bus_verify_manage_units_async_full(
u,
is_image ? "mount-image" : "bind-mount",
- CAP_SYS_ADMIN,
N_("Authentication is required to mount on '$(unit)'."),
- true,
message,
error);
if (r < 0)
diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c
index e77e9e5..03c5b4a 100644
--- a/src/core/dbus-socket.c
+++ b/src/core/dbus-socket.c
@@ -86,6 +86,7 @@ const sd_bus_vtable bus_socket_vtable[] = {
SD_BUS_PROPERTY("Transparent", "b", bus_property_get_bool, offsetof(Socket, transparent), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Broadcast", "b", bus_property_get_bool, offsetof(Socket, broadcast), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PassCredentials", "b", bus_property_get_bool, offsetof(Socket, pass_cred), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PassFileDescriptorsToExec", "b", bus_property_get_bool, offsetof(Socket, pass_fds_to_exec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PassSecurity", "b", bus_property_get_bool, offsetof(Socket, pass_sec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PassPacketInfo", "b", bus_property_get_bool, offsetof(Socket, pass_pktinfo), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Timestamping", "s", property_get_timestamping, offsetof(Socket, timestamping), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -190,6 +191,9 @@ static int bus_socket_set_transient_property(
if (streq(name, "PassCredentials"))
return bus_set_transient_bool(u, name, &s->pass_cred, message, flags, error);
+ if (streq(name, "PassFileDescriptorsToExec"))
+ return bus_set_transient_bool(u, name, &s->pass_fds_to_exec, message, flags, error);
+
if (streq(name, "PassSecurity"))
return bus_set_transient_bool(u, name, &s->pass_sec, message, flags, error);
diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c
index 1a037b7..953cd51 100644
--- a/src/core/dbus-unit.c
+++ b/src/core/dbus-unit.c
@@ -7,6 +7,7 @@
#include "bus-common-errors.h"
#include "bus-get-properties.h"
#include "bus-polkit.h"
+#include "bus-util.h"
#include "cgroup-util.h"
#include "condition.h"
#include "dbus-job.h"
@@ -177,7 +178,7 @@ static int property_get_dependencies(
return sd_bus_message_close_container(reply);
}
-static int property_get_requires_mounts_for(
+static int property_get_mounts_for(
sd_bus *bus,
const char *path,
const char *interface,
@@ -408,9 +409,7 @@ int bus_unit_method_start_generic(
r = bus_verify_manage_units_async_full(
u,
verb,
- CAP_SYS_ADMIN,
polkit_message_for_job[job_type],
- true,
message,
error);
if (r < 0)
@@ -491,9 +490,7 @@ int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_
r = bus_verify_manage_units_async_full(
u,
jtype,
- CAP_SYS_ADMIN,
polkit_message_for_job[type],
- true,
message,
error);
if (r < 0)
@@ -549,9 +546,7 @@ int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *
r = bus_verify_manage_units_async_full(
u,
"kill",
- CAP_KILL,
N_("Authentication is required to send a UNIX signal to the processes of '$(unit)'."),
- true,
message,
error);
if (r < 0)
@@ -579,9 +574,7 @@ int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus
r = bus_verify_manage_units_async_full(
u,
"reset-failed",
- CAP_SYS_ADMIN,
N_("Authentication is required to reset the \"failed\" state of '$(unit)'."),
- true,
message,
error);
if (r < 0)
@@ -611,9 +604,7 @@ int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_b
r = bus_verify_manage_units_async_full(
u,
"set-property",
- CAP_SYS_ADMIN,
N_("Authentication is required to set properties on '$(unit)'."),
- true,
message,
error);
if (r < 0)
@@ -641,9 +632,7 @@ int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *e
r = bus_verify_manage_units_async_full(
u,
"ref",
- CAP_SYS_ADMIN,
- NULL,
- false,
+ /* polkit_message= */ NULL,
message,
error);
if (r < 0)
@@ -712,9 +701,7 @@ int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error
r = bus_verify_manage_units_async_full(
u,
"clean",
- CAP_DAC_OVERRIDE,
N_("Authentication is required to delete files and directories associated with '$(unit)'."),
- true,
message,
error);
if (r < 0)
@@ -736,22 +723,13 @@ int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error
}
static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userdata, sd_bus_error *error, FreezerAction action) {
- const char* perm;
- int (*method)(Unit*);
Unit *u = ASSERT_PTR(userdata);
- bool reply_no_delay = false;
int r;
assert(message);
assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
- if (action == FREEZER_FREEZE) {
- perm = "stop";
- method = unit_freeze;
- } else {
- perm = "start";
- method = unit_thaw;
- }
+ const char *perm = action == FREEZER_FREEZE ? "stop" : "start";
r = mac_selinux_unit_access_check(u, message, perm, error);
if (r < 0)
@@ -760,9 +738,7 @@ static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userda
r = bus_verify_manage_units_async_full(
u,
perm,
- CAP_SYS_ADMIN,
N_("Authentication is required to freeze or thaw the processes of '$(unit)' unit."),
- true,
message,
error);
if (r < 0)
@@ -770,19 +746,21 @@ static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userda
if (r == 0)
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
- r = method(u);
+ r = unit_freezer_action(u, action);
if (r == -EOPNOTSUPP)
- return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support freezing.", u->id);
+ return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit does not support freeze/thaw");
if (r == -EBUSY)
- return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit has a pending job.");
+ return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit has a pending job");
if (r == -EHOSTDOWN)
- return sd_bus_error_set(error, BUS_ERROR_UNIT_INACTIVE, "Unit is inactive.");
+ return sd_bus_error_set(error, BUS_ERROR_UNIT_INACTIVE, "Unit is not active");
if (r == -EALREADY)
- return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Previously requested freezer operation for unit '%s' is still in progress.", u->id);
+ return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Previously requested freezer operation for unit is still in progress");
+ if (r == -ECHILD)
+ return sd_bus_error_set(error, SD_BUS_ERROR_FAILED, "Unit is frozen by a parent slice");
if (r < 0)
return r;
- if (r == 0)
- reply_no_delay = true;
+
+ bool reply_now = r == 0;
if (u->pending_freezer_invocation) {
bus_unit_send_pending_freezer_message(u, true);
@@ -791,7 +769,7 @@ static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userda
u->pending_freezer_invocation = sd_bus_message_ref(message);
- if (reply_no_delay) {
+ if (reply_now) {
r = bus_unit_send_pending_freezer_message(u, false);
if (r < 0)
return r;
@@ -879,7 +857,8 @@ const sd_bus_vtable bus_unit_vtable[] = {
SD_BUS_PROPERTY("StopPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("JoinsNamespaceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SliceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
- SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_requires_mounts_for, offsetof(Unit, requires_mounts_for), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_mounts_for, offsetof(Unit, mounts_for[UNIT_MOUNT_REQUIRES]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WantsMountsFor", "as", property_get_mounts_for, offsetof(Unit, mounts_for[UNIT_MOUNT_WANTS]), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Documentation", "as", NULL, offsetof(Unit, documentation), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Description", "s", property_get_description, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("AccessSELinuxContext", "s", NULL, offsetof(Unit, access_selinux_context), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1235,12 +1214,32 @@ static int property_get_cgroup(
* indicates the root cgroup, which we report as "/". c) all
* other cases we report as-is. */
- if (u->cgroup_path)
- t = empty_to_root(u->cgroup_path);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+
+ if (crt && crt->cgroup_path)
+ t = empty_to_root(crt->cgroup_path);
return sd_bus_message_append(reply, "s", t);
}
+static int property_get_cgroup_id(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ return sd_bus_message_append(reply, "t", crt ? crt->cgroup_id : UINT64_C(0));
+}
+
static int append_process(sd_bus_message *reply, const char *p, PidRef *pid, Set *pids) {
_cleanup_free_ char *buf = NULL, *cmdline = NULL;
int r;
@@ -1299,7 +1298,7 @@ static int append_cgroup(sd_bus_message *reply, const char *p, Set *pids) {
* threaded domain cgroup contains the PIDs of all processes in the subtree and is not
* readable in the subtree proper. */
- r = cg_read_pidref(f, &pidref);
+ r = cg_read_pidref(f, &pidref, /* flags = */ 0);
if (IN_SET(r, 0, -EOPNOTSUPP))
break;
if (r < 0)
@@ -1369,8 +1368,10 @@ int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bu
if (r < 0)
return r;
- if (u->cgroup_path) {
- r = append_cgroup(reply, u->cgroup_path, pids);
+ CGroupRuntime *crt;
+ crt = unit_get_cgroup_runtime(u);
+ if (crt && crt->cgroup_path) {
+ r = append_cgroup(reply, crt->cgroup_path, pids);
if (r < 0)
return r;
}
@@ -1441,6 +1442,28 @@ static int property_get_io_counter(
return sd_bus_message_append(reply, "t", value);
}
+static int property_get_effective_limit(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t value = CGROUP_LIMIT_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ ssize_t type;
+
+ assert(bus);
+ assert(reply);
+ assert(property);
+
+ assert_se((type = cgroup_effective_limit_type_from_string(property)) >= 0);
+ (void) unit_get_effective_limit(u, type, &value);
+ return sd_bus_message_append(reply, "t", value);
+}
+
int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
_cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
_cleanup_set_free_ Set *pids = NULL;
@@ -1478,7 +1501,7 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd
if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing.");
- r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds);
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID|SD_BUS_CREDS_PIDFD, &creds);
if (r < 0)
return r;
@@ -1489,7 +1512,6 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd
_cleanup_(pidref_freep) PidRef *pidref = NULL;
uid_t process_uid, sender_uid;
uint32_t upid;
- pid_t pid;
r = sd_bus_message_read(message, "u", &upid);
if (r < 0)
@@ -1498,13 +1520,14 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd
break;
if (upid == 0) {
- r = sd_bus_creds_get_pid(creds, &pid);
+ _cleanup_(pidref_done) PidRef p = PIDREF_NULL;
+ r = bus_creds_get_pidref(creds, &p);
if (r < 0)
return r;
- } else
- pid = (uid_t) upid;
- r = pidref_new_from_pid(pid, &pidref);
+ r = pidref_dup(&p, &pidref);
+ } else
+ r = pidref_new_from_pid(upid, &pidref);
if (r < 0)
return r;
@@ -1530,9 +1553,9 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd
return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m");
if (process_uid != sender_uid)
- return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid);
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pidref->pid);
if (process_uid != u->ref_uid)
- return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid);
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pidref->pid);
}
r = set_ensure_consume(&pids, &pidref_hash_ops_free, TAKE_PTR(pidref));
@@ -1555,17 +1578,20 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0),
SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0),
- SD_BUS_PROPERTY("ControlGroupId", "t", NULL, offsetof(Unit, cgroup_id), 0),
+ SD_BUS_PROPERTY("ControlGroupId", "t", property_get_cgroup_id, 0, 0),
SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0),
SD_BUS_PROPERTY("MemoryPeak", "t", property_get_memory_accounting, 0, 0),
SD_BUS_PROPERTY("MemorySwapCurrent", "t", property_get_memory_accounting, 0, 0),
SD_BUS_PROPERTY("MemorySwapPeak", "t", property_get_memory_accounting, 0, 0),
SD_BUS_PROPERTY("MemoryZSwapCurrent", "t", property_get_memory_accounting, 0, 0),
SD_BUS_PROPERTY("MemoryAvailable", "t", property_get_available_memory, 0, 0),
+ SD_BUS_PROPERTY("EffectiveMemoryMax", "t", property_get_effective_limit, 0, 0),
+ SD_BUS_PROPERTY("EffectiveMemoryHigh", "t", property_get_effective_limit, 0, 0),
SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0),
SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0),
SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0),
SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0),
+ SD_BUS_PROPERTY("EffectiveTasksMax", "t", property_get_effective_limit, 0, 0),
SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0),
SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0),
SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0),
@@ -1576,16 +1602,16 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = {
SD_BUS_PROPERTY("IOWriteOperations", "t", property_get_io_counter, 0, 0),
SD_BUS_METHOD_WITH_ARGS("GetProcesses",
- SD_BUS_NO_ARGS,
- SD_BUS_ARGS("a(sus)", processes),
- bus_unit_method_get_processes,
- SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_NO_ARGS,
+ SD_BUS_ARGS("a(sus)", processes),
+ bus_unit_method_get_processes,
+ SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD_WITH_ARGS("AttachProcesses",
- SD_BUS_ARGS("s", subcgroup, "au", pids),
- SD_BUS_NO_RESULT,
- bus_unit_method_attach_processes,
- SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_ARGS("s", subcgroup, "au", pids),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_attach_processes,
+ SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_VTABLE_END
};
@@ -2210,7 +2236,7 @@ static int bus_unit_set_transient_property(
return bus_set_transient_emergency_action(u, name, &u->job_timeout_action, message, flags, error);
if (streq(name, "JobTimeoutRebootArgument"))
- return bus_set_transient_string(u, name, &u->job_timeout_reboot_arg, message, flags, error);
+ return bus_set_transient_reboot_parameter(u, name, &u->job_timeout_reboot_arg, message, flags, error);
if (streq(name, "StartLimitIntervalUSec"))
return bus_set_transient_usec(u, name, &u->start_ratelimit.interval, message, flags, error);
@@ -2234,7 +2260,7 @@ static int bus_unit_set_transient_property(
return bus_set_transient_exit_status(u, name, &u->success_action_exit_status, message, flags, error);
if (streq(name, "RebootArgument"))
- return bus_set_transient_string(u, name, &u->reboot_arg, message, flags, error);
+ return bus_set_transient_reboot_parameter(u, name, &u->reboot_arg, message, flags, error);
if (streq(name, "CollectMode"))
return bus_set_transient_collect_mode(u, name, &u->collect_mode, message, flags, error);
@@ -2261,7 +2287,9 @@ static int bus_unit_set_transient_property(
u->documentation = strv_free(u->documentation);
unit_write_settingf(u, flags, name, "%s=", name);
} else {
- strv_extend_strv(&u->documentation, l, false);
+ r = strv_extend_strv(&u->documentation, l, /* filter_duplicates= */ false);
+ if (r < 0)
+ return r;
STRV_FOREACH(p, l)
unit_write_settingf(u, flags, name, "%s=%s", name, *p);
@@ -2308,7 +2336,7 @@ static int bus_unit_set_transient_property(
return 1;
- } else if (streq(name, "RequiresMountsFor")) {
+ } else if (STR_IN_SET(name, "RequiresMountsFor", "WantsMountsFor")) {
_cleanup_strv_free_ char **l = NULL;
r = sd_bus_message_read_strv(message, &l);
@@ -2328,9 +2356,9 @@ static int bus_unit_set_transient_property(
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not normalized: %s", name, *p);
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
- r = unit_require_mounts_for(u, *p, UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(u, *p, UNIT_DEPENDENCY_FILE, unit_mount_dependency_type_from_string(name));
if (r < 0)
- return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add required mount \"%s\": %m", *p);
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add requested mount \"%s\": %m", *p);
unit_write_settingf(u, flags, name, "%s=%s", name, *p);
}
diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c
index d680a64..b871d89 100644
--- a/src/core/dbus-util.c
+++ b/src/core/dbus-util.c
@@ -6,6 +6,7 @@
#include "escape.h"
#include "parse-util.h"
#include "path-util.h"
+#include "reboot-util.h"
#include "unit-printf.h"
#include "user-util.h"
#include "unit.h"
@@ -39,6 +40,7 @@ static bool valid_user_group_name_or_id_relaxed(const char *u) {
BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(user_relaxed, valid_user_group_name_or_id_relaxed);
BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(path, path_is_absolute);
+BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(reboot_parameter, reboot_parameter_is_valid);
int bus_set_transient_string(
Unit *u,
@@ -151,9 +153,7 @@ int bus_set_transient_usec_internal(
int bus_verify_manage_units_async_full(
Unit *u,
const char *verb,
- int capability,
const char *polkit_message,
- bool interactive,
sd_bus_message *call,
sd_bus_error *error) {
@@ -171,11 +171,8 @@ int bus_verify_manage_units_async_full(
return bus_verify_polkit_async(
call,
- capability,
"org.freedesktop.systemd1.manage-units",
details,
- interactive,
- UID_INVALID,
&u->manager->polkit_registry,
error);
}
diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h
index 9464b25..0fc3a94 100644
--- a/src/core/dbus-util.h
+++ b/src/core/dbus-util.h
@@ -239,6 +239,7 @@ int bus_set_transient_mode_t(Unit *u, const char *name, mode_t *p, sd_bus_messag
int bus_set_transient_unsigned(Unit *u, const char *name, unsigned *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_reboot_parameter(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
int bus_set_transient_tristate(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
@@ -249,7 +250,7 @@ static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, s
static inline int bus_set_transient_usec_fix_0(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) {
return bus_set_transient_usec_internal(u, name, p, true, message, flags, error);
}
-int bus_verify_manage_units_async_full(Unit *u, const char *verb, int capability, const char *polkit_message, bool interactive, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_manage_units_async_full(Unit *u, const char *verb, const char *polkit_message, sd_bus_message *call, sd_bus_error *error);
int bus_read_mount_options(sd_bus_message *message, sd_bus_error *error, MountOptions **ret_options, char **ret_format_str, const char *separator);
diff --git a/src/core/dbus.c b/src/core/dbus.c
index ba2cec4..1c6f6fc 100644
--- a/src/core/dbus.c
+++ b/src/core/dbus.c
@@ -232,6 +232,8 @@ static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_er
return 0;
path = sd_bus_message_get_path(message);
+ if (!path)
+ return 0;
if (object_path_startswith("/org/freedesktop/systemd1", path)) {
r = mac_selinux_access_check(message, verb, error);
@@ -241,25 +243,20 @@ static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_er
return 0;
}
- if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) {
- _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
- pid_t pid;
-
- r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
- if (r < 0)
- return 0;
+ if (streq(path, "/org/freedesktop/systemd1/unit/self")) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
- r = sd_bus_creds_get_pid(creds, &pid);
+ r = bus_query_sender_pidref(message, &pidref);
if (r < 0)
return 0;
- u = manager_get_unit_by_pid(m, pid);
+ u = manager_get_unit_by_pidref(m, &pidref);
} else {
r = manager_get_job_from_dbus_path(m, path, &j);
if (r >= 0)
u = j->unit;
else
- manager_load_unit_from_dbus_path(m, path, NULL, &u);
+ (void) manager_load_unit_from_dbus_path(m, path, NULL, &u);
}
if (!u)
return 0;
@@ -280,24 +277,19 @@ static int find_unit(Manager *m, sd_bus *bus, const char *path, Unit **unit, sd_
assert(bus);
assert(path);
- if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) {
- _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ if (streq(path, "/org/freedesktop/systemd1/unit/self")) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
sd_bus_message *message;
- pid_t pid;
message = sd_bus_get_current_message(bus);
if (!message)
return 0;
- r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
- if (r < 0)
- return r;
-
- r = sd_bus_creds_get_pid(creds, &pid);
+ r = bus_query_sender_pidref(message, &pidref);
if (r < 0)
return r;
- u = manager_get_unit_by_pid(m, pid);
+ u = manager_get_unit_by_pidref(m, &pidref);
if (!u)
return 0;
} else {
@@ -739,7 +731,7 @@ static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void
log_debug("Accepting direct incoming connection from " PID_FMT " (%s) [%s]", pid, strna(comm), strna(description));
}
- r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+ r = sd_bus_attach_event(bus, m->event, EVENT_PRIORITY_IPC);
if (r < 0) {
log_warning_errno(r, "Failed to attach new connection bus to event loop: %m");
return 0;
@@ -847,7 +839,7 @@ int bus_init_api(Manager *m) {
if (r < 0)
return log_error_errno(r, "Failed to connect to API bus: %m");
- r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+ r = sd_bus_attach_event(bus, m->event, EVENT_PRIORITY_IPC);
if (r < 0)
return log_error_errno(r, "Failed to attach API bus to event loop: %m");
@@ -904,7 +896,7 @@ int bus_init_system(Manager *m) {
if (r < 0)
return log_error_errno(r, "Failed to connect to system bus: %m");
- r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+ r = sd_bus_attach_event(bus, m->event, EVENT_PRIORITY_IPC);
if (r < 0)
return log_error_errno(r, "Failed to attach system bus to event loop: %m");
@@ -1073,7 +1065,7 @@ void bus_done(Manager *m) {
assert(!m->subscribed);
m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
- bus_verify_polkit_async_registry_free(m->polkit_registry);
+ m->polkit_registry = hashmap_free(m->polkit_registry);
}
int bus_fdset_add_all(Manager *m, FDSet *fds) {
@@ -1121,31 +1113,29 @@ int bus_foreach_bus(
int (*send_message)(sd_bus *bus, void *userdata),
void *userdata) {
- sd_bus *b;
- int r, ret = 0;
+ int r = 0;
+
+ assert(m);
+ assert(send_message);
/* Send to all direct buses, unconditionally */
+ sd_bus *b;
SET_FOREACH(b, m->private_buses) {
/* Don't bother with enqueuing these messages to clients that haven't started yet */
if (sd_bus_is_ready(b) <= 0)
continue;
- r = send_message(b, userdata);
- if (r < 0)
- ret = r;
+ RET_GATHER(r, send_message(b, userdata));
}
/* Send to API bus, but only if somebody is subscribed */
if (m->api_bus &&
(sd_bus_track_count(m->subscribed) > 0 ||
- sd_bus_track_count(subscribed2) > 0)) {
- r = send_message(m->api_bus, userdata);
- if (r < 0)
- ret = r;
- }
+ sd_bus_track_count(subscribed2) > 0))
+ RET_GATHER(r, send_message(m->api_bus, userdata));
- return ret;
+ return r;
}
void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix) {
@@ -1189,22 +1179,46 @@ int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l) {
}
int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
- return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-units", NULL, false, UID_INVALID, &m->polkit_registry, error);
+ return bus_verify_polkit_async(
+ call,
+ "org.freedesktop.systemd1.manage-units",
+ /* details= */ NULL,
+ &m->polkit_registry,
+ error);
}
int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
- return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-unit-files", NULL, false, UID_INVALID, &m->polkit_registry, error);
+ return bus_verify_polkit_async(
+ call,
+ "org.freedesktop.systemd1.manage-unit-files",
+ /* details= */ NULL,
+ &m->polkit_registry,
+ error);
}
int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
- return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.reload-daemon", NULL, false, UID_INVALID, &m->polkit_registry, error);
+ return bus_verify_polkit_async(
+ call,
+ "org.freedesktop.systemd1.reload-daemon",
+ /* details= */ NULL,
+ &m->polkit_registry, error);
}
int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
- return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.set-environment", NULL, false, UID_INVALID, &m->polkit_registry, error);
+ return bus_verify_polkit_async(
+ call,
+ "org.freedesktop.systemd1.set-environment",
+ /* details= */ NULL,
+ &m->polkit_registry,
+ error);
}
int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
- return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.bypass-dump-ratelimit", NULL, false, UID_INVALID, &m->polkit_registry, error);
+ return bus_verify_polkit_async(
+ call,
+ "org.freedesktop.systemd1.bypass-dump-ratelimit",
+ /* details= */ NULL,
+ &m->polkit_registry,
+ error);
}
uint64_t manager_bus_n_queued_write(Manager *m) {
diff --git a/src/core/device.c b/src/core/device.c
index 6b2d7c3..d856767 100644
--- a/src/core/device.c
+++ b/src/core/device.c
@@ -119,10 +119,9 @@ static int device_set_sysfs(Device *d, const char *sysfs) {
}
static void device_init(Unit *u) {
- Device *d = DEVICE(u);
+ Device *d = ASSERT_PTR(DEVICE(u));
- assert(d);
- assert(UNIT(d)->load_state == UNIT_STUB);
+ assert(u->load_state == UNIT_STUB);
/* In contrast to all other unit types we timeout jobs waiting
* for devices by default. This is because they otherwise wait
@@ -137,9 +136,7 @@ static void device_init(Unit *u) {
}
static void device_done(Unit *u) {
- Device *d = DEVICE(u);
-
- assert(d);
+ Device *d = ASSERT_PTR(DEVICE(u));
device_unset_sysfs(d);
d->deserialized_sysfs = mfree(d->deserialized_sysfs);
@@ -258,9 +255,8 @@ static void device_update_found_by_name(Manager *m, const char *path, DeviceFoun
}
static int device_coldplug(Unit *u) {
- Device *d = DEVICE(u);
+ Device *d = ASSERT_PTR(DEVICE(u));
- assert(d);
assert(d->state == DEVICE_DEAD);
/* First, let's put the deserialized state and found mask into effect, if we have it. */
@@ -336,9 +332,7 @@ static int device_coldplug(Unit *u) {
}
static void device_catchup(Unit *u) {
- Device *d = DEVICE(u);
-
- assert(d);
+ Device *d = ASSERT_PTR(DEVICE(u));
/* Second, let's update the state with the enumerated state */
device_update_found_one(d, d->enumerated_found, DEVICE_FOUND_MASK);
@@ -405,11 +399,9 @@ static int device_found_from_string_many(const char *name, DeviceFound *ret) {
}
static int device_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Device *d = ASSERT_PTR(DEVICE(u));
_cleanup_free_ char *s = NULL;
- Device *d = DEVICE(u);
- assert(d);
- assert(u);
assert(f);
assert(fds);
@@ -428,11 +420,9 @@ static int device_serialize(Unit *u, FILE *f, FDSet *fds) {
}
static int device_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Device *d = DEVICE(u);
+ Device *d = ASSERT_PTR(DEVICE(u));
int r;
- assert(d);
- assert(u);
assert(key);
assert(value);
assert(fds);
@@ -472,10 +462,11 @@ static int device_deserialize_item(Unit *u, const char *key, const char *value,
}
static void device_dump(Unit *u, FILE *f, const char *prefix) {
- Device *d = DEVICE(u);
+ Device *d = ASSERT_PTR(DEVICE(u));
_cleanup_free_ char *s = NULL;
- assert(d);
+ assert(f);
+ assert(prefix);
(void) device_found_to_string_many(d->found, &s);
@@ -495,15 +486,15 @@ static void device_dump(Unit *u, FILE *f, const char *prefix) {
}
static UnitActiveState device_active_state(Unit *u) {
- assert(u);
+ Device *d = ASSERT_PTR(DEVICE(u));
- return state_translation_table[DEVICE(u)->state];
+ return state_translation_table[d->state];
}
static const char *device_sub_state_to_string(Unit *u) {
- assert(u);
+ Device *d = ASSERT_PTR(DEVICE(u));
- return device_state_to_string(DEVICE(u)->state);
+ return device_state_to_string(d->state);
}
static int device_update_description(Unit *u, sd_device *dev, const char *path) {
@@ -538,12 +529,11 @@ static int device_update_description(Unit *u, sd_device *dev, const char *path)
}
static int device_add_udev_wants(Unit *u, sd_device *dev) {
+ Device *d = ASSERT_PTR(DEVICE(u));
_cleanup_strv_free_ char **added = NULL;
const char *wants, *property;
- Device *d = DEVICE(u);
int r;
- assert(d);
assert(dev);
property = MANAGER_IS_USER(u->manager) ? "SYSTEMD_USER_WANTS" : "SYSTEMD_WANTS";
@@ -646,6 +636,8 @@ static void device_upgrade_mount_deps(Unit *u) {
/* Let's upgrade Requires= to BindsTo= on us. (Used when SYSTEMD_MOUNT_DEVICE_BOUND is set) */
+ assert(u);
+
HASHMAP_FOREACH_KEY(v, other, unit_get_dependencies(u, UNIT_REQUIRED_BY)) {
if (other->type != UNIT_MOUNT)
continue;
@@ -706,16 +698,18 @@ static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool
unit_add_to_load_queue(u);
}
- if (!DEVICE(u)->path) {
- DEVICE(u)->path = strdup(path);
- if (!DEVICE(u)->path)
+ Device *d = ASSERT_PTR(DEVICE(u));
+
+ if (!d->path) {
+ d->path = strdup(path);
+ if (!d->path)
return log_oom();
}
/* If this was created via some dependency and has not actually been seen yet ->sysfs will not be
* initialized. Hence initialize it if necessary. */
if (sysfs) {
- r = device_set_sysfs(DEVICE(u), sysfs);
+ r = device_set_sysfs(d, sysfs);
if (r < 0)
return log_unit_error_errno(u, r, "Failed to set sysfs path %s: %m", sysfs);
@@ -730,11 +724,11 @@ static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool
* by systemd before the device appears on its radar. In this case the device unit is partially
* initialized and includes the deps on the mount unit but at that time the "bind mounts" flag wasn't
* present. Fix this up now. */
- if (dev && device_is_bound_by_mounts(DEVICE(u), dev))
+ if (dev && device_is_bound_by_mounts(d, dev))
device_upgrade_mount_deps(u);
if (units) {
- r = set_ensure_put(units, NULL, DEVICE(u));
+ r = set_ensure_put(units, NULL, d);
if (r < 0)
return log_unit_error_errno(u, r, "Failed to store unit: %m");
}
@@ -950,10 +944,7 @@ static int device_setup_units(Manager *m, sd_device *dev, Set **ready_units, Set
}
static Unit *device_following(Unit *u) {
- Device *d = DEVICE(u);
- Device *first = NULL;
-
- assert(d);
+ Device *d = ASSERT_PTR(DEVICE(u)), *first = NULL;
if (startswith(u->id, "sys-"))
return NULL;
@@ -973,16 +964,15 @@ static Unit *device_following(Unit *u) {
return UNIT(first);
}
-static int device_following_set(Unit *u, Set **_set) {
- Device *d = DEVICE(u);
+static int device_following_set(Unit *u, Set **ret) {
+ Device *d = ASSERT_PTR(DEVICE(u));
_cleanup_set_free_ Set *set = NULL;
int r;
- assert(d);
- assert(_set);
+ assert(ret);
if (LIST_JUST_US(same_sysfs, d)) {
- *_set = NULL;
+ *ret = NULL;
return 0;
}
@@ -1002,7 +992,7 @@ static int device_following_set(Unit *u, Set **_set) {
return r;
}
- *_set = TAKE_PTR(set);
+ *ret = TAKE_PTR(set);
return 1;
}
@@ -1061,6 +1051,9 @@ static void device_enumerate(Manager *m) {
_cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL;
Device *d;
+ if (device_is_processed(dev) <= 0)
+ continue;
+
if (device_setup_units(m, dev, &ready_units, &not_ready_units) < 0)
continue;
diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c
index 2bf9094..11de2ba 100644
--- a/src/core/dynamic-user.c
+++ b/src/core/dynamic-user.c
@@ -20,7 +20,7 @@
#include "stdio-util.h"
#include "string-util.h"
#include "strv.h"
-#include "uid-alloc-range.h"
+#include "uid-classification.h"
#include "user-util.h"
/* Takes a value generated randomly or by hashing and turns it into a UID in the right range */
@@ -143,7 +143,6 @@ static int dynamic_user_acquire(Manager *m, const char *name, DynamicUser** ret)
}
static int make_uid_symlinks(uid_t uid, const char *name, bool b) {
-
char path1[STRLEN("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1];
const char *path2;
int r = 0, k;
@@ -293,8 +292,8 @@ static int pick_uid(char **suggested_paths, const char *name, uid_t *ret_uid) {
}
/* Some superficial check whether this UID/GID might already be taken by some static user */
- if (getpwuid(candidate) ||
- getgrgid((gid_t) candidate) ||
+ if (getpwuid_malloc(candidate, /* ret= */ NULL) >= 0 ||
+ getgrgid_malloc((gid_t) candidate, /* ret= */ NULL) >= 0 ||
search_ipc(candidate, (gid_t) candidate) != 0) {
(void) unlink(lock_path);
continue;
@@ -419,30 +418,26 @@ static int dynamic_user_realize(
/* First, let's parse this as numeric UID */
r = parse_uid(d->name, &num);
if (r < 0) {
- struct passwd *p;
- struct group *g;
+ _cleanup_free_ struct passwd *p = NULL;
+ _cleanup_free_ struct group *g = NULL;
if (is_user) {
/* OK, this is not a numeric UID. Let's see if there's a user by this name */
- p = getpwnam(d->name);
- if (p) {
+ if (getpwnam_malloc(d->name, &p) >= 0) {
num = p->pw_uid;
gid = p->pw_gid;
} else {
/* if the user does not exist but the group with the same name exists, refuse operation */
- g = getgrnam(d->name);
- if (g)
+ if (getgrnam_malloc(d->name, /* ret= */ NULL) >= 0)
return -EILSEQ;
}
} else {
/* Let's see if there's a group by this name */
- g = getgrnam(d->name);
- if (g)
+ if (getgrnam_malloc(d->name, &g) >= 0)
num = (uid_t) g->gr_gid;
else {
/* if the group does not exist but the user with the same name exists, refuse operation */
- p = getpwnam(d->name);
- if (p)
+ if (getpwnam_malloc(d->name, /* ret= */ NULL) >= 0)
return -EILSEQ;
}
}
@@ -484,13 +479,12 @@ static int dynamic_user_realize(
uid_lock_fd = new_uid_lock_fd;
}
} else if (is_user && !uid_is_dynamic(num)) {
- struct passwd *p;
+ _cleanup_free_ struct passwd *p = NULL;
/* Statically allocated user may have different uid and gid. So, let's obtain the gid. */
- errno = 0;
- p = getpwuid(num);
- if (!p)
- return errno_or_else(ESRCH);
+ r = getpwuid_malloc(num, &p);
+ if (r < 0)
+ return r;
gid = p->pw_gid;
}
@@ -658,7 +652,7 @@ void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, Dyn
/* Parse the serialization again, after a daemon reload */
- r = extract_many_words(&value, NULL, 0, &name, &s0, &s1, NULL);
+ r = extract_many_words(&value, NULL, 0, &name, &s0, &s1);
if (r != 3 || !isempty(value)) {
log_debug("Unable to parse dynamic user line.");
return;
@@ -761,7 +755,6 @@ int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret) {
int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicCreds **ret) {
_cleanup_(dynamic_creds_unrefp) DynamicCreds *creds = NULL;
- bool acquired = false;
int r;
assert(m);
@@ -784,20 +777,14 @@ int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicC
r = dynamic_user_acquire(m, user, &creds->user);
if (r < 0)
return r;
-
- acquired = true;
}
- if (creds->user && (!group || streq_ptr(user, group)))
- creds->group = dynamic_user_ref(creds->user);
- else if (group) {
+ if (group && !streq_ptr(user, group)) {
r = dynamic_user_acquire(m, group, &creds->group);
- if (r < 0) {
- if (acquired)
- creds->user = dynamic_user_unref(creds->user);
+ if (r < 0)
return r;
- }
- }
+ } else
+ creds->group = ASSERT_PTR(dynamic_user_ref(creds->user));
*ret = TAKE_PTR(creds);
diff --git a/src/core/emergency-action.c b/src/core/emergency-action.c
index e2cd931..dbda6e5 100644
--- a/src/core/emergency-action.c
+++ b/src/core/emergency-action.c
@@ -13,22 +13,22 @@
#include "virt.h"
static const char* const emergency_action_table[_EMERGENCY_ACTION_MAX] = {
- [EMERGENCY_ACTION_NONE] = "none",
- [EMERGENCY_ACTION_REBOOT] = "reboot",
- [EMERGENCY_ACTION_REBOOT_FORCE] = "reboot-force",
- [EMERGENCY_ACTION_REBOOT_IMMEDIATE] = "reboot-immediate",
- [EMERGENCY_ACTION_POWEROFF] = "poweroff",
- [EMERGENCY_ACTION_POWEROFF_FORCE] = "poweroff-force",
+ [EMERGENCY_ACTION_NONE] = "none",
+ [EMERGENCY_ACTION_EXIT] = "exit",
+ [EMERGENCY_ACTION_EXIT_FORCE] = "exit-force",
+ [EMERGENCY_ACTION_REBOOT] = "reboot",
+ [EMERGENCY_ACTION_REBOOT_FORCE] = "reboot-force",
+ [EMERGENCY_ACTION_REBOOT_IMMEDIATE] = "reboot-immediate",
+ [EMERGENCY_ACTION_POWEROFF] = "poweroff",
+ [EMERGENCY_ACTION_POWEROFF_FORCE] = "poweroff-force",
[EMERGENCY_ACTION_POWEROFF_IMMEDIATE] = "poweroff-immediate",
- [EMERGENCY_ACTION_EXIT] = "exit",
- [EMERGENCY_ACTION_EXIT_FORCE] = "exit-force",
- [EMERGENCY_ACTION_SOFT_REBOOT] = "soft-reboot",
- [EMERGENCY_ACTION_SOFT_REBOOT_FORCE] = "soft-reboot-force",
- [EMERGENCY_ACTION_KEXEC] = "kexec",
- [EMERGENCY_ACTION_KEXEC_FORCE] = "kexec-force",
- [EMERGENCY_ACTION_HALT] = "halt",
- [EMERGENCY_ACTION_HALT_FORCE] = "halt-force",
- [EMERGENCY_ACTION_HALT_IMMEDIATE] = "halt-immediate",
+ [EMERGENCY_ACTION_SOFT_REBOOT] = "soft-reboot",
+ [EMERGENCY_ACTION_SOFT_REBOOT_FORCE] = "soft-reboot-force",
+ [EMERGENCY_ACTION_KEXEC] = "kexec",
+ [EMERGENCY_ACTION_KEXEC_FORCE] = "kexec-force",
+ [EMERGENCY_ACTION_HALT] = "halt",
+ [EMERGENCY_ACTION_HALT_FORCE] = "halt-force",
+ [EMERGENCY_ACTION_HALT_IMMEDIATE] = "halt-immediate",
};
static void log_and_status(Manager *m, bool warn, const char *message, const char *reason) {
@@ -216,7 +216,7 @@ int parse_emergency_action(
if (x < 0)
return -EINVAL;
- if (runtime_scope != RUNTIME_SCOPE_SYSTEM && x != EMERGENCY_ACTION_NONE && x < _EMERGENCY_ACTION_FIRST_USER_ACTION)
+ if (runtime_scope != RUNTIME_SCOPE_SYSTEM && x > _EMERGENCY_ACTION_LAST_USER_ACTION)
return -EOPNOTSUPP;
*ret = x;
diff --git a/src/core/emergency-action.h b/src/core/emergency-action.h
index 33e0ec6..6bec475 100644
--- a/src/core/emergency-action.h
+++ b/src/core/emergency-action.h
@@ -7,15 +7,15 @@
typedef enum EmergencyAction {
EMERGENCY_ACTION_NONE,
+ EMERGENCY_ACTION_EXIT,
+ EMERGENCY_ACTION_EXIT_FORCE,
+ _EMERGENCY_ACTION_LAST_USER_ACTION = EMERGENCY_ACTION_EXIT_FORCE,
EMERGENCY_ACTION_REBOOT,
EMERGENCY_ACTION_REBOOT_FORCE,
EMERGENCY_ACTION_REBOOT_IMMEDIATE,
EMERGENCY_ACTION_POWEROFF,
EMERGENCY_ACTION_POWEROFF_FORCE,
EMERGENCY_ACTION_POWEROFF_IMMEDIATE,
- EMERGENCY_ACTION_EXIT,
- _EMERGENCY_ACTION_FIRST_USER_ACTION = EMERGENCY_ACTION_EXIT,
- EMERGENCY_ACTION_EXIT_FORCE,
EMERGENCY_ACTION_SOFT_REBOOT,
EMERGENCY_ACTION_SOFT_REBOOT_FORCE,
EMERGENCY_ACTION_KEXEC,
diff --git a/src/core/exec-credential.c b/src/core/exec-credential.c
index 6bcfb68..f4cff57 100644
--- a/src/core/exec-credential.c
+++ b/src/core/exec-credential.c
@@ -9,6 +9,7 @@
#include "fileio.h"
#include "glob-util.h"
#include "io-util.h"
+#include "iovec-util.h"
#include "label-util.h"
#include "mkdir-label.h"
#include "mount-util.h"
@@ -48,6 +49,12 @@ DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
char, string_hash_func, string_compare_func,
ExecLoadCredential, exec_load_credential_free);
+bool exec_params_need_credentials(const ExecParameters *p) {
+ assert(p);
+
+ return p->flags & (EXEC_SETUP_CREDENTIALS|EXEC_SETUP_CREDENTIALS_FRESH);
+}
+
bool exec_context_has_credentials(const ExecContext *c) {
assert(c);
@@ -56,16 +63,15 @@ bool exec_context_has_credentials(const ExecContext *c) {
!set_isempty(c->import_credentials);
}
-bool exec_context_has_encrypted_credentials(ExecContext *c) {
- ExecLoadCredential *load_cred;
- ExecSetCredential *set_cred;
-
+bool exec_context_has_encrypted_credentials(const ExecContext *c) {
assert(c);
+ const ExecLoadCredential *load_cred;
HASHMAP_FOREACH(load_cred, c->load_credentials)
if (load_cred->encrypted)
return true;
+ const ExecSetCredential *set_cred;
HASHMAP_FOREACH(set_cred, c->set_credentials)
if (set_cred->encrypted)
return true;
@@ -106,7 +112,7 @@ int exec_context_get_credential_directory(
assert(unit);
assert(ret);
- if (!exec_context_has_credentials(context)) {
+ if (!exec_params_need_credentials(params) || !exec_context_has_credentials(context)) {
*ret = NULL;
return 0;
}
@@ -172,6 +178,10 @@ static int write_credential(
_cleanup_close_ int fd = -EBADF;
int r;
+ assert(dfd >= 0);
+ assert(id);
+ assert(data || size == 0);
+
r = tempfn_random_child("", "cred", &tmp);
if (r < 0)
return r;
@@ -224,7 +234,6 @@ typedef enum CredentialSearchPath {
} CredentialSearchPath;
static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
-
_cleanup_strv_free_ char **l = NULL;
assert(params);
@@ -243,9 +252,8 @@ static char **credential_search_path(const ExecParameters *params, CredentialSea
}
if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
- if (params->received_credentials_directory)
- if (strv_extend(&l, params->received_credentials_directory) < 0)
- return NULL;
+ if (strv_extend(&l, params->received_credentials_directory) < 0)
+ return NULL;
if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
return NULL;
@@ -271,20 +279,29 @@ static int maybe_decrypt_and_write_credential(
size_t size,
uint64_t *left) {
- _cleanup_free_ void *plaintext = NULL;
+ _cleanup_(iovec_done_erase) struct iovec plaintext = {};
size_t add;
int r;
- if (encrypted) {
- size_t plaintext_size = 0;
+ assert(dir_fd >= 0);
+ assert(id);
+ assert(left);
- r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
- &plaintext, &plaintext_size);
+ if (encrypted) {
+ r = decrypt_credential_and_warn(
+ id,
+ now(CLOCK_REALTIME),
+ /* tpm2_device= */ NULL,
+ /* tpm2_signature_path= */ NULL,
+ getuid(),
+ &IOVEC_MAKE(data, size),
+ CREDENTIAL_ANY_SCOPE,
+ &plaintext);
if (r < 0)
return r;
- data = plaintext;
- size = plaintext_size;
+ data = plaintext.iov_base;
+ size = plaintext.iov_len;
}
add = strlen(id) + size;
@@ -302,7 +319,7 @@ static int maybe_decrypt_and_write_credential(
static int load_credential_glob(
const char *path,
bool encrypted,
- char **search_path,
+ char * const *search_path,
ReadFullFileFlags flags,
int write_dfd,
uid_t uid,
@@ -312,6 +329,11 @@ static int load_credential_glob(
int r;
+ assert(path);
+ assert(search_path);
+ assert(write_dfd >= 0);
+ assert(left);
+
STRV_FOREACH(d, search_path) {
_cleanup_globfree_ glob_t pglob = {};
_cleanup_free_ char *j = NULL;
@@ -326,38 +348,36 @@ static int load_credential_glob(
if (r < 0)
return r;
- for (size_t n = 0; n < pglob.gl_pathc; n++) {
+ FOREACH_ARRAY(p, pglob.gl_pathv, pglob.gl_pathc) {
_cleanup_free_ char *fn = NULL;
_cleanup_(erase_and_freep) char *data = NULL;
size_t size;
/* path is absolute, hence pass AT_FDCWD as nop dir fd here */
r = read_full_file_full(
- AT_FDCWD,
- pglob.gl_pathv[n],
- UINT64_MAX,
- encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
- flags,
- NULL,
- &data, &size);
+ AT_FDCWD,
+ *p,
+ UINT64_MAX,
+ encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
+ flags,
+ NULL,
+ &data, &size);
if (r < 0)
- return log_debug_errno(r, "Failed to read credential '%s': %m",
- pglob.gl_pathv[n]);
+ return log_debug_errno(r, "Failed to read credential '%s': %m", *p);
- r = path_extract_filename(pglob.gl_pathv[n], &fn);
+ r = path_extract_filename(*p, &fn);
if (r < 0)
- return log_debug_errno(r, "Failed to extract filename from '%s': %m",
- pglob.gl_pathv[n]);
+ return log_debug_errno(r, "Failed to extract filename from '%s': %m", *p);
r = maybe_decrypt_and_write_credential(
- write_dfd,
- fn,
- encrypted,
- uid,
- gid,
- ownership_ok,
- data, size,
- left);
+ write_dfd,
+ fn,
+ encrypted,
+ uid,
+ gid,
+ ownership_ok,
+ data, size,
+ left);
if (r == -EEXIST)
continue;
if (r < 0)
@@ -423,7 +443,7 @@ static int load_credential(
/* Pass some minimal info about the unit and the credential name we are looking to acquire
* via the source socket address in case we read off an AF_UNIX socket. */
- if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
+ if (asprintf(&bindname, "@%" PRIx64 "/unit/%s/%s", random_u64(), unit, id) < 0)
return -ENOMEM;
missing_ok = false;
@@ -447,7 +467,7 @@ static int load_credential(
maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
- if (search_path) {
+ if (search_path)
STRV_FOREACH(d, search_path) {
_cleanup_free_ char *j = NULL;
@@ -465,7 +485,7 @@ static int load_credential(
if (r != -ENOENT)
break;
}
- } else if (source)
+ else if (source)
r = read_full_file_full(
read_dfd, source,
UINT64_MAX,
@@ -484,7 +504,8 @@ static int load_credential(
*
* Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
* we are fine, too. */
- log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
+ log_full_errno(hashmap_contains(context->set_credentials, id) ? LOG_DEBUG : LOG_INFO,
+ r, "Couldn't read inherited credential '%s', skipping: %m", path);
return 0;
}
if (r < 0)
@@ -518,6 +539,9 @@ static int load_cred_recurse_dir_cb(
_cleanup_free_ char *sub_id = NULL;
int r;
+ assert(path);
+ assert(de);
+
if (event != RECURSE_DIR_ENTRY)
return RECURSE_DIR_CONTINUE;
@@ -574,6 +598,8 @@ static int acquire_credentials(
int r;
assert(context);
+ assert(params);
+ assert(unit);
assert(p);
dfd = open(p, O_DIRECTORY|O_CLOEXEC);
@@ -618,8 +644,7 @@ static int acquire_credentials(
&left);
else
/* Directory */
- r = recurse_dir(
- sub_fd,
+ r = recurse_dir(sub_fd,
/* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
/* statx_mask= */ 0,
/* n_depth_max= */ UINT_MAX,
@@ -684,7 +709,7 @@ static int acquire_credentials(
/* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
* add them, so that they can act as a "default" if the same credential is specified multiple times. */
HASHMAP_FOREACH(sc, context->set_credentials) {
- _cleanup_(erase_and_freep) void *plaintext = NULL;
+ _cleanup_(iovec_done_erase) struct iovec plaintext = {};
const char *data;
size_t size, add;
@@ -698,11 +723,20 @@ static int acquire_credentials(
return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
if (sc->encrypted) {
- r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
+ r = decrypt_credential_and_warn(
+ sc->id,
+ now(CLOCK_REALTIME),
+ /* tpm2_device= */ NULL,
+ /* tpm2_signature_path= */ NULL,
+ getuid(),
+ &IOVEC_MAKE(sc->data, sc->size),
+ CREDENTIAL_ANY_SCOPE,
+ &plaintext);
if (r < 0)
return r;
- data = plaintext;
+ data = plaintext.iov_base;
+ size = plaintext.iov_len;
} else {
data = sc->data;
size = sc->size;
@@ -754,17 +788,42 @@ static int setup_credentials_internal(
uid_t uid,
gid_t gid) {
+ bool final_mounted;
int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
* if we mounted something; false if we definitely can't mount anything */
- bool final_mounted;
- const char *where;
assert(context);
+ assert(params);
+ assert(unit);
assert(final);
assert(workspace);
+ r = path_is_mount_point(final);
+ if (r < 0)
+ return r;
+ final_mounted = r > 0;
+
+ if (final_mounted) {
+ if (FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
+ r = umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
+ if (r < 0)
+ return r;
+
+ final_mounted = false;
+ } else {
+ /* We can reuse the previous credential dir */
+ r = dir_is_empty(final, /* ignore_hidden_or_backup = */ false);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
+ return 0;
+ }
+ }
+ }
+
if (reuse_workspace) {
- r = path_is_mount_point(workspace, NULL, 0);
+ r = path_is_mount_point(workspace);
if (r < 0)
return r;
if (r > 0)
@@ -775,40 +834,19 @@ static int setup_credentials_internal(
} else
workspace_mounted = -1; /* ditto */
- r = path_is_mount_point(final, NULL, 0);
- if (r < 0)
- return r;
- if (r > 0) {
- /* If the final place already has something mounted, we use that. If the workspace also has
- * something mounted we assume it's actually the same mount (but with MS_RDONLY
- * different). */
- final_mounted = true;
-
- if (workspace_mounted < 0) {
- /* If the final place is mounted, but the workspace isn't, then let's bind mount
- * the final version to the workspace, and make it writable, so that we can make
- * changes */
-
- r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
- if (r < 0)
- return r;
-
- r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
- if (r < 0)
- return r;
-
- workspace_mounted = true;
- }
- } else
- final_mounted = false;
+ /* If both the final place and the workspace are mounted, we have no mounts to set up, based on
+ * the assumption that they're actually the same tmpfs (but the latter with MS_RDONLY different).
+ * If the workspace is not mounted, we just bind the final place over and make it writable. */
+ must_mount = must_mount || final_mounted;
if (workspace_mounted < 0) {
- /* Nothing is mounted on the workspace yet, let's try to mount something now */
-
- r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
- if (r < 0) {
- /* If that didn't work, try to make a bind mount from the final to the workspace, so
- * that we can make it writable there. */
+ if (!final_mounted)
+ /* Nothing is mounted on the workspace yet, let's try to mount a new tmpfs if
+ * not using the final place. */
+ r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
+ if (final_mounted || r < 0) {
+ /* If using final place or failed to mount new tmpfs, make a bind mount from
+ * the final to the workspace, so that we can make it writable there. */
r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
if (r < 0) {
if (!ERRNO_IS_PRIVILEGE(r))
@@ -821,12 +859,19 @@ static int setup_credentials_internal(
return r;
/* If we lack privileges to bind mount stuff, then let's gracefully proceed
- * for compat with container envs, and just use the final dir as is. */
+ * for compat with container envs, and just use the final dir as is.
+ * Final place must not be mounted in this case (refused by must_mount
+ * above) */
workspace_mounted = false;
} else {
/* Make the new bind mount writable (i.e. drop MS_RDONLY) */
- r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+ r = mount_nofollow_verbose(LOG_DEBUG,
+ NULL,
+ workspace,
+ NULL,
+ MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false),
+ NULL);
if (r < 0)
return r;
@@ -836,34 +881,26 @@ static int setup_credentials_internal(
workspace_mounted = true;
}
- assert(!must_mount || workspace_mounted > 0);
- where = workspace_mounted ? workspace : final;
+ assert(workspace_mounted >= 0);
+ assert(!must_mount || workspace_mounted);
+
+ const char *where = workspace_mounted ? workspace : final;
(void) label_fix_full(AT_FDCWD, where, final, 0);
r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted);
- if (r < 0)
- return r;
-
- if (workspace_mounted) {
- bool install;
-
- /* Determine if we should actually install the prepared mount in the final location by bind
- * mounting it there. We do so only if the mount is not established there already, and if the
- * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
- * case we are doing all this in a mount namespace, thus no one else will see that we
- * allocated a file system we are getting rid of again here. */
+ if (r < 0) {
+ /* If we're using final place as workspace, and failed to acquire credentials, we might
+ * have left half-written creds there. Let's get rid of the whole mount, so future
+ * calls won't reuse it. */
if (final_mounted)
- install = false; /* already installed */
- else {
- r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
- if (r < 0)
- return r;
+ (void) umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
- install = r == 0; /* install only if non-empty */
- }
+ return r;
+ }
- if (install) {
+ if (workspace_mounted) {
+ if (!final_mounted) {
/* Make workspace read-only now, so that any bind mount we make from it defaults to
* read-only too */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
@@ -873,7 +910,7 @@ static int setup_credentials_internal(
/* And mount it to the final place, read-only */
r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
} else
- /* Otherwise get rid of it */
+ /* Otherwise we just get rid of the bind mount of final place */
r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
if (r < 0)
return r;
@@ -905,15 +942,16 @@ int exec_setup_credentials(
assert(context);
assert(params);
+ assert(unit);
- if (!exec_context_has_credentials(context))
+ if (!exec_params_need_credentials(params) || !exec_context_has_credentials(context))
return 0;
if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
return -EINVAL;
- /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
- * and the subdir we mount over with a read-only file system readable by the service's user */
+ /* This is where we'll place stuff when we are done; the main credentials directory is world-readable,
+ * and the subdir we mount over with a read-only file system readable by the service's user. */
q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
if (!q)
return -ENOMEM;
diff --git a/src/core/exec-credential.h b/src/core/exec-credential.h
index 6f836fb..70bb46b 100644
--- a/src/core/exec-credential.h
+++ b/src/core/exec-credential.h
@@ -34,8 +34,10 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecLoadCredential*, exec_load_credential_free);
extern const struct hash_ops exec_set_credential_hash_ops;
extern const struct hash_ops exec_load_credential_hash_ops;
-bool exec_context_has_encrypted_credentials(ExecContext *c);
+bool exec_params_need_credentials(const ExecParameters *p);
+
bool exec_context_has_credentials(const ExecContext *c);
+bool exec_context_has_encrypted_credentials(const ExecContext *c);
int exec_context_get_credential_directory(
const ExecContext *context,
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index 8e6de15..ee8db04 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -22,7 +22,7 @@
#include "argv-util.h"
#include "barrier.h"
#include "bpf-dlopen.h"
-#include "bpf-lsm.h"
+#include "bpf-restrict-fs.h"
#include "btrfs-util.h"
#include "capability-util.h"
#include "cgroup-setup.h"
@@ -41,6 +41,7 @@
#include "hexdecoct.h"
#include "io-util.h"
#include "iovec-util.h"
+#include "journal-send.h"
#include "missing_ioprio.h"
#include "missing_prctl.h"
#include "missing_securebits.h"
@@ -59,52 +60,13 @@
#include "strv.h"
#include "terminal-util.h"
#include "utmp-wtmp.h"
+#include "vpick.h"
#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
#define SNDBUF_SIZE (8*1024*1024)
-static int shift_fds(int fds[], size_t n_fds) {
- if (n_fds <= 0)
- return 0;
-
- /* Modifies the fds array! (sorts it) */
-
- assert(fds);
-
- for (int start = 0;;) {
- int restart_from = -1;
-
- for (int i = start; i < (int) n_fds; i++) {
- int nfd;
-
- /* Already at right index? */
- if (fds[i] == i+3)
- continue;
-
- nfd = fcntl(fds[i], F_DUPFD, i + 3);
- if (nfd < 0)
- return -errno;
-
- safe_close(fds[i]);
- fds[i] = nfd;
-
- /* Hmm, the fd we wanted isn't free? Then
- * let's remember that and try again from here */
- if (nfd != i+3 && restart_from < 0)
- restart_from = i;
- }
-
- if (restart_from < 0)
- break;
-
- start = restart_from;
- }
-
- return 0;
-}
-
static int flag_fds(
const int fds[],
size_t n_socket_fds,
@@ -198,9 +160,11 @@ static int connect_journal_socket(
const char *j;
int r;
- j = log_namespace ?
- strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
- "/run/systemd/journal/stdout";
+ assert(fd >= 0);
+
+ j = journal_stream_path(log_namespace);
+ if (!j)
+ return -EINVAL;
if (gid_is_valid(gid)) {
oldgid = getgid();
@@ -449,7 +413,7 @@ static int setup_input(
case EXEC_INPUT_DATA: {
int fd;
- fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
+ fd = acquire_data_fd_full(context->stdin_data, context->stdin_data_size, /* flags = */ 0);
if (fd < 0)
return fd;
@@ -670,12 +634,8 @@ static int chown_terminal(int fd, uid_t uid) {
assert(fd >= 0);
/* Before we chown/chmod the TTY, let's ensure this is actually a tty */
- if (isatty(fd) < 1) {
- if (IN_SET(errno, EINVAL, ENOTTY))
- return 0; /* not a tty */
-
- return -errno;
- }
+ if (!isatty_safe(fd))
+ return 0;
/* This might fail. What matters are the results. */
r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
@@ -1126,7 +1086,8 @@ static int setup_pam(
gid_t gid,
const char *tty,
char ***env, /* updated on success */
- const int fds[], size_t n_fds) {
+ const int fds[], size_t n_fds,
+ int exec_fd) {
#if HAVE_PAM
@@ -1141,7 +1102,7 @@ static int setup_pam(
sigset_t old_ss;
int pam_code = PAM_SUCCESS, r;
bool close_session = false;
- pid_t pam_pid = 0, parent_pid;
+ pid_t parent_pid;
int flags = 0;
assert(name);
@@ -1196,7 +1157,7 @@ static int setup_pam(
pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
if (pam_code != PAM_SUCCESS)
- log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
+ log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
pam_code = pam_open_session(handle, flags);
if (pam_code != PAM_SUCCESS)
@@ -1212,15 +1173,15 @@ static int setup_pam(
/* Block SIGTERM, so that we know that it won't get lost in the child */
- assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
+ assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0);
parent_pid = getpid_cached();
- r = safe_fork("(sd-pam)", 0, &pam_pid);
+ r = safe_fork("(sd-pam)", 0, NULL);
if (r < 0)
goto fail;
if (r == 0) {
- int sig, ret = EXIT_PAM;
+ int ret = EXIT_PAM;
/* The child's job is to reset the PAM session on termination */
barrier_set_role(&barrier, BARRIER_CHILD);
@@ -1229,17 +1190,18 @@ static int setup_pam(
* those fds are open here that have been opened by PAM. */
(void) close_many(fds, n_fds);
+ /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
+ * by the execve() to wait for completion, and if we'd keep the fd open here in the child
+ * we'd never signal completion. */
+ exec_fd = safe_close(exec_fd);
+
/* Drop privileges - we don't need any to pam_close_session and this will make
* PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
* threads to fail to exit normally */
- r = maybe_setgroups(0, NULL);
+ r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
if (r < 0)
- log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
- if (setresgid(gid, gid, gid) < 0)
- log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
- if (setresuid(uid, uid, uid) < 0)
- log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
+ log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
(void) ignore_signals(SIGPIPE);
@@ -1258,21 +1220,13 @@ static int setup_pam(
/* Check if our parent process might already have died? */
if (getppid() == parent_pid) {
sigset_t ss;
+ int sig;
assert_se(sigemptyset(&ss) >= 0);
assert_se(sigaddset(&ss, SIGTERM) >= 0);
- for (;;) {
- if (sigwait(&ss, &sig) < 0) {
- if (errno == EINTR)
- continue;
-
- goto child_finish;
- }
-
- assert(sig == SIGTERM);
- break;
- }
+ assert_se(sigwait(&ss, &sig) == 0);
+ assert(sig == SIGTERM);
}
/* If our parent died we'll end the session */
@@ -1361,7 +1315,7 @@ static void rename_process_from_path(const char *path) {
process_name[1+l] = ')';
process_name[1+l+1] = 0;
- rename_process(process_name);
+ (void) rename_process(process_name);
}
static bool context_has_address_families(const ExecContext *c) {
@@ -1725,7 +1679,7 @@ static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters
if (!exec_context_restrict_filesystems_set(c))
return 0;
- if (p->bpf_outer_map_fd < 0) {
+ if (p->bpf_restrict_fs_map_fd < 0) {
/* LSM BPF is unsupported or lsm_bpf_setup failed */
log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
return 0;
@@ -1736,7 +1690,7 @@ static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters
if (r < 0)
return r;
- return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list);
+ return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
}
#endif
@@ -1817,10 +1771,10 @@ static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
* the service payload in. */
static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
- [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
- [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
- [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
- [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
+ [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
+ [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
+ [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
+ [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
[EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
};
@@ -1907,7 +1861,7 @@ static int build_environment(
"Failed to determine user credentials for root: %m");
}
- bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user);
+ bool set_user_login_env = exec_context_get_set_login_environment(c);
if (username) {
x = strjoin("USER=", username);
@@ -1961,7 +1915,7 @@ static int build_environment(
* to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
* container manager passes to PID 1 ends up all the way in the console login shown. */
- if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
+ if (path_equal(tty_path, "/dev/console") && getppid() == 1)
term = getenv("TERM");
else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
_cleanup_free_ char *key = NULL;
@@ -2315,10 +2269,10 @@ static int setup_exec_directory(
int *exit_status) {
static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
- [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
- [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
- [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
- [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
+ [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
+ [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
+ [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
+ [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
[EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
};
int r;
@@ -2338,10 +2292,10 @@ static int setup_exec_directory(
gid = 0;
}
- for (size_t i = 0; i < context->directories[type].n_items; i++) {
+ FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
_cleanup_free_ char *p = NULL, *pp = NULL;
- p = path_join(params->prefix[type], context->directories[type].items[i].path);
+ p = path_join(params->prefix[type], i->path);
if (!p) {
r = -ENOMEM;
goto fail;
@@ -2357,7 +2311,7 @@ static int setup_exec_directory(
* doesn't exist, then we likely are upgrading from an older systemd version that
* didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
* directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
- * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
+ * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME it is now
* separated. If a service has both dirs configured but only the configuration dir
* exists and the state dir does not, we assume we are looking at an update
* situation. Hence, create a compatibility symlink, so that all expectations are
@@ -2378,9 +2332,9 @@ static int setup_exec_directory(
* under the configuration hierarchy. */
if (type == EXEC_DIRECTORY_STATE)
- q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
+ q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], i->path);
else if (type == EXEC_DIRECTORY_LOGS)
- q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
+ q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", i->path);
else
assert_not_reached();
if (!q) {
@@ -2443,7 +2397,7 @@ static int setup_exec_directory(
if (r < 0)
goto fail;
- if (!path_extend(&pp, context->directories[type].items[i].path)) {
+ if (!path_extend(&pp, i->path)) {
r = -ENOMEM;
goto fail;
}
@@ -2477,7 +2431,7 @@ static int setup_exec_directory(
goto fail;
}
- if (!context->directories[type].items[i].only_create) {
+ if (!i->only_create) {
/* And link it up from the original place.
* Notes
* 1) If a mount namespace is going to be used, then this symlink remains on
@@ -2514,7 +2468,7 @@ static int setup_exec_directory(
if (r < 0)
goto fail;
- q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
+ q = path_join(params->prefix[type], "private", i->path);
if (!q) {
r = -ENOMEM;
goto fail;
@@ -2568,7 +2522,7 @@ static int setup_exec_directory(
params,
"%s \'%s\' already exists but the mode is different. "
"(File system: %o %sMode: %o)",
- exec_directory_type_to_string(type), context->directories[type].items[i].path,
+ exec_directory_type_to_string(type), i->path,
st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
continue;
@@ -2599,10 +2553,8 @@ static int setup_exec_directory(
/* If we are not going to run in a namespace, set up the symlinks - otherwise
* they are set up later, to allow configuring empty var/run/etc. */
if (!needs_mount_namespace)
- for (size_t i = 0; i < context->directories[type].n_items; i++) {
- r = create_many_symlinks(params->prefix[type],
- context->directories[type].items[i].path,
- context->directories[type].items[i].symlinks);
+ FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
+ r = create_many_symlinks(params->prefix[type], i->path, i->symlinks);
if (r < 0)
goto fail;
}
@@ -2669,8 +2621,8 @@ static int compile_bind_mounts(
if (!params->prefix[t])
continue;
- for (size_t i = 0; i < context->directories[t].n_items; i++)
- n += !context->directories[t].items[i].only_create;
+ FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items)
+ n += !i->only_create;
}
if (n <= 0) {
@@ -2684,8 +2636,7 @@ static int compile_bind_mounts(
if (!bind_mounts)
return -ENOMEM;
- for (size_t i = 0; i < context->n_bind_mounts; i++) {
- BindMount *item = context->bind_mounts + i;
+ FOREACH_ARRAY(item, context->bind_mounts, context->n_bind_mounts) {
_cleanup_free_ char *s = NULL, *d = NULL;
s = strdup(item->source);
@@ -2729,18 +2680,18 @@ static int compile_bind_mounts(
return r;
}
- for (size_t i = 0; i < context->directories[t].n_items; i++) {
+ FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items) {
_cleanup_free_ char *s = NULL, *d = NULL;
/* When one of the parent directories is in the list, we cannot create the symlink
* for the child directory. See also the comments in setup_exec_directory(). */
- if (context->directories[t].items[i].only_create)
+ if (i->only_create)
continue;
if (exec_directory_is_private(context, t))
- s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
+ s = path_join(params->prefix[t], "private", i->path);
else
- s = path_join(params->prefix[t], context->directories[t].items[i].path);
+ s = path_join(params->prefix[t], i->path);
if (!s)
return -ENOMEM;
@@ -2749,7 +2700,7 @@ static int compile_bind_mounts(
/* When RootDirectory= or RootImage= are set, then the symbolic link to the private
* directory is not created on the root directory. So, let's bind-mount the directory
* on the 'non-private' place. */
- d = path_join(params->prefix[t], context->directories[t].items[i].path);
+ d = path_join(params->prefix[t], i->path);
else
d = strdup(s);
if (!d)
@@ -2758,10 +2709,8 @@ static int compile_bind_mounts(
bind_mounts[h++] = (BindMount) {
.source = TAKE_PTR(s),
.destination = TAKE_PTR(d),
- .read_only = false,
.nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
.recursive = true,
- .ignore_enoent = false,
};
}
}
@@ -2791,14 +2740,14 @@ static int compile_symlinks(
assert(params);
assert(ret_symlinks);
- for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
- for (size_t i = 0; i < context->directories[dt].n_items; i++) {
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
+ FOREACH_ARRAY(i, context->directories[dt].items, context->directories[dt].n_items) {
_cleanup_free_ char *private_path = NULL, *path = NULL;
- STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
+ STRV_FOREACH(symlink, i->symlinks) {
_cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
- src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
+ src_abs = path_join(params->prefix[dt], i->path);
dst_abs = path_join(params->prefix[dt], *symlink);
if (!src_abs || !dst_abs)
return -ENOMEM;
@@ -2810,14 +2759,14 @@ static int compile_symlinks(
if (!exec_directory_is_private(context, dt) ||
exec_context_with_rootfs(context) ||
- context->directories[dt].items[i].only_create)
+ i->only_create)
continue;
- private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
+ private_path = path_join(params->prefix[dt], "private", i->path);
if (!private_path)
return -ENOMEM;
- path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
+ path = path_join(params->prefix[dt], i->path);
if (!path)
return -ENOMEM;
@@ -2825,18 +2774,16 @@ static int compile_symlinks(
if (r < 0)
return r;
}
- }
/* We make the host's os-release available via a symlink, so that we can copy it atomically
* and readers will never get a half-written version. Note that, while the paths specified here are
* absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
* 'os-release -> .os-release-stage/os-release' is what will be created. */
if (setup_os_release_symlink) {
- r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
- if (r < 0)
- return r;
-
- r = strv_extend(&symlinks, "/run/host/os-release");
+ r = strv_extend_many(
+ &symlinks,
+ "/run/host/.os-release-stage/os-release",
+ "/run/host/os-release");
if (r < 0)
return r;
}
@@ -2877,8 +2824,8 @@ static bool insist_on_sandboxing(
/* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
* essential. */
- for (size_t i = 0; i < n_bind_mounts; i++)
- if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
+ FOREACH_ARRAY(i, bind_mounts, n_bind_mounts)
+ if (!path_equal(i->source, i->destination))
return true;
if (context->log_namespace)
@@ -2887,13 +2834,33 @@ static bool insist_on_sandboxing(
return false;
}
-static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
+static int setup_ephemeral(
+ const ExecContext *context,
+ ExecRuntime *runtime,
+ char **root_image, /* both input and output! modified if ephemeral logic enabled */
+ char **root_directory) { /* ditto */
+
_cleanup_close_ int fd = -EBADF;
+ _cleanup_free_ char *new_root = NULL;
int r;
+ assert(context);
+ assert(root_image);
+ assert(root_directory);
+
+ if (!*root_image && !*root_directory)
+ return 0;
+
if (!runtime || !runtime->ephemeral_copy)
return 0;
+ assert(runtime->ephemeral_storage_socket[0] >= 0);
+ assert(runtime->ephemeral_storage_socket[1] >= 0);
+
+ new_root = strdup(runtime->ephemeral_copy);
+ if (!new_root)
+ return log_oom_debug();
+
r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
if (r < 0)
return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
@@ -2904,28 +2871,23 @@ static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
if (fd >= 0)
/* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
return 0;
-
if (fd != -EAGAIN)
return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
- log_debug("Making ephemeral snapshot of %s to %s",
- context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+ if (*root_image) {
+ log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
- if (context->root_image)
- fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
- COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
- else
- fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
- AT_FDCWD, runtime->ephemeral_copy,
- BTRFS_SNAPSHOT_FALLBACK_COPY |
- BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
- BTRFS_SNAPSHOT_RECURSIVE |
- BTRFS_SNAPSHOT_LOCK_BSD);
- if (fd < 0)
- return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
- context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+ fd = copy_file(*root_image,
+ new_root,
+ O_EXCL,
+ 0600,
+ COPY_LOCK_BSD|
+ COPY_REFLINK|
+ COPY_CRTIME);
+ if (fd < 0)
+ return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
+ *root_image, new_root);
- if (context->root_image) {
/* A root image might be subject to lots of random writes so let's try to disable COW on it
* which tends to not perform well in combination with lots of random writes.
*
@@ -2934,13 +2896,35 @@ static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
*/
r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
if (r < 0)
- log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
+ log_debug_errno(r, "Failed to disable copy-on-write for %s, ignoring: %m", new_root);
+ } else {
+ assert(*root_directory);
+
+ log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
+
+ fd = btrfs_subvol_snapshot_at(
+ AT_FDCWD, *root_directory,
+ AT_FDCWD, new_root,
+ BTRFS_SNAPSHOT_FALLBACK_COPY |
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+ BTRFS_SNAPSHOT_RECURSIVE |
+ BTRFS_SNAPSHOT_LOCK_BSD);
+ if (fd < 0)
+ return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
+ *root_directory, new_root);
}
r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
if (r < 0)
return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
+ if (*root_image)
+ free_and_replace(*root_image, new_root);
+ else {
+ assert(*root_directory);
+ free_and_replace(*root_directory, new_root);
+ }
+
return 1;
}
@@ -3000,22 +2984,80 @@ static int verity_settings_prepare(
return 0;
}
+static int pick_versions(
+ const ExecContext *context,
+ const ExecParameters *params,
+ char **ret_root_image,
+ char **ret_root_directory) {
+
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(ret_root_image);
+ assert(ret_root_directory);
+
+ if (context->root_image) {
+ _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+
+ r = path_pick(/* toplevel_path= */ NULL,
+ /* toplevel_fd= */ AT_FDCWD,
+ context->root_image,
+ &pick_filter_image_raw,
+ PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
+ &result);
+ if (r < 0)
+ return r;
+
+ if (!result.path)
+ return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
+
+ *ret_root_image = TAKE_PTR(result.path);
+ *ret_root_directory = NULL;
+ return r;
+ }
+
+ if (context->root_directory) {
+ _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+
+ r = path_pick(/* toplevel_path= */ NULL,
+ /* toplevel_fd= */ AT_FDCWD,
+ context->root_directory,
+ &pick_filter_image_dir,
+ PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
+ &result);
+ if (r < 0)
+ return r;
+
+ if (!result.path)
+ return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
+
+ *ret_root_image = NULL;
+ *ret_root_directory = TAKE_PTR(result.path);
+ return r;
+ }
+
+ *ret_root_image = *ret_root_directory = NULL;
+ return 0;
+}
+
static int apply_mount_namespace(
ExecCommandFlags command_flags,
const ExecContext *context,
const ExecParameters *params,
ExecRuntime *runtime,
const char *memory_pressure_path,
+ bool needs_sandboxing,
char **error_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
**read_write_paths_cleanup = NULL;
_cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
- *extension_dir = NULL, *host_os_release_stage = NULL;
- const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
+ *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
+ const char *tmp_dir = NULL, *var_tmp_dir = NULL;
char **read_write_paths;
- bool needs_sandboxing, setup_os_release_symlink;
+ bool setup_os_release_symlink;
BindMount *bind_mounts = NULL;
size_t n_bind_mounts = 0;
int r;
@@ -3025,14 +3067,21 @@ static int apply_mount_namespace(
CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
if (params->flags & EXEC_APPLY_CHROOT) {
- r = setup_ephemeral(context, runtime);
+ r = pick_versions(
+ context,
+ params,
+ &root_image,
+ &root_dir);
if (r < 0)
return r;
- if (context->root_image)
- root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
- else
- root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
+ r = setup_ephemeral(
+ context,
+ runtime,
+ &root_image,
+ &root_dir);
+ if (r < 0)
+ return r;
}
r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
@@ -3054,7 +3103,6 @@ static int apply_mount_namespace(
} else
read_write_paths = context->read_write_paths;
- needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
if (needs_sandboxing) {
/* The runtime struct only contains the parent of the private /tmp, which is non-accessible
* to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
@@ -3084,11 +3132,9 @@ static int apply_mount_namespace(
params,
"shared mount propagation hidden by other fs namespacing unit settings: ignoring");
- if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
- r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
- if (r < 0)
- return r;
- }
+ r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
+ if (r < 0)
+ return r;
if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
@@ -3246,31 +3292,39 @@ static int apply_working_directory(
const char *home,
int *exit_status) {
- const char *d, *wd;
+ const char *wd;
+ int r;
assert(context);
assert(exit_status);
if (context->working_directory_home) {
-
if (!home) {
*exit_status = EXIT_CHDIR;
return -ENXIO;
}
wd = home;
-
} else
wd = empty_to_root(context->working_directory);
if (params->flags & EXEC_APPLY_CHROOT)
- d = wd;
- else
- d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
+ r = RET_NERRNO(chdir(wd));
+ else {
+ _cleanup_close_ int dfd = -EBADF;
+
+ r = chase(wd,
+ (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory,
+ CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
+ /* ret_path= */ NULL,
+ &dfd);
+ if (r >= 0)
+ r = RET_NERRNO(fchdir(dfd));
+ }
- if (chdir(d) < 0 && !context->working_directory_missing_ok) {
+ if (r < 0 && !context->working_directory_missing_ok) {
*exit_status = EXIT_CHDIR;
- return -errno;
+ return r;
}
return 0;
@@ -3459,7 +3513,7 @@ static int close_remaining_fds(
const int *fds, size_t n_fds) {
size_t n_dont_close = 0;
- int dont_close[n_fds + 15];
+ int dont_close[n_fds + 16];
assert(params);
@@ -3495,6 +3549,9 @@ static int close_remaining_fds(
if (params->user_lookup_fd >= 0)
dont_close[n_dont_close++] = params->user_lookup_fd;
+ if (params->handoff_timestamp_fd >= 0)
+ dont_close[n_dont_close++] = params->handoff_timestamp_fd;
+
assert(n_dont_close <= ELEMENTSOF(dont_close));
return close_all_fds(dont_close, n_dont_close);
@@ -3528,26 +3585,29 @@ static int send_user_lookup(
return 0;
}
-static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
+static int acquire_home(const ExecContext *c, const char **home, char **ret_buf) {
int r;
assert(c);
assert(home);
- assert(buf);
+ assert(ret_buf);
/* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
- if (*home)
+ if (*home) /* Already acquired from get_fixed_user()? */
return 0;
if (!c->working_directory_home)
return 0;
- r = get_home_dir(buf);
+ if (c->dynamic_user)
+ return -EADDRNOTAVAIL;
+
+ r = get_home_dir(ret_buf);
if (r < 0)
return r;
- *home = *buf;
+ *home = *ret_buf;
return 1;
}
@@ -3641,11 +3701,12 @@ static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
}
static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
+ static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
+
union sockaddr_union addr = {
.un.sun_family = AF_UNIX,
};
socklen_t sa_len;
- static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
int r;
assert(c);
@@ -3655,43 +3716,35 @@ static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, co
r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
if (r < 0)
- return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
-
+ return log_exec_error_errno(c, p, r, "Failed to set sockaddr for '%s': %m", of->path);
sa_len = r;
- for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
+ FOREACH_ELEMENT(i, socket_types) {
_cleanup_close_ int fd = -EBADF;
- fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
+ fd = socket(AF_UNIX, *i|SOCK_CLOEXEC, 0);
if (fd < 0)
- return log_exec_error_errno(c,
- p,
- errno,
- "Failed to create socket for %s: %m",
+ return log_exec_error_errno(c, p,
+ errno, "Failed to create socket for '%s': %m",
of->path);
r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
- if (r == -EPROTOTYPE)
- continue;
- if (r < 0)
- return log_exec_error_errno(c,
- p,
- r,
- "Failed to connect socket for %s: %m",
+ if (r >= 0)
+ return TAKE_FD(fd);
+ if (r != -EPROTOTYPE)
+ return log_exec_error_errno(c, p,
+ r, "Failed to connect to socket for '%s': %m",
of->path);
-
- return TAKE_FD(fd);
}
- return log_exec_error_errno(c,
- p,
- SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
+ return log_exec_error_errno(c, p,
+ SYNTHETIC_ERRNO(EPROTOTYPE), "No suitable socket type to connect to socket '%s'.",
of->path);
}
static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
- struct stat st;
_cleanup_close_ int fd = -EBADF, ofd = -EBADF;
+ struct stat st;
assert(c);
assert(p);
@@ -3699,10 +3752,10 @@ static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const
ofd = open(of->path, O_PATH | O_CLOEXEC);
if (ofd < 0)
- return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
+ return log_exec_error_errno(c, p, errno, "Failed to open '%s' as O_PATH: %m", of->path);
if (fstat(ofd, &st) < 0)
- return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
+ return log_exec_error_errno(c, p, errno, "Failed to stat '%s': %m", of->path);
if (S_ISSOCK(st.st_mode)) {
fd = connect_unix_harder(c, p, of, ofd);
@@ -3710,10 +3763,11 @@ static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const
return fd;
if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
- return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
+ return log_exec_error_errno(c, p,
+ errno, "Failed to shutdown send for socket '%s': %m",
of->path);
- log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
+ log_exec_debug(c, p, "Opened socket '%s' as fd %d.", of->path, fd);
} else {
int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
if (FLAGS_SET(of->flags, OPENFILE_APPEND))
@@ -3723,9 +3777,9 @@ static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const
fd = fd_reopen(ofd, flags | O_CLOEXEC);
if (fd < 0)
- return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
+ return log_exec_error_errno(c, p, fd, "Failed to reopen file '%s': %m", of->path);
- log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
+ log_exec_debug(c, p, "Opened file '%s' as fd %d.", of->path, fd);
}
return TAKE_FD(fd);
@@ -3744,7 +3798,9 @@ static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t
fd = get_open_file_fd(c, p, of);
if (fd < 0) {
if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
- log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
+ log_exec_warning_errno(c, p, fd,
+ "Failed to get OpenFile= file descriptor for '%s', ignoring: %m",
+ of->path);
continue;
}
@@ -3758,9 +3814,7 @@ static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t
if (r < 0)
return r;
- p->fds[*n_fds] = TAKE_FD(fd);
-
- (*n_fds)++;
+ p->fds[(*n_fds)++] = TAKE_FD(fd);
}
return 0;
@@ -3810,7 +3864,7 @@ static bool exec_context_need_unprivileged_private_users(
context->private_ipc ||
context->ipc_namespace_path ||
context->private_mounts > 0 ||
- context->mount_apivfs ||
+ context->mount_apivfs > 0 ||
context->n_bind_mounts > 0 ||
context->n_temporary_filesystems > 0 ||
context->root_directory ||
@@ -3920,6 +3974,52 @@ static void exec_params_close(ExecParameters *p) {
p->stderr_fd = safe_close(p->stderr_fd);
}
+static int exec_fd_mark_hot(
+ const ExecContext *c,
+ ExecParameters *p,
+ bool hot,
+ int *reterr_exit_status) {
+
+ assert(c);
+ assert(p);
+
+ if (p->exec_fd < 0)
+ return 0;
+
+ uint8_t x = hot;
+
+ if (write(p->exec_fd, &x, sizeof(x)) < 0) {
+ if (reterr_exit_status)
+ *reterr_exit_status = EXIT_EXEC;
+ return log_exec_error_errno(c, p, errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold");
+ }
+
+ return 1;
+}
+
+static int send_handoff_timestamp(
+ const ExecContext *c,
+ ExecParameters *p,
+ int *reterr_exit_status) {
+
+ assert(c);
+ assert(p);
+
+ if (p->handoff_timestamp_fd < 0)
+ return 0;
+
+ dual_timestamp dt;
+ dual_timestamp_now(&dt);
+
+ if (send(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2, 0) < 0) {
+ if (reterr_exit_status)
+ *reterr_exit_status = EXIT_EXEC;
+ return log_exec_error_errno(c, p, errno, "Failed to send handoff timestamp: %m");
+ }
+
+ return 1;
+}
+
int exec_invoke(
const ExecCommand *command,
const ExecContext *context,
@@ -3974,6 +4074,8 @@ int exec_invoke(
assert(params);
assert(exit_status);
+ /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
+ * and is already applied earlier. Just for safety. */
if (context->log_level_max >= 0)
log_set_max_level(context->log_level_max);
@@ -4049,7 +4151,7 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
}
- int keep_fds[n_fds + 3];
+ int keep_fds[n_fds + 4];
memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
n_keep_fds = n_fds;
@@ -4059,8 +4161,14 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
}
+ r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->handoff_timestamp_fd);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
+ }
+
#if HAVE_LIBBPF
- r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_outer_map_fd);
+ r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
if (r < 0) {
*exit_status = EXIT_FDS;
return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
@@ -4099,7 +4207,7 @@ int exec_invoke(
*exit_status = EXIT_CONFIRM;
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
- "Execution cancelled by the user");
+ "Execution cancelled by the user.");
}
}
@@ -4141,12 +4249,12 @@ int exec_invoke(
if (!uid_is_valid(uid)) {
*exit_status = EXIT_USER;
- return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\".", uid);
}
if (!gid_is_valid(gid)) {
*exit_status = EXIT_USER;
- return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\".", gid);
}
if (runtime->dynamic_creds->user)
@@ -4186,7 +4294,7 @@ int exec_invoke(
params->user_lookup_fd = safe_close(params->user_lookup_fd);
- r = acquire_home(context, uid, &home, &home_buffer);
+ r = acquire_home(context, &home, &home_buffer);
if (r < 0) {
*exit_status = EXIT_CHDIR;
return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
@@ -4210,9 +4318,10 @@ int exec_invoke(
r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
if (r == -EUCLEAN) {
*exit_status = EXIT_CGROUP;
- return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
+ return log_exec_error_errno(context, params, r,
+ "Failed to attach process to cgroup '%s', "
"because the cgroup or one of its parents or "
- "siblings is in the threaded mode: %m", p);
+ "siblings is in the threaded mode.", p);
}
if (r < 0) {
*exit_status = EXIT_CGROUP;
@@ -4242,13 +4351,20 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
}
- r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
+ _cleanup_free_ char *fname = NULL;
+ r = path_extract_filename(command->path, &fname);
+ if (r < 0) {
+ *exit_status = EXIT_STDOUT;
+ return log_exec_error_errno(context, params, r, "Failed to extract filename from path %s: %m", command->path);
+ }
+
+ r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
if (r < 0) {
*exit_status = EXIT_STDOUT;
return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
}
- r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
+ r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
if (r < 0) {
*exit_status = EXIT_STDERR;
return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
@@ -4445,12 +4561,10 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
}
- if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
- r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
- if (r < 0) {
- *exit_status = EXIT_CREDENTIALS;
- return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
- }
+ r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_CREDENTIALS;
+ return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
}
r = build_environment(
@@ -4567,7 +4681,7 @@ int exec_invoke(
* wins here. (See above.) */
/* All fds passed in the fds array will be closed in the pam child process. */
- r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds);
+ r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds, params->exec_fd);
if (r < 0) {
*exit_status = EXIT_PAM;
return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
@@ -4639,7 +4753,7 @@ int exec_invoke(
if (ns_type_supported(NAMESPACE_IPC)) {
r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
- if (r == -EPERM)
+ if (ERRNO_IS_NEG_PRIVILEGE(r))
log_exec_warning_errno(context, params, r,
"PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
else if (r < 0) {
@@ -4657,7 +4771,13 @@ int exec_invoke(
if (needs_mount_namespace) {
_cleanup_free_ char *error_path = NULL;
- r = apply_mount_namespace(command->flags, context, params, runtime, memory_pressure_path, &error_path);
+ r = apply_mount_namespace(command->flags,
+ context,
+ params,
+ runtime,
+ memory_pressure_path,
+ needs_sandboxing,
+ &error_path);
if (r < 0) {
*exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
@@ -4672,7 +4792,7 @@ int exec_invoke(
}
if (context->memory_ksm >= 0)
- if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
+ if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm, 0, 0, 0) < 0) {
if (ERRNO_IS_NOT_SUPPORTED(errno))
log_exec_debug_errno(context,
params,
@@ -4731,26 +4851,16 @@ int exec_invoke(
_cleanup_close_ int executable_fd = -EBADF;
r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
if (r < 0) {
- if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
- log_exec_struct_errno(context, params, LOG_INFO, r,
- "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
- LOG_EXEC_INVOCATION_ID(params),
- LOG_EXEC_MESSAGE(params,
- "Executable %s missing, skipping: %m",
- command->path),
- "EXECUTABLE=%s", command->path);
- *exit_status = EXIT_SUCCESS;
- return 0;
- }
-
*exit_status = EXIT_EXEC;
- return log_exec_struct_errno(context, params, LOG_INFO, r,
- "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
- LOG_EXEC_INVOCATION_ID(params),
- LOG_EXEC_MESSAGE(params,
- "Failed to locate executable %s: %m",
- command->path),
- "EXECUTABLE=%s", command->path);
+ log_exec_struct_errno(context, params, LOG_NOTICE, r,
+ "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+ LOG_EXEC_MESSAGE(params,
+ "Unable to locate executable '%s': %m",
+ command->path),
+ "EXECUTABLE=%s", command->path);
+ /* If the error will be ignored by manager, tune down the log level here. Missing executable
+ * is very much expected in this case. */
+ return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r;
}
r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
@@ -4791,15 +4901,16 @@ int exec_invoke(
/* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
* we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
- * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
- * execve(). But first, close the remaining sockets in the context objects. */
+ * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
+ * them open until the final execve(). But first, close the remaining sockets in the context
+ * objects. */
exec_runtime_close(runtime);
exec_params_close(params);
r = close_all_fds(keep_fds, n_keep_fds);
if (r >= 0)
- r = shift_fds(params->fds, n_fds);
+ r = pack_fds(params->fds, n_fds);
if (r >= 0)
r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
if (r < 0) {
@@ -4945,8 +5056,10 @@ int exec_invoke(
}
}
- /* Apply working directory here, because the working directory might be on NFS and only the user running
- * this service might have the correct privilege to change to the working directory */
+ /* Apply working directory here, because the working directory might be on NFS and only the user
+ * running this service might have the correct privilege to change to the working directory. Also, it
+ * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
+ * the cwd cannot be used to pin directories outside of the sandbox. */
r = apply_working_directory(context, params, runtime, home, exit_status);
if (r < 0)
return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
@@ -5206,31 +5319,29 @@ int exec_invoke(
log_command_line(context, params, "Executing", executable, final_argv);
- if (params->exec_fd >= 0) {
- uint8_t hot = 1;
+ /* We have finished with all our initializations. Let's now let the manager know that. From this
+ * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
- /* We have finished with all our initializations. Let's now let the manager know that. From this point
- * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
+ r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status);
+ if (r < 0)
+ return r;
- if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
- *exit_status = EXIT_EXEC;
- return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
- }
+ /* As last thing before the execve(), let's send the handoff timestamp */
+ r = send_handoff_timestamp(context, params, exit_status);
+ if (r < 0) {
+ /* If this handoff timestamp failed, let's undo the marking as hot */
+ (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
+ return r;
}
- r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
-
- if (params->exec_fd >= 0) {
- uint8_t hot = 0;
+ /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
+ * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
+ * exec_fd this is pretty much the whole raison d'etre. */
- /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
- * that POLLHUP on it no longer means execve() succeeded. */
+ r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
- if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
- *exit_status = EXIT_EXEC;
- return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
- }
- }
+ /* The execve() failed, let's undo the marking as hot */
+ (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
*exit_status = EXIT_EXEC;
return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c
index b1e716e..ecd1e70 100644
--- a/src/core/execute-serialize.c
+++ b/src/core/execute-serialize.c
@@ -230,6 +230,10 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) {
return r;
}
+ r = serialize_bool(f, "exec-cgroup-context-memory-zswap-writeback", c->memory_zswap_writeback);
+ if (r < 0)
+ return r;
+
if (c->memory_limit != CGROUP_LIMIT_MAX) {
r = serialize_item_format(f, "exec-cgroup-context-memory-limit", "%" PRIu64, c->memory_limit);
if (r < 0)
@@ -373,8 +377,7 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) {
if (il->limits[type] == cgroup_io_limit_defaults[type])
continue;
- key = strjoin("exec-cgroup-context-io-device-limit-",
- cgroup_io_limit_type_to_string(type));
+ key = strjoin("exec-cgroup-context-io-device-limit-", cgroup_io_limit_type_to_string(type));
if (!key)
return -ENOMEM;
@@ -678,6 +681,11 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
r = safe_atou64(val, &c->startup_memory_zswap_max);
if (r < 0)
return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-zswap-writeback="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->memory_zswap_writeback = r;
} else if ((val = startswith(l, "exec-cgroup-context-memory-limit="))) {
r = safe_atou64(val, &c->memory_limit);
if (r < 0)
@@ -789,7 +797,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
_cleanup_free_ char *path = NULL, *rwm = NULL;
CGroupDevicePermissions p;
- r = extract_many_words(&val, " ", 0, &path, &rwm, NULL);
+ r = extract_many_words(&val, " ", 0, &path, &rwm);
if (r < 0)
return r;
if (r == 0)
@@ -806,7 +814,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
_cleanup_free_ char *path = NULL, *weight = NULL;
CGroupIODeviceWeight *a = NULL;
- r = extract_many_words(&val, " ", 0, &path, &weight, NULL);
+ r = extract_many_words(&val, " ", 0, &path, &weight);
if (r < 0)
return r;
if (r != 2)
@@ -835,7 +843,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
_cleanup_free_ char *path = NULL, *target = NULL;
CGroupIODeviceLatency *a = NULL;
- r = extract_many_words(&val, " ", 0, &path, &target, NULL);
+ r = extract_many_words(&val, " ", 0, &path, &target);
if (r < 0)
return r;
if (r != 2)
@@ -865,7 +873,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
CGroupIODeviceLimit *limit = NULL;
CGroupIOLimitType t;
- r = extract_many_words(&val, "= ", 0, &type, &path, &limits, NULL);
+ r = extract_many_words(&val, "= ", 0, &type, &path, &limits);
if (r < 0)
return r;
if (r != 3)
@@ -900,7 +908,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
_cleanup_free_ char *path = NULL, *weight = NULL;
CGroupBlockIODeviceWeight *a = NULL;
- r = extract_many_words(&val, " ", 0, &path, &weight, NULL);
+ r = extract_many_words(&val, " ", 0, &path, &weight);
if (r < 0)
return r;
if (r != 2)
@@ -921,7 +929,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
_cleanup_free_ char *path = NULL, *bw = NULL;
CGroupBlockIODeviceBandwidth *a = NULL;
- r = extract_many_words(&val, " ", 0, &path, &bw, NULL);
+ r = extract_many_words(&val, " ", 0, &path, &bw);
if (r < 0)
return r;
if (r != 2)
@@ -951,7 +959,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
_cleanup_free_ char *path = NULL, *bw = NULL;
CGroupBlockIODeviceBandwidth *a = NULL;
- r = extract_many_words(&val, " ", 0, &path, &bw, NULL);
+ r = extract_many_words(&val, " ", 0, &path, &bw);
if (r < 0)
return r;
if (r != 2)
@@ -1019,7 +1027,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
_cleanup_free_ char *type = NULL, *path = NULL;
uint32_t t;
- r = extract_many_words(&val, " ", 0, &type, &path, NULL);
+ r = extract_many_words(&val, " ", 0, &type, &path);
if (r < 0)
return r;
if (r != 2)
@@ -1365,8 +1373,12 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext
if (r < 0)
return r;
+ r = serialize_fd(f, fds, "exec-parameters-handoff-timestamp-fd", p->handoff_timestamp_fd);
+ if (r < 0)
+ return r;
+
if (c && exec_context_restrict_filesystems_set(c)) {
- r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_outer_map_fd);
+ r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd);
if (r < 0)
return r;
}
@@ -1479,8 +1491,8 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
return log_oom_debug();
/* Ensure we don't leave any FD uninitialized on error, it makes the fuzzer sad */
- for (size_t i = 0; i < p->n_socket_fds + p->n_storage_fds; ++i)
- p->fds[i] = -EBADF;
+ FOREACH_ARRAY(i, p->fds, p->n_socket_fds + p->n_storage_fds)
+ *i = -EBADF;
r = deserialize_fd_many(fds, val, p->n_socket_fds + p->n_storage_fds, p->fds);
if (r < 0)
@@ -1522,7 +1534,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
_cleanup_free_ char *type = NULL, *prefix = NULL;
ExecDirectoryType dt;
- r = extract_many_words(&val, "= ", 0, &type, &prefix, NULL);
+ r = extract_many_words(&val, "= ", 0, &type, &prefix);
if (r < 0)
return r;
if (r == 0)
@@ -1585,7 +1597,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
if (fd < 0)
continue;
- p->stdin_fd = fd;
+ close_and_replace(p->stdin_fd, fd);
} else if ((val = startswith(l, "exec-parameters-stdout-fd="))) {
int fd;
@@ -1594,7 +1606,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
if (fd < 0)
continue;
- p->stdout_fd = fd;
+ close_and_replace(p->stdout_fd, fd);
} else if ((val = startswith(l, "exec-parameters-stderr-fd="))) {
int fd;
@@ -1603,7 +1615,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
if (fd < 0)
continue;
- p->stderr_fd = fd;
+ close_and_replace(p->stderr_fd, fd);
} else if ((val = startswith(l, "exec-parameters-exec-fd="))) {
int fd;
@@ -1611,7 +1623,15 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
if (fd < 0)
continue;
- p->exec_fd = fd;
+ close_and_replace(p->exec_fd, fd);
+ } else if ((val = startswith(l, "exec-parameters-handoff-timestamp-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd < 0)
+ continue;
+
+ close_and_replace(p->handoff_timestamp_fd, fd);
} else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) {
int fd;
@@ -1619,13 +1639,13 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
if (fd < 0)
continue;
- p->bpf_outer_map_fd = fd;
+ close_and_replace(p->bpf_restrict_fs_map_fd, fd);
} else if ((val = startswith(l, "exec-parameters-notify-socket="))) {
r = free_and_strdup(&p->notify_socket, val);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-parameters-open-file="))) {
- OpenFile *of = NULL;
+ OpenFile *of;
r = open_file_parse(val, &of);
if (r < 0)
@@ -1643,7 +1663,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
if (fd < 0)
continue;
- p->user_lookup_fd = fd;
+ close_and_replace(p->user_lookup_fd, fd);
} else if ((val = startswith(l, "exec-parameters-files-env="))) {
r = deserialize_strv(val, &p->files_env);
if (r < 0)
@@ -1812,6 +1832,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
if (r < 0)
return r;
+ r = serialize_item_tristate(f, "exec-context-mount-api-vfs", c->mount_apivfs);
+ if (r < 0)
+ return r;
+
r = serialize_item_tristate(f, "exec-context-memory-ksm", c->memory_ksm);
if (r < 0)
return r;
@@ -1868,20 +1892,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
if (r < 0)
return r;
- if (c->mount_apivfs_set) {
- r = serialize_bool(f, "exec-context-mount-api-vfs", c->mount_apivfs);
- if (r < 0)
- return r;
- }
-
r = serialize_bool_elide(f, "exec-context-same-pgrp", c->same_pgrp);
if (r < 0)
return r;
- r = serialize_bool_elide(f, "exec-context-cpu-sched-reset-on-fork", c->cpu_sched_reset_on_fork);
- if (r < 0)
- return r;
-
r = serialize_bool(f, "exec-context-ignore-sigpipe", c->ignore_sigpipe);
if (r < 0)
return r;
@@ -2154,6 +2168,8 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
if (r < 0)
return r;
+ /* This is also passed to executor as an argument. So, the information should be redundant in general.
+ * But, let's keep this as is for consistency with other elements of ExecContext. See exec_spawn(). */
r = serialize_item_format(f, "exec-context-log-level-max", "%d", c->log_level_max);
if (r < 0)
return r;
@@ -2538,14 +2554,14 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
if (base64mem(sc->data, sc->size, &data) < 0)
return log_oom_debug();
- r = serialize_item_format(f, "exec-context-set-credentials", "%s %s %s", sc->id, yes_no(sc->encrypted), data);
+ r = serialize_item_format(f, "exec-context-set-credentials", "%s %s %s", sc->id, data, yes_no(sc->encrypted));
if (r < 0)
return r;
}
ExecLoadCredential *lc;
HASHMAP_FOREACH(lc, c->load_credentials) {
- r = serialize_item_format(f, "exec-context-load-credentials", "%s %s %s", lc->id, yes_no(lc->encrypted), lc->path);
+ r = serialize_item_format(f, "exec-context-load-credentials", "%s %s %s", lc->id, lc->path, yes_no(lc->encrypted));
if (r < 0)
return r;
}
@@ -2636,7 +2652,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
break;
p = word;
- r = extract_many_words(&p, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ r = extract_many_words(&p, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options);
if (r < 0)
return r;
if (r == 0)
@@ -2669,12 +2685,12 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
return r;
} else if ((val = startswith(l, "exec-context-root-hash="))) {
c->root_hash = mfree(c->root_hash);
- r = unhexmem(val, strlen(val), &c->root_hash, &c->root_hash_size);
+ r = unhexmem(val, &c->root_hash, &c->root_hash_size);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-root-hash-sig="))) {
c->root_hash_sig = mfree(c->root_hash_sig);
- r= unbase64mem(val, strlen(val), &c->root_hash_sig, &c->root_hash_sig_size);
+ r= unbase64mem(val, &c->root_hash_sig, &c->root_hash_sig_size);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-root-ephemeral="))) {
@@ -2695,6 +2711,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
r = safe_atoi(val, &c->private_mounts);
if (r < 0)
return r;
+ } else if ((val = startswith(l, "exec-context-mount-api-vfs="))) {
+ r = safe_atoi(val, &c->mount_apivfs);
+ if (r < 0)
+ return r;
} else if ((val = startswith(l, "exec-context-memory-ksm="))) {
r = safe_atoi(val, &c->memory_ksm);
if (r < 0)
@@ -2762,22 +2782,11 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
c->protect_system = protect_system_from_string(val);
if (c->protect_system < 0)
return -EINVAL;
- } else if ((val = startswith(l, "exec-context-mount-api-vfs="))) {
- r = parse_boolean(val);
- if (r < 0)
- return r;
- c->mount_apivfs = r;
- c->mount_apivfs_set = true;
} else if ((val = startswith(l, "exec-context-same-pgrp="))) {
r = parse_boolean(val);
if (r < 0)
return r;
c->same_pgrp = r;
- } else if ((val = startswith(l, "exec-context-cpu-sched-reset-on-fork="))) {
- r = parse_boolean(val);
- if (r < 0)
- return r;
- c->cpu_sched_reset_on_fork = r;
} else if ((val = startswith(l, "exec-context-non-blocking="))) {
r = parse_boolean(val);
if (r < 0)
@@ -2828,7 +2837,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
_cleanup_free_ char *type = NULL, *mode = NULL;
ExecDirectoryType dt;
- r = extract_many_words(&val, "= ", 0, &type, &mode, NULL);
+ r = extract_many_words(&val, "= ", 0, &type, &mode);
if (r < 0)
return r;
if (r == 0 || !mode)
@@ -2854,7 +2863,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
break;
p = tuple;
- r = extract_many_words(&p, ":", EXTRACT_UNESCAPE_SEPARATORS, &path, &only_create, NULL);
+ r = extract_many_words(&p, ":", EXTRACT_UNESCAPE_SEPARATORS, &path, &only_create);
if (r < 0)
return r;
if (r < 2)
@@ -3054,7 +3063,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
if (c->stdin_data)
return -EINVAL; /* duplicated */
- r = unbase64mem(val, strlen(val), &c->stdin_data, &c->stdin_data_size);
+ r = unbase64mem(val, &c->stdin_data, &c->stdin_data_size);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-tty-path="))) {
@@ -3098,6 +3107,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-log-level-max="))) {
+ /* See comment in serialization. */
r = safe_atoi(val, &c->log_level_max);
if (r < 0)
return r;
@@ -3314,7 +3324,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
} else if ((val = startswith(l, "exec-context-temporary-filesystems="))) {
_cleanup_free_ char *path = NULL, *options = NULL;
- r = extract_many_words(&val, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &path, &options, NULL);
+ r = extract_many_words(&val, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &path, &options);
if (r < 0)
return r;
if (r < 1)
@@ -3392,7 +3402,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
_cleanup_free_ char *s_id = NULL, *s_errno_num = NULL;
int id, errno_num;
- r = extract_many_words(&val, NULL, 0, &s_id, &s_errno_num, NULL);
+ r = extract_many_words(&val, NULL, 0, &s_id, &s_errno_num);
if (r < 0)
return r;
if (r != 2)
@@ -3432,7 +3442,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
_cleanup_free_ char *s_id = NULL, *s_errno_num = NULL;
int id, errno_num;
- r = extract_many_words(&val, " ", 0, &s_id, &s_errno_num, NULL);
+ r = extract_many_words(&val, " ", 0, &s_id, &s_errno_num);
if (r < 0)
return r;
if (r != 2)
@@ -3505,8 +3515,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
NULL,
EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS,
&source,
- &destination,
- NULL);
+ &destination);
if (r < 0)
return r;
if (r == 0)
@@ -3538,8 +3547,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
":",
EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS,
&partition,
- &opts,
- NULL);
+ &opts);
if (r < 0)
return r;
if (r == 0)
@@ -3619,8 +3627,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
":",
EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS,
&partition,
- &opts,
- NULL);
+ &opts);
if (r < 0)
return r;
if (r == 0)
@@ -3669,7 +3676,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
_cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
_cleanup_free_ char *id = NULL, *encrypted = NULL, *data = NULL;
- r = extract_many_words(&val, " ", 0, &id, &encrypted, &data, NULL);
+ r = extract_many_words(&val, " ", EXTRACT_DONT_COALESCE_SEPARATORS, &id, &data, &encrypted);
if (r < 0)
return r;
if (r != 3)
@@ -3688,7 +3695,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
.encrypted = r,
};
- r = unbase64mem(data, strlen(data), &sc->data, &sc->size);
+ r = unbase64mem(data, &sc->data, &sc->size);
if (r < 0)
return r;
@@ -3701,7 +3708,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
_cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL;
_cleanup_free_ char *id = NULL, *encrypted = NULL, *path = NULL;
- r = extract_many_words(&val, " ", 0, &id, &encrypted, &path, NULL);
+ r = extract_many_words(&val, " ", EXTRACT_DONT_COALESCE_SEPARATORS, &id, &path, &encrypted);
if (r < 0)
return r;
if (r != 3)
diff --git a/src/core/execute.c b/src/core/execute.c
index 8dbdfcf..513e95e 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -147,7 +147,7 @@ void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p)
const char *path = exec_context_tty_path(context);
- if (p && p->stdin_fd >= 0 && isatty(p->stdin_fd))
+ if (p && p->stdin_fd >= 0 && isatty_safe(p->stdin_fd))
fd = p->stdin_fd;
else if (path && (context->tty_path || is_terminal_input(context->std_input) ||
is_terminal_output(context->std_output) || is_terminal_output(context->std_error))) {
@@ -162,9 +162,11 @@ void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p)
* that will be closed automatically, and operate on it for convenience. */
lock_fd = lock_dev_console();
if (ERRNO_IS_NEG_PRIVILEGE(lock_fd))
- log_debug_errno(lock_fd, "No privileges to lock /dev/console, proceeding without: %m");
+ log_debug_errno(lock_fd, "No privileges to lock /dev/console, proceeding without lock: %m");
+ else if (ERRNO_IS_NEG_DEVICE_ABSENT(lock_fd))
+ log_debug_errno(lock_fd, "Device /dev/console does not exist, proceeding without lock: %m");
else if (lock_fd < 0)
- return (void) log_debug_errno(lock_fd, "Failed to lock /dev/console: %m");
+ log_warning_errno(lock_fd, "Failed to lock /dev/console, proceeding without lock: %m");
if (context->tty_vhangup)
(void) terminal_vhangup_fd(fd);
@@ -351,19 +353,18 @@ static void log_command_line(Unit *unit, const char *msg, const char *executable
static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
-int exec_spawn(Unit *unit,
- ExecCommand *command,
- const ExecContext *context,
- ExecParameters *params,
- ExecRuntime *runtime,
- const CGroupContext *cgroup_context,
- pid_t *ret) {
+int exec_spawn(
+ Unit *unit,
+ ExecCommand *command,
+ const ExecContext *context,
+ ExecParameters *params,
+ ExecRuntime *runtime,
+ const CGroupContext *cgroup_context,
+ PidRef *ret) {
- char serialization_fd_number[DECIMAL_STR_MAX(int) + 1];
- _cleanup_free_ char *subcgroup_path = NULL, *log_level = NULL, *executor_path = NULL;
+ _cleanup_free_ char *subcgroup_path = NULL, *max_log_levels = NULL, *executor_path = NULL;
_cleanup_fdset_free_ FDSet *fdset = NULL;
_cleanup_fclose_ FILE *f = NULL;
- pid_t pid;
int r;
assert(unit);
@@ -371,10 +372,11 @@ int exec_spawn(Unit *unit,
assert(unit->manager->executor_fd >= 0);
assert(command);
assert(context);
- assert(ret);
assert(params);
- assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
+ assert(!params->fds || FLAGS_SET(params->flags, EXEC_PASS_FDS));
+ assert(params->fds || (params->n_socket_fds + params->n_storage_fds == 0));
assert(!params->files_env); /* We fill this field, ensure it comes NULL-initialized to us */
+ assert(ret);
LOG_CONTEXT_PUSH_UNIT(unit);
@@ -404,8 +406,8 @@ int exec_spawn(Unit *unit,
* child's memory.max, serialize all the state needed to start the unit, and pass it to the
* systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
* and ensure all memory is shared. The child immediately execs the new binary so the delay should
- * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the
- * target cgroup. */
+ * be minimal. If glibc 2.39 is available pidfd_spawn() is used in order to get a race-free pid fd
+ * and to clone directly into the target cgroup (if we booted with cgroupv2). */
r = open_serialization_file("sd-executor-state", &f);
if (r < 0)
@@ -430,39 +432,57 @@ int exec_spawn(Unit *unit,
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialized fds: %m");
- r = log_level_to_string_alloc(log_get_max_level(), &log_level);
+ /* If LogLevelMax= is specified, then let's use the specified log level at the beginning of the
+ * executor process. To achieve that the specified log level is passed as an argument, rather than
+ * the one for the manager process. */
+ r = log_max_levels_to_string(context->log_level_max >= 0 ? context->log_level_max : log_get_max_level(), &max_log_levels);
if (r < 0)
- return log_unit_error_errno(unit, r, "Failed to convert log level to string: %m");
+ return log_unit_error_errno(unit, r, "Failed to convert max log levels to string: %m");
r = fd_get_path(unit->manager->executor_fd, &executor_path);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to get executor path from fd: %m");
+ char serialization_fd_number[DECIMAL_STR_MAX(int)];
xsprintf(serialization_fd_number, "%i", fileno(f));
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ dual_timestamp start_timestamp;
+
+ /* Record the start timestamp before we fork so that it is guaranteed to be earlier than the
+ * handoff timestamp. */
+ dual_timestamp_now(&start_timestamp);
+
/* The executor binary is pinned, to avoid compatibility problems during upgrades. */
r = posix_spawn_wrapper(
FORMAT_PROC_FD_PATH(unit->manager->executor_fd),
STRV_MAKE(executor_path,
"--deserialize", serialization_fd_number,
- "--log-level", log_level,
+ "--log-level", max_log_levels,
"--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
environ,
- &pid);
+ cg_unified() > 0 ? subcgroup_path : NULL,
+ &pidref);
+ if (r == -EUCLEAN && subcgroup_path)
+ return log_unit_error_errno(unit, r,
+ "Failed to spawn process into cgroup '%s', because the cgroup "
+ "or one of its parents or siblings is in the threaded mode.",
+ subcgroup_path);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
-
- log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
-
/* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
* executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
* process will be killed too). */
- if (subcgroup_path)
- (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
+ if (r == 0 && subcgroup_path)
+ (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pidref.pid);
+ /* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */
+
+ log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)",
+ command->path, pidref.pid, r > 0 ? "via" : "without");
- exec_status_start(&command->exec_status, pid);
+ exec_status_start(&command->exec_status, pidref.pid, &start_timestamp);
- *ret = pid;
+ *ret = TAKE_PIDREF(pidref);
return 0;
}
@@ -491,6 +511,7 @@ void exec_context_init(ExecContext *c) {
.tty_rows = UINT_MAX,
.tty_cols = UINT_MAX,
.private_mounts = -1,
+ .mount_apivfs = -1,
.memory_ksm = -1,
.set_login_environment = -1,
};
@@ -664,13 +685,19 @@ void exec_command_done_array(ExecCommand *c, size_t n) {
exec_command_done(i);
}
+ExecCommand* exec_command_free(ExecCommand *c) {
+ if (!c)
+ return NULL;
+
+ exec_command_done(c);
+ return mfree(c);
+}
+
ExecCommand* exec_command_free_list(ExecCommand *c) {
ExecCommand *i;
- while ((i = LIST_POP(command, c))) {
- exec_command_done(i);
- free(i);
- }
+ while ((i = LIST_POP(command, c)))
+ exec_command_free(i);
return NULL;
}
@@ -1396,7 +1423,7 @@ bool exec_context_maintains_privileges(const ExecContext *c) {
if (!c->user)
return true;
- if (streq(c->user, "root") || streq(c->user, "0"))
+ if (STR_IN_SET(c->user, "root", "0"))
return true;
return false;
@@ -1421,8 +1448,8 @@ bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
assert(c);
/* Explicit setting wins */
- if (c->mount_apivfs_set)
- return c->mount_apivfs;
+ if (c->mount_apivfs >= 0)
+ return c->mount_apivfs > 0;
/* Default to "yes" if root directory or image are specified */
if (exec_context_with_rootfs(c))
@@ -1657,6 +1684,15 @@ uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c) {
return (uint64_t) MAX(r, 0);
}
+bool exec_context_get_set_login_environment(const ExecContext *c) {
+ assert(c);
+
+ if (c->set_login_environment >= 0)
+ return c->set_login_environment;
+
+ return c->user || c->dynamic_user || c->pam_name;
+}
+
char** exec_context_get_syscall_filter(const ExecContext *c) {
_cleanup_strv_free_ char **l = NULL;
@@ -1787,14 +1823,17 @@ char** exec_context_get_restrict_filesystems(const ExecContext *c) {
return l ? TAKE_PTR(l) : strv_new(NULL);
}
-void exec_status_start(ExecStatus *s, pid_t pid) {
+void exec_status_start(ExecStatus *s, pid_t pid, const dual_timestamp *ts) {
assert(s);
*s = (ExecStatus) {
.pid = pid,
};
- dual_timestamp_now(&s->start_timestamp);
+ if (ts)
+ s->start_timestamp = *ts;
+ else
+ dual_timestamp_now(&s->start_timestamp);
}
void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
@@ -1814,6 +1853,19 @@ void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int
(void) utmp_put_dead_process(context->utmp_id, pid, code, status);
}
+void exec_status_handoff(ExecStatus *s, const struct ucred *ucred, const dual_timestamp *ts) {
+ assert(s);
+ assert(ucred);
+ assert(ts);
+
+ if (ucred->pid != s->pid)
+ *s = (ExecStatus) {
+ .pid = ucred->pid,
+ };
+
+ s->handoff_timestamp = *ts;
+}
+
void exec_status_reset(ExecStatus *s) {
assert(s);
@@ -1836,19 +1888,45 @@ void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
if (dual_timestamp_is_set(&s->start_timestamp))
fprintf(f,
"%sStart Timestamp: %s\n",
- prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
+ prefix, FORMAT_TIMESTAMP_STYLE(s->start_timestamp.realtime, TIMESTAMP_US));
+
+ if (dual_timestamp_is_set(&s->handoff_timestamp) && dual_timestamp_is_set(&s->start_timestamp) &&
+ s->handoff_timestamp.monotonic > s->start_timestamp.monotonic)
+ fprintf(f,
+ "%sHandoff Timestamp: %s since start\n",
+ prefix,
+ FORMAT_TIMESPAN(usec_sub_unsigned(s->handoff_timestamp.monotonic, s->start_timestamp.monotonic), 1));
+ else
+ fprintf(f,
+ "%sHandoff Timestamp: %s\n",
+ prefix, FORMAT_TIMESTAMP_STYLE(s->handoff_timestamp.realtime, TIMESTAMP_US));
+
+ if (dual_timestamp_is_set(&s->exit_timestamp)) {
+
+ if (dual_timestamp_is_set(&s->handoff_timestamp) && s->exit_timestamp.monotonic > s->handoff_timestamp.monotonic)
+ fprintf(f,
+ "%sExit Timestamp: %s since handoff\n",
+ prefix,
+ FORMAT_TIMESPAN(usec_sub_unsigned(s->exit_timestamp.monotonic, s->handoff_timestamp.monotonic), 1));
+ else if (dual_timestamp_is_set(&s->start_timestamp) && s->exit_timestamp.monotonic > s->start_timestamp.monotonic)
+ fprintf(f,
+ "%sExit Timestamp: %s since start\n",
+ prefix,
+ FORMAT_TIMESPAN(usec_sub_unsigned(s->exit_timestamp.monotonic, s->start_timestamp.monotonic), 1));
+ else
+ fprintf(f,
+ "%sExit Timestamp: %s\n",
+ prefix, FORMAT_TIMESTAMP_STYLE(s->exit_timestamp.realtime, TIMESTAMP_US));
- if (dual_timestamp_is_set(&s->exit_timestamp))
fprintf(f,
- "%sExit Timestamp: %s\n"
"%sExit Code: %s\n"
"%sExit Status: %i\n",
- prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
prefix, sigchld_code_to_string(s->code),
prefix, s->status);
+ }
}
-static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
+void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
_cleanup_free_ char *cmd = NULL;
const char *prefix2;
@@ -1951,8 +2029,7 @@ static char *destroy_tree(char *path) {
}
void exec_shared_runtime_done(ExecSharedRuntime *rt) {
- if (!rt)
- return;
+ assert(rt);
if (rt->manager)
(void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
@@ -1965,8 +2042,10 @@ void exec_shared_runtime_done(ExecSharedRuntime *rt) {
}
static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
- exec_shared_runtime_done(rt);
+ if (!rt)
+ return NULL;
+ exec_shared_runtime_done(rt);
return mfree(rt);
}
@@ -2090,15 +2169,13 @@ static int exec_shared_runtime_make(
return r;
}
- if (exec_needs_network_namespace(c)) {
+ if (exec_needs_network_namespace(c))
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
return -errno;
- }
- if (exec_needs_ipc_namespace(c)) {
+ if (exec_needs_ipc_namespace(c))
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
return -errno;
- }
r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
if (r < 0)
@@ -2488,7 +2565,7 @@ void exec_params_shallow_clear(ExecParameters *p) {
p->fds = mfree(p->fds);
p->exec_fd = safe_close(p->exec_fd);
p->user_lookup_fd = -EBADF;
- p->bpf_outer_map_fd = -EBADF;
+ p->bpf_restrict_fs_map_fd = -EBADF;
p->unit_id = mfree(p->unit_id);
p->invocation_id = SD_ID128_NULL;
p->invocation_id_string[0] = '\0';
@@ -2643,46 +2720,46 @@ ExecCleanMask exec_clean_mask_from_string(const char *s) {
}
static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
- [EXEC_INPUT_NULL] = "null",
- [EXEC_INPUT_TTY] = "tty",
+ [EXEC_INPUT_NULL] = "null",
+ [EXEC_INPUT_TTY] = "tty",
[EXEC_INPUT_TTY_FORCE] = "tty-force",
- [EXEC_INPUT_TTY_FAIL] = "tty-fail",
- [EXEC_INPUT_SOCKET] = "socket",
- [EXEC_INPUT_NAMED_FD] = "fd",
- [EXEC_INPUT_DATA] = "data",
- [EXEC_INPUT_FILE] = "file",
+ [EXEC_INPUT_TTY_FAIL] = "tty-fail",
+ [EXEC_INPUT_SOCKET] = "socket",
+ [EXEC_INPUT_NAMED_FD] = "fd",
+ [EXEC_INPUT_DATA] = "data",
+ [EXEC_INPUT_FILE] = "file",
};
DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
- [EXEC_OUTPUT_INHERIT] = "inherit",
- [EXEC_OUTPUT_NULL] = "null",
- [EXEC_OUTPUT_TTY] = "tty",
- [EXEC_OUTPUT_KMSG] = "kmsg",
- [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
- [EXEC_OUTPUT_JOURNAL] = "journal",
+ [EXEC_OUTPUT_INHERIT] = "inherit",
+ [EXEC_OUTPUT_NULL] = "null",
+ [EXEC_OUTPUT_TTY] = "tty",
+ [EXEC_OUTPUT_KMSG] = "kmsg",
+ [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
+ [EXEC_OUTPUT_JOURNAL] = "journal",
[EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
- [EXEC_OUTPUT_SOCKET] = "socket",
- [EXEC_OUTPUT_NAMED_FD] = "fd",
- [EXEC_OUTPUT_FILE] = "file",
- [EXEC_OUTPUT_FILE_APPEND] = "append",
- [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
+ [EXEC_OUTPUT_SOCKET] = "socket",
+ [EXEC_OUTPUT_NAMED_FD] = "fd",
+ [EXEC_OUTPUT_FILE] = "file",
+ [EXEC_OUTPUT_FILE_APPEND] = "append",
+ [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
};
DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
- [EXEC_UTMP_INIT] = "init",
+ [EXEC_UTMP_INIT] = "init",
[EXEC_UTMP_LOGIN] = "login",
- [EXEC_UTMP_USER] = "user",
+ [EXEC_UTMP_USER] = "user",
};
DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
- [EXEC_PRESERVE_NO] = "no",
- [EXEC_PRESERVE_YES] = "yes",
+ [EXEC_PRESERVE_NO] = "no",
+ [EXEC_PRESERVE_YES] = "yes",
[EXEC_PRESERVE_RESTART] = "restart",
};
@@ -2690,10 +2767,10 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EX
/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
- [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
- [EXEC_DIRECTORY_STATE] = "StateDirectory",
- [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
- [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
+ [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
+ [EXEC_DIRECTORY_STATE] = "StateDirectory",
+ [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
+ [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
[EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
};
@@ -2724,10 +2801,10 @@ DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_mode, ExecDirectoryType);
* one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
* directories, specifically .timer units with their timestamp touch file. */
static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
- [EXEC_DIRECTORY_RUNTIME] = "runtime",
- [EXEC_DIRECTORY_STATE] = "state",
- [EXEC_DIRECTORY_CACHE] = "cache",
- [EXEC_DIRECTORY_LOGS] = "logs",
+ [EXEC_DIRECTORY_RUNTIME] = "runtime",
+ [EXEC_DIRECTORY_STATE] = "state",
+ [EXEC_DIRECTORY_CACHE] = "cache",
+ [EXEC_DIRECTORY_LOGS] = "logs",
[EXEC_DIRECTORY_CONFIGURATION] = "configuration",
};
@@ -2736,7 +2813,7 @@ DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
[EXEC_KEYRING_INHERIT] = "inherit",
[EXEC_KEYRING_PRIVATE] = "private",
- [EXEC_KEYRING_SHARED] = "shared",
+ [EXEC_KEYRING_SHARED] = "shared",
};
DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);
diff --git a/src/core/execute.h b/src/core/execute.h
index 5a6927a..107ae25 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -91,6 +91,7 @@ typedef enum ExecKeyringMode {
struct ExecStatus {
dual_timestamp start_timestamp;
dual_timestamp exit_timestamp;
+ dual_timestamp handoff_timestamp;
pid_t pid;
int code; /* as in siginfo_t::si_code */
int status; /* as in siginfo_t::si_status */
@@ -199,7 +200,6 @@ struct ExecContext {
bool nice_set:1;
bool ioprio_set:1;
bool cpu_sched_set:1;
- bool mount_apivfs_set:1;
/* This is not exposed to the user but available internally. We need it to make sure that whenever we
* spawn /usr/bin/mount it is run in the same process group as us so that the autofs logic detects
@@ -312,6 +312,7 @@ struct ExecContext {
ProcSubset proc_subset; /* subset= */
int private_mounts;
+ int mount_apivfs;
int memory_ksm;
bool private_tmp;
bool private_network;
@@ -326,7 +327,6 @@ struct ExecContext {
ProtectSystem protect_system;
ProtectHome protect_home;
bool protect_hostname;
- bool mount_apivfs;
bool dynamic_user;
bool remove_ipc;
@@ -390,22 +390,23 @@ static inline bool exec_context_with_rootfs(const ExecContext *c) {
}
typedef enum ExecFlags {
- EXEC_APPLY_SANDBOXING = 1 << 0,
- EXEC_APPLY_CHROOT = 1 << 1,
- EXEC_APPLY_TTY_STDIN = 1 << 2,
- EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */
- EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
- EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */
- EXEC_CGROUP_DELEGATE = 1 << 6,
- EXEC_IS_CONTROL = 1 << 7,
- EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */
- EXEC_WRITE_CREDENTIALS = 1 << 9, /* Set up the credential store logic */
+ EXEC_APPLY_SANDBOXING = 1 << 0,
+ EXEC_APPLY_CHROOT = 1 << 1,
+ EXEC_APPLY_TTY_STDIN = 1 << 2,
+ EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */
+ EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
+ EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */
+ EXEC_CGROUP_DELEGATE = 1 << 6,
+ EXEC_IS_CONTROL = 1 << 7,
+ EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */
+ EXEC_SETUP_CREDENTIALS = 1 << 9, /* Set up the credential store logic */
+ EXEC_SETUP_CREDENTIALS_FRESH = 1 << 10, /* Set up a new credential store (disable reuse) */
/* The following are not used by execute.c, but by consumers internally */
- EXEC_PASS_FDS = 1 << 10,
- EXEC_SETENV_RESULT = 1 << 11,
- EXEC_SET_WATCHDOG = 1 << 12,
- EXEC_SETENV_MONITOR_RESULT = 1 << 13, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */
+ EXEC_PASS_FDS = 1 << 11,
+ EXEC_SETENV_RESULT = 1 << 12,
+ EXEC_SET_WATCHDOG = 1 << 13,
+ EXEC_SETENV_MONITOR_RESULT = 1 << 14, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */
} ExecFlags;
/* Parameters for a specific invocation of a command. This structure is put together right before a command is
@@ -442,7 +443,7 @@ struct ExecParameters {
int stdout_fd;
int stderr_fd;
- /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */
+ /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. */
int exec_fd;
char *notify_socket;
@@ -453,7 +454,9 @@ struct ExecParameters {
char **files_env;
int user_lookup_fd;
- int bpf_outer_map_fd;
+ int handoff_timestamp_fd;
+
+ int bpf_restrict_fs_map_fd;
/* Used for logging in the executor functions */
char *unit_id;
@@ -461,34 +464,40 @@ struct ExecParameters {
char invocation_id_string[SD_ID128_STRING_MAX];
};
-#define EXEC_PARAMETERS_INIT(_flags) \
- (ExecParameters) { \
- .flags = (_flags), \
- .stdin_fd = -EBADF, \
- .stdout_fd = -EBADF, \
- .stderr_fd = -EBADF, \
- .exec_fd = -EBADF, \
- .bpf_outer_map_fd = -EBADF, \
- .user_lookup_fd = -EBADF, \
- };
+#define EXEC_PARAMETERS_INIT(_flags) \
+ (ExecParameters) { \
+ .flags = (_flags), \
+ .stdin_fd = -EBADF, \
+ .stdout_fd = -EBADF, \
+ .stderr_fd = -EBADF, \
+ .exec_fd = -EBADF, \
+ .bpf_restrict_fs_map_fd = -EBADF, \
+ .user_lookup_fd = -EBADF, \
+ .handoff_timestamp_fd = -EBADF, \
+ }
#include "unit.h"
#include "dynamic-user.h"
-int exec_spawn(Unit *unit,
- ExecCommand *command,
- const ExecContext *context,
- ExecParameters *exec_params,
- ExecRuntime *runtime,
- const CGroupContext *cgroup_context,
- pid_t *ret);
+int exec_spawn(
+ Unit *unit,
+ ExecCommand *command,
+ const ExecContext *context,
+ ExecParameters *exec_params,
+ ExecRuntime *runtime,
+ const CGroupContext *cgroup_context,
+ PidRef *ret);
void exec_command_done(ExecCommand *c);
void exec_command_done_array(ExecCommand *c, size_t n);
+ExecCommand* exec_command_free(ExecCommand *c);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecCommand*, exec_command_free);
ExecCommand* exec_command_free_list(ExecCommand *c);
void exec_command_free_array(ExecCommand **c, size_t n);
void exec_command_reset_status_array(ExecCommand *c, size_t n);
void exec_command_reset_status_list_array(ExecCommand **c, size_t n);
+
+void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix);
void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix);
void exec_command_append_list(ExecCommand **l, ExecCommand *e);
int exec_command_set(ExecCommand *c, const char *path, ...) _sentinel_;
@@ -527,14 +536,16 @@ int exec_context_get_nice(const ExecContext *c);
int exec_context_get_cpu_sched_policy(const ExecContext *c);
int exec_context_get_cpu_sched_priority(const ExecContext *c);
uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c);
+bool exec_context_get_set_login_environment(const ExecContext *c);
char** exec_context_get_syscall_filter(const ExecContext *c);
char** exec_context_get_syscall_archs(const ExecContext *c);
char** exec_context_get_syscall_log(const ExecContext *c);
char** exec_context_get_address_families(const ExecContext *c);
char** exec_context_get_restrict_filesystems(const ExecContext *c);
-void exec_status_start(ExecStatus *s, pid_t pid);
+void exec_status_start(ExecStatus *s, pid_t pid, const dual_timestamp *ts);
void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status);
+void exec_status_handoff(ExecStatus *s, const struct ucred *ucred, const dual_timestamp *ts);
void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix);
void exec_status_reset(ExecStatus *s);
@@ -613,23 +624,23 @@ bool exec_needs_ipc_namespace(const ExecContext *context);
#define LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep) \
((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_INVOCATION_ID=%s" : "INVOCATION_ID=%s")
-#define log_exec_full_errno_zerook(ec, ep, level, error, ...) \
- ({ \
- const ExecContext *_c = (ec); \
- const ExecParameters *_p = (ep); \
- const int _l = (level); \
- bool _do_log = !(log_get_max_level() < LOG_PRI(_l) || \
- !(_c->log_level_max < 0 || \
- _c->log_level_max >= LOG_PRI(_l))); \
- LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
- _c->n_log_extra_fields); \
- !_do_log ? -ERRNO_VALUE(error) : \
- log_object_internal(_l, error, PROJECT_FILE, \
- __LINE__, __func__, \
- LOG_EXEC_ID_FIELD(_p), \
- _p->unit_id, \
- LOG_EXEC_INVOCATION_ID_FIELD(_p), \
- _p->invocation_id_string, ##__VA_ARGS__); \
+#define log_exec_full_errno_zerook(ec, ep, level, error, ...) \
+ ({ \
+ const ExecContext *_c = (ec); \
+ const ExecParameters *_p = (ep); \
+ const int _l = (level); \
+ bool _do_log = _c->log_level_max < 0 || \
+ _c->log_level_max >= LOG_PRI(_l); \
+ LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
+ _c->n_log_extra_fields); \
+ !_do_log ? -ERRNO_VALUE(error) : \
+ log_object_internal(_l, error, \
+ PROJECT_FILE, __LINE__, __func__, \
+ LOG_EXEC_ID_FIELD(_p), \
+ _p->unit_id, \
+ LOG_EXEC_INVOCATION_ID_FIELD(_p), \
+ _p->invocation_id_string, \
+ ##__VA_ARGS__); \
})
#define log_exec_full_errno(ec, ep, level, error, ...) \
@@ -653,48 +664,34 @@ bool exec_needs_ipc_namespace(const ExecContext *context);
#define log_exec_warning_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_WARNING, error, __VA_ARGS__)
#define log_exec_error_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_ERR, error, __VA_ARGS__)
-#define log_exec_struct_errno(ec, ep, level, error, ...) \
- ({ \
- const ExecContext *_c = (ec); \
- const ExecParameters *_p = (ep); \
- const int _l = (level); \
- bool _do_log = !(_c->log_level_max < 0 || \
- _c->log_level_max >= LOG_PRI(_l)); \
- LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
- _c->n_log_extra_fields); \
- _do_log ? \
- log_struct_errno(_l, error, __VA_ARGS__, LOG_EXEC_ID_FIELD_FORMAT(_p), _p->unit_id) : \
- -ERRNO_VALUE(error); \
- })
-
-#define log_exec_struct(ec, ep, level, ...) log_exec_struct_errno(ec, ep, level, 0, __VA_ARGS__)
-
-#define log_exec_struct_iovec_errno(ec, ep, level, error, iovec, n_iovec) \
- ({ \
- const ExecContext *_c = (ec); \
- const ExecParameters *_p = (ep); \
- const int _l = (level); \
- bool _do_log = !(_c->log_level_max < 0 || \
- _c->log_level_max >= LOG_PRI(_l)); \
- LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
- _c->n_log_extra_fields); \
- _do_log ? \
- log_struct_iovec_errno(_l, error, iovec, n_iovec) : \
- -ERRNO_VALUE(error); \
- })
-
-#define log_exec_struct_iovec(ec, ep, level, iovec, n_iovec) log_exec_struct_iovec_errno(ec, ep, level, 0, iovec, n_iovec)
-
/* Like LOG_MESSAGE(), but with the unit name prefixed. */
#define LOG_EXEC_MESSAGE(ep, fmt, ...) LOG_MESSAGE("%s: " fmt, (ep)->unit_id, ##__VA_ARGS__)
#define LOG_EXEC_ID(ep) LOG_EXEC_ID_FIELD_FORMAT(ep), (ep)->unit_id
#define LOG_EXEC_INVOCATION_ID(ep) LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep), (ep)->invocation_id_string
-#define _LOG_CONTEXT_PUSH_EXEC(ec, ep, p, c) \
- const ExecContext *c = (ec); \
- const ExecParameters *p = (ep); \
+#define log_exec_struct_errno(ec, ep, level, error, ...) \
+ ({ \
+ const ExecContext *_c = (ec); \
+ const ExecParameters *_p = (ep); \
+ const int _l = (level); \
+ bool _do_log = _c->log_level_max < 0 || \
+ _c->log_level_max >= LOG_PRI(_l); \
+ LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
+ _c->n_log_extra_fields); \
+ !_do_log ? -ERRNO_VALUE(error) : \
+ log_struct_errno(_l, error, \
+ LOG_EXEC_ID(_p), \
+ LOG_EXEC_INVOCATION_ID(_p), \
+ __VA_ARGS__); \
+ })
+
+#define log_exec_struct(ec, ep, level, ...) log_exec_struct_errno(ec, ep, level, 0, __VA_ARGS__)
+
+#define _LOG_CONTEXT_PUSH_EXEC(ec, ep, p, c) \
+ const ExecContext *c = (ec); \
+ const ExecParameters *p = (ep); \
LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_ID_FIELD(p), p->unit_id); \
- LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_INVOCATION_ID_FIELD(p), p->invocation_id_string); \
+ LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_INVOCATION_ID_FIELD(p), p->invocation_id_string); \
LOG_CONTEXT_PUSH_IOV(c->log_extra_fields, c->n_log_extra_fields)
#define LOG_CONTEXT_PUSH_EXEC(ec, ep) \
diff --git a/src/core/executor.c b/src/core/executor.c
index b2716ef..bd0c742 100644
--- a/src/core/executor.c
+++ b/src/core/executor.c
@@ -245,12 +245,13 @@ static int run(int argc, char *argv[]) {
log_exec_struct_errno(&context, &params, LOG_ERR, r,
"MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
- LOG_EXEC_INVOCATION_ID(&params),
LOG_EXEC_MESSAGE(&params, "Failed at step %s spawning %s: %m",
status, command.path),
"EXECUTABLE=%s", command.path);
} else
- assert(exit_status == EXIT_SUCCESS); /* When 'skip' is chosen in the confirm spawn prompt */
+ /* r == 0: 'skip' is chosen in the confirm spawn prompt
+ * r > 0: expected/ignored failure, do not log at error level */
+ assert((r == 0) == (exit_status == EXIT_SUCCESS));
return exit_status;
}
diff --git a/src/core/fuzz-execute-serialize.c b/src/core/fuzz-execute-serialize.c
index 6069efd..5b2dc95 100644
--- a/src/core/fuzz-execute-serialize.c
+++ b/src/core/fuzz-execute-serialize.c
@@ -56,7 +56,7 @@ static void exec_fuzz_one(FILE *f, FDSet *fdset) {
params.stderr_fd = -EBADF;
params.exec_fd = -EBADF;
params.user_lookup_fd = -EBADF;
- params.bpf_outer_map_fd = -EBADF;
+ params.bpf_restrict_fs_map_fd = -EBADF;
if (!params.fds)
params.n_socket_fds = params.n_storage_fds = 0;
for (size_t i = 0; params.fds && i < params.n_socket_fds + params.n_storage_fds; i++)
diff --git a/src/core/generator-setup.c b/src/core/generator-setup.c
index 00d6ad6..b16211e 100644
--- a/src/core/generator-setup.c
+++ b/src/core/generator-setup.c
@@ -8,7 +8,7 @@
#include "rm-rf.h"
int lookup_paths_mkdir_generator(LookupPaths *p) {
- int r, q;
+ int r;
assert(p);
@@ -16,14 +16,8 @@ int lookup_paths_mkdir_generator(LookupPaths *p) {
return -EINVAL;
r = mkdir_p_label(p->generator, 0755);
-
- q = mkdir_p_label(p->generator_early, 0755);
- if (q < 0 && r >= 0)
- r = q;
-
- q = mkdir_p_label(p->generator_late, 0755);
- if (q < 0 && r >= 0)
- r = q;
+ RET_GATHER(r, mkdir_p_label(p->generator_early, 0755));
+ RET_GATHER(r, mkdir_p_label(p->generator_late, 0755));
return r;
}
diff --git a/src/core/import-creds.c b/src/core/import-creds.c
index 48f3160..f27ffed 100644
--- a/src/core/import-creds.c
+++ b/src/core/import-creds.c
@@ -80,7 +80,7 @@ static int acquire_credential_directory(ImportCredentialContext *c, const char *
if (c->target_dir_fd >= 0)
return c->target_dir_fd;
- r = path_is_mount_point(path, NULL, 0);
+ r = path_is_mount_point(path);
if (r < 0) {
if (r != -ENOENT)
return log_error_errno(r, "Failed to determine if %s is a mount point: %m", path);
@@ -314,7 +314,7 @@ static int proc_cmdline_callback(const char *key, const char *value, void *data)
colon++;
if (base64) {
- r = unbase64mem(colon, SIZE_MAX, &binary, &l);
+ r = unbase64mem(colon, &binary, &l);
if (r < 0) {
log_warning_errno(r, "Failed to decode binary credential '%s' data, ignoring: %m", n);
return 0;
@@ -519,13 +519,13 @@ static int parse_smbios_strings(ImportCredentialContext *c, const char *data, si
return log_oom();
if (!credential_name_valid(cn)) {
- log_warning("SMBIOS credential name '%s' is not valid, ignoring: %m", cn);
+ log_warning("SMBIOS credential name '%s' is not valid, ignoring.", cn);
continue;
}
/* Optionally base64 decode the data, if requested, to allow binary credentials */
if (unbase64) {
- r = unbase64mem(eq + 1, nul - (eq + 1), &buf, &buflen);
+ r = unbase64mem_full(eq + 1, nul - (eq + 1), /* secure = */ false, &buf, &buflen);
if (r < 0) {
log_warning_errno(r, "Failed to base64 decode credential '%s', ignoring: %m", cn);
continue;
@@ -753,7 +753,7 @@ static int merge_credentials_trusted(const char *creds_dir) {
return 0;
/* Do not try to merge initrd credentials into foreign credentials directories */
- if (!path_equal_ptr(creds_dir, SYSTEM_CREDENTIALS_DIRECTORY)) {
+ if (!path_equal(creds_dir, SYSTEM_CREDENTIALS_DIRECTORY)) {
log_debug("Not importing initrd credentials, as foreign $CREDENTIALS_DIRECTORY has been set.");
return 0;
}
@@ -815,7 +815,6 @@ static int setenv_notify_socket(void) {
static int report_credentials_per_func(const char *title, int (*get_directory_func)(const char **ret)) {
_cleanup_free_ DirectoryEntries *de = NULL;
- _cleanup_close_ int dir_fd = -EBADF;
_cleanup_free_ char *ll = NULL;
const char *d = NULL;
int r, c = 0;
@@ -831,11 +830,7 @@ static int report_credentials_per_func(const char *title, int (*get_directory_fu
return log_warning_errno(r, "Failed to determine %s directory: %m", title);
}
- dir_fd = open(d, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
- if (dir_fd < 0)
- return log_warning_errno(errno, "Failed to open credentials directory %s: %m", d);
-
- r = readdir_all(dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+ r = readdir_all_at(AT_FDCWD, d, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
if (r < 0)
return log_warning_errno(r, "Failed to enumerate credentials directory %s: %m", d);
diff --git a/src/core/job.c b/src/core/job.c
index e78c2a7..2f19468 100644
--- a/src/core/job.c
+++ b/src/core/job.c
@@ -133,6 +133,7 @@ Job* job_free(Job *j) {
static void job_set_state(Job *j, JobState state) {
assert(j);
+ assert(j->manager);
assert(state >= 0);
assert(state < _JOB_STATE_MAX);
@@ -145,15 +146,15 @@ static void job_set_state(Job *j, JobState state) {
return;
if (j->state == JOB_RUNNING)
- j->unit->manager->n_running_jobs++;
+ j->manager->n_running_jobs++;
else {
assert(j->state == JOB_WAITING);
- assert(j->unit->manager->n_running_jobs > 0);
+ assert(j->manager->n_running_jobs > 0);
- j->unit->manager->n_running_jobs--;
+ j->manager->n_running_jobs--;
- if (j->unit->manager->n_running_jobs <= 0)
- j->unit->manager->jobs_in_progress_event_source = sd_event_source_disable_unref(j->unit->manager->jobs_in_progress_event_source);
+ if (j->manager->n_running_jobs <= 0)
+ j->manager->jobs_in_progress_event_source = sd_event_source_disable_unref(j->manager->jobs_in_progress_event_source);
}
}
@@ -281,6 +282,8 @@ int job_install_deserialized(Job *j) {
Job **pj;
int r;
+ assert(j);
+ assert(j->manager);
assert(!j->installed);
if (j->type < 0 || j->type >= _JOB_TYPE_MAX_IN_TRANSACTION)
@@ -307,7 +310,7 @@ int job_install_deserialized(Job *j) {
j->installed = true;
if (j->state == JOB_RUNNING)
- j->unit->manager->n_running_jobs++;
+ j->manager->n_running_jobs++;
log_unit_debug(j->unit,
"Reinstalled deserialized job %s/%s as %u",
@@ -633,16 +636,19 @@ static const char* job_done_message_format(Unit *u, JobType t, JobResult result)
[JOB_UNSUPPORTED] = "Starting of %s unsupported.",
[JOB_COLLECTED] = "Unnecessary job was removed for %s.",
[JOB_ONCE] = "Unit %s has been started before and cannot be started again.",
+ [JOB_FROZEN] = "Cannot start frozen unit %s.",
};
static const char* const generic_finished_stop_job[_JOB_RESULT_MAX] = {
[JOB_DONE] = "Stopped %s.",
[JOB_FAILED] = "Stopped %s with error.",
[JOB_TIMEOUT] = "Timed out stopping %s.",
+ [JOB_FROZEN] = "Cannot stop frozen unit %s.",
};
static const char* const generic_finished_reload_job[_JOB_RESULT_MAX] = {
[JOB_DONE] = "Reloaded %s.",
[JOB_FAILED] = "Reload failed for %s.",
[JOB_TIMEOUT] = "Timed out reloading %s.",
+ [JOB_FROZEN] = "Cannot reload frozen unit %s.",
};
/* When verify-active detects the unit is inactive, report it.
* Most likely a DEPEND warning from a requisiting unit will
@@ -704,6 +710,7 @@ static const struct {
[JOB_UNSUPPORTED] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "UNSUPP" },
[JOB_COLLECTED] = { LOG_INFO, },
[JOB_ONCE] = { LOG_ERR, ANSI_HIGHLIGHT_RED, " ONCE " },
+ [JOB_FROZEN] = { LOG_ERR, ANSI_HIGHLIGHT_RED, "FROZEN" },
};
static const char* job_done_mid(JobType type, JobResult result) {
@@ -954,6 +961,8 @@ int job_run_and_invalidate(Job *j) {
r = job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false);
else if (r == -ESTALE)
r = job_finish_and_invalidate(j, JOB_ONCE, true, false);
+ else if (r == -EDEADLK)
+ r = job_finish_and_invalidate(j, JOB_FROZEN, true, false);
else if (r < 0)
r = job_finish_and_invalidate(j, JOB_FAILED, true, false);
}
@@ -1011,7 +1020,7 @@ int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool alr
goto finish;
}
- if (IN_SET(result, JOB_FAILED, JOB_INVALID))
+ if (IN_SET(result, JOB_FAILED, JOB_INVALID, JOB_FROZEN))
j->manager->n_failed_jobs++;
job_uninstall(j);
@@ -1369,6 +1378,7 @@ int job_coldplug(Job *j) {
void job_shutdown_magic(Job *j) {
assert(j);
+ assert(j->manager);
/* The shutdown target gets some special treatment here: we
* tell the kernel to begin with flushing its disk caches, to
@@ -1381,16 +1391,19 @@ void job_shutdown_magic(Job *j) {
if (j->type != JOB_START)
return;
- if (!MANAGER_IS_SYSTEM(j->unit->manager))
+ if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET))
return;
- if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET))
+ /* This is the very beginning of the shutdown phase, so take the timestamp here */
+ dual_timestamp_now(j->manager->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START);
+
+ if (!MANAGER_IS_SYSTEM(j->manager))
return;
/* In case messages on console has been disabled on boot */
- j->unit->manager->no_console_output = false;
+ j->manager->no_console_output = false;
- manager_invalidate_startup_units(j->unit->manager);
+ manager_invalidate_startup_units(j->manager);
if (detect_container() > 0)
return;
@@ -1430,6 +1443,7 @@ bool job_may_gc(Job *j) {
Unit *other;
assert(j);
+ assert(j->manager);
/* Checks whether this job should be GC'ed away. We only do this for jobs of units that have no effect on their
* own and just track external state. For now the only unit type that qualifies for this are .device units.
@@ -1450,7 +1464,7 @@ bool job_may_gc(Job *j) {
* referenced by one, and reset this whenever we notice that no private bus connections are around. This means
* the GC is a bit too conservative when it comes to jobs created by private bus connections. */
if (j->ref_by_private_bus) {
- if (set_isempty(j->unit->manager->private_buses))
+ if (set_isempty(j->manager->private_buses))
j->ref_by_private_bus = false;
else
return false;
@@ -1473,6 +1487,7 @@ bool job_may_gc(Job *j) {
void job_add_to_gc_queue(Job *j) {
assert(j);
+ assert(j->manager);
if (j->in_gc_queue)
return;
@@ -1480,7 +1495,7 @@ void job_add_to_gc_queue(Job *j) {
if (!job_may_gc(j))
return;
- LIST_PREPEND(gc_queue, j->unit->manager->gc_job_queue, j);
+ LIST_PREPEND(gc_queue, j->manager->gc_job_queue, j);
j->in_gc_queue = true;
}
@@ -1645,6 +1660,7 @@ static const char* const job_result_table[_JOB_RESULT_MAX] = {
[JOB_UNSUPPORTED] = "unsupported",
[JOB_COLLECTED] = "collected",
[JOB_ONCE] = "once",
+ [JOB_FROZEN] = "frozen",
};
DEFINE_STRING_TABLE_LOOKUP(job_result, JobResult);
diff --git a/src/core/job.h b/src/core/job.h
index 891d87a..8318b52 100644
--- a/src/core/job.h
+++ b/src/core/job.h
@@ -96,6 +96,7 @@ enum JobResult {
JOB_UNSUPPORTED, /* Couldn't start a unit, because the unit type is not supported on the system */
JOB_COLLECTED, /* Job was garbage collected, since nothing needed it anymore */
JOB_ONCE, /* Unit was started before, and hence can't be started again */
+ JOB_FROZEN, /* Unit is currently frozen, so we can't safely operate on it */
_JOB_RESULT_MAX,
_JOB_RESULT_INVALID = -EINVAL,
};
diff --git a/src/core/kmod-setup.c b/src/core/kmod-setup.c
index b8e3f7a..c39b136 100644
--- a/src/core/kmod-setup.c
+++ b/src/core/kmod-setup.c
@@ -9,28 +9,13 @@
#include "fileio.h"
#include "kmod-setup.h"
#include "macro.h"
+#include "module-util.h"
#include "recurse-dir.h"
#include "string-util.h"
#include "strv.h"
#include "virt.h"
#if HAVE_KMOD
-#include "module-util.h"
-
-static void systemd_kmod_log(
- void *data,
- int priority,
- const char *file, int line,
- const char *fn,
- const char *format,
- va_list args) {
-
- /* library logging is enabled at debug only */
- DISABLE_WARNING_FORMAT_NONLITERAL;
- log_internalv(LOG_DEBUG, 0, file, line, fn, format, args);
- REENABLE_WARNING;
-}
-
static int match_modalias_recurse_dir_cb(
RecurseDirEvent event,
const char *path,
@@ -113,12 +98,11 @@ static bool in_qemu(void) {
int kmod_setup(void) {
#if HAVE_KMOD
-
static const struct {
const char *module;
const char *path;
- bool warn_if_unavailable:1;
- bool warn_if_module:1;
+ bool warn_if_unavailable;
+ bool warn_if_module;
bool (*condition_fn)(void);
} kmod_table[] = {
/* This one we need to load explicitly, since auto-loading on use doesn't work
@@ -166,34 +150,32 @@ int kmod_setup(void) {
{ "tpm", "/sys/class/tpmrm", false, false, efi_has_tpm2 },
#endif
};
- _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL;
- unsigned i;
+
+ int r;
if (have_effective_cap(CAP_SYS_MODULE) <= 0)
return 0;
- for (i = 0; i < ELEMENTSOF(kmod_table); i++) {
- if (kmod_table[i].path && access(kmod_table[i].path, F_OK) >= 0)
+ _cleanup_(sym_kmod_unrefp) struct kmod_ctx *ctx = NULL;
+ FOREACH_ELEMENT(kmod, kmod_table) {
+ if (kmod->path && access(kmod->path, F_OK) >= 0)
continue;
- if (kmod_table[i].condition_fn && !kmod_table[i].condition_fn())
+ if (kmod->condition_fn && !kmod->condition_fn())
continue;
- if (kmod_table[i].warn_if_module)
+ if (kmod->warn_if_module)
log_debug("Your kernel apparently lacks built-in %s support. Might be "
"a good idea to compile it in. We'll now try to work around "
- "this by loading the module...", kmod_table[i].module);
+ "this by loading the module...", kmod->module);
if (!ctx) {
- ctx = kmod_new(NULL, NULL);
- if (!ctx)
- return log_oom();
-
- kmod_set_log_fn(ctx, systemd_kmod_log, NULL);
- kmod_load_resources(ctx);
+ r = module_setup_context(&ctx);
+ if (r < 0)
+ return log_error_errno(r, "Failed to initialize kmod context: %m");
}
- (void) module_load_and_warn(ctx, kmod_table[i].module, kmod_table[i].warn_if_unavailable);
+ (void) module_load_and_warn(ctx, kmod->module, kmod->warn_if_unavailable);
}
#endif
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
index 45f9ab0..df219d8 100644
--- a/src/core/load-fragment-gperf.gperf.in
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -136,7 +136,7 @@
{{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system)
{{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home)
{{type}}.MountFlags, config_parse_exec_mount_propagation_flag, 0, offsetof({{type}}, exec_context.mount_propagation_flag)
-{{type}}.MountAPIVFS, config_parse_exec_mount_apivfs, 0, offsetof({{type}}, exec_context)
+{{type}}.MountAPIVFS, config_parse_tristate, 0, offsetof({{type}}, exec_context.mount_apivfs)
{{type}}.Personality, config_parse_personality, 0, offsetof({{type}}, exec_context.personality)
{{type}}.RuntimeDirectoryPreserve, config_parse_exec_preserve_mode, 0, offsetof({{type}}, exec_context.runtime_directory_preserve_mode)
{{type}}.RuntimeDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME].mode)
@@ -220,6 +220,7 @@
{{type}}.StartupMemorySwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
{{type}}.MemoryZSwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
{{type}}.StartupMemoryZSwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryZSwapWriteback, config_parse_bool, 0, offsetof({{type}}, cgroup_context.memory_zswap_writeback)
{{type}}.MemoryLimit, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
{{type}}.DeviceAllow, config_parse_device_allow, 0, offsetof({{type}}, cgroup_context)
{{type}}.DevicePolicy, config_parse_device_policy, 0, offsetof({{type}}, cgroup_context.device_policy)
@@ -309,7 +310,8 @@ Unit.PartOf, config_parse_unit_deps,
Unit.JoinsNamespaceOf, config_parse_unit_deps, UNIT_JOINS_NAMESPACE_OF, 0
Unit.RequiresOverridable, config_parse_obsolete_unit_deps, UNIT_REQUIRES, 0
Unit.RequisiteOverridable, config_parse_obsolete_unit_deps, UNIT_REQUISITE, 0
-Unit.RequiresMountsFor, config_parse_unit_requires_mounts_for, 0, 0
+Unit.RequiresMountsFor, config_parse_unit_mounts_for, 0, 0
+Unit.WantsMountsFor, config_parse_unit_mounts_for, 0, 0
Unit.StopWhenUnneeded, config_parse_bool, 0, offsetof(Unit, stop_when_unneeded)
Unit.RefuseManualStart, config_parse_bool, 0, offsetof(Unit, refuse_manual_start)
Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop)
@@ -325,7 +327,7 @@ Unit.IgnoreOnSnapshot, config_parse_warn_compat,
Unit.JobTimeoutSec, config_parse_job_timeout_sec, 0, 0
Unit.JobRunningTimeoutSec, config_parse_job_running_timeout_sec, 0, 0
Unit.JobTimeoutAction, config_parse_emergency_action, 0, offsetof(Unit, job_timeout_action)
-Unit.JobTimeoutRebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, job_timeout_reboot_arg)
+Unit.JobTimeoutRebootArgument, config_parse_reboot_parameter, 0, offsetof(Unit, job_timeout_reboot_arg)
Unit.StartLimitIntervalSec, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval)
{# The following is a legacy alias name for compatibility #}
Unit.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval)
@@ -335,7 +337,7 @@ Unit.FailureAction, config_parse_emergency_action,
Unit.SuccessAction, config_parse_emergency_action, 0, offsetof(Unit, success_action)
Unit.FailureActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, failure_action_exit_status)
Unit.SuccessActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, success_action_exit_status)
-Unit.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg)
+Unit.RebootArgument, config_parse_reboot_parameter, 0, offsetof(Unit, reboot_arg)
Unit.ConditionPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, conditions)
Unit.ConditionPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, conditions)
Unit.ConditionPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, conditions)
@@ -498,6 +500,7 @@ Socket.FreeBind, config_parse_bool,
Socket.Transparent, config_parse_bool, 0, offsetof(Socket, transparent)
Socket.Broadcast, config_parse_bool, 0, offsetof(Socket, broadcast)
Socket.PassCredentials, config_parse_bool, 0, offsetof(Socket, pass_cred)
+Socket.PassFileDescriptorsToExec, config_parse_bool, 0, offsetof(Socket, pass_fds_to_exec)
Socket.PassSecurity, config_parse_bool, 0, offsetof(Socket, pass_sec)
Socket.PassPacketInfo, config_parse_bool, 0, offsetof(Socket, pass_pktinfo)
Socket.Timestamping, config_parse_socket_timestamping, 0, offsetof(Socket, timestamping)
@@ -530,7 +533,7 @@ Socket.SELinuxContextFromNet, config_parse_warn_compat,
{{ EXEC_CONTEXT_CONFIG_ITEMS('Socket') }}
{{ CGROUP_CONTEXT_CONFIG_ITEMS('Socket') }}
{{ KILL_CONTEXT_CONFIG_ITEMS('Socket') }}
-Mount.What, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.what)
+Mount.What, config_parse_mount_node, 0, offsetof(Mount, parameters_fragment.what)
Mount.Where, config_parse_unit_path_printf, 0, offsetof(Mount, where)
Mount.Options, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.options)
Mount.Type, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.fstype)
@@ -547,7 +550,7 @@ Automount.Where, config_parse_unit_path_printf,
Automount.ExtraOptions, config_parse_unit_string_printf, 0, offsetof(Automount, extra_options)
Automount.DirectoryMode, config_parse_mode, 0, offsetof(Automount, directory_mode)
Automount.TimeoutIdleSec, config_parse_sec_fix_0, 0, offsetof(Automount, timeout_idle_usec)
-Swap.What, config_parse_unit_path_printf, 0, offsetof(Swap, parameters_fragment.what)
+Swap.What, config_parse_mount_node, 0, offsetof(Swap, parameters_fragment.what)
Swap.Priority, config_parse_swap_priority, 0, 0
Swap.Options, config_parse_unit_string_printf, 0, offsetof(Swap, parameters_fragment.options)
Swap.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Swap, timeout_usec)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 0baf08e..5ae6888 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -16,8 +16,8 @@
#include "all-units.h"
#include "alloc-util.h"
#include "bpf-firewall.h"
-#include "bpf-lsm.h"
#include "bpf-program.h"
+#include "bpf-restrict-fs.h"
#include "bpf-socket-bind.h"
#include "bus-error.h"
#include "bus-internal.h"
@@ -38,6 +38,7 @@
#include "fileio.h"
#include "firewall-util.h"
#include "fs-util.h"
+#include "fstab-util.h"
#include "hexdecoct.h"
#include "iovec-util.h"
#include "ioprio-util.h"
@@ -56,6 +57,7 @@
#include "pcre2-util.h"
#include "percent-util.h"
#include "process-util.h"
+#include "reboot-util.h"
#include "seccomp-util.h"
#include "securebits-util.h"
#include "selinux-util.h"
@@ -248,7 +250,7 @@ int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, cons
/* Fragment paths should also be equal as a custom fragment for a specific template instance
* wouldn't necessarily lead to infinite recursion. */
- if (!path_equal_ptr(u->fragment_path, fragment_path))
+ if (!path_equal(u->fragment_path, fragment_path))
return false;
if (!contains_instance_specifier_superset(format))
@@ -361,6 +363,40 @@ int config_parse_unit_string_printf(
return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
}
+int config_parse_reboot_parameter(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (!reboot_parameter_is_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid reboot parameter '%s', ignoring.", k);
+ return 0;
+ }
+
+ return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
int config_parse_unit_strv_printf(
const char *unit,
const char *filename,
@@ -433,8 +469,9 @@ int config_parse_colon_separated_paths(
const char *rvalue,
void *data,
void *userdata) {
+
char ***sv = ASSERT_PTR(data);
- const Unit *u = userdata;
+ const Unit *u = ASSERT_PTR(userdata);
int r;
assert(filename);
@@ -574,17 +611,13 @@ int config_parse_socket_listen(
void *data,
void *userdata) {
+ Socket *s = ASSERT_PTR(SOCKET(data));
_cleanup_free_ SocketPort *p = NULL;
- SocketPort *tail;
- Socket *s;
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
- assert(data);
-
- s = SOCKET(data);
if (isempty(rvalue)) {
/* An empty assignment removes all ports */
@@ -592,10 +625,15 @@ int config_parse_socket_listen(
return 0;
}
- p = new0(SocketPort, 1);
+ p = new(SocketPort, 1);
if (!p)
return log_oom();
+ *p = (SocketPort) {
+ .socket = s,
+ .fd = -EBADF,
+ };
+
if (ltype != SOCKET_SOCKET) {
_cleanup_free_ char *k = NULL;
@@ -605,7 +643,11 @@ int config_parse_socket_listen(
return 0;
}
- r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ PathSimplifyWarnFlags flags = PATH_CHECK_ABSOLUTE;
+ if (ltype != SOCKET_SPECIAL)
+ flags |= PATH_CHECK_NON_API_VFS;
+
+ r = path_simplify_and_warn(k, flags, unit, filename, line, lvalue);
if (r < 0)
return 0;
@@ -619,7 +661,7 @@ int config_parse_socket_listen(
p->type = ltype;
} else if (streq(lvalue, "ListenNetlink")) {
- _cleanup_free_ char *k = NULL;
+ _cleanup_free_ char *k = NULL;
r = unit_path_printf(UNIT(s), rvalue, &k);
if (r < 0) {
@@ -644,7 +686,7 @@ int config_parse_socket_listen(
return 0;
}
- if (k[0] == '/') { /* Only for AF_UNIX file system sockets… */
+ if (path_is_absolute(k)) { /* Only for AF_UNIX file system sockets… */
r = patch_var_run(unit, filename, line, lvalue, &k);
if (r < 0)
return r;
@@ -674,16 +716,7 @@ int config_parse_socket_listen(
p->type = SOCKET_SOCKET;
}
- p->fd = -EBADF;
- p->auxiliary_fds = NULL;
- p->n_auxiliary_fds = 0;
- p->socket = s;
-
- tail = LIST_FIND_TAIL(port, s->ports);
- LIST_INSERT_AFTER(port, s->ports, tail, p);
-
- p = NULL;
-
+ LIST_APPEND(port, s->ports, TAKE_PTR(p));
return 0;
}
@@ -858,9 +891,7 @@ int config_parse_exec(
void *userdata) {
ExecCommand **e = ASSERT_PTR(data);
- const Unit *u = userdata;
- const char *p;
- bool semicolon;
+ const Unit *u = ASSERT_PTR(userdata);
int r;
assert(filename);
@@ -875,15 +906,11 @@ int config_parse_exec(
return 0;
}
- p = rvalue;
+ const char *p = rvalue;
+ bool semicolon;
+
do {
_cleanup_free_ char *path = NULL, *firstword = NULL;
- ExecCommandFlags flags = 0;
- bool ignore = false, separate_argv0 = false;
- _cleanup_free_ ExecCommand *nce = NULL;
- _cleanup_strv_free_ char **n = NULL;
- size_t nlen = 0;
- const char *f;
semicolon = false;
@@ -897,25 +924,30 @@ int config_parse_exec(
continue;
}
- f = firstword;
- for (;;) {
- /* We accept an absolute path as first argument. If it's prefixed with - and the path doesn't
- * exist, we ignore it instead of erroring out; if it's prefixed with @, we allow overriding of
- * argv[0]; if it's prefixed with :, we will not do environment variable substitution;
- * if it's prefixed with +, it will be run with full privileges and no sandboxing; if
- * it's prefixed with '!' we apply sandboxing, but do not change user/group credentials; if
- * it's prefixed with '!!', then we apply user/group credentials if the kernel supports ambient
- * capabilities -- if it doesn't we don't apply the credentials themselves, but do apply most
- * other sandboxing, with some special exceptions for changing UID.
+ const char *f = firstword;
+ bool ignore, separate_argv0 = false;
+ ExecCommandFlags flags = 0;
+
+ for (;; f++) {
+ /* We accept an absolute path as first argument. Valid prefixes and their effect:
+ *
+ * "-": Ignore if the path doesn't exist
+ * "@": Allow overriding argv[0] (supplied as a separate argument)
+ * ":": Disable environment variable substitution
+ * "+": Run with full privileges and no sandboxing
+ * "!": Apply sandboxing except for user/group credentials
+ * "!!": Apply user/group credentials if the kernel supports ambient capabilities -
+ * if it doesn't we don't apply the credentials themselves, but do apply
+ * most other sandboxing, with some special exceptions for changing UID.
*
- * The idea is that '!!' may be used to write services that can take benefit of systemd's
- * UID/GID dropping if the kernel supports ambient creds, but provide an automatic fallback to
- * privilege dropping within the daemon if the kernel does not offer that. */
+ * The idea is that '!!' may be used to write services that can take benefit of
+ * systemd's UID/GID dropping if the kernel supports ambient creds, but provide
+ * an automatic fallback to privilege dropping within the daemon if the kernel
+ * does not offer that. */
- if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) {
+ if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE))
flags |= EXEC_COMMAND_IGNORE_FAILURE;
- ignore = true;
- } else if (*f == '@' && !separate_argv0)
+ else if (*f == '@' && !separate_argv0)
separate_argv0 = true;
else if (*f == ':' && !(flags & EXEC_COMMAND_NO_ENV_EXPAND))
flags |= EXEC_COMMAND_NO_ENV_EXPAND;
@@ -928,9 +960,10 @@ int config_parse_exec(
flags |= EXEC_COMMAND_AMBIENT_MAGIC;
} else
break;
- f++;
}
+ ignore = FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE);
+
r = unit_path_printf(u, f, &path);
if (r < 0) {
log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
@@ -940,19 +973,18 @@ int config_parse_exec(
}
if (isempty(path)) {
- /* First word is either "-" or "@" with no command. */
log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
- "Empty path in command line%s: '%s'",
+ "Empty path in command line%s: %s",
ignore ? ", ignoring" : "", rvalue);
return ignore ? 0 : -ENOEXEC;
}
if (!string_is_safe(path)) {
log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
- "Executable name contains special characters%s: %s",
+ "Executable path contains special characters%s: %s",
ignore ? ", ignoring" : "", path);
return ignore ? 0 : -ENOEXEC;
}
- if (endswith(path, "/")) {
+ if (path_implies_directory(path)) {
log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
"Executable path specifies a directory%s: %s",
ignore ? ", ignoring" : "", path);
@@ -966,92 +998,71 @@ int config_parse_exec(
return ignore ? 0 : -ENOEXEC;
}
- if (!separate_argv0) {
- char *w = NULL;
-
- if (!GREEDY_REALLOC0(n, nlen + 2))
- return log_oom();
+ _cleanup_strv_free_ char **args = NULL;
- w = strdup(path);
- if (!w)
+ if (!separate_argv0)
+ if (strv_extend(&args, path) < 0)
return log_oom();
- n[nlen++] = w;
- n[nlen] = NULL;
- }
-
- path_simplify(path);
while (!isempty(p)) {
_cleanup_free_ char *word = NULL, *resolved = NULL;
- /* Check explicitly for an unquoted semicolon as
- * command separator token. */
+ /* Check explicitly for an unquoted semicolon as command separator token. */
if (p[0] == ';' && (!p[1] || strchr(WHITESPACE, p[1]))) {
p++;
- p += strspn(p, WHITESPACE);
+ p = skip_leading_chars(p, /* bad = */ NULL);
semicolon = true;
break;
}
/* Check for \; explicitly, to not confuse it with \\; or "\;" or "\\;" etc.
- * extract_first_word() would return the same for all of those. */
+ * extract_first_word() would return the same for all of those. */
if (p[0] == '\\' && p[1] == ';' && (!p[2] || strchr(WHITESPACE, p[2]))) {
- char *w;
-
p += 2;
- p += strspn(p, WHITESPACE);
+ p = skip_leading_chars(p, /* bad = */ NULL);
- if (!GREEDY_REALLOC0(n, nlen + 2))
+ if (strv_extend(&args, ";") < 0)
return log_oom();
- w = strdup(";");
- if (!w)
- return log_oom();
- n[nlen++] = w;
- n[nlen] = NULL;
continue;
}
r = extract_first_word_and_warn(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue);
- if (r == 0)
- break;
if (r < 0)
return ignore ? 0 : -ENOEXEC;
+ if (r == 0)
+ break;
r = unit_full_printf(u, word, &resolved);
if (r < 0) {
log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
- "Failed to resolve unit specifiers in %s%s: %m",
+ "Failed to resolve unit specifiers in '%s'%s: %m",
word, ignore ? ", ignoring" : "");
return ignore ? 0 : -ENOEXEC;
}
- if (!GREEDY_REALLOC(n, nlen + 2))
+ if (strv_consume(&args, TAKE_PTR(resolved)) < 0)
return log_oom();
-
- n[nlen++] = TAKE_PTR(resolved);
- n[nlen] = NULL;
}
- if (!n || !n[0]) {
+ if (strv_isempty(args)) {
log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
"Empty executable name or zeroeth argument%s: %s",
ignore ? ", ignoring" : "", rvalue);
return ignore ? 0 : -ENOEXEC;
}
- nce = new0(ExecCommand, 1);
- if (!nce)
+ ExecCommand *nec = new(ExecCommand, 1);
+ if (!nec)
return log_oom();
- nce->argv = TAKE_PTR(n);
- nce->path = TAKE_PTR(path);
- nce->flags = flags;
-
- exec_command_append_list(e, nce);
+ *nec = (ExecCommand) {
+ .path = path_simplify(TAKE_PTR(path)),
+ .argv = TAKE_PTR(args),
+ .flags = flags,
+ };
- /* Do not _cleanup_free_ these. */
- nce = NULL;
+ exec_command_append_list(e, nec);
rvalue = p;
} while (semicolon);
@@ -1254,7 +1265,7 @@ int config_parse_exec_input_data(
return 0;
}
- r = unbase64mem(rvalue, SIZE_MAX, &p, &sz);
+ r = unbase64mem(rvalue, &p, &sz);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r,
"Failed to decode base64 data, ignoring: %s", rvalue);
@@ -1520,43 +1531,6 @@ int config_parse_exec_cpu_sched_policy(const char *unit,
return 0;
}
-int config_parse_exec_mount_apivfs(const char *unit,
- const char *filename,
- unsigned line,
- const char *section,
- unsigned section_line,
- const char *lvalue,
- int ltype,
- const char *rvalue,
- void *data,
- void *userdata) {
-
- ExecContext *c = ASSERT_PTR(data);
- int k;
-
- assert(filename);
- assert(lvalue);
- assert(rvalue);
-
- if (isempty(rvalue)) {
- c->mount_apivfs_set = false;
- c->mount_apivfs = false;
- return 0;
- }
-
- k = parse_boolean(rvalue);
- if (k < 0) {
- log_syntax(unit, LOG_WARNING, filename, line, k,
- "Failed to parse boolean value, ignoring: %s",
- rvalue);
- return 0;
- }
-
- c->mount_apivfs_set = true;
- c->mount_apivfs = k;
- return 0;
-}
-
int config_parse_numa_mask(const char *unit,
const char *filename,
unsigned line,
@@ -1748,7 +1722,7 @@ int config_parse_exec_root_hash(
}
/* We have a roothash to decode, eg: RootHash=012345789abcdef */
- r = unhexmem(rvalue, strlen(rvalue), &roothash_decoded, &roothash_decoded_size);
+ r = unhexmem(rvalue, &roothash_decoded, &roothash_decoded_size);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHash=, ignoring: %s", rvalue);
return 0;
@@ -1816,7 +1790,7 @@ int config_parse_exec_root_hash_sig(
}
/* We have a roothash signature to decode, eg: RootHashSignature=base64:012345789abcdef */
- r = unbase64mem(value, strlen(value), &roothash_sig_decoded, &roothash_sig_decoded_size);
+ r = unbase64mem(value, &roothash_sig_decoded, &roothash_sig_decoded_size);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHashSignature=, ignoring: %s", rvalue);
return 0;
@@ -2634,6 +2608,7 @@ int config_parse_working_directory(
assert(rvalue);
if (isempty(rvalue)) {
+ c->working_directory_missing_ok = false;
c->working_directory_home = false;
c->working_directory = mfree(c->working_directory);
return 0;
@@ -2659,7 +2634,7 @@ int config_parse_working_directory(
return missing_ok ? 0 : -ENOEXEC;
}
- r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE | (missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue);
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS|(missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue);
if (r < 0)
return missing_ok ? 0 : -ENOEXEC;
@@ -2697,7 +2672,7 @@ int config_parse_unit_env_file(const char *unit,
return 0;
}
- r = unit_full_printf_full(u, rvalue, PATH_MAX, &n);
+ r = unit_path_printf(u, rvalue, &n);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
return 0;
@@ -3152,7 +3127,7 @@ int config_parse_unit_condition_string(
return 0;
}
-int config_parse_unit_requires_mounts_for(
+int config_parse_unit_mounts_for(
const char *unit,
const char *filename,
unsigned line,
@@ -3171,6 +3146,7 @@ int config_parse_unit_requires_mounts_for(
assert(lvalue);
assert(rvalue);
assert(data);
+ assert(STR_IN_SET(lvalue, "RequiresMountsFor", "WantsMountsFor"));
for (const char *p = rvalue;;) {
_cleanup_free_ char *word = NULL, *resolved = NULL;
@@ -3196,9 +3172,9 @@ int config_parse_unit_requires_mounts_for(
if (r < 0)
continue;
- r = unit_require_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE, unit_mount_dependency_type_from_string(lvalue));
if (r < 0) {
- log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add required mount '%s', ignoring: %m", resolved);
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add requested mount '%s', ignoring: %m", resolved);
continue;
}
}
@@ -3695,7 +3671,7 @@ int config_parse_restrict_filesystems(
break;
}
- r = lsm_bpf_parse_filesystem(
+ r = bpf_restrict_fs_parse_filesystem(
word,
&c->restrict_filesystems,
FILESYSTEM_PARSE_LOG|
@@ -4693,7 +4669,7 @@ int config_parse_exec_directories(
_cleanup_free_ char *src = NULL, *dest = NULL;
const char *q = tuple;
- r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &src, &dest, NULL);
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &src, &dest);
if (r == -ENOMEM)
return log_oom();
if (r <= 0) {
@@ -4908,11 +4884,8 @@ int config_parse_load_credential(
void *data,
void *userdata) {
- _cleanup_free_ char *word = NULL, *k = NULL, *q = NULL;
ExecContext *context = ASSERT_PTR(data);
- bool encrypted = ltype;
- Unit *u = userdata;
- const char *p;
+ const Unit *u = ASSERT_PTR(userdata);
int r;
assert(filename);
@@ -4925,7 +4898,10 @@ int config_parse_load_credential(
return 0;
}
- p = rvalue;
+ _cleanup_free_ char *word = NULL, *id = NULL, *path = NULL;
+ const char *p = rvalue;
+ bool encrypted = ltype;
+
r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
if (r == -ENOMEM)
return log_oom();
@@ -4934,35 +4910,35 @@ int config_parse_load_credential(
return 0;
}
- r = unit_cred_printf(u, word, &k);
+ r = unit_cred_printf(u, word, &id);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word);
return 0;
}
- if (!credential_name_valid(k)) {
- log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k);
+ if (!credential_name_valid(id)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", id);
return 0;
}
if (isempty(p)) {
/* If only one field is specified take it as shortcut for inheriting a credential named
* the same way from our parent */
- q = strdup(k);
- if (!q)
+ path = strdup(id);
+ if (!path)
return log_oom();
} else {
- r = unit_path_printf(u, p, &q);
+ r = unit_path_printf(u, p, &path);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", p);
return 0;
}
- if (path_is_absolute(q) ? !path_is_normalized(q) : !credential_name_valid(q)) {
- log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential source \"%s\" not valid, ignoring.", q);
+ if (path_is_absolute(path) ? !path_is_normalized(path) : !credential_name_valid(path)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential source \"%s\" not valid, ignoring.", path);
return 0;
}
}
- r = hashmap_put_credential(&context->load_credentials, k, q, encrypted);
+ r = hashmap_put_credential(&context->load_credentials, id, path, encrypted);
if (r < 0)
return log_error_errno(r, "Failed to store load credential '%s': %m", rvalue);
@@ -5236,7 +5212,7 @@ int config_parse_bind_paths(
void *userdata) {
ExecContext *c = ASSERT_PTR(data);
- const Unit *u = userdata;
+ const Unit *u = ASSERT_PTR(userdata);
int r;
assert(filename);
@@ -5267,7 +5243,7 @@ int config_parse_bind_paths(
if (r == 0)
break;
- r = unit_full_printf_full(u, source, PATH_MAX, &sresolved);
+ r = unit_path_printf(u, source, &sresolved);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r,
"Failed to resolve unit specifiers in \"%s\", ignoring: %m", source);
@@ -5396,7 +5372,7 @@ int config_parse_mount_images(
return 0;
q = tuple;
- r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second, NULL);
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second);
if (r == -ENOMEM)
return log_oom();
if (r < 0) {
@@ -5420,7 +5396,7 @@ int config_parse_mount_images(
continue;
}
- r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS, unit, filename, line, lvalue);
if (r < 0)
continue;
@@ -5436,7 +5412,7 @@ int config_parse_mount_images(
continue;
}
- r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS, unit, filename, line, lvalue);
if (r < 0)
continue;
@@ -5445,7 +5421,7 @@ int config_parse_mount_images(
MountOptions *o = NULL;
PartitionDesignator partition_designator;
- r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options);
if (r == -ENOMEM)
return log_oom();
if (r < 0) {
@@ -5578,7 +5554,7 @@ int config_parse_extension_images(
continue;
}
- r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS, unit, filename, line, lvalue);
if (r < 0)
continue;
@@ -5587,7 +5563,7 @@ int config_parse_extension_images(
MountOptions *o = NULL;
PartitionDesignator partition_designator;
- r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options);
if (r == -ENOMEM)
return log_oom();
if (r < 0) {
@@ -5799,7 +5775,7 @@ int config_parse_pid_file(
return log_oom();
/* Check that the result is a sensible path */
- r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS, unit, filename, line, lvalue);
if (r < 0)
return r;
@@ -6095,7 +6071,7 @@ int config_parse_restrict_network_interfaces(
break;
}
- if (!ifname_valid(word)) {
+ if (!ifname_valid_full(word, IFNAME_VALID_ALTERNATIVE)) {
log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", word);
continue;
}
@@ -6112,6 +6088,47 @@ int config_parse_restrict_network_interfaces(
return 0;
}
+int config_parse_mount_node(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = ASSERT_PTR(userdata);
+ _cleanup_free_ char *resolved = NULL, *path = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_full_printf(u, rvalue, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ path = fstab_node_to_udev_node(resolved);
+ if (!path)
+ return log_oom();
+
+ /* The source passed is not necessarily something we understand, and we pass it as-is to mount/swapon,
+ * so path_is_valid is not used. But let's check for basic sanity, i.e. if the source is longer than
+ * PATH_MAX, you're likely doing something wrong. */
+ if (strlen(path) >= PATH_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Resolved mount path '%s' too long, ignoring.", path);
+ return 0;
+ }
+
+ return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, path, data, userdata);
+}
+
static int merge_by_names(Unit *u, Set *names, const char *id) {
char *k;
int r;
@@ -6316,8 +6333,7 @@ void unit_dump_config_items(FILE *f) {
{ config_parse_nsec, "NANOSECONDS" },
{ config_parse_namespace_path_strv, "PATH [...]" },
{ config_parse_bind_paths, "PATH[:PATH[:OPTIONS]] [...]" },
- { config_parse_unit_requires_mounts_for,
- "PATH [...]" },
+ { config_parse_unit_mounts_for, "PATH [...]" },
{ config_parse_exec_mount_propagation_flag,
"MOUNTFLAG" },
{ config_parse_unit_string_printf, "STRING" },
@@ -6365,6 +6381,7 @@ void unit_dump_config_items(FILE *f) {
{ config_parse_job_mode_isolate, "BOOLEAN" },
{ config_parse_personality, "PERSONALITY" },
{ config_parse_log_filter_patterns, "REGEX" },
+ { config_parse_mount_node, "NODE" },
};
const char *prev = NULL;
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
index 6919805..005b915 100644
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@@ -23,6 +23,7 @@ void unit_dump_config_items(FILE *f);
CONFIG_PARSER_PROTOTYPE(config_parse_unit_deps);
CONFIG_PARSER_PROTOTYPE(config_parse_obsolete_unit_deps);
CONFIG_PARSER_PROTOTYPE(config_parse_unit_string_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_reboot_parameter);
CONFIG_PARSER_PROTOTYPE(config_parse_unit_strv_printf);
CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_printf);
CONFIG_PARSER_PROTOTYPE(config_parse_colon_separated_paths);
@@ -71,7 +72,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_string);
CONFIG_PARSER_PROTOTYPE(config_parse_kill_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_notify_access);
CONFIG_PARSER_PROTOTYPE(config_parse_emergency_action);
-CONFIG_PARSER_PROTOTYPE(config_parse_unit_requires_mounts_for);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_mounts_for);
CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter);
CONFIG_PARSER_PROTOTYPE(config_parse_syscall_archs);
CONFIG_PARSER_PROTOTYPE(config_parse_syscall_errno);
@@ -159,6 +160,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_log_filter_patterns);
CONFIG_PARSER_PROTOTYPE(config_parse_open_file);
CONFIG_PARSER_PROTOTYPE(config_parse_memory_pressure_watch);
CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_nft_set);
+CONFIG_PARSER_PROTOTYPE(config_parse_mount_node);
/* gperf prototypes */
const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
diff --git a/src/core/main.c b/src/core/main.c
index 1ed968d..4b8a315 100644
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -21,7 +21,7 @@
#include "architecture.h"
#include "argv-util.h"
#if HAVE_LIBBPF
-#include "bpf-lsm.h"
+#include "bpf-restrict-fs.h"
#endif
#include "build.h"
#include "bus-error.h"
@@ -68,6 +68,7 @@
#include "manager-serialize.h"
#include "mkdir-label.h"
#include "mount-setup.h"
+#include "mount-util.h"
#include "os-util.h"
#include "pager.h"
#include "parse-argument.h"
@@ -87,6 +88,7 @@
#include "special.h"
#include "stat-util.h"
#include "stdio-util.h"
+#include "string-table.h"
#include "strv.h"
#include "switch-root.h"
#include "sysctl-util.h"
@@ -121,7 +123,7 @@ static RuntimeScope arg_runtime_scope;
bool arg_dump_core;
int arg_crash_chvt;
bool arg_crash_shell;
-bool arg_crash_reboot;
+CrashAction arg_crash_action;
static char *arg_confirm_spawn;
static ShowStatus arg_show_status;
static StatusUnitFormat arg_status_unit_format;
@@ -140,6 +142,7 @@ static char **arg_default_environment;
static char **arg_manager_environment;
static uint64_t arg_capability_bounding_set;
static bool arg_no_new_privs;
+static int arg_protect_system;
static nsec_t arg_timer_slack_nsec;
static Set* arg_syscall_archs;
static FILE* arg_serialization;
@@ -159,6 +162,16 @@ static char **saved_env = NULL;
static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
const struct rlimit *saved_rlimit_memlock);
+static const char* const crash_action_table[_CRASH_ACTION_MAX] = {
+ [CRASH_FREEZE] = "freeze",
+ [CRASH_REBOOT] = "reboot",
+ [CRASH_POWEROFF] = "poweroff",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(crash_action, CrashAction);
+
+static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_crash_action, crash_action, CrashAction, CRASH_FREEZE, "Invalid crash action");
+
static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) {
_cleanup_free_ char *base = NULL;
_cleanup_strv_free_ char **files = NULL, **dirs = NULL;
@@ -206,13 +219,17 @@ static int console_setup(void) {
r = proc_cmdline_tty_size("/dev/console", &rows, &cols);
if (r < 0)
- log_warning_errno(r, "Failed to get terminal size, ignoring: %m");
+ log_warning_errno(r, "Failed to get /dev/console size, ignoring: %m");
else {
r = terminal_set_size_fd(tty_fd, NULL, rows, cols);
if (r < 0)
- log_warning_errno(r, "Failed to set terminal size, ignoring: %m");
+ log_warning_errno(r, "Failed to set /dev/console size, ignoring: %m");
}
+ r = terminal_reset_ansi_seq(tty_fd);
+ if (r < 0)
+ log_warning_errno(r, "Failed to reset /dev/console using ANSI sequences, ignoring: %m");
+
return 0;
}
@@ -273,7 +290,18 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat
if (r < 0)
log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
else
- arg_crash_reboot = r;
+ arg_crash_action = r ? CRASH_REBOOT : CRASH_FREEZE;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.crash_action")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = crash_action_from_string(value);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse crash action switch %s, ignoring: %m", value);
+ else
+ arg_crash_action = r;
} else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
char *s;
@@ -462,7 +490,7 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat
if (proc_cmdline_value_missing(key, value))
return 0;
- r = unbase64mem(value, SIZE_MAX, &p, &sz);
+ r = unbase64mem(value, &p, &sz);
if (r < 0)
log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
@@ -610,6 +638,73 @@ static int config_parse_oom_score_adjust(
return 0;
}
+static int config_parse_protect_system_pid1(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int *v = ASSERT_PTR(data), r;
+
+ /* This is modelled after the per-service ProtectSystem= setting, but a bit more restricted on one
+ * hand, and more automatic in another. i.e. we currently only support yes/no (not "strict" or
+ * "full"). And we will enable this automatically for the initrd unless configured otherwise.
+ *
+ * We might extend this later to match more closely what the per-service ProtectSystem= can do, but
+ * this is not trivial, due to ordering constraints: besides /usr/ we don't really have much mounted
+ * at the moment we enable this logic. */
+
+ if (isempty(rvalue) || streq(rvalue, "auto")) {
+ *v = -1;
+ return 0;
+ }
+
+ r = parse_boolean(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse ProtectSystem= argument '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ *v = r;
+ return 0;
+}
+
+static int config_parse_crash_reboot(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CrashAction *v = ASSERT_PTR(data);
+ int r;
+
+ if (isempty(rvalue)) {
+ *v = CRASH_REBOOT;
+ return 0;
+ }
+
+ r = parse_boolean(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CrashReboot= argument '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ *v = r > 0 ? CRASH_REBOOT : CRASH_FREEZE;
+ return 0;
+}
+
static int parse_config_file(void) {
const ConfigTableItem items[] = {
{ "Manager", "LogLevel", config_parse_level2, 0, NULL },
@@ -621,7 +716,8 @@ static int parse_config_file(void) {
{ "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt },
{ "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt },
{ "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell },
- { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot },
+ { "Manager", "CrashReboot", config_parse_crash_reboot, 0, &arg_crash_action },
+ { "Manager", "CrashAction", config_parse_crash_action, 0, &arg_crash_action },
{ "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
{ "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format },
{ "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity },
@@ -637,6 +733,7 @@ static int parse_config_file(void) {
{ "Manager", "RuntimeWatchdogPreGovernor", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor },
{ "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
{ "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
+ { "Manager", "ProtectSystem", config_parse_protect_system_pid1, 0, &arg_protect_system },
#if HAVE_SECCOMP
{ "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
#else
@@ -696,11 +793,12 @@ static int parse_config_file(void) {
};
if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM)
- (void) config_parse_config_file("system.conf",
- "Manager\0",
- config_item_table_lookup, items,
- CONFIG_PARSE_WARN,
- NULL);
+ (void) config_parse_standard_file_with_dropins(
+ "systemd/system.conf",
+ "Manager\0",
+ config_item_table_lookup, items,
+ CONFIG_PARSE_WARN,
+ /* userdata= */ NULL);
else {
_cleanup_strv_free_ char **files = NULL, **dirs = NULL;
int r;
@@ -769,8 +867,8 @@ static void set_manager_settings(Manager *m) {
m->cad_burst_action = arg_cad_burst_action;
/* Note that we don't do structured initialization here, otherwise it will reset the rate limit
* counter on every daemon-reload. */
- m->reload_ratelimit.interval = arg_reload_limit_interval_sec;
- m->reload_ratelimit.burst = arg_reload_limit_burst;
+ m->reload_reexec_ratelimit.interval = arg_reload_limit_interval_sec;
+ m->reload_reexec_ratelimit.burst = arg_reload_limit_burst;
manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
@@ -935,9 +1033,17 @@ static int parse_argv(int argc, char *argv[]) {
break;
case ARG_CRASH_REBOOT:
- r = parse_boolean_argument("--crash-reboot", optarg, &arg_crash_reboot);
+ r = parse_boolean_argument("--crash-reboot", optarg, NULL);
if (r < 0)
return r;
+ arg_crash_action = r > 0 ? CRASH_REBOOT : CRASH_FREEZE;
+ break;
+
+ case ARG_CRASH_ACTION:
+ r = crash_action_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse crash action \"%s\": %m", optarg);
+ arg_crash_action = r;
break;
case ARG_CONFIRM_SPAWN:
@@ -1053,7 +1159,7 @@ static int help(void) {
" --unit=UNIT Set default unit\n"
" --dump-core[=BOOL] Dump core on crash\n"
" --crash-vt=NR Change to specified VT on crash\n"
- " --crash-reboot[=BOOL] Reboot on crash\n"
+ " --crash-action=ACTION Specify what to do on crash\n"
" --crash-shell[=BOOL] Run shell on crash\n"
" --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n"
" --show-status[=BOOL] Show status updates on the console during boot\n"
@@ -1265,7 +1371,7 @@ static void test_usr(void) {
log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. "
"Some things will probably break (sometimes even silently) in mysterious ways. "
- "Consult https://www.freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information.");
+ "Consult https://systemd.io/SEPARATE_USR_IS_BROKEN for more information.");
}
static int enforce_syscall_archs(Set *archs) {
@@ -1277,7 +1383,7 @@ static int enforce_syscall_archs(Set *archs) {
r = seccomp_restrict_archs(arg_syscall_archs);
if (r < 0)
- return log_error_errno(r, "Failed to enforce system call architecture restrication: %m");
+ return log_error_errno(r, "Failed to enforce system call architecture restriction: %m");
#endif
return 0;
}
@@ -1435,7 +1541,7 @@ static int fixup_environment(void) {
return -errno;
/* The kernels sets HOME=/ for init. Let's undo this. */
- if (path_equal_ptr(getenv("HOME"), "/"))
+ if (path_equal(getenv("HOME"), "/"))
assert_se(unsetenv("HOME") == 0);
return 0;
@@ -1467,32 +1573,37 @@ static int become_shutdown(int objective, int retval) {
[MANAGER_KEXEC] = "kexec",
};
- char log_level[STRLEN("--log-level=") + DECIMAL_STR_MAX(int)],
- timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")],
+ char timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")],
exit_code[STRLEN("--exit-code=") + DECIMAL_STR_MAX(uint8_t)];
_cleanup_strv_free_ char **env_block = NULL;
+ _cleanup_free_ char *max_log_levels = NULL;
usec_t watchdog_timer = 0;
int r;
assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
assert(table[objective]);
- xsprintf(log_level, "--log-level=%d", log_get_max_level());
xsprintf(timeout, "--timeout=%" PRI_USEC "us", arg_defaults.timeout_stop_usec);
- const char* command_line[10] = {
+ const char* command_line[11] = {
SYSTEMD_SHUTDOWN_BINARY_PATH,
table[objective],
- log_level,
timeout,
/* Note that the last position is a terminator and must contain NULL. */
};
- size_t pos = 4;
+ size_t pos = 3;
assert(command_line[pos-1]);
assert(!command_line[pos]);
+ (void) log_max_levels_to_string(log_get_max_level(), &max_log_levels);
+
+ if (max_log_levels) {
+ command_line[pos++] = "--log-level";
+ command_line[pos++] = max_log_levels;
+ }
+
switch (log_get_target()) {
case LOG_TARGET_KMSG:
@@ -1538,7 +1649,7 @@ static int become_shutdown(int objective, int retval) {
(void) watchdog_setup_pretimeout(0);
(void) watchdog_setup_pretimeout_governor(NULL);
r = watchdog_setup(watchdog_timer);
- watchdog_close(r < 0);
+ watchdog_close(/* disarm= */ r < 0);
/* The environment block: */
@@ -1684,6 +1795,35 @@ static void initialize_core_pattern(bool skip_setup) {
arg_early_core_pattern);
}
+static void apply_protect_system(bool skip_setup) {
+ int r;
+
+ if (skip_setup || getpid_cached() != 1 || arg_protect_system == 0)
+ return;
+
+ if (arg_protect_system < 0 && !in_initrd()) {
+ log_debug("ProtectSystem=auto selected, but not running in an initrd, skipping.");
+ return;
+ }
+
+ r = make_mount_point("/usr");
+ if (r < 0) {
+ log_warning_errno(r, "Failed to make /usr/ a mount point, ignoring: %m");
+ return;
+ }
+
+ if (mount_nofollow_verbose(
+ LOG_WARNING,
+ /* what= */ NULL,
+ "/usr",
+ /* fstype= */ NULL,
+ MS_BIND|MS_REMOUNT|MS_RDONLY,
+ /* options= */ NULL) < 0)
+ return;
+
+ log_info("Successfully made /usr/ read-only.");
+}
+
static void update_cpu_affinity(bool skip_setup) {
_cleanup_free_ char *mask = NULL;
@@ -1966,6 +2106,16 @@ static int invoke_main_loop(
"MESSAGE_ID=" SD_MESSAGE_CORE_MAINLOOP_FAILED_STR);
}
+ /* Ensure shutdown timestamp is taken even when bypassing the job engine */
+ if (IN_SET(objective,
+ MANAGER_SOFT_REBOOT,
+ MANAGER_REBOOT,
+ MANAGER_KEXEC,
+ MANAGER_HALT,
+ MANAGER_POWEROFF) &&
+ !dual_timestamp_is_set(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START))
+ dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START);
+
switch (objective) {
case MANAGER_RELOAD: {
@@ -2133,9 +2283,9 @@ static void log_execution_mode(bool *ret_first_boot) {
/* Let's check whether we are in first boot. First, check if an override was
* specified on the kernel command line. If yes, we honour that. */
- r = proc_cmdline_get_bool("systemd.condition-first-boot", /* flags = */ 0, &first_boot);
+ r = proc_cmdline_get_bool("systemd.condition_first_boot", /* flags = */ 0, &first_boot);
if (r < 0)
- log_debug_errno(r, "Failed to parse systemd.condition-first-boot= kernel command line argument, ignoring: %m");
+ log_debug_errno(r, "Failed to parse systemd.condition_first_boot= kernel command line argument, ignoring: %m");
if (r > 0)
log_full(first_boot ? LOG_INFO : LOG_DEBUG,
@@ -2221,12 +2371,6 @@ static int initialize_runtime(
install_crash_handler();
if (!skip_setup) {
- r = mount_cgroup_controllers();
- if (r < 0) {
- *ret_error_message = "Failed to mount cgroup hierarchies";
- return r;
- }
-
/* Pull credentials from various sources into a common credential directory (we do
* this here, before setting up the machine ID, so that we can use credential info
* for setting up the machine ID) */
@@ -2493,7 +2637,7 @@ static void setenv_manager_environment(void) {
r = putenv_dup(*p, true);
if (r < 0)
- log_warning_errno(errno, "Failed to setenv \"%s\", ignoring: %m", *p);
+ log_warning_errno(r, "Failed to setenv \"%s\", ignoring: %m", *p);
}
}
@@ -2507,7 +2651,7 @@ static void reset_arguments(void) {
arg_dump_core = true;
arg_crash_chvt = -1;
arg_crash_shell = false;
- arg_crash_reboot = false;
+ arg_crash_action = CRASH_FREEZE;
arg_confirm_spawn = mfree(arg_confirm_spawn);
arg_show_status = _SHOW_STATUS_INVALID;
arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
@@ -2531,6 +2675,7 @@ static void reset_arguments(void) {
arg_capability_bounding_set = CAP_MASK_UNSET;
arg_no_new_privs = false;
+ arg_protect_system = -1;
arg_timer_slack_nsec = NSEC_INFINITY;
arg_syscall_archs = set_free(arg_syscall_archs);
@@ -2952,6 +3097,24 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ if (!skip_setup) {
+ /* Before we actually start deleting cgroup v1 code, make it harder to boot
+ * in cgroupv1 mode first. See also #30852. */
+
+ r = mount_cgroup_legacy_controllers(loaded_policy);
+ if (r < 0) {
+ if (r == -ERFKILL)
+ error_message = "Refusing to run under cgroup v1, SYSTEMD_CGROUP_ENABLE_LEGACY_FORCE=1 not specified on kernel command line";
+ else
+ error_message = "Failed to mount cgroup v1 hierarchy";
+ goto finish;
+ }
+ if (r > 0) {
+ log_full(LOG_CRIT, "Legacy cgroup v1 support selected. This is no longer supported. Will proceed anyway after 30s.");
+ (void) usleep_safe(30 * USEC_PER_SEC);
+ }
+ }
+
/* The efivarfs is now mounted, let's lock down the system token. */
lock_down_efi_variables();
@@ -3038,9 +3201,12 @@ int main(int argc, char *argv[]) {
cmdline_take_random_seed();
}
- /* A core pattern might have been specified via the cmdline. */
+ /* A core pattern might have been specified via the cmdline. */
initialize_core_pattern(skip_setup);
+ /* Make /usr/ read-only */
+ apply_protect_system(skip_setup);
+
/* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
log_close();
@@ -3196,7 +3362,8 @@ finish:
#endif
if (r < 0)
- (void) sd_notifyf(0, "ERRNO=%i", -r);
+ (void) sd_notifyf(/* unset_environment= */ false,
+ "ERRNO=%i", -r);
/* Try to invoke the shutdown binary unless we already failed.
* If we failed above, we want to freeze after finishing cleanup. */
@@ -3209,7 +3376,8 @@ finish:
/* This is primarily useful when running systemd in a VM, as it provides the user running the VM with
* a mechanism to pick up systemd's exit status in the VM. */
- (void) sd_notifyf(0, "EXIT_STATUS=%i", retval);
+ (void) sd_notifyf(/* unset_environment= */ false,
+ "EXIT_STATUS=%i", retval);
watchdog_free_device();
arg_watchdog_device = mfree(arg_watchdog_device);
diff --git a/src/core/main.h b/src/core/main.h
index b12a1cc..1949a08 100644
--- a/src/core/main.h
+++ b/src/core/main.h
@@ -1,9 +1,21 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
+#include <errno.h>
#include <stdbool.h>
+typedef enum CrashAction {
+ CRASH_FREEZE,
+ CRASH_REBOOT,
+ CRASH_POWEROFF,
+ _CRASH_ACTION_MAX,
+ _CRASH_ACTION_INVALID = -EINVAL,
+} CrashAction;
+
+const char* crash_action_to_string(CrashAction action);
+CrashAction crash_action_from_string(const char *action);
+
extern bool arg_dump_core;
extern int arg_crash_chvt;
extern bool arg_crash_shell;
-extern bool arg_crash_reboot;
+extern CrashAction arg_crash_action;
diff --git a/src/core/manager-dump.c b/src/core/manager-dump.c
index 6c32d78..a12d50c 100644
--- a/src/core/manager-dump.c
+++ b/src/core/manager-dump.c
@@ -64,7 +64,7 @@ static void manager_dump_header(Manager *m, FILE *f, const char *prefix) {
* stable between versions. We take the liberty to restructure it entirely between versions and
* add/remove fields at will. */
- fprintf(f, "%sManager: systemd " STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")\n", strempty(prefix));
+ fprintf(f, "%sManager: systemd " PROJECT_VERSION_FULL " (" GIT_VERSION ")\n", strempty(prefix));
fprintf(f, "%sFeatures: %s\n", strempty(prefix), systemd_features);
for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c
index 1ac2636..b4af82b 100644
--- a/src/core/manager-serialize.c
+++ b/src/core/manager-serialize.c
@@ -23,11 +23,12 @@ int manager_open_serialization(Manager *m, FILE **ret_f) {
return open_serialization_file("systemd-state", ret_f);
}
-static bool manager_timestamp_shall_serialize(ManagerTimestamp t) {
- if (!in_initrd())
+static bool manager_timestamp_shall_serialize(ManagerObjective o, ManagerTimestamp t) {
+ if (!in_initrd() && o != MANAGER_SOFT_REBOOT)
return true;
- /* The following timestamps only apply to the host system, hence only serialize them there */
+ /* The following timestamps only apply to the host system (or first boot in case of soft-reboot),
+ * hence only serialize them there. */
return !IN_SET(t,
MANAGER_TIMESTAMP_USERSPACE, MANAGER_TIMESTAMP_FINISH,
MANAGER_TIMESTAMP_SECURITY_START, MANAGER_TIMESTAMP_SECURITY_FINISH,
@@ -108,10 +109,13 @@ int manager_serialize(
(void) serialize_usec(f, "pretimeout-watchdog-overridden", m->watchdog_overridden[WATCHDOG_PRETIMEOUT]);
(void) serialize_item(f, "pretimeout-watchdog-governor-overridden", m->watchdog_pretimeout_governor_overridden);
+ (void) serialize_item(f, "previous-objective", manager_objective_to_string(m->objective));
+ (void) serialize_item_format(f, "soft-reboots-count", "%u", m->soft_reboots_count);
+
for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
_cleanup_free_ char *joined = NULL;
- if (!manager_timestamp_shall_serialize(q))
+ if (!manager_timestamp_shall_serialize(m->objective, q))
continue;
joined = strjoin(manager_timestamp_to_string(q), "-timestamp");
@@ -139,21 +143,19 @@ int manager_serialize(
}
if (m->user_lookup_fds[0] >= 0) {
- int copy0, copy1;
-
- copy0 = fdset_put_dup(fds, m->user_lookup_fds[0]);
- if (copy0 < 0)
- return log_error_errno(copy0, "Failed to add user lookup fd to serialization: %m");
-
- copy1 = fdset_put_dup(fds, m->user_lookup_fds[1]);
- if (copy1 < 0)
- return log_error_errno(copy1, "Failed to add user lookup fd to serialization: %m");
+ r = serialize_fd_many(f, fds, "user-lookup", m->user_lookup_fds, 2);
+ if (r < 0)
+ return r;
+ }
- (void) serialize_item_format(f, "user-lookup", "%i %i", copy0, copy1);
+ if (m->handoff_timestamp_fds[0] >= 0) {
+ r = serialize_fd_many(f, fds, "handoff-timestamp-fds", m->handoff_timestamp_fds, 2);
+ if (r < 0)
+ return r;
}
(void) serialize_ratelimit(f, "dump-ratelimit", &m->dump_ratelimit);
- (void) serialize_ratelimit(f, "reload-ratelimit", &m->reload_ratelimit);
+ (void) serialize_ratelimit(f, "reload-reexec-ratelimit", &m->reload_reexec_ratelimit);
bus_track_serialize(m->subscribed, f, "subscribed");
@@ -443,10 +445,10 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
if (r < 0)
return r;
- } else if (startswith(l, "env=")) {
- r = deserialize_environment(l + 4, &m->client_environment);
+ } else if ((val = startswith(l, "env="))) {
+ r = deserialize_environment(val, &m->client_environment);
if (r < 0)
- log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", l);
+ log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", val);
} else if ((val = startswith(l, "notify-fd="))) {
int fd;
@@ -454,8 +456,7 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
fd = deserialize_fd(fds, val);
if (fd >= 0) {
m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source);
- safe_close(m->notify_fd);
- m->notify_fd = fd;
+ close_and_replace(m->notify_fd, fd);
}
} else if ((val = startswith(l, "notify-socket="))) {
@@ -469,21 +470,26 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
fd = deserialize_fd(fds, val);
if (fd >= 0) {
m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source);
- safe_close(m->cgroups_agent_fd);
- m->cgroups_agent_fd = fd;
+ close_and_replace(m->cgroups_agent_fd, fd);
}
} else if ((val = startswith(l, "user-lookup="))) {
- int fd0, fd1;
-
- if (sscanf(val, "%i %i", &fd0, &fd1) != 2 || fd0 < 0 || fd1 < 0 || fd0 == fd1 || !fdset_contains(fds, fd0) || !fdset_contains(fds, fd1))
- log_notice("Failed to parse user lookup fd, ignoring: %s", val);
- else {
- m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source);
- safe_close_pair(m->user_lookup_fds);
- m->user_lookup_fds[0] = fdset_remove(fds, fd0);
- m->user_lookup_fds[1] = fdset_remove(fds, fd1);
- }
+
+ m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source);
+ safe_close_pair(m->user_lookup_fds);
+
+ r = deserialize_fd_many(fds, val, 2, m->user_lookup_fds);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse user-lookup fds: \"%s\", ignoring: %m", val);
+
+ } else if ((val = startswith(l, "handoff-timestamp-fds="))) {
+
+ m->handoff_timestamp_event_source = sd_event_source_disable_unref(m->handoff_timestamp_event_source);
+ safe_close_pair(m->handoff_timestamp_fds);
+
+ r = deserialize_fd_many(fds, val, 2, m->handoff_timestamp_fds);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse handoff-timestamp fds: \"%s\", ignoring: %m", val);
} else if ((val = startswith(l, "dynamic-user=")))
dynamic_user_deserialize_one(m, val, fds, NULL);
@@ -495,8 +501,9 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
(void) exec_shared_runtime_deserialize_one(m, val, fds);
else if ((val = startswith(l, "subscribed="))) {
- if (strv_extend(&m->deserialized_subscribed, val) < 0)
- return -ENOMEM;
+ r = strv_extend(&m->deserialized_subscribed, val);
+ if (r < 0)
+ return r;
} else if ((val = startswith(l, "varlink-server-socket-address="))) {
if (!m->varlink_server && MANAGER_IS_SYSTEM(m)) {
r = manager_varlink_init(m);
@@ -516,9 +523,25 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
(void) varlink_server_deserialize_one(m->varlink_server, val, fds);
} else if ((val = startswith(l, "dump-ratelimit=")))
deserialize_ratelimit(&m->dump_ratelimit, "dump-ratelimit", val);
- else if ((val = startswith(l, "reload-ratelimit=")))
- deserialize_ratelimit(&m->reload_ratelimit, "reload-ratelimit", val);
- else {
+ else if ((val = startswith(l, "reload-reexec-ratelimit=")))
+ deserialize_ratelimit(&m->reload_reexec_ratelimit, "reload-reexec-ratelimit", val);
+ else if ((val = startswith(l, "soft-reboots-count="))) {
+ unsigned n;
+
+ if (safe_atou(val, &n) < 0)
+ log_notice("Failed to parse soft reboots counter '%s', ignoring.", val);
+ else
+ m->soft_reboots_count = n;
+ } else if ((val = startswith(l, "previous-objective="))) {
+ ManagerObjective objective;
+
+ objective = manager_objective_from_string(val);
+ if (objective < 0)
+ log_notice("Failed to parse previous objective '%s', ignoring.", val);
+ else
+ m->previous_objective = objective;
+
+ } else {
ManagerTimestamp q;
for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
diff --git a/src/core/manager.c b/src/core/manager.c
index 88eebfc..90e72b0 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -9,7 +9,6 @@
#include <sys/mount.h>
#include <sys/reboot.h>
#include <sys/timerfd.h>
-#include <sys/utsname.h>
#include <sys/wait.h>
#include <unistd.h>
@@ -25,6 +24,7 @@
#include "alloc-util.h"
#include "audit-fd.h"
#include "boot-timestamps.h"
+#include "build-path.h"
#include "bus-common-errors.h"
#include "bus-error.h"
#include "bus-kernel.h"
@@ -36,6 +36,7 @@
#include "constants.h"
#include "core-varlink.h"
#include "creds-util.h"
+#include "daemon-util.h"
#include "dbus-job.h"
#include "dbus-manager.h"
#include "dbus-unit.h"
@@ -55,6 +56,7 @@
#include "inotify-util.h"
#include "install.h"
#include "io-util.h"
+#include "iovec-util.h"
#include "label-util.h"
#include "load-fragment.h"
#include "locale-setup.h"
@@ -88,6 +90,7 @@
#include "strxcpyx.h"
#include "sysctl-util.h"
#include "syslog-util.h"
+#include "taint.h"
#include "terminal-util.h"
#include "time-util.h"
#include "transaction.h"
@@ -122,6 +125,7 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t
static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata);
static int manager_dispatch_run_queue(sd_event_source *source, void *userdata);
static int manager_dispatch_sigchld(sd_event_source *source, void *userdata);
@@ -263,12 +267,11 @@ static void manager_print_jobs_in_progress(Manager *m) {
strempty(status_text));
}
- sd_notifyf(false,
- "STATUS=%sUser job %s/%s running (%s / %s)...",
- job_of_n,
- ident,
- job_type_to_string(j->type),
- time, limit);
+ (void) sd_notifyf(/* unset_environment= */ false,
+ "STATUS=%sUser job %s/%s running (%s / %s)...",
+ job_of_n,
+ ident, job_type_to_string(j->type),
+ time, limit);
m->status_ready = false;
}
@@ -397,7 +400,7 @@ static int manager_setup_time_change(Manager *m) {
return log_error_errno(r, "Failed to create time change event source: %m");
/* Schedule this slightly earlier than the .timer event sources */
- r = sd_event_source_set_priority(m->time_change_event_source, SD_EVENT_PRIORITY_NORMAL-1);
+ r = sd_event_source_set_priority(m->time_change_event_source, EVENT_PRIORITY_TIME_CHANGE);
if (r < 0)
return log_error_errno(r, "Failed to set priority of time change event sources: %m");
@@ -464,7 +467,7 @@ static int manager_setup_timezone_change(Manager *m) {
return log_error_errno(r, "Failed to create timezone change event source: %m");
/* Schedule this slightly earlier than the .timer event sources */
- r = sd_event_source_set_priority(new_event, SD_EVENT_PRIORITY_NORMAL-1);
+ r = sd_event_source_set_priority(new_event, EVENT_PRIORITY_TIME_ZONE);
if (r < 0)
return log_error_errno(r, "Failed to set priority of timezone change event sources: %m");
@@ -482,21 +485,19 @@ static int enable_special_signals(Manager *m) {
if (MANAGER_IS_TEST_RUN(m))
return 0;
- /* Enable that we get SIGINT on control-alt-del. In containers
- * this will fail with EPERM (older) or EINVAL (newer), so
- * ignore that. */
+ /* Enable that we get SIGINT on control-alt-del. In containers this will fail with EPERM (older) or
+ * EINVAL (newer), so ignore that. */
if (reboot(RB_DISABLE_CAD) < 0 && !IN_SET(errno, EPERM, EINVAL))
- log_warning_errno(errno, "Failed to enable ctrl-alt-del handling: %m");
+ log_warning_errno(errno, "Failed to enable ctrl-alt-del handling, ignoring: %m");
fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC);
- if (fd < 0) {
- /* Support systems without virtual console */
- if (fd != -ENOENT)
- log_warning_errno(errno, "Failed to open /dev/tty0: %m");
- } else {
+ if (fd < 0)
+ /* Support systems without virtual console (ENOENT) gracefully */
+ log_full_errno(fd == -ENOENT ? LOG_DEBUG : LOG_WARNING, fd, "Failed to open /dev/tty0, ignoring: %m");
+ else {
/* Enable that we get SIGWINCH on kbrequest */
if (ioctl(fd, KDSIGACCEPT, SIGWINCH) < 0)
- log_warning_errno(errno, "Failed to enable kbrequest handling: %m");
+ log_warning_errno(errno, "Failed to enable kbrequest handling, ignoring: %m");
}
return 0;
@@ -592,10 +593,21 @@ static int manager_setup_signals(Manager *m) {
* notify processing can still figure out to which process/service a message belongs, before we reap the
* process. Also, process this before handling cgroup notifications, so that we always collect child exit
* status information before detecting that there's no process in a cgroup. */
- r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6);
+ r = sd_event_source_set_priority(m->signal_event_source, EVENT_PRIORITY_SIGNALS);
if (r < 0)
return r;
+ /* Report to supervisor that we now process the above signals. We report this as level "2", to
+ * indicate that we support more than sysvinit's signals (of course, sysvinit never sent this
+ * message, but conceptually it makes sense to consider level "1" to be equivalent to sysvinit's
+ * signal handling). Also, by setting this to "2" people looking for this hopefully won't
+ * misunderstand this as a boolean concept. Signal level 2 shall refer to the signals PID 1
+ * understands at the time of release of systemd v256, i.e. including basic SIGRTMIN+18 handling for
+ * memory pressure and stuff. When more signals are hooked up (or more SIGRTMIN+18 multiplex
+ * operations added, this level should be increased). */
+ (void) sd_notify(/* unset_environment= */ false,
+ "X_SYSTEMD_SIGNALS_LEVEL=2");
+
if (MANAGER_IS_SYSTEM(m))
return enable_special_signals(m);
@@ -641,16 +653,13 @@ static char** sanitize_environment(char **l) {
"TRIGGER_TIMER_REALTIME_USEC",
"TRIGGER_UNIT",
"WATCHDOG_PID",
- "WATCHDOG_USEC",
- NULL);
+ "WATCHDOG_USEC");
/* Let's order the environment alphabetically, just to make it pretty */
return strv_sort(l);
}
int manager_default_environment(Manager *m) {
- int r;
-
assert(m);
m->transient_environment = strv_free(m->transient_environment);
@@ -661,21 +670,39 @@ int manager_default_environment(Manager *m) {
*
* The initial passed environment is untouched to keep /proc/self/environ valid; it is used
* for tagging the init process inside containers. */
- m->transient_environment = strv_new("PATH=" DEFAULT_PATH);
- if (!m->transient_environment)
+ char *path = strjoin("PATH=", default_PATH());
+ if (!path)
+ return log_oom();
+
+ if (strv_consume(&m->transient_environment, path) < 0)
return log_oom();
/* Import locale variables LC_*= from configuration */
(void) locale_setup(&m->transient_environment);
} else {
- /* The user manager passes its own environment along to its children, except for $PATH. */
+ /* The user manager passes its own environment along to its children, except for $PATH and
+ * session envs. */
+
m->transient_environment = strv_copy(environ);
if (!m->transient_environment)
return log_oom();
- r = strv_env_replace_strdup(&m->transient_environment, "PATH=" DEFAULT_USER_PATH);
- if (r < 0)
+ char *path = strjoin("PATH=", default_user_PATH());
+ if (!path)
+ return log_oom();
+
+ if (strv_env_replace_consume(&m->transient_environment, path) < 0)
return log_oom();
+
+ /* Envvars set for our 'manager' class session are private and should not be propagated
+ * to children. Also it's likely that the graphical session will set these on their own. */
+ strv_env_unset_many(m->transient_environment,
+ "XDG_SESSION_ID",
+ "XDG_SESSION_CLASS",
+ "XDG_SESSION_TYPE",
+ "XDG_SESSION_DESKTOP",
+ "XDG_SEAT",
+ "XDG_VTNR");
}
sanitize_environment(m->transient_environment);
@@ -689,18 +716,18 @@ static int manager_setup_prefix(Manager *m) {
};
static const struct table_entry paths_system[_EXEC_DIRECTORY_TYPE_MAX] = {
- [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME, NULL },
- [EXEC_DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE, NULL },
- [EXEC_DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE, NULL },
- [EXEC_DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS, NULL },
+ [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME, NULL },
+ [EXEC_DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE, NULL },
+ [EXEC_DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE, NULL },
+ [EXEC_DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS, NULL },
[EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_SYSTEM_CONFIGURATION, NULL },
};
static const struct table_entry paths_user[_EXEC_DIRECTORY_TYPE_MAX] = {
- [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME, NULL },
- [EXEC_DIRECTORY_STATE] = { SD_PATH_USER_STATE_PRIVATE, NULL },
- [EXEC_DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE, NULL },
- [EXEC_DIRECTORY_LOGS] = { SD_PATH_USER_STATE_PRIVATE, "log" },
+ [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME, NULL },
+ [EXEC_DIRECTORY_STATE] = { SD_PATH_USER_STATE_PRIVATE, NULL },
+ [EXEC_DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE, NULL },
+ [EXEC_DIRECTORY_LOGS] = { SD_PATH_USER_STATE_PRIVATE, "log" },
[EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_USER_CONFIGURATION, NULL },
};
@@ -736,7 +763,7 @@ static int manager_setup_run_queue(Manager *m) {
if (r < 0)
return r;
- r = sd_event_source_set_priority(m->run_queue_event_source, SD_EVENT_PRIORITY_IDLE);
+ r = sd_event_source_set_priority(m->run_queue_event_source, EVENT_PRIORITY_RUN_QUEUE);
if (r < 0)
return r;
@@ -759,7 +786,7 @@ static int manager_setup_sigchld_event_source(Manager *m) {
if (r < 0)
return r;
- r = sd_event_source_set_priority(m->sigchld_event_source, SD_EVENT_PRIORITY_NORMAL-7);
+ r = sd_event_source_set_priority(m->sigchld_event_source, EVENT_PRIORITY_SIGCHLD);
if (r < 0)
return r;
@@ -861,6 +888,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
*m = (Manager) {
.runtime_scope = runtime_scope,
.objective = _MANAGER_OBJECTIVE_INVALID,
+ .previous_objective = _MANAGER_OBJECTIVE_INVALID,
.status_unit_format = STATUS_UNIT_FORMAT_DEFAULT,
@@ -878,6 +906,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
.cgroups_agent_fd = -EBADF,
.signal_fd = -EBADF,
.user_lookup_fds = EBADF_PAIR,
+ .handoff_timestamp_fds = EBADF_PAIR,
.private_listen_fd = -EBADF,
.dev_autofs_fd = -EBADF,
.cgroup_inotify_fd = -EBADF,
@@ -992,8 +1021,8 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
return r;
#if HAVE_LIBBPF
- if (MANAGER_IS_SYSTEM(m) && lsm_bpf_supported(/* initialize = */ true)) {
- r = lsm_bpf_setup(m);
+ if (MANAGER_IS_SYSTEM(m) && bpf_restrict_fs_supported(/* initialize = */ true)) {
+ r = bpf_restrict_fs_setup(m);
if (r < 0)
log_warning_errno(r, "Failed to setup LSM BPF, ignoring: %m");
}
@@ -1013,42 +1042,19 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
if (r < 0 && r != -EEXIST)
return r;
+ }
- m->executor_fd = open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH);
- if (m->executor_fd < 0)
- return log_emergency_errno(errno,
- "Failed to open executor binary '%s': %m",
- SYSTEMD_EXECUTOR_BINARY_PATH);
- } else if (!FLAGS_SET(test_run_flags, MANAGER_TEST_DONT_OPEN_EXECUTOR)) {
- _cleanup_free_ char *self_exe = NULL, *executor_path = NULL;
- _cleanup_close_ int self_dir_fd = -EBADF;
- int level = LOG_DEBUG;
-
- /* Prefer sd-executor from the same directory as the test, e.g.: when running unit tests from the
- * build directory. Fallback to working directory and then the installation path. */
- r = readlink_and_make_absolute("/proc/self/exe", &self_exe);
- if (r < 0)
- return r;
-
- self_dir_fd = open_parent(self_exe, O_CLOEXEC|O_PATH|O_DIRECTORY, 0);
- if (self_dir_fd < 0)
- return self_dir_fd;
-
- m->executor_fd = RET_NERRNO(openat(self_dir_fd, "systemd-executor", O_CLOEXEC|O_PATH));
- if (m->executor_fd == -ENOENT)
- m->executor_fd = RET_NERRNO(openat(AT_FDCWD, "systemd-executor", O_CLOEXEC|O_PATH));
- if (m->executor_fd == -ENOENT) {
- m->executor_fd = RET_NERRNO(open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH));
- level = LOG_WARNING; /* Tests should normally use local builds */
- }
+ if (!FLAGS_SET(test_run_flags, MANAGER_TEST_DONT_OPEN_EXECUTOR)) {
+ m->executor_fd = pin_callout_binary(SYSTEMD_EXECUTOR_BINARY_PATH);
if (m->executor_fd < 0)
- return m->executor_fd;
+ return log_debug_errno(m->executor_fd, "Failed to pin executor binary: %m");
+ _cleanup_free_ char *executor_path = NULL;
r = fd_get_path(m->executor_fd, &executor_path);
if (r < 0)
return r;
- log_full(level, "Using systemd-executor binary from '%s'.", executor_path);
+ log_debug("Using systemd-executor binary from '%s'.", executor_path);
}
/* Note that we do not set up the notify fd here. We do that after deserialization,
@@ -1113,7 +1119,7 @@ static int manager_setup_notify(Manager *m) {
/* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which
* service an exit message belongs. */
- r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-8);
+ r = sd_event_source_set_priority(m->notify_event_source, EVENT_PRIORITY_NOTIFY);
if (r < 0)
return log_error_errno(r, "Failed to set priority of notify event source: %m");
@@ -1187,7 +1193,7 @@ static int manager_setup_cgroups_agent(Manager *m) {
/* Process cgroups notifications early. Note that when the agent notification is received
* we'll just enqueue the unit in the cgroup empty queue, hence pick a high priority than
* that. Also see handling of cgroup inotify for the unified cgroup stuff. */
- r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+ r = sd_event_source_set_priority(m->cgroups_agent_event_source, EVENT_PRIORITY_CGROUP_AGENT);
if (r < 0)
return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m");
@@ -1236,13 +1242,13 @@ static int manager_setup_user_lookup_fd(Manager *m) {
if (!m->user_lookup_event_source) {
r = sd_event_add_io(m->event, &m->user_lookup_event_source, m->user_lookup_fds[0], EPOLLIN, manager_dispatch_user_lookup_fd, m);
if (r < 0)
- return log_error_errno(errno, "Failed to allocate user lookup event source: %m");
+ return log_error_errno(r, "Failed to allocate user lookup event source: %m");
/* Process even earlier than the notify event source, so that we always know first about valid UID/GID
* resolutions */
- r = sd_event_source_set_priority(m->user_lookup_event_source, SD_EVENT_PRIORITY_NORMAL-11);
+ r = sd_event_source_set_priority(m->user_lookup_event_source, EVENT_PRIORITY_USER_LOOKUP);
if (r < 0)
- return log_error_errno(errno, "Failed to set priority of user lookup event source: %m");
+ return log_error_errno(r, "Failed to set priority of user lookup event source: %m");
(void) sd_event_source_set_description(m->user_lookup_event_source, "user-lookup");
}
@@ -1250,6 +1256,49 @@ static int manager_setup_user_lookup_fd(Manager *m) {
return 0;
}
+static int manager_setup_handoff_timestamp_fd(Manager *m) {
+ int r;
+
+ assert(m);
+
+ /* Set up the socket pair used for passing timestamps back when the executor processes we fork
+ * off invokes execve(), i.e. when we hand off control to our payload processes. */
+
+ if (m->handoff_timestamp_fds[0] < 0) {
+ m->handoff_timestamp_event_source = sd_event_source_disable_unref(m->handoff_timestamp_event_source);
+ safe_close_pair(m->handoff_timestamp_fds);
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->handoff_timestamp_fds) < 0)
+ return log_error_errno(errno, "Failed to allocate handoff timestamp socket: %m");
+
+ /* Make sure children never have to block */
+ (void) fd_increase_rxbuf(m->handoff_timestamp_fds[0], NOTIFY_RCVBUF_SIZE);
+
+ r = setsockopt_int(m->handoff_timestamp_fds[0], SOL_SOCKET, SO_PASSCRED, true);
+ if (r < 0)
+ return log_error_errno(r, "SO_PASSCRED failed: %m");
+
+ /* Mark the receiving socket as O_NONBLOCK (but leave sending side as-is) */
+ r = fd_nonblock(m->handoff_timestamp_fds[0], true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make handoff timestamp socket O_NONBLOCK: %m");
+ }
+
+ if (!m->handoff_timestamp_event_source) {
+ r = sd_event_add_io(m->event, &m->handoff_timestamp_event_source, m->handoff_timestamp_fds[0], EPOLLIN, manager_dispatch_handoff_timestamp_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate handoff timestamp event source: %m");
+
+ r = sd_event_source_set_priority(m->handoff_timestamp_event_source, EVENT_PRIORITY_HANDOFF_TIMESTAMP);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of handoff timestamp event source: %m");
+
+ (void) sd_event_source_set_description(m->handoff_timestamp_event_source, "handoff-timestamp");
+ }
+
+ return 0;
+}
+
static unsigned manager_dispatch_cleanup_queue(Manager *m) {
Unit *u;
unsigned n = 0;
@@ -1664,12 +1713,14 @@ Manager* manager_free(Manager *m) {
sd_event_source_unref(m->jobs_in_progress_event_source);
sd_event_source_unref(m->run_queue_event_source);
sd_event_source_unref(m->user_lookup_event_source);
+ sd_event_source_unref(m->handoff_timestamp_event_source);
sd_event_source_unref(m->memory_pressure_event_source);
safe_close(m->signal_fd);
safe_close(m->notify_fd);
safe_close(m->cgroups_agent_fd);
safe_close_pair(m->user_lookup_fds);
+ safe_close_pair(m->handoff_timestamp_fds);
manager_close_ask_password(m);
@@ -1679,7 +1730,7 @@ Manager* manager_free(Manager *m) {
free(m->notify_socket);
- lookup_paths_free(&m->lookup_paths);
+ lookup_paths_done(&m->lookup_paths);
strv_free(m->transient_environment);
strv_free(m->client_environment);
@@ -1691,8 +1742,10 @@ Manager* manager_free(Manager *m) {
unit_defaults_done(&m->defaults);
- assert(hashmap_isempty(m->units_requiring_mounts_for));
- hashmap_free(m->units_requiring_mounts_for);
+ FOREACH_ARRAY(map, m->units_needing_mounts_for, _UNIT_MOUNT_DEPENDENCY_TYPE_MAX) {
+ assert(hashmap_isempty(*map));
+ hashmap_free(*map);
+ }
hashmap_free(m->uid_refs);
hashmap_free(m->gid_refs);
@@ -1708,7 +1761,7 @@ Manager* manager_free(Manager *m) {
m->fw_ctx = fw_ctx_free(m->fw_ctx);
#if BPF_FRAMEWORK
- lsm_bpf_destroy(m->restrict_fs);
+ bpf_restrict_fs_destroy(m->restrict_fs);
#endif
safe_close(m->executor_fd);
@@ -1802,7 +1855,7 @@ static void manager_distribute_fds(Manager *m, FDSet *fds) {
HASHMAP_FOREACH(u, m->units) {
- if (fdset_size(fds) <= 0)
+ if (fdset_isempty(fds))
break;
if (!UNIT_VTABLE(u)->distribute_fds)
@@ -1973,6 +2026,20 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
return log_error_errno(r, "Deserialization failed: %m");
}
+ if (m->previous_objective >= 0) {
+ if (IN_SET(m->previous_objective, MANAGER_REEXECUTE, MANAGER_SOFT_REBOOT, MANAGER_SWITCH_ROOT))
+ log_debug("Launching as effect of a '%s' operation.",
+ manager_objective_to_string(m->previous_objective));
+ else
+ log_warning("Got unexpected previous objective '%s', ignoring.",
+ manager_objective_to_string(m->previous_objective));
+ }
+
+ /* If we are in a new soft-reboot iteration bump the counter now before starting units, so
+ * that they can reliably read it. We get the previous objective from serialized state. */
+ if (m->previous_objective == MANAGER_SOFT_REBOOT)
+ m->soft_reboots_count++;
+
/* Any fds left? Find some unit which wants them. This is useful to allow container managers to pass
* some file descriptors to us pre-initialized. This enables socket-based activation of entire
* containers. */
@@ -1994,6 +2061,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
/* This shouldn't fail, except if things are really broken. */
return r;
+ r = manager_setup_handoff_timestamp_fd(m);
+ if (r < 0)
+ /* This shouldn't fail, except if things are really broken. */
+ return r;
+
/* Connect to the bus if we are good for it */
manager_setup_bus(m);
@@ -2203,8 +2275,8 @@ static int manager_dispatch_target_deps_queue(Manager *m) {
if (n_targets < 0)
return n_targets;
- for (int i = 0; i < n_targets; i++) {
- r = unit_add_default_target_dependency(u, targets[i]);
+ FOREACH_ARRAY(i, targets, n_targets) {
+ r = unit_add_default_target_dependency(u, *i);
if (r < 0)
return r;
}
@@ -2303,7 +2375,7 @@ int manager_load_unit_prepare(
Unit *unit = manager_get_unit(m, name);
if (unit) {
- /* The time-based cache allows to start new units without daemon-reload,
+ /* The time-based cache allows new units to be started without daemon-reload,
* but if they are already referenced (because of dependencies or ordering)
* then we have to force a load of the fragment. As an optimization, check
* first if anything in the usual paths was modified since the last time
@@ -2403,7 +2475,7 @@ void manager_clear_jobs(Manager *m) {
job_finish_and_invalidate(j, JOB_CANCELED, false, false);
}
-void manager_unwatch_pidref(Manager *m, PidRef *pid) {
+void manager_unwatch_pidref(Manager *m, const PidRef *pid) {
assert(m);
for (;;) {
@@ -2586,22 +2658,70 @@ static void manager_invoke_notify_message(
UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds);
else if (DEBUG_LOGGING) {
- _cleanup_free_ char *buf = NULL, *x = NULL, *y = NULL;
+ _cleanup_free_ char *joined = strv_join(tags, ", ");
+ char buf[CELLESCAPE_DEFAULT_LENGTH];
+
+ log_unit_debug(u, "Got notification message from unexpected unit type, ignoring: %s",
+ joined ? cellescape(buf, sizeof(buf), joined) : "(null)");
+ }
+}
+
+static int manager_get_units_for_pidref(Manager *m, const PidRef *pidref, Unit ***ret_units) {
+ /* Determine array of every unit that is interested in the specified process */
+
+ assert(m);
+ assert(pidref_is_set(pidref));
- buf = strv_join(tags, ", ");
- if (buf)
- x = ellipsize(buf, 20, 90);
- if (x)
- y = cescape(x);
+ Unit *u1, *u2, **array;
+ u1 = manager_get_unit_by_pidref_cgroup(m, pidref);
+ u2 = hashmap_get(m->watch_pids, pidref);
+ array = hashmap_get(m->watch_pids_more, pidref);
+
+ size_t n = 0;
+ if (u1)
+ n++;
+ if (u2)
+ n++;
+ if (array)
+ for (size_t j = 0; array[j]; j++)
+ n++;
+
+ assert(n <= INT_MAX); /* Make sure we can reasonably return the counter as "int" */
+
+ if (ret_units) {
+ _cleanup_free_ Unit **units = NULL;
+
+ if (n > 0) {
+ units = new(Unit*, n + 1);
+ if (!units)
+ return -ENOMEM;
+
+ /* We return a dense array, and put the "main" unit first, i.e. unit in whose cgroup
+ * the process currently is. Note that we do not bother with filtering duplicates
+ * here. */
+
+ size_t i = 0;
+ if (u1)
+ units[i++] = u1;
+ if (u2)
+ units[i++] = u2;
+ if (array)
+ for (size_t j = 0; array[j]; j++)
+ units[i++] = array[j];
+ assert(i == n);
+
+ units[i] = NULL; /* end array in an extra NULL */
+ }
- log_unit_debug(u, "Got notification message \"%s\", ignoring.", strnull(y));
+ *ret_units = TAKE_PTR(units);
}
+
+ return (int) n;
}
static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
-
- _cleanup_fdset_free_ FDSet *fds = NULL;
Manager *m = ASSERT_PTR(userdata);
+ _cleanup_fdset_free_ FDSet *fds = NULL;
char buf[NOTIFY_BUFFER_MAX+1];
struct iovec iovec = {
.iov_base = buf,
@@ -2618,12 +2738,9 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t
struct cmsghdr *cmsg;
struct ucred *ucred = NULL;
- _cleanup_free_ Unit **array_copy = NULL;
_cleanup_strv_free_ char **tags = NULL;
- Unit *u1, *u2, **array;
int r, *fd_array = NULL;
size_t n_fds = 0;
- bool found = false;
ssize_t n;
assert(m->notify_fd == fd);
@@ -2711,39 +2828,22 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t
PidRef pidref = PIDREF_MAKE_FROM_PID(ucred->pid);
/* Notify every unit that might be interested, which might be multiple. */
- u1 = manager_get_unit_by_pidref_cgroup(m, &pidref);
- u2 = hashmap_get(m->watch_pids, &pidref);
- array = hashmap_get(m->watch_pids_more, &pidref);
- if (array) {
- size_t k = 0;
+ _cleanup_free_ Unit **array = NULL;
- while (array[k])
- k++;
-
- array_copy = newdup(Unit*, array, k+1);
- if (!array_copy)
- log_oom();
- }
- /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle
- * duplicate units make sure we only invoke each unit's handler once. */
- if (u1) {
- manager_invoke_notify_message(m, u1, ucred, tags, fds);
- found = true;
- }
- if (u2) {
- manager_invoke_notify_message(m, u2, ucred, tags, fds);
- found = true;
+ int n_array = manager_get_units_for_pidref(m, &pidref, &array);
+ if (n_array < 0) {
+ log_warning_errno(n_array, "Failed to determine units for PID " PID_FMT ", ignoring: %m", ucred->pid);
+ return 0;
}
- if (array_copy)
- for (size_t i = 0; array_copy[i]; i++) {
- manager_invoke_notify_message(m, array_copy[i], ucred, tags, fds);
- found = true;
- }
-
- if (!found)
- log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid);
+ if (n_array == 0)
+ log_debug("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid);
+ else
+ /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle
+ * duplicate units – making sure we only invoke each unit's handler once. */
+ FOREACH_ARRAY(u, array, n_array)
+ manager_invoke_notify_message(m, *u, ucred, tags, fds);
- if (fdset_size(fds) > 0)
+ if (!fdset_isempty(fds))
log_warning("Got extra auxiliary fds with notification message, closing them.");
return 0;
@@ -2792,10 +2892,7 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) {
goto turn_off;
if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) {
- _cleanup_free_ Unit **array_copy = NULL;
_cleanup_free_ char *name = NULL;
- Unit *u1, *u2, **array;
-
(void) pid_get_comm(si.si_pid, &name);
log_debug("Child "PID_FMT" (%s) died (code=%s, status=%i/%s)",
@@ -2813,41 +2910,27 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) {
* pidfd here any more even if we wanted (since the process just exited). */
PidRef pidref = PIDREF_MAKE_FROM_PID(si.si_pid);
- /* And now figure out the unit this belongs to, it might be multiple... */
- u1 = manager_get_unit_by_pidref_cgroup(m, &pidref);
- u2 = hashmap_get(m->watch_pids, &pidref);
- array = hashmap_get(m->watch_pids_more, &pidref);
- if (array) {
- size_t n = 0;
-
- /* Count how many entries the array has */
- while (array[n])
- n++;
-
- /* Make a copy of the array so that we don't trip up on the array changing beneath us */
- array_copy = newdup(Unit*, array, n+1);
- if (!array_copy)
- log_oom();
- }
-
- /* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but
- * that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for
- * each iteration. */
- if (u1) {
- /* We check for oom condition, in case we got SIGCHLD before the oom notification.
- * We only do this for the cgroup the PID belonged to. */
- (void) unit_check_oom(u1);
+ /* And now figure out the units this belongs to, there might be multiple... */
+ _cleanup_free_ Unit **array = NULL;
+ int n_array = manager_get_units_for_pidref(m, &pidref, &array);
+ if (n_array < 0)
+ log_warning_errno(n_array, "Failed to get units for process " PID_FMT ", ignoring: %m", si.si_pid);
+ else if (n_array == 0)
+ log_debug("Got SIGCHLD for process " PID_FMT " we weren't interested in, ignoring.", si.si_pid);
+ else {
+ /* We check for an OOM condition, in case we got SIGCHLD before the OOM notification.
+ * We only do this for the cgroup the PID belonged to, which is the f */
+ (void) unit_check_oom(array[0]);
/* We check if systemd-oomd performed a kill so that we log and notify appropriately */
- (void) unit_check_oomd_kill(u1);
+ (void) unit_check_oomd_kill(array[0]);
- manager_invoke_sigchld_event(m, u1, &si);
+ /* Finally, execute them all. Note that the array might contain duplicates, but that's fine,
+ * manager_invoke_sigchld_event() will ensure we only invoke the handlers once for each
+ * iteration. */
+ FOREACH_ARRAY(u, array, n_array)
+ manager_invoke_sigchld_event(m, *u, &si);
}
- if (u2)
- manager_invoke_sigchld_event(m, u2, &si);
- if (array_copy)
- for (size_t i = 0; array_copy[i]; i++)
- manager_invoke_sigchld_event(m, array_copy[i], &si);
}
/* And now, we actually reap the zombie. */
@@ -2878,8 +2961,8 @@ static void manager_start_special(Manager *m, const char *name, JobMode mode) {
log_info("Activating special unit %s...", s);
- sd_notifyf(false,
- "STATUS=Activating special unit %s...", s);
+ (void) sd_notifyf(/* unset_environment= */ false,
+ "STATUS=Activating special unit %s...", s);
m->status_ready = false;
}
@@ -2986,7 +3069,7 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t
r = manager_get_dump_string(m, /* patterns= */ NULL, &dump);
if (r < 0) {
- log_warning_errno(errno, "Failed to acquire manager dump: %m");
+ log_warning_errno(r, "Failed to acquire manager dump: %m");
break;
}
@@ -3008,9 +3091,9 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t
const char *target;
JobMode mode;
} target_table[] = {
- [0] = { SPECIAL_DEFAULT_TARGET, JOB_ISOLATE },
- [1] = { SPECIAL_RESCUE_TARGET, JOB_ISOLATE },
- [2] = { SPECIAL_EMERGENCY_TARGET, JOB_ISOLATE },
+ [0] = { SPECIAL_DEFAULT_TARGET, JOB_ISOLATE },
+ [1] = { SPECIAL_RESCUE_TARGET, JOB_ISOLATE },
+ [2] = { SPECIAL_EMERGENCY_TARGET, JOB_ISOLATE },
[3] = { SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY },
[4] = { SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY },
[5] = { SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY },
@@ -3077,7 +3160,7 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t
r = manager_get_dump_jobs_string(m, /* patterns= */ NULL, " ", &dump_jobs);
if (r < 0) {
- log_warning_errno(errno, "Failed to acquire manager jobs dump: %m");
+ log_warning_errno(r, "Failed to acquire manager jobs dump: %m");
break;
}
@@ -3371,16 +3454,18 @@ void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) {
const char *msg;
int audit_fd, r;
+ assert(m);
+ assert(u);
+
if (!MANAGER_IS_SYSTEM(m))
return;
- audit_fd = get_audit_fd();
- if (audit_fd < 0)
+ /* Don't generate audit events if the service was already started and we're just deserializing */
+ if (MANAGER_IS_RELOADING(m))
return;
- /* Don't generate audit events if the service was already
- * started and we're just deserializing */
- if (MANAGER_IS_RELOADING(m))
+ audit_fd = get_audit_fd();
+ if (audit_fd < 0)
return;
r = unit_name_to_prefix_and_instance(u->id, &p);
@@ -3399,21 +3484,22 @@ void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) {
log_warning_errno(errno, "Failed to send audit message, ignoring: %m");
}
#endif
-
}
void manager_send_unit_plymouth(Manager *m, Unit *u) {
_cleanup_free_ char *message = NULL;
int c, r;
- /* Don't generate plymouth events if the service was already
- * started and we're just deserializing */
- if (MANAGER_IS_RELOADING(m))
- return;
+ assert(m);
+ assert(u);
if (!MANAGER_IS_SYSTEM(m))
return;
+ /* Don't generate plymouth events if the service was already started and we're just deserializing */
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
if (detect_container() > 0)
return;
@@ -3431,6 +3517,27 @@ void manager_send_unit_plymouth(Manager *m, Unit *u) {
"Failed to communicate with plymouth: %m");
}
+void manager_send_unit_supervisor(Manager *m, Unit *u, bool active) {
+ assert(m);
+ assert(u);
+
+ /* Notify a "supervisor" process about our progress, i.e. a container manager, hypervisor, or
+ * surrounding service manager. */
+
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
+ if (!UNIT_VTABLE(u)->notify_supervisor)
+ return;
+
+ if (in_initrd()) /* Only send these once we left the initrd */
+ return;
+
+ (void) sd_notifyf(/* unset_environment= */ false,
+ active ? "X_SYSTEMD_UNIT_ACTIVE=%s" : "X_SYSTEMD_UNIT_INACTIVE=%s",
+ u->id);
+}
+
usec_t manager_get_watchdog(Manager *m, WatchdogType t) {
assert(m);
@@ -3566,7 +3673,7 @@ int manager_reload(Manager *m) {
manager_clear_jobs_and_units(m);
lookup_paths_flush_generator(&m->lookup_paths);
- lookup_paths_free(&m->lookup_paths);
+ lookup_paths_done(&m->lookup_paths);
exec_shared_runtime_vacuum(m);
dynamic_user_vacuum(m, false);
m->uid_refs = hashmap_free(m->uid_refs);
@@ -3601,6 +3708,7 @@ int manager_reload(Manager *m) {
(void) manager_setup_notify(m);
(void) manager_setup_cgroups_agent(m);
(void) manager_setup_user_lookup_fd(m);
+ (void) manager_setup_handoff_timestamp_fd(m);
/* Third, fire things up! */
manager_coldplug(m);
@@ -3645,8 +3753,6 @@ bool manager_unit_inactive_or_pending(Manager *m, const char *name) {
}
static void log_taint_string(Manager *m) {
- _cleanup_free_ char *taint = NULL;
-
assert(m);
if (MANAGER_IS_USER(m) || m->taint_logged)
@@ -3654,7 +3760,7 @@ static void log_taint_string(Manager *m) {
m->taint_logged = true; /* only check for taint once */
- taint = manager_taint_string(m);
+ _cleanup_free_ char *taint = taint_string();
if (isempty(taint))
return;
@@ -3670,7 +3776,19 @@ static void manager_notify_finished(Manager *m) {
if (MANAGER_IS_TEST_RUN(m))
return;
- if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) {
+ if (MANAGER_IS_SYSTEM(m) && m->soft_reboots_count > 0) {
+ /* The soft-reboot case, where we only report data for the last reboot */
+ firmware_usec = loader_usec = initrd_usec = kernel_usec = 0;
+ total_usec = userspace_usec = usec_sub_unsigned(m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic,
+ m->timestamps[MANAGER_TIMESTAMP_SHUTDOWN_START].monotonic);
+
+ log_struct(LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR,
+ "USERSPACE_USEC="USEC_FMT, userspace_usec,
+ LOG_MESSAGE("Soft-reboot finished in %s, counter is now at %u.",
+ FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC),
+ m->soft_reboots_count));
+ } else if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) {
char buf[FORMAT_TIMESPAN_MAX + STRLEN(" (firmware) + ") + FORMAT_TIMESPAN_MAX + STRLEN(" (loader) + ")]
= {};
char *p = buf;
@@ -3740,7 +3858,7 @@ static void manager_notify_finished(Manager *m) {
log_taint_string(m);
}
-static void user_manager_send_ready(Manager *m) {
+static void manager_send_ready_user_scope(Manager *m) {
int r;
assert(m);
@@ -3749,7 +3867,7 @@ static void user_manager_send_ready(Manager *m) {
if (!MANAGER_IS_USER(m) || m->ready_sent)
return;
- r = sd_notify(false,
+ r = sd_notify(/* unset_environment= */ false,
"READY=1\n"
"STATUS=Reached " SPECIAL_BASIC_TARGET ".");
if (r < 0)
@@ -3759,14 +3877,19 @@ static void user_manager_send_ready(Manager *m) {
m->status_ready = false;
}
-static void manager_send_ready(Manager *m) {
+static void manager_send_ready_system_scope(Manager *m) {
int r;
+ assert(m);
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return;
+
+ /* Skip the notification if nothing changed. */
if (m->ready_sent && m->status_ready)
- /* Skip the notification if nothing changed. */
return;
- r = sd_notify(false,
+ r = sd_notify(/* unset_environment= */ false,
"READY=1\n"
"STATUS=Ready.");
if (r < 0)
@@ -3790,7 +3913,7 @@ static void manager_check_basic_target(Manager *m) {
return;
/* For user managers, send out READY=1 as soon as we reach basic.target */
- user_manager_send_ready(m);
+ manager_send_ready_user_scope(m);
/* Log the taint string as soon as we reach basic.target */
log_taint_string(m);
@@ -3808,7 +3931,7 @@ void manager_check_finished(Manager *m) {
manager_check_basic_target(m);
- if (hashmap_size(m->jobs) > 0) {
+ if (!hashmap_isempty(m->jobs)) {
if (m->jobs_in_progress_event_source)
/* Ignore any failure, this is only for feedback */
(void) sd_event_source_set_time(m->jobs_in_progress_event_source,
@@ -3821,7 +3944,7 @@ void manager_check_finished(Manager *m) {
if (hashmap_buckets(m->jobs) > hashmap_size(m->units) / 10)
m->jobs = hashmap_free(m->jobs);
- manager_send_ready(m);
+ manager_send_ready_system_scope(m);
/* Notify Type=idle units that we are done now */
manager_close_idle_pipe(m);
@@ -3851,9 +3974,7 @@ void manager_send_reloading(Manager *m) {
assert(m);
/* Let whoever invoked us know that we are now reloading */
- (void) sd_notifyf(/* unset= */ false,
- "RELOADING=1\n"
- "MONOTONIC_USEC=" USEC_FMT "\n", now(CLOCK_MONOTONIC));
+ (void) notify_reloading_full(/* status = */ NULL);
/* And ensure that we'll send READY=1 again as soon as we are ready again */
m->ready_sent = false;
@@ -3878,8 +3999,8 @@ static int manager_run_environment_generators(Manager *m) {
_cleanup_strv_free_ char **paths = NULL;
void* args[] = {
[STDOUT_GENERATE] = &tmp,
- [STDOUT_COLLECT] = &tmp,
- [STDOUT_CONSUME] = &m->transient_environment,
+ [STDOUT_COLLECT] = &tmp,
+ [STDOUT_CONSUME] = &m->transient_environment,
};
int r;
@@ -4040,7 +4161,7 @@ static int manager_run_generators(Manager *m) {
/* On some systems /tmp/ doesn't exist, and on some other systems we cannot create it at all. Avoid
* trying to mount a private tmpfs on it as there's no one size fits all. */
- if (is_dir("/tmp", /* follow= */ false) > 0)
+ if (is_dir("/tmp", /* follow= */ false) > 0 && !MANAGER_IS_TEST_RUN(m))
flags |= FORK_PRIVATE_TMP;
r = safe_fork("(sd-gens)", flags, NULL);
@@ -4373,7 +4494,7 @@ void manager_override_show_status(Manager *m, ShowStatus mode, const char *reaso
set_show_status_marker(show_status_on(mode));
}
-const char *manager_get_confirm_spawn(Manager *m) {
+const char* manager_get_confirm_spawn(Manager *m) {
static int last_errno = 0;
struct stat st;
int r;
@@ -4478,14 +4599,15 @@ void manager_status_printf(Manager *m, StatusType type, const char *status, cons
va_end(ap);
}
-Set* manager_get_units_requiring_mounts_for(Manager *m, const char *path) {
+Set* manager_get_units_needing_mounts_for(Manager *m, const char *path, UnitMountDependencyType t) {
assert(m);
assert(path);
+ assert(t >= 0 && t < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX);
if (path_equal(path, "/"))
path = "";
- return hashmap_get(m->units_requiring_mounts_for, path);
+ return hashmap_get(m->units_needing_mounts_for[t], path);
}
int manager_update_failed_units(Manager *m, Unit *u, bool failed) {
@@ -4542,7 +4664,7 @@ ManagerState manager_state(Manager *m) {
}
/* Are there any failed units? If so, we are in degraded mode */
- if (set_size(m->failed_units) > 0)
+ if (!set_isempty(m->failed_units))
return MANAGER_DEGRADED;
return MANAGER_RUNNING;
@@ -4701,20 +4823,19 @@ static void manager_vacuum(Manager *m) {
exec_shared_runtime_vacuum(m);
}
-int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
struct buffer {
uid_t uid;
gid_t gid;
char unit_name[UNIT_NAME_MAX+1];
} _packed_ buffer;
- Manager *m = userdata;
+ Manager *m = ASSERT_PTR(userdata);
ssize_t l;
size_t n;
Unit *u;
- assert_se(source);
- assert_se(m);
+ assert(source);
/* Invoked whenever a child process succeeded resolving its user/group to use and sent us the
* resulting UID/GID in a datagram. We parse the datagram here and pass it off to the unit, so that
@@ -4763,76 +4884,71 @@ int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t re
return 0;
}
-static int short_uid_range(const char *path) {
- _cleanup_(uid_range_freep) UidRange *p = NULL;
- int r;
-
- assert(path);
-
- /* Taint systemd if we the UID range assigned to this environment doesn't at least cover 0…65534,
- * i.e. from root to nobody. */
-
- r = uid_range_load_userns(&p, path);
- if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
- return false;
- if (r < 0)
- return log_debug_errno(r, "Failed to load %s: %m", path);
-
- return !uid_range_covers(p, 0, 65535);
-}
-
-char* manager_taint_string(const Manager *m) {
- /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". Only things that are detected at
- * runtime should be tagged here. For stuff that is known during compilation, emit a warning in the
- * configuration phase. */
-
- assert(m);
-
- const char* stage[12] = {};
- size_t n = 0;
-
- _cleanup_free_ char *usrbin = NULL;
- if (readlink_malloc("/bin", &usrbin) < 0 || !PATH_IN_SET(usrbin, "usr/bin", "/usr/bin"))
- stage[n++] = "unmerged-usr";
+static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ usec_t ts[2] = {};
+ CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control;
+ struct msghdr msghdr = {
+ .msg_iov = &IOVEC_MAKE(ts, sizeof(ts)),
+ .msg_iovlen = 1,
+ .msg_control = &control,
+ .msg_controllen = sizeof(control),
+ };
+ ssize_t n;
- if (access("/proc/cgroups", F_OK) < 0)
- stage[n++] = "cgroups-missing";
+ assert(source);
- if (cg_all_unified() == 0)
- stage[n++] = "cgroupsv1";
+ n = recvmsg_safe(m->handoff_timestamp_fds[0], &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC);
+ if (ERRNO_IS_NEG_TRANSIENT(n))
+ return 0; /* Spurious wakeup, try again */
+ if (n == -EXFULL) {
+ log_warning("Got message with truncated control, ignoring.");
+ return 0;
+ }
+ if (n < 0)
+ return log_error_errno(n, "Failed to receive handoff timestamp message: %m");
- if (clock_is_localtime(NULL) > 0)
- stage[n++] = "local-hwclock";
+ if (msghdr.msg_flags & MSG_TRUNC) {
+ log_warning("Got truncated handoff timestamp message, ignoring.");
+ return 0;
+ }
+ if (n != sizeof(ts)) {
+ log_warning("Got handoff timestamp message of unexpected size %zi (expected %zu), ignoring.", n, sizeof(ts));
+ return 0;
+ }
- if (os_release_support_ended(NULL, /* quiet= */ true, NULL) > 0)
- stage[n++] = "support-ended";
+ struct ucred *ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
+ if (!ucred || !pid_is_valid(ucred->pid)) {
+ log_warning("Received notify message without valid credentials. Ignoring.");
+ return 0;
+ }
- _cleanup_free_ char *destination = NULL;
- if (readlink_malloc("/var/run", &destination) < 0 ||
- !PATH_IN_SET(destination, "../run", "/run"))
- stage[n++] = "var-run-bad";
+ log_debug("Got handoff timestamp event for PID " PID_FMT ".", ucred->pid);
- _cleanup_free_ char *overflowuid = NULL, *overflowgid = NULL;
- if (read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid) >= 0 &&
- !streq(overflowuid, "65534"))
- stage[n++] = "overflowuid-not-65534";
- if (read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid) >= 0 &&
- !streq(overflowgid, "65534"))
- stage[n++] = "overflowgid-not-65534";
+ _cleanup_free_ Unit **units = NULL;
+ int n_units = manager_get_units_for_pidref(m, &PIDREF_MAKE_FROM_PID(ucred->pid), &units);
+ if (n_units < 0) {
+ log_warning_errno(n_units, "Unable to determine units for PID " PID_FMT ", ignoring: %m", ucred->pid);
+ return 0;
+ }
+ if (n_units == 0) {
+ log_debug("Got handoff timestamp for process " PID_FMT " we are not interested in, ignoring.", ucred->pid);
+ return 0;
+ }
- struct utsname uts;
- assert_se(uname(&uts) >= 0);
- if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
- stage[n++] = "old-kernel";
+ dual_timestamp dt = {
+ .realtime = ts[0],
+ .monotonic = ts[1],
+ };
- if (short_uid_range("/proc/self/uid_map") > 0)
- stage[n++] = "short-uid-range";
- if (short_uid_range("/proc/self/gid_map") > 0)
- stage[n++] = "short-gid-range";
+ FOREACH_ARRAY(u, units, n_units) {
+ if (!UNIT_VTABLE(*u)->notify_handoff_timestamp)
+ continue;
- assert(n < ELEMENTSOF(stage) - 1); /* One extra for NULL terminator */
+ UNIT_VTABLE(*u)->notify_handoff_timestamp(*u, ucred, &dt);
+ }
- return strv_join((char**) stage, ":");
+ return 0;
}
void manager_ref_console(Manager *m) {
@@ -4988,14 +5104,13 @@ LogTarget manager_get_executor_log_target(Manager *m) {
assert(m);
/* If journald is not available tell sd-executor to go to kmsg, as it might be starting journald */
+ if (!MANAGER_IS_TEST_RUN(m) && !manager_journal_is_running(m))
+ return LOG_TARGET_KMSG;
- if (manager_journal_is_running(m))
- return log_get_target();
-
- return LOG_TARGET_KMSG;
+ return log_get_target();
}
-static const char *const manager_state_table[_MANAGER_STATE_MAX] = {
+static const char* const manager_state_table[_MANAGER_STATE_MAX] = {
[MANAGER_INITIALIZING] = "initializing",
[MANAGER_STARTING] = "starting",
[MANAGER_RUNNING] = "running",
@@ -5006,7 +5121,22 @@ static const char *const manager_state_table[_MANAGER_STATE_MAX] = {
DEFINE_STRING_TABLE_LOOKUP(manager_state, ManagerState);
-static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = {
+static const char* const manager_objective_table[_MANAGER_OBJECTIVE_MAX] = {
+ [MANAGER_OK] = "ok",
+ [MANAGER_EXIT] = "exit",
+ [MANAGER_RELOAD] = "reload",
+ [MANAGER_REEXECUTE] = "reexecute",
+ [MANAGER_REBOOT] = "reboot",
+ [MANAGER_SOFT_REBOOT] = "soft-reboot",
+ [MANAGER_POWEROFF] = "poweroff",
+ [MANAGER_HALT] = "halt",
+ [MANAGER_KEXEC] = "kexec",
+ [MANAGER_SWITCH_ROOT] = "switch-root",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(manager_objective, ManagerObjective);
+
+static const char* const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = {
[MANAGER_TIMESTAMP_FIRMWARE] = "firmware",
[MANAGER_TIMESTAMP_LOADER] = "loader",
[MANAGER_TIMESTAMP_KERNEL] = "kernel",
@@ -5026,6 +5156,7 @@ static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = {
[MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH] = "initrd-generators-finish",
[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START] = "initrd-units-load-start",
[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH] = "initrd-units-load-finish",
+ [MANAGER_TIMESTAMP_SHUTDOWN_START] = "shutdown-start",
};
DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp);
diff --git a/src/core/manager.h b/src/core/manager.h
index d96eb7b..0641b27 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -120,6 +120,9 @@ typedef enum ManagerTimestamp {
MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH,
MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START,
MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH,
+
+ MANAGER_TIMESTAMP_SHUTDOWN_START,
+
_MANAGER_TIMESTAMP_MAX,
_MANAGER_TIMESTAMP_INVALID = -EINVAL,
} ManagerTimestamp;
@@ -137,6 +140,7 @@ typedef enum WatchdogType {
#include "path-lookup.h"
#include "show-status.h"
#include "unit-name.h"
+#include "unit.h"
typedef enum ManagerTestRunFlags {
MANAGER_TEST_NORMAL = 0, /* run normally */
@@ -282,6 +286,9 @@ struct Manager {
int user_lookup_fds[2];
sd_event_source *user_lookup_event_source;
+ int handoff_timestamp_fds[2];
+ sd_event_source *handoff_timestamp_event_source;
+
RuntimeScope runtime_scope;
LookupPaths lookup_paths;
@@ -375,6 +382,8 @@ struct Manager {
bool etc_localtime_accessible;
ManagerObjective objective;
+ /* Objective as it was before serialization, mostly to detect soft-reboots */
+ ManagerObjective previous_objective;
/* Flags */
bool dispatching_load_queue;
@@ -438,10 +447,9 @@ struct Manager {
/* This is true before and after switching root. */
bool switching_root;
- /* This maps all possible path prefixes to the units needing
- * them. It's a hashmap with a path string as key and a Set as
- * value where Unit objects are contained. */
- Hashmap *units_requiring_mounts_for;
+ /* These map all possible path prefixes to the units needing them. They are hashmaps with a path
+ * string as key, and a Set as value where Unit objects are contained. */
+ Hashmap *units_needing_mounts_for[_UNIT_MOUNT_DEPENDENCY_TYPE_MAX];
/* Used for processing polkit authorization responses */
Hashmap *polkit_registry;
@@ -488,8 +496,8 @@ struct Manager {
/* Reference to RestrictFileSystems= BPF program */
struct restrict_fs_bpf *restrict_fs;
- /* Allow users to configure a rate limit for Reload() operations */
- RateLimit reload_ratelimit;
+ /* Allow users to configure a rate limit for Reload()/Reexecute() operations */
+ RateLimit reload_reexec_ratelimit;
/* Dump*() are slow, so always rate limit them to 10 per 10 minutes */
RateLimit dump_ratelimit;
@@ -501,6 +509,8 @@ struct Manager {
/* Pin the systemd-executor binary, so that it never changes until re-exec, ensuring we don't have
* serialization/deserialization compatibility issues during upgrades. */
int executor_fd;
+
+ unsigned soft_reboots_count;
};
static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
@@ -550,7 +560,7 @@ int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error
void manager_clear_jobs(Manager *m);
-void manager_unwatch_pidref(Manager *m, PidRef *pid);
+void manager_unwatch_pidref(Manager *m, const PidRef *pid);
unsigned manager_dispatch_load_queue(Manager *m);
@@ -575,6 +585,7 @@ void manager_reset_failed(Manager *m);
void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success);
void manager_send_unit_plymouth(Manager *m, Unit *u);
+void manager_send_unit_supervisor(Manager *m, Unit *u, bool active);
bool manager_unit_inactive_or_pending(Manager *m, const char *name);
@@ -596,7 +607,7 @@ double manager_get_progress(Manager *m);
void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) _printf_(4,5);
-Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path);
+Set* manager_get_units_needing_mounts_for(Manager *m, const char *path, UnitMountDependencyType t);
ManagerState manager_state(Manager *m);
@@ -608,8 +619,6 @@ int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc);
void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now);
int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc);
-char* manager_taint_string(const Manager *m);
-
void manager_ref_console(Manager *m);
void manager_unref_console(Manager *m);
@@ -619,13 +628,16 @@ void manager_restore_original_log_level(Manager *m);
void manager_override_log_target(Manager *m, LogTarget target);
void manager_restore_original_log_target(Manager *m);
-const char *manager_state_to_string(ManagerState m) _const_;
+const char* manager_get_confirm_spawn(Manager *m);
+void manager_disable_confirm_spawn(void);
+
+const char* manager_state_to_string(ManagerState m) _const_;
ManagerState manager_state_from_string(const char *s) _pure_;
-const char *manager_get_confirm_spawn(Manager *m);
-void manager_disable_confirm_spawn(void);
+const char* manager_objective_to_string(ManagerObjective m) _const_;
+ManagerObjective manager_objective_from_string(const char *s) _pure_;
-const char *manager_timestamp_to_string(ManagerTimestamp m) _const_;
+const char* manager_timestamp_to_string(ManagerTimestamp m) _const_;
ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_;
ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s);
@@ -644,3 +656,26 @@ OOMPolicy oom_policy_from_string(const char *s) _pure_;
void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope);
void unit_defaults_done(UnitDefaults *defaults);
+
+enum {
+ /* most important … */
+ EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-11,
+ EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
+ EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
+ EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv1 */
+ EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv2 */
+ EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-8,
+ EVENT_PRIORITY_HANDOFF_TIMESTAMP = SD_EVENT_PRIORITY_NORMAL-7,
+ EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6,
+ EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5,
+ EVENT_PRIORITY_SIGCHLD = SD_EVENT_PRIORITY_NORMAL-4,
+ EVENT_PRIORITY_SIGNALS = SD_EVENT_PRIORITY_NORMAL-3,
+ EVENT_PRIORITY_CGROUP_EMPTY = SD_EVENT_PRIORITY_NORMAL-2,
+ EVENT_PRIORITY_TIME_CHANGE = SD_EVENT_PRIORITY_NORMAL-1,
+ EVENT_PRIORITY_TIME_ZONE = SD_EVENT_PRIORITY_NORMAL-1,
+ EVENT_PRIORITY_IPC = SD_EVENT_PRIORITY_NORMAL,
+ EVENT_PRIORITY_REWATCH_PIDS = SD_EVENT_PRIORITY_IDLE,
+ EVENT_PRIORITY_SERVICE_WATCHDOG = SD_EVENT_PRIORITY_IDLE+1,
+ EVENT_PRIORITY_RUN_QUEUE = SD_EVENT_PRIORITY_IDLE+2,
+ /* … to least important */
+};
diff --git a/src/core/meson.build b/src/core/meson.build
index 7701d3d..7a2012a 100644
--- a/src/core/meson.build
+++ b/src/core/meson.build
@@ -7,7 +7,8 @@ libcore_sources = files(
'bpf-devices.c',
'bpf-firewall.c',
'bpf-foreign.c',
- 'bpf-lsm.c',
+ 'bpf-restrict-fs.c',
+ 'bpf-restrict-ifaces.c',
'bpf-socket-bind.c',
'cgroup.c',
'core-varlink.c',
@@ -51,7 +52,6 @@ libcore_sources = files(
'mount.c',
'namespace.c',
'path.c',
- 'restrict-ifaces.c',
'scope.c',
'selinux-access.c',
'selinux-setup.c',
@@ -61,6 +61,7 @@ libcore_sources = files(
'smack-setup.c',
'socket.c',
'swap.c',
+ 'taint.c',
'target.c',
'timer.c',
'transaction.c',
@@ -125,7 +126,7 @@ libcore = shared_library(
libaudit,
libblkid,
libdl,
- libkmod,
+ libkmod_cflags,
libm,
libmount,
libpam,
diff --git a/src/core/mount.c b/src/core/mount.c
index 3c4971c..ebafcaf 100644
--- a/src/core/mount.c
+++ b/src/core/mount.c
@@ -39,18 +39,18 @@
#define RETRY_UMOUNT_MAX 32
static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = {
- [MOUNT_DEAD] = UNIT_INACTIVE,
- [MOUNT_MOUNTING] = UNIT_ACTIVATING,
- [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING,
- [MOUNT_MOUNTED] = UNIT_ACTIVE,
- [MOUNT_REMOUNTING] = UNIT_RELOADING,
- [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING,
+ [MOUNT_DEAD] = UNIT_INACTIVE,
+ [MOUNT_MOUNTING] = UNIT_ACTIVATING,
+ [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING,
+ [MOUNT_MOUNTED] = UNIT_ACTIVE,
+ [MOUNT_REMOUNTING] = UNIT_RELOADING,
+ [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING,
[MOUNT_REMOUNTING_SIGTERM] = UNIT_RELOADING,
[MOUNT_REMOUNTING_SIGKILL] = UNIT_RELOADING,
[MOUNT_UNMOUNTING_SIGTERM] = UNIT_DEACTIVATING,
[MOUNT_UNMOUNTING_SIGKILL] = UNIT_DEACTIVATING,
- [MOUNT_FAILED] = UNIT_FAILED,
- [MOUNT_CLEANING] = UNIT_MAINTENANCE,
+ [MOUNT_FAILED] = UNIT_FAILED,
+ [MOUNT_CLEANING] = UNIT_MAINTENANCE,
};
static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
@@ -171,24 +171,9 @@ static bool mount_propagate_stop(Mount *m) {
* otherwise let's not bother. */
}
-static bool mount_needs_quota(const MountParameters *p) {
- assert(p);
-
- if (p->fstype && !fstype_needs_quota(p->fstype))
- return false;
-
- if (mount_is_bind(p))
- return false;
-
- return fstab_test_option(p->options,
- "usrquota\0" "grpquota\0" "quota\0" "usrjquota\0" "grpjquota\0");
-}
-
static void mount_init(Unit *u) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
- assert(m);
- assert(u);
assert(u->load_state == UNIT_STUB);
m->timeout_usec = u->manager->defaults.timeout_start_usec;
@@ -218,12 +203,7 @@ static int mount_arm_timer(Mount *m, bool relative, usec_t usec) {
static void mount_unwatch_control_pid(Mount *m) {
assert(m);
-
- if (!pidref_is_set(&m->control_pid))
- return;
-
- unit_unwatch_pidref(UNIT(m), &m->control_pid);
- pidref_done(&m->control_pid);
+ unit_unwatch_pidref_done(UNIT(m), &m->control_pid);
}
static void mount_parameters_done(MountParameters *p) {
@@ -235,9 +215,7 @@ static void mount_parameters_done(MountParameters *p) {
}
static void mount_done(Unit *u) {
- Mount *m = MOUNT(u);
-
- assert(m);
+ Mount *m = ASSERT_PTR(MOUNT(u));
m->where = mfree(m->where);
@@ -245,6 +223,7 @@ static void mount_done(Unit *u) {
mount_parameters_done(&m->parameters_fragment);
m->exec_runtime = exec_runtime_free(m->exec_runtime);
+
exec_command_done_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX);
m->control_command = NULL;
@@ -262,6 +241,8 @@ static int update_parameters_proc_self_mountinfo(
MountParameters *p;
int r, q, w;
+ assert(m);
+
p = &m->parameters_proc_self_mountinfo;
r = free_and_strdup(&p->what, what);
@@ -281,8 +262,6 @@ static int update_parameters_proc_self_mountinfo(
static int mount_add_mount_dependencies(Mount *m) {
MountParameters *pm;
- Unit *other;
- Set *s;
int r;
assert(m);
@@ -296,7 +275,7 @@ static int mount_add_mount_dependencies(Mount *m) {
if (r < 0)
return r;
- r = unit_require_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT);
+ r = unit_add_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT, UNIT_MOUNT_REQUIRES);
if (r < 0)
return r;
}
@@ -308,30 +287,43 @@ static int mount_add_mount_dependencies(Mount *m) {
path_is_absolute(pm->what) &&
(mount_is_bind(pm) || mount_is_loop(pm) || !mount_is_network(pm))) {
- r = unit_require_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES);
if (r < 0)
return r;
}
/* Adds in dependencies to other units that use this path or paths further down in the hierarchy */
- s = manager_get_units_requiring_mounts_for(UNIT(m)->manager, m->where);
- SET_FOREACH(other, s) {
-
- if (other->load_state != UNIT_LOADED)
- continue;
-
- if (other == UNIT(m))
- continue;
-
- r = unit_add_dependency(other, UNIT_AFTER, UNIT(m), true, UNIT_DEPENDENCY_PATH);
- if (r < 0)
- return r;
-
- if (UNIT(m)->fragment_path) {
- /* If we have fragment configuration, then make this dependency required */
- r = unit_add_dependency(other, UNIT_REQUIRES, UNIT(m), true, UNIT_DEPENDENCY_PATH);
+ for (UnitMountDependencyType t = 0; t < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX; ++t) {
+ Unit *other;
+ Set *s = manager_get_units_needing_mounts_for(UNIT(m)->manager, m->where, t);
+
+ SET_FOREACH(other, s) {
+ if (other->load_state != UNIT_LOADED)
+ continue;
+
+ if (other == UNIT(m))
+ continue;
+
+ r = unit_add_dependency(
+ other,
+ UNIT_AFTER,
+ UNIT(m),
+ /* add_reference= */ true,
+ UNIT_DEPENDENCY_PATH);
if (r < 0)
return r;
+
+ if (UNIT(m)->fragment_path) {
+ /* If we have fragment configuration, then make this dependency required/wanted */
+ r = unit_add_dependency(
+ other,
+ unit_mount_dependency_type_to_dependency_type(t),
+ UNIT(m),
+ /* add_reference= */ true,
+ UNIT_DEPENDENCY_PATH);
+ if (r < 0)
+ return r;
+ }
}
}
@@ -413,39 +405,9 @@ static int mount_add_device_dependencies(Mount *m) {
return 0;
}
-static int mount_add_quota_dependencies(Mount *m) {
- MountParameters *p;
- int r;
-
- assert(m);
-
- if (!MANAGER_IS_SYSTEM(UNIT(m)->manager))
- return 0;
-
- p = get_mount_parameters_fragment(m);
- if (!p)
- return 0;
-
- if (!mount_needs_quota(p))
- return 0;
-
- r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTACHECK_SERVICE,
- /* add_reference= */ true, UNIT_DEPENDENCY_FILE);
- if (r < 0)
- return r;
-
- r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTAON_SERVICE,
- /* add_reference= */true, UNIT_DEPENDENCY_FILE);
- if (r < 0)
- return r;
-
- return 0;
-}
-
static bool mount_is_extrinsic(Unit *u) {
+ Mount *m = ASSERT_PTR(MOUNT(u));
MountParameters *p;
- Mount *m = MOUNT(u);
- assert(m);
/* Returns true for all units that are "magic" and should be excluded from the usual
* start-up and shutdown dependencies. We call them "extrinsic" here, as they are generally
@@ -501,10 +463,7 @@ static int mount_add_default_ordering_dependencies(Mount *m, MountParameters *p,
after = SPECIAL_LOCAL_FS_PRE_TARGET;
before = SPECIAL_INITRD_USR_FS_TARGET;
- } else if (mount_is_credentials(m))
- after = before = NULL;
-
- else if (mount_is_network(p)) {
+ } else if (mount_is_network(p)) {
after = SPECIAL_REMOTE_FS_PRE_TARGET;
before = SPECIAL_REMOTE_FS_TARGET;
@@ -645,6 +604,9 @@ static int mount_add_non_exec_dependencies(Mount *m) {
if (!m->where)
return 0;
+ if (mount_is_credentials(m))
+ UNIT(m)->default_dependencies = false;
+
/* Adds in all dependencies directly responsible for ordering the mount, as opposed to dependencies
* resulting from the ExecContext and such. */
@@ -656,10 +618,6 @@ static int mount_add_non_exec_dependencies(Mount *m) {
if (r < 0)
return r;
- r = mount_add_quota_dependencies(m);
- if (r < 0)
- return r;
-
r = mount_add_default_dependencies(m);
if (r < 0)
return r;
@@ -668,11 +626,9 @@ static int mount_add_non_exec_dependencies(Mount *m) {
}
static int mount_add_extras(Mount *m) {
- Unit *u = UNIT(m);
+ Unit *u = UNIT(ASSERT_PTR(m));
int r;
- assert(m);
-
/* Note: this call might be called after we already have been loaded once (and even when it has already been
* activated), in case data from /proc/self/mountinfo has changed. This means all code here needs to be ready
* to run with an already set up unit. */
@@ -717,7 +673,7 @@ static int mount_add_extras(Mount *m) {
}
static void mount_load_root_mount(Unit *u) {
- assert(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
if (!unit_has_name(u, SPECIAL_ROOT_MOUNT))
return;
@@ -726,37 +682,35 @@ static void mount_load_root_mount(Unit *u) {
u->default_dependencies = false;
/* The stdio/kmsg bridge socket is on /, in order to avoid a dep loop, don't use kmsg logging for -.mount */
- MOUNT(u)->exec_context.std_output = EXEC_OUTPUT_NULL;
- MOUNT(u)->exec_context.std_input = EXEC_INPUT_NULL;
+ m->exec_context.std_output = EXEC_OUTPUT_NULL;
+ m->exec_context.std_input = EXEC_INPUT_NULL;
if (!u->description)
u->description = strdup("Root Mount");
}
static int mount_load(Unit *u) {
- Mount *m = MOUNT(u);
- int r, q = 0;
+ Mount *m = ASSERT_PTR(MOUNT(u));
+ int r;
- assert(m);
- assert(u);
assert(u->load_state == UNIT_STUB);
mount_load_root_mount(u);
- bool fragment_optional = m->from_proc_self_mountinfo || u->perpetual;
- r = unit_load_fragment_and_dropin(u, !fragment_optional);
+ bool from_kernel = m->from_proc_self_mountinfo || u->perpetual;
+
+ r = unit_load_fragment_and_dropin(u, /* fragment_required = */ !from_kernel);
/* Add in some extras. Note we do this in all cases (even if we failed to load the unit) when announced by the
* kernel, because we need some things to be set up no matter what when the kernel establishes a mount and thus
* we need to update the state in our unit to track it. After all, consider that we don't allow changing the
* 'slice' field for a unit once it is active. */
- if (u->load_state == UNIT_LOADED || m->from_proc_self_mountinfo || u->perpetual)
- q = mount_add_extras(m);
+ if (u->load_state == UNIT_LOADED || from_kernel)
+ RET_GATHER(r, mount_add_extras(m));
if (r < 0)
return r;
- if (q < 0)
- return q;
+
if (u->load_state != UNIT_LOADED)
return 0;
@@ -765,6 +719,7 @@ static int mount_load(Unit *u) {
static void mount_set_state(Mount *m, MountState state) {
MountState old_state;
+
assert(m);
if (m->state != state)
@@ -787,10 +742,9 @@ static void mount_set_state(Mount *m, MountState state) {
}
static int mount_coldplug(Unit *u) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
int r;
- assert(m);
assert(m->state == MOUNT_DEAD);
if (m->deserialized_state == m->state)
@@ -809,17 +763,17 @@ static int mount_coldplug(Unit *u) {
return r;
}
- if (!IN_SET(m->deserialized_state, MOUNT_DEAD, MOUNT_FAILED))
+ if (!IN_SET(m->deserialized_state, MOUNT_DEAD, MOUNT_FAILED)) {
(void) unit_setup_exec_runtime(u);
+ (void) unit_setup_cgroup_runtime(u);
+ }
mount_set_state(m, m->deserialized_state);
return 0;
}
static void mount_catchup(Unit *u) {
- Mount *m = MOUNT(ASSERT_PTR(u));
-
- assert(m);
+ Mount *m = ASSERT_PTR(MOUNT(u));
/* Adjust the deserialized state. See comments in mount_process_proc_self_mountinfo(). */
if (m->from_proc_self_mountinfo)
@@ -854,12 +808,15 @@ static void mount_catchup(Unit *u) {
}
static void mount_dump(Unit *u, FILE *f, const char *prefix) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
MountParameters *p;
+ const char *prefix2;
- assert(m);
assert(f);
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
p = get_mount_parameters(m);
fprintf(f,
@@ -904,14 +861,22 @@ static void mount_dump(Unit *u, FILE *f, const char *prefix) {
exec_context_dump(&m->exec_context, f, prefix);
kill_context_dump(&m->kill_context, f, prefix);
cgroup_context_dump(UNIT(m), f, prefix);
+
+ for (MountExecCommand c = 0; c < _MOUNT_EXEC_COMMAND_MAX; c++) {
+ if (!m->exec_command[c].argv)
+ continue;
+
+ fprintf(f, "%s%s %s:\n",
+ prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), mount_exec_command_to_string(c));
+
+ exec_command_dump(m->exec_command + c, f, prefix2);
+ }
}
static int mount_spawn(Mount *m, ExecCommand *c, PidRef *ret_pid) {
-
_cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(
EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN);
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
- pid_t pid;
int r;
assert(m);
@@ -936,11 +901,7 @@ static int mount_spawn(Mount *m, ExecCommand *c, PidRef *ret_pid) {
&exec_params,
m->exec_runtime,
&m->cgroup_context,
- &pid);
- if (r < 0)
- return r;
-
- r = pidref_set_pid(&pidref, pid);
+ &pidref);
if (r < 0)
return r;
@@ -1025,13 +986,7 @@ static void mount_enter_signal(Mount *m, MountState state, MountResult f) {
if (m->result == MOUNT_SUCCESS)
m->result = f;
- r = unit_kill_context(
- UNIT(m),
- &m->kill_context,
- state_to_kill_operation(state),
- /* main_pid= */ NULL,
- &m->control_pid,
- /* main_pid_alien= */ false);
+ r = unit_kill_context(UNIT(m), state_to_kill_operation(state));
if (r < 0) {
log_unit_warning_errno(UNIT(m), r, "Failed to kill processes: %m");
goto fail;
@@ -1166,9 +1121,9 @@ static int mount_set_mount_command(Mount *m, ExecCommand *c, const MountParamete
}
static void mount_enter_mounting(Mount *m) {
- int r;
MountParameters *p;
bool source_is_dir = true;
+ int r;
assert(m);
@@ -1192,6 +1147,34 @@ static void mount_enter_mounting(Mount *m) {
if (r < 0 && r != -EEXIST)
log_unit_warning_errno(UNIT(m), r, "Failed to create mount point '%s', ignoring: %m", m->where);
+ /* If we are asked to create an OverlayFS, create the upper/work directories if they are missing */
+ if (p && streq_ptr(p->fstype, "overlay")) {
+ _cleanup_strv_free_ char **dirs = NULL;
+
+ r = fstab_filter_options(
+ p->options,
+ "upperdir\0workdir\0",
+ /* ret_namefound= */ NULL,
+ /* ret_value= */ NULL,
+ &dirs,
+ /* ret_filtered= */ NULL);
+ if (r < 0)
+ log_unit_warning_errno(
+ UNIT(m),
+ r,
+ "Failed to determine upper directory for OverlayFS, ignoring: %m");
+ else
+ STRV_FOREACH(d, dirs) {
+ r = mkdir_p_label(*d, m->directory_mode);
+ if (r < 0 && r != -EEXIST)
+ log_unit_warning_errno(
+ UNIT(m),
+ r,
+ "Failed to create overlay directory '%s', ignoring: %m",
+ *d);
+ }
+ }
+
if (source_is_dir)
unit_warn_if_dir_nonempty(UNIT(m), m->where);
unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_start);
@@ -1249,8 +1232,8 @@ static void mount_set_reload_result(Mount *m, MountResult result) {
}
static void mount_enter_remounting(Mount *m) {
- int r;
MountParameters *p;
+ int r;
assert(m);
@@ -1312,15 +1295,15 @@ static void mount_cycle_clear(Mount *m) {
m->result = MOUNT_SUCCESS;
m->reload_result = MOUNT_SUCCESS;
exec_command_reset_status_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX);
- UNIT(m)->reset_accounting = true;
+
+ if (m->cgroup_runtime)
+ m->cgroup_runtime->reset_accounting = true;
}
static int mount_start(Unit *u) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
int r;
- assert(m);
-
/* We cannot fulfill this request right now, try again later
* please! */
if (IN_SET(m->state,
@@ -1347,9 +1330,7 @@ static int mount_start(Unit *u) {
}
static int mount_stop(Unit *u) {
- Mount *m = MOUNT(u);
-
- assert(m);
+ Mount *m = ASSERT_PTR(MOUNT(u));
/* When we directly call umount() for a path, then the state of the corresponding mount unit may be
* outdated. Let's re-read mountinfo now and update the state. */
@@ -1401,9 +1382,8 @@ static int mount_stop(Unit *u) {
}
static int mount_reload(Unit *u) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
- assert(m);
assert(m->state == MOUNT_MOUNTED);
mount_enter_remounting(m);
@@ -1412,9 +1392,8 @@ static int mount_reload(Unit *u) {
}
static int mount_serialize(Unit *u, FILE *f, FDSet *fds) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
- assert(m);
assert(f);
assert(fds);
@@ -1431,11 +1410,9 @@ static int mount_serialize(Unit *u, FILE *f, FDSet *fds) {
}
static int mount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
int r;
- assert(m);
- assert(u);
assert(key);
assert(value);
assert(fds);
@@ -1495,21 +1472,19 @@ static int mount_deserialize_item(Unit *u, const char *key, const char *value, F
}
static UnitActiveState mount_active_state(Unit *u) {
- assert(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
- return state_translation_table[MOUNT(u)->state];
+ return state_translation_table[m->state];
}
static const char *mount_sub_state_to_string(Unit *u) {
- assert(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
- return mount_state_to_string(MOUNT(u)->state);
+ return mount_state_to_string(m->state);
}
static bool mount_may_gc(Unit *u) {
- Mount *m = MOUNT(u);
-
- assert(m);
+ Mount *m = ASSERT_PTR(MOUNT(u));
if (m->from_proc_self_mountinfo)
return false;
@@ -1518,10 +1493,9 @@ static bool mount_may_gc(Unit *u) {
}
static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
MountResult f;
- assert(m);
assert(pid >= 0);
if (pid != m->control_pid.pid)
@@ -1653,9 +1627,8 @@ static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) {
}
static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
- Mount *m = MOUNT(userdata);
+ Mount *m = ASSERT_PTR(MOUNT(userdata));
- assert(m);
assert(m->timer_event_source == source);
switch (m->state) {
@@ -1738,6 +1711,7 @@ static int mount_setup_new_unit(
Unit **ret) {
_cleanup_(unit_freep) Unit *u = NULL;
+ Mount *mnt;
int r;
assert(m);
@@ -1749,24 +1723,26 @@ static int mount_setup_new_unit(
if (r < 0)
return r;
+ mnt = ASSERT_PTR(MOUNT(u));
+
r = free_and_strdup(&u->source_path, "/proc/self/mountinfo");
if (r < 0)
return r;
- r = free_and_strdup(&MOUNT(u)->where, where);
+ r = free_and_strdup(&mnt->where, where);
if (r < 0)
return r;
- r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype);
+ r = update_parameters_proc_self_mountinfo(mnt, what, options, fstype);
if (r < 0)
return r;
/* This unit was generated because /proc/self/mountinfo reported it. Remember this, so that by the
* time we load the unit file for it (and thus add in extra deps right after) we know what source to
* attributes the deps to. */
- MOUNT(u)->from_proc_self_mountinfo = true;
+ mnt->from_proc_self_mountinfo = true;
- r = mount_add_non_exec_dependencies(MOUNT(u));
+ r = mount_add_non_exec_dependencies(mnt);
if (r < 0)
return r;
@@ -1787,14 +1763,16 @@ static int mount_setup_existing_unit(
const char *fstype,
MountProcFlags *ret_flags) {
+ Mount *m = ASSERT_PTR(MOUNT(u));
int r;
assert(u);
+ assert(where);
assert(ret_flags);
- if (!MOUNT(u)->where) {
- MOUNT(u)->where = strdup(where);
- if (!MOUNT(u)->where)
+ if (!m->where) {
+ m->where = strdup(where);
+ if (!m->where)
return -ENOMEM;
}
@@ -1802,10 +1780,9 @@ static int mount_setup_existing_unit(
* for the current unit. Note that the flags field is reset on each iteration of reading
* /proc/self/mountinfo, hence we know for sure anything already set here is from the current
* iteration and thus worthy of taking into account. */
- MountProcFlags flags =
- MOUNT(u)->proc_flags | MOUNT_PROC_IS_MOUNTED;
+ MountProcFlags flags = m->proc_flags | MOUNT_PROC_IS_MOUNTED;
- r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype);
+ r = update_parameters_proc_self_mountinfo(m, what, options, fstype);
if (r < 0)
return r;
if (r > 0)
@@ -1818,12 +1795,12 @@ static int mount_setup_existing_unit(
* from the serialized state), and need to catch up. Since we know that the MOUNT_MOUNTING state is
* reached when we wait for the mount to appear we hence can assume that if we are in it, we are
* actually seeing it established for the first time. */
- if (!MOUNT(u)->from_proc_self_mountinfo || MOUNT(u)->state == MOUNT_MOUNTING)
+ if (!m->from_proc_self_mountinfo || m->state == MOUNT_MOUNTING)
flags |= MOUNT_PROC_JUST_MOUNTED;
- MOUNT(u)->from_proc_self_mountinfo = true;
+ m->from_proc_self_mountinfo = true;
- if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) {
+ if (UNIT_IS_LOAD_ERROR(u->load_state)) {
/* The unit was previously not found or otherwise not loaded. Now that the unit shows up in
* /proc/self/mountinfo we should reconsider it this, hence set it to UNIT_LOADED. */
u->load_state = UNIT_LOADED;
@@ -1835,7 +1812,7 @@ static int mount_setup_existing_unit(
if (FLAGS_SET(flags, MOUNT_PROC_JUST_CHANGED)) {
/* If things changed, then make sure that all deps are regenerated. Let's
* first remove all automatic deps, and then add in the new ones. */
- r = mount_add_non_exec_dependencies(MOUNT(u));
+ r = mount_add_non_exec_dependencies(m);
if (r < 0)
return r;
}
@@ -1950,14 +1927,27 @@ static void mount_shutdown(Manager *m) {
m->mount_monitor = NULL;
}
+static void mount_handoff_timestamp(
+ Unit *u,
+ const struct ucred *ucred,
+ const dual_timestamp *ts) {
+
+ Mount *m = ASSERT_PTR(MOUNT(u));
+
+ assert(ucred);
+ assert(ts);
+
+ if (m->control_pid.pid == ucred->pid && m->control_command) {
+ exec_status_handoff(&m->control_command->exec_status, ucred, ts);
+ unit_add_to_dbus_queue(u);
+ }
+}
+
static int mount_get_timeout(Unit *u, usec_t *timeout) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
usec_t t;
int r;
- assert(m);
- assert(u);
-
if (!m->timer_event_source)
return 0;
@@ -2063,7 +2053,7 @@ static void mount_enumerate(Manager *m) {
goto fail;
}
- r = sd_event_source_set_priority(m->mount_event_source, SD_EVENT_PRIORITY_NORMAL-10);
+ r = sd_event_source_set_priority(m->mount_event_source, EVENT_PRIORITY_MOUNT_TABLE);
if (r < 0) {
log_error_errno(r, "Failed to adjust mount watch priority: %m");
goto fail;
@@ -2330,19 +2320,15 @@ fail:
}
static int mount_can_clean(Unit *u, ExecCleanMask *ret) {
- Mount *m = MOUNT(u);
-
- assert(m);
+ Mount *m = ASSERT_PTR(MOUNT(u));
return exec_context_get_clean_mask(&m->exec_context, ret);
}
static int mount_can_start(Unit *u) {
- Mount *m = MOUNT(u);
+ Mount *m = ASSERT_PTR(MOUNT(u));
int r;
- assert(m);
-
r = unit_test_start_limit(u);
if (r < 0) {
mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT, /* flush_result = */ false);
@@ -2440,6 +2426,7 @@ const UnitVTable mount_vtable = {
.cgroup_context_offset = offsetof(Mount, cgroup_context),
.kill_context_offset = offsetof(Mount, kill_context),
.exec_runtime_offset = offsetof(Mount, exec_runtime),
+ .cgroup_runtime_offset = offsetof(Mount, cgroup_runtime),
.sections =
"Unit\0"
@@ -2482,6 +2469,8 @@ const UnitVTable mount_vtable = {
.reset_failed = mount_reset_failed,
+ .notify_handoff_timestamp = mount_handoff_timestamp,
+
.control_pid = mount_control_pid,
.bus_set_property = bus_mount_set_property,
diff --git a/src/core/mount.h b/src/core/mount.h
index 6712c16..a029dc8 100644
--- a/src/core/mount.h
+++ b/src/core/mount.h
@@ -79,6 +79,7 @@ struct Mount {
CGroupContext cgroup_context;
ExecRuntime *exec_runtime;
+ CGroupRuntime *cgroup_runtime;
MountState state, deserialized_state;
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 88681aa..6c0dc94 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -47,6 +47,7 @@
#include "tmpfile-util.h"
#include "umask-util.h"
#include "user-util.h"
+#include "vpick.h"
#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
@@ -500,9 +501,24 @@ static int append_extensions(
/* First, prepare a mount for each image, but these won't be visible to the unit, instead
* they will be mounted in our propagate directory, and used as a source for the overlay. */
for (size_t i = 0; i < n; i++) {
+ _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
_cleanup_free_ char *mount_point = NULL;
const MountImage *m = mount_images + i;
+ r = path_pick(/* toplevel_path= */ NULL,
+ /* toplevel_fd= */ AT_FDCWD,
+ m->source,
+ &pick_filter_image_raw,
+ PICK_ARCHITECTURE|PICK_TRIES,
+ &result);
+ if (r < 0)
+ return r;
+ if (!result.path)
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ENOENT),
+ "No matching entry in .v/ directory %s found.",
+ m->source);
+
if (asprintf(&mount_point, "%s/%zu", extension_dir, i) < 0)
return -ENOMEM;
@@ -524,7 +540,7 @@ static int append_extensions(
.path_malloc = TAKE_PTR(mount_point),
.image_options_const = m->mount_options,
.ignore = m->ignore_enoent,
- .source_const = m->source,
+ .source_malloc = TAKE_PTR(result.path),
.mode = MOUNT_EXTENSION_IMAGE,
.has_prefix = true,
};
@@ -534,7 +550,8 @@ static int append_extensions(
* Bind mount them in the same location as the ExtensionImages, so that we
* can check that they are valid trees (extension-release.d). */
STRV_FOREACH(extension_directory, extension_directories) {
- _cleanup_free_ char *mount_point = NULL, *source = NULL;
+ _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
+ _cleanup_free_ char *mount_point = NULL;
const char *e = *extension_directory;
bool ignore_enoent = false;
@@ -551,9 +568,19 @@ static int append_extensions(
if (startswith(e, "+"))
e++;
- source = strdup(e);
- if (!source)
- return -ENOMEM;
+ r = path_pick(/* toplevel_path= */ NULL,
+ /* toplevel_fd= */ AT_FDCWD,
+ e,
+ &pick_filter_image_dir,
+ PICK_ARCHITECTURE|PICK_TRIES,
+ &result);
+ if (r < 0)
+ return r;
+ if (!result.path)
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ENOENT),
+ "No matching entry in .v/ directory %s found.",
+ e);
for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
@@ -571,7 +598,7 @@ static int append_extensions(
*me = (MountEntry) {
.path_malloc = TAKE_PTR(mount_point),
- .source_malloc = TAKE_PTR(source),
+ .source_malloc = TAKE_PTR(result.path),
.mode = MOUNT_EXTENSION_DIRECTORY,
.ignore = ignore_enoent,
.has_prefix = true,
@@ -626,8 +653,7 @@ static int append_tmpfs_mounts(MountList *ml, const TemporaryFileSystem *tmpfs,
return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
ro = flags & MS_RDONLY;
- if (ro)
- flags ^= MS_RDONLY;
+ flags &= ~MS_RDONLY;
MountEntry *me = mount_list_extend(ml);
if (!me)
@@ -876,42 +902,41 @@ static void drop_outside_root(MountList *ml, const char *root_directory) {
ml->n_mounts = t - ml->mounts;
}
-static int clone_device_node(
- const char *d,
- const char *temporary_mount,
- bool *make_devnode) {
-
+static int clone_device_node(const char *node, const char *temporary_mount, bool *make_devnode) {
_cleanup_free_ char *sl = NULL;
- const char *dn, *bn, *t;
+ const char *dn, *bn;
struct stat st;
int r;
- if (stat(d, &st) < 0) {
+ assert(node);
+ assert(path_is_absolute(node));
+ assert(temporary_mount);
+ assert(make_devnode);
+
+ if (stat(node, &st) < 0) {
if (errno == ENOENT) {
- log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
+ log_debug_errno(errno, "Device node '%s' to clone does not exist.", node);
return -ENXIO;
}
- return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
+ return log_debug_errno(errno, "Failed to stat() device node '%s' to clone: %m", node);
}
- if (!S_ISBLK(st.st_mode) &&
- !S_ISCHR(st.st_mode))
- return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
- "Device node '%s' to clone is not a device node, ignoring.",
- d);
+ r = stat_verify_device_node(&st);
+ if (r < 0)
+ return log_debug_errno(r, "Cannot clone device node '%s': %m", node);
- dn = strjoina(temporary_mount, d);
+ dn = strjoina(temporary_mount, node);
/* First, try to create device node properly */
if (*make_devnode) {
- mac_selinux_create_file_prepare(d, st.st_mode);
+ mac_selinux_create_file_prepare(node, st.st_mode);
r = mknod(dn, st.st_mode, st.st_rdev);
mac_selinux_create_file_clear();
if (r >= 0)
goto add_symlink;
if (errno != EPERM)
- return log_debug_errno(errno, "mknod failed for %s: %m", d);
+ return log_debug_errno(errno, "Failed to mknod '%s': %m", node);
/* This didn't work, let's not try this again for the next iterations. */
*make_devnode = false;
@@ -921,17 +946,17 @@ static int clone_device_node(
* Do not prepare device-node SELinux label (see issue 13762) */
r = mknod(dn, S_IFREG, 0);
if (r < 0 && errno != EEXIST)
- return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
+ return log_debug_errno(errno, "Failed to mknod dummy device node for '%s': %m", node);
/* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard
* properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or
* root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */
- r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL);
+ r = mount_nofollow_verbose(LOG_DEBUG, node, dn, NULL, MS_BIND, NULL);
if (r < 0)
return r;
add_symlink:
- bn = path_startswith(d, "/dev/");
+ bn = path_startswith(node, "/dev/");
if (!bn)
return 0;
@@ -944,14 +969,27 @@ add_symlink:
(void) mkdir_parents(sl, 0755);
- t = strjoina("../", bn);
+ const char *t = strjoina("../", bn);
if (symlink(t, sl) < 0)
log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
return 0;
}
-static char *settle_runtime_dir(RuntimeScope scope) {
+static int bind_mount_device_dir(const char *temporary_mount, const char *dir) {
+ const char *t;
+
+ assert(temporary_mount);
+ assert(dir);
+ assert(path_is_absolute(dir));
+
+ t = strjoina(temporary_mount, dir);
+
+ (void) mkdir(t, 0755);
+ return mount_nofollow_verbose(LOG_DEBUG, dir, t, NULL, MS_BIND, NULL);
+}
+
+static char* settle_runtime_dir(RuntimeScope scope) {
char *runtime_dir;
if (scope != RUNTIME_SCOPE_USER)
@@ -992,8 +1030,8 @@ static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
"/dev/urandom\0"
"/dev/tty\0";
- _cleanup_free_ char *temporary_mount = NULL;
- const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
+ _cleanup_(rmdir_and_freep) char *temporary_mount = NULL;
+ _cleanup_(umount_and_rmdir_and_freep) char *dev = NULL;
bool can_mknod = true;
int r;
@@ -1003,67 +1041,56 @@ static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
if (r < 0)
return r;
- dev = strjoina(temporary_mount, "/dev");
+ dev = path_join(temporary_mount, "dev");
+ if (!dev)
+ return -ENOMEM;
+
(void) mkdir(dev, 0755);
r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV);
if (r < 0)
- goto fail;
+ return r;
r = label_fix_full(AT_FDCWD, dev, "/dev", 0);
- if (r < 0) {
- log_debug_errno(r, "Failed to fix label of '%s' as /dev: %m", dev);
- goto fail;
- }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to fix label of '%s' as /dev/: %m", dev);
- devpts = strjoina(temporary_mount, "/dev/pts");
- (void) mkdir(devpts, 0755);
- r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL);
+ r = bind_mount_device_dir(temporary_mount, "/dev/pts");
if (r < 0)
- goto fail;
+ return r;
/* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
* When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
* Thus, in that case make a clone.
* In nspawn and other containers it will be a symlink, in that case make it a symlink. */
r = is_symlink("/dev/ptmx");
- if (r < 0) {
- log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
- goto fail;
- } else if (r > 0) {
- devptmx = strjoina(temporary_mount, "/dev/ptmx");
- if (symlink("pts/ptmx", devptmx) < 0) {
- r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
- goto fail;
- }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
+ if (r > 0) {
+ const char *devptmx = strjoina(temporary_mount, "/dev/ptmx");
+ if (symlink("pts/ptmx", devptmx) < 0)
+ return log_debug_errno(errno, "Failed to create symlink '%s' to pts/ptmx: %m", devptmx);
} else {
r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
if (r < 0)
- goto fail;
+ return r;
}
- devshm = strjoina(temporary_mount, "/dev/shm");
- (void) mkdir(devshm, 0755);
- r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL);
+ r = bind_mount_device_dir(temporary_mount, "/dev/shm");
if (r < 0)
- goto fail;
-
- devmqueue = strjoina(temporary_mount, "/dev/mqueue");
- (void) mkdir(devmqueue, 0755);
- (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
+ return r;
- devhugepages = strjoina(temporary_mount, "/dev/hugepages");
- (void) mkdir(devhugepages, 0755);
- (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
+ FOREACH_STRING(d, "/dev/mqueue", "/dev/hugepages")
+ (void) bind_mount_device_dir(temporary_mount, d);
- devlog = strjoina(temporary_mount, "/dev/log");
+ const char *devlog = strjoina(temporary_mount, "/dev/log");
if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
- log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
+ log_debug_errno(errno, "Failed to create symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
NULSTR_FOREACH(d, devnodes) {
r = clone_device_node(d, temporary_mount, &can_mknod);
/* ENXIO means the *source* is not a device file, skip creation in that case */
if (r < 0 && r != -ENXIO)
- goto fail;
+ return r;
}
r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
@@ -1081,31 +1108,10 @@ static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL);
if (r < 0)
- goto fail;
-
- (void) rmdir(dev);
- (void) rmdir(temporary_mount);
+ return r;
+ dev = rmdir_and_free(dev); /* Mount is successfully moved, do not umount() */
return 1;
-
-fail:
- if (devpts)
- (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW);
-
- if (devshm)
- (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW);
-
- if (devhugepages)
- (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW);
-
- if (devmqueue)
- (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW);
-
- (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW);
- (void) rmdir(dev);
- (void) rmdir(temporary_mount);
-
- return r;
}
static int mount_bind_dev(const MountEntry *m) {
@@ -1118,7 +1124,7 @@ static int mount_bind_dev(const MountEntry *m) {
(void) mkdir_p_label(mount_entry_path(m), 0755);
- r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ r = path_is_mount_point(mount_entry_path(m));
if (r < 0)
return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
if (r > 0) /* make this a NOP if /dev is already a mount point */
@@ -1138,7 +1144,7 @@ static int mount_bind_sysfs(const MountEntry *m) {
(void) mkdir_p_label(mount_entry_path(m), 0755);
- r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ r = path_is_mount_point(mount_entry_path(m));
if (r < 0)
return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
if (r > 0) /* make this a NOP if /sys is already a mount point */
@@ -1185,7 +1191,7 @@ static int mount_private_apivfs(
/* When we do not have enough privileges to mount a new instance, fall back to use an
* existing mount. */
- r = path_is_mount_point(entry_path, /* root = */ NULL, /* flags = */ 0);
+ r = path_is_mount_point(entry_path);
if (r < 0)
return log_debug_errno(r, "Unable to determine whether '%s' is already mounted: %m", entry_path);
if (r > 0)
@@ -1300,7 +1306,7 @@ static int mount_run(const MountEntry *m) {
assert(m);
- r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ r = path_is_mount_point(mount_entry_path(m));
if (r < 0 && r != -ENOENT)
return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m");
if (r > 0) /* make this a NOP if /run is already a mount point */
@@ -1354,7 +1360,7 @@ static int mount_image(
if (r < 0)
return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
if (isempty(host_os_release_id))
- return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s'.", empty_to_root(root_directory));
}
r = verity_dissect_and_mount(
@@ -1448,6 +1454,8 @@ static int follow_symlink(
_cleanup_free_ char *target = NULL;
int r;
+ assert(m);
+
/* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
* might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
* a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
@@ -1469,7 +1477,7 @@ static int follow_symlink(
mount_entry_consume_prefix(m, TAKE_PTR(target));
- m->n_followed ++;
+ m->n_followed++;
return 0;
}
@@ -1524,7 +1532,7 @@ static int apply_one_mount(
r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
if (r < 0)
return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
- "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
+ "File type not supported for inaccessible mounts. Note that symlinks are not allowed.");
what = inaccessible;
break;
}
@@ -1534,7 +1542,7 @@ static int apply_one_mount(
case MOUNT_READ_WRITE_IMPLICIT:
case MOUNT_EXEC:
case MOUNT_NOEXEC:
- r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
+ r = path_is_mount_point_full(mount_entry_path(m), root_directory, /* flags = */ 0);
if (r == -ENOENT && m->ignore)
return 0;
if (r < 0)
@@ -1575,7 +1583,7 @@ static int apply_one_mount(
if (r < 0)
return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
if (isempty(host_os_release_id))
- return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s'.", empty_to_root(root_directory));
r = load_extension_release_pairs(mount_entry_source(m), class, extension_name, /* relax_extension_release_check= */ false, &extension_release);
if (r == -ENOENT && m->ignore)
@@ -1588,13 +1596,13 @@ static int apply_one_mount(
host_os_release_id,
host_os_release_version_id,
host_os_release_level,
- /* host_extension_scope */ NULL, /* Leave empty, we need to accept both system and portable */
+ /* host_extension_scope = */ NULL, /* Leave empty, we need to accept both system and portable */
extension_release,
class);
- if (r == 0)
- return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's", extension_name);
if (r < 0)
return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name);
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's.", extension_name);
_fallthrough_;
}
@@ -2049,9 +2057,9 @@ static bool root_read_only(
}
static bool home_read_only(
- char** read_only_paths,
- char** inaccessible_paths,
- char** empty_directories,
+ char * const *read_only_paths,
+ char * const *inaccessible_paths,
+ char * const *empty_directories,
const BindMount *bind_mounts,
size_t n_bind_mounts,
const TemporaryFileSystem *temporary_filesystems,
@@ -2070,13 +2078,13 @@ static bool home_read_only(
prefixed_path_strv_contains(empty_directories, "/home"))
return true;
- for (size_t i = 0; i < n_temporary_filesystems; i++)
- if (path_equal(temporary_filesystems[i].path, "/home"))
+ FOREACH_ARRAY(i, temporary_filesystems, n_temporary_filesystems)
+ if (path_equal(i->path, "/home"))
return true;
/* If /home is overmounted with some dir from the host it's not writable. */
- for (size_t i = 0; i < n_bind_mounts; i++)
- if (path_equal(bind_mounts[i].destination, "/home"))
+ FOREACH_ARRAY(i, bind_mounts, n_bind_mounts)
+ if (path_equal(i->destination, "/home"))
return true;
return false;
@@ -2088,6 +2096,7 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) {
_cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
_cleanup_strv_free_ char **hierarchies = NULL;
_cleanup_(mount_list_done) MountList ml = {};
+ _cleanup_close_ int userns_fd = -EBADF;
bool require_prefix = false;
const char *root;
DissectImageFlags dissect_image_flags =
@@ -2099,7 +2108,8 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) {
DISSECT_IMAGE_USR_NO_ROOT |
DISSECT_IMAGE_GROWFS |
DISSECT_IMAGE_ADD_PARTITION_DEVICES |
- DISSECT_IMAGE_PIN_PARTITION_DEVICES;
+ DISSECT_IMAGE_PIN_PARTITION_DEVICES |
+ DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
int r;
assert(p);
@@ -2123,40 +2133,57 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) {
SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
- r = loop_device_make_by_path(
- p->root_image,
- FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
- /* sector_size= */ UINT32_MAX,
- FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
- LOCK_SH,
- &loop_device);
- if (r < 0)
- return log_debug_errno(r, "Failed to create loop device for root image: %m");
-
- r = dissect_loop_device(
- loop_device,
- p->verity,
- p->root_image_options,
- p->root_image_policy,
- dissect_image_flags,
- &dissected_image);
- if (r < 0)
- return log_debug_errno(r, "Failed to dissect image: %m");
+ if (p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+ /* In system mode we mount directly */
- r = dissected_image_load_verity_sig_partition(
- dissected_image,
- loop_device->fd,
- p->verity);
- if (r < 0)
- return r;
+ r = loop_device_make_by_path(
+ p->root_image,
+ FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
+ /* sector_size= */ UINT32_MAX,
+ FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+ LOCK_SH,
+ &loop_device);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create loop device for root image: %m");
+
+ r = dissect_loop_device(
+ loop_device,
+ p->verity,
+ p->root_image_options,
+ p->root_image_policy,
+ dissect_image_flags,
+ &dissected_image);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to dissect image: %m");
- r = dissected_image_decrypt(
- dissected_image,
- NULL,
- p->verity,
- dissect_image_flags);
- if (r < 0)
- return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+ r = dissected_image_load_verity_sig_partition(
+ dissected_image,
+ loop_device->fd,
+ p->verity);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_decrypt(
+ dissected_image,
+ NULL,
+ p->verity,
+ dissect_image_flags);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+ } else {
+ userns_fd = namespace_open_by_type(NAMESPACE_USER);
+ if (userns_fd < 0)
+ return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m");
+
+ r = mountfsd_mount_image(
+ p->root_image,
+ userns_fd,
+ p->root_image_policy,
+ dissect_image_flags,
+ &dissected_image);
+ if (r < 0)
+ return r;
+ }
}
if (p->root_directory)
@@ -2520,16 +2547,18 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) {
root,
/* uid_shift= */ UID_INVALID,
/* uid_range= */ UID_INVALID,
- /* userns_fd= */ -EBADF,
+ userns_fd,
dissect_image_flags);
if (r < 0)
return log_debug_errno(r, "Failed to mount root image: %m");
/* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
* if it likes. */
- r = loop_device_flock(loop_device, LOCK_UN);
- if (r < 0)
- return log_debug_errno(r, "Failed to release lock on loopback block device: %m");
+ if (loop_device) {
+ r = loop_device_flock(loop_device, LOCK_UN);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to release lock on loopback block device: %m");
+ }
r = dissected_image_relinquish(dissected_image);
if (r < 0)
@@ -2538,7 +2567,7 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) {
} else if (p->root_directory) {
/* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
- r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
+ r = path_is_mount_point_full(root, /* root = */ NULL, AT_SYMLINK_FOLLOW);
if (r < 0)
return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
if (r == 0) {
@@ -2595,9 +2624,9 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) {
void bind_mount_free_many(BindMount *b, size_t n) {
assert(b || n == 0);
- for (size_t i = 0; i < n; i++) {
- free(b[i].source);
- free(b[i].destination);
+ FOREACH_ARRAY(i, b, n) {
+ free(i->source);
+ free(i->destination);
}
free(b);
@@ -2625,7 +2654,7 @@ int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
*b = c;
- c[(*n) ++] = (BindMount) {
+ c[(*n)++] = (BindMount) {
.source = TAKE_PTR(s),
.destination = TAKE_PTR(d),
.read_only = item->read_only,
@@ -2694,7 +2723,7 @@ int mount_image_add(MountImage **m, size_t *n, const MountImage *item) {
*m = c;
- c[(*n) ++] = (MountImage) {
+ c[(*n)++] = (MountImage) {
.source = TAKE_PTR(s),
.destination = TAKE_PTR(d),
.mount_options = TAKE_PTR(options),
@@ -2745,7 +2774,7 @@ int temporary_filesystem_add(
*t = c;
- c[(*n) ++] = (TemporaryFileSystem) {
+ c[(*n)++] = (TemporaryFileSystem) {
.path = TAKE_PTR(p),
.options = TAKE_PTR(o),
};
diff --git a/src/core/path.c b/src/core/path.c
index ef00c20..fdb6ca4 100644
--- a/src/core/path.c
+++ b/src/core/path.c
@@ -90,7 +90,7 @@ int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler) {
/* If this is a symlink watch both the symlink inode and where it points to. If the inode is
* not a symlink both calls will install the same watch, which is redundant and doesn't
* hurt. */
- for (int follow_symlink = 0; follow_symlink < 2; follow_symlink ++) {
+ for (int follow_symlink = 0; follow_symlink < 2; follow_symlink++) {
uint32_t f = flags;
SET_FLAG(f, IN_DONT_FOLLOW, !follow_symlink);
@@ -249,6 +249,8 @@ static bool path_spec_check_good(PathSpec *s, bool initial, bool from_trigger_no
static void path_spec_mkdir(PathSpec *s, mode_t mode) {
int r;
+ assert(s);
+
if (IN_SET(s->type, PATH_EXISTS, PATH_EXISTS_GLOB))
return;
@@ -260,6 +262,10 @@ static void path_spec_mkdir(PathSpec *s, mode_t mode) {
static void path_spec_dump(PathSpec *s, FILE *f, const char *prefix) {
const char *type;
+ assert(s);
+ assert(f);
+ assert(prefix);
+
assert_se(type = path_type_to_string(s->type));
fprintf(f, "%s%s: %s\n", prefix, type, s->path);
}
@@ -272,9 +278,8 @@ void path_spec_done(PathSpec *s) {
}
static void path_init(Unit *u) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
- assert(u);
assert(u->load_state == UNIT_STUB);
p->directory_mode = 0755;
@@ -295,9 +300,7 @@ void path_free_specs(Path *p) {
}
static void path_done(Unit *u) {
- Path *p = PATH(u);
-
- assert(p);
+ Path *p = ASSERT_PTR(PATH(u));
p->trigger_notify_event_source = sd_event_source_disable_unref(p->trigger_notify_event_source);
path_free_specs(p);
@@ -309,7 +312,7 @@ static int path_add_mount_dependencies(Path *p) {
assert(p);
LIST_FOREACH(spec, s, p->specs) {
- r = unit_require_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES);
if (r < 0)
return r;
}
@@ -389,10 +392,9 @@ static int path_add_extras(Path *p) {
}
static int path_load(Unit *u) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
int r;
- assert(u);
assert(u->load_state == UNIT_STUB);
r = unit_load_fragment_and_dropin(u, true);
@@ -410,11 +412,11 @@ static int path_load(Unit *u) {
}
static void path_dump(Unit *u, FILE *f, const char *prefix) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
Unit *trigger;
- assert(p);
assert(f);
+ assert(prefix);
trigger = UNIT_TRIGGER(u);
@@ -461,6 +463,7 @@ static int path_watch(Path *p) {
static void path_set_state(Path *p, PathState state) {
PathState old_state;
+
assert(p);
if (p->state != state)
@@ -481,9 +484,8 @@ static void path_set_state(Path *p, PathState state) {
static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify);
static int path_coldplug(Unit *u) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
- assert(p);
assert(p->state == PATH_DEAD);
if (p->deserialized_state != p->state) {
@@ -625,10 +627,9 @@ static void path_mkdir(Path *p) {
}
static int path_start(Unit *u) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
int r;
- assert(p);
assert(IN_SET(p->state, PATH_DEAD, PATH_FAILED));
r = unit_test_trigger_loaded(u);
@@ -648,9 +649,8 @@ static int path_start(Unit *u) {
}
static int path_stop(Unit *u) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
- assert(p);
assert(IN_SET(p->state, PATH_WAITING, PATH_RUNNING));
path_enter_dead(p, PATH_SUCCESS);
@@ -658,9 +658,8 @@ static int path_stop(Unit *u) {
}
static int path_serialize(Unit *u, FILE *f, FDSet *fds) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
- assert(u);
assert(f);
assert(fds);
@@ -688,9 +687,8 @@ static int path_serialize(Unit *u, FILE *f, FDSet *fds) {
}
static int path_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
- assert(u);
assert(key);
assert(value);
assert(fds);
@@ -755,28 +753,24 @@ static int path_deserialize_item(Unit *u, const char *key, const char *value, FD
}
static UnitActiveState path_active_state(Unit *u) {
- assert(u);
+ Path *p = ASSERT_PTR(PATH(u));
- return state_translation_table[PATH(u)->state];
+ return state_translation_table[p->state];
}
static const char *path_sub_state_to_string(Unit *u) {
- assert(u);
+ Path *p = ASSERT_PTR(PATH(u));
- return path_state_to_string(PATH(u)->state);
+ return path_state_to_string(p->state);
}
static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
- PathSpec *s = userdata, *found = NULL;
- Path *p;
+ PathSpec *s = ASSERT_PTR(userdata), *found = NULL;
+ Path *p = ASSERT_PTR(PATH(s->unit));
int changed;
- assert(s);
- assert(s->unit);
assert(fd >= 0);
- p = PATH(s->unit);
-
if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING))
return 0;
@@ -827,10 +821,9 @@ static int path_trigger_notify_on_defer(sd_event_source *s, void *userdata) {
}
static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
int r;
- assert(u);
assert(other);
/* Invoked whenever the unit we trigger changes state or gains or loses a job */
@@ -897,9 +890,7 @@ static void path_trigger_notify(Unit *u, Unit *other) {
}
static void path_reset_failed(Unit *u) {
- Path *p = PATH(u);
-
- assert(p);
+ Path *p = ASSERT_PTR(PATH(u));
if (p->state == PATH_FAILED)
path_set_state(p, PATH_DEAD);
@@ -908,11 +899,9 @@ static void path_reset_failed(Unit *u) {
}
static int path_can_start(Unit *u) {
- Path *p = PATH(u);
+ Path *p = ASSERT_PTR(PATH(u));
int r;
- assert(p);
-
r = unit_test_start_limit(u);
if (r < 0) {
path_enter_dead(p, PATH_FAILURE_START_LIMIT_HIT);
@@ -961,13 +950,11 @@ static int activation_details_path_deserialize(const char *key, const char *valu
}
static int activation_details_path_append_env(ActivationDetails *details, char ***strv) {
- ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details);
+ ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details));
char *s;
int r;
- assert(details);
assert(strv);
- assert(p);
if (isempty(p->trigger_path_filename))
return 0;
@@ -984,21 +971,15 @@ static int activation_details_path_append_env(ActivationDetails *details, char *
}
static int activation_details_path_append_pair(ActivationDetails *details, char ***strv) {
- ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details);
+ ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details));
int r;
- assert(details);
assert(strv);
- assert(p);
if (isempty(p->trigger_path_filename))
return 0;
- r = strv_extend(strv, "trigger_path");
- if (r < 0)
- return r;
-
- r = strv_extend(strv, p->trigger_path_filename);
+ r = strv_extend_many(strv, "trigger_path", p->trigger_path_filename);
if (r < 0)
return r;
diff --git a/src/core/scope.c b/src/core/scope.c
index 2841280..cfa2aeb 100644
--- a/src/core/scope.c
+++ b/src/core/scope.c
@@ -23,21 +23,20 @@
#include "user-util.h"
static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
- [SCOPE_DEAD] = UNIT_INACTIVE,
- [SCOPE_START_CHOWN] = UNIT_ACTIVATING,
- [SCOPE_RUNNING] = UNIT_ACTIVE,
- [SCOPE_ABANDONED] = UNIT_ACTIVE,
+ [SCOPE_DEAD] = UNIT_INACTIVE,
+ [SCOPE_START_CHOWN] = UNIT_ACTIVATING,
+ [SCOPE_RUNNING] = UNIT_ACTIVE,
+ [SCOPE_ABANDONED] = UNIT_ACTIVE,
[SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
[SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
- [SCOPE_FAILED] = UNIT_FAILED,
+ [SCOPE_FAILED] = UNIT_FAILED,
};
static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
static void scope_init(Unit *u) {
- Scope *s = SCOPE(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
- assert(u);
assert(u->load_state == UNIT_STUB);
s->runtime_max_usec = USEC_INFINITY;
@@ -48,9 +47,7 @@ static void scope_init(Unit *u) {
}
static void scope_done(Unit *u) {
- Scope *s = SCOPE(u);
-
- assert(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
s->controller = mfree(s->controller);
s->controller_track = sd_bus_track_unref(s->controller_track);
@@ -84,6 +81,7 @@ static int scope_arm_timer(Scope *s, bool relative, usec_t usec) {
static void scope_set_state(Scope *s, ScopeState state) {
ScopeState old_state;
+
assert(s);
if (s->state != state)
@@ -101,7 +99,8 @@ static void scope_set_state(Scope *s, ScopeState state) {
}
if (state != old_state)
- log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state));
+ log_unit_debug(UNIT(s), "Changed %s -> %s",
+ scope_state_to_string(old_state), scope_state_to_string(state));
unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
}
@@ -181,10 +180,9 @@ static int scope_add_extras(Scope *s) {
}
static int scope_load(Unit *u) {
- Scope *s = SCOPE(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
int r;
- assert(s);
assert(u->load_state == UNIT_STUB);
if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
@@ -227,10 +225,9 @@ static usec_t scope_coldplug_timeout(Scope *s) {
}
static int scope_coldplug(Unit *u) {
- Scope *s = SCOPE(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
int r;
- assert(s);
assert(s->state == SCOPE_DEAD);
if (s->deserialized_state == s->state)
@@ -260,10 +257,10 @@ static int scope_coldplug(Unit *u) {
}
static void scope_dump(Unit *u, FILE *f, const char *prefix) {
- Scope *s = SCOPE(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
- assert(s);
assert(f);
+ assert(prefix);
fprintf(f,
"%sScope State: %s\n"
@@ -277,7 +274,7 @@ static void scope_dump(Unit *u, FILE *f, const char *prefix) {
prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC),
prefix, oom_policy_to_string(s->oom_policy));
- cgroup_context_dump(UNIT(s), f, prefix);
+ cgroup_context_dump(u, f, prefix);
kill_context_dump(&s->kill_context, f, prefix);
}
@@ -317,13 +314,9 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
else {
r = unit_kill_context(
UNIT(s),
- &s->kill_context,
state != SCOPE_STOP_SIGTERM ? KILL_KILL :
s->was_abandoned ? KILL_TERMINATE_AND_LOG :
- KILL_TERMINATE,
- /* main_pid= */ NULL,
- /* control_pid= */ NULL,
- /* main_pid_alien= */ false);
+ KILL_TERMINATE);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
goto fail;
@@ -350,13 +343,15 @@ fail:
}
static int scope_enter_start_chown(Scope *s) {
+ Unit *u = UNIT(ASSERT_PTR(s));
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
- Unit *u = UNIT(s);
int r;
- assert(s);
assert(s->user);
+ if (!s->cgroup_runtime)
+ return -EINVAL;
+
r = scope_arm_timer(s, /* relative= */ true, u->manager->defaults.timeout_start_usec);
if (r < 0)
return r;
@@ -389,7 +384,7 @@ static int scope_enter_start_chown(Scope *s) {
}
}
- r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, uid, gid);
+ r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, s->cgroup_runtime->cgroup_path, uid, gid);
if (r < 0) {
log_unit_error_errno(UNIT(s), r, "Failed to adjust control group access: %m");
_exit(EXIT_CGROUP);
@@ -411,11 +406,9 @@ fail:
}
static int scope_enter_running(Scope *s) {
- Unit *u = UNIT(s);
+ Unit *u = UNIT(ASSERT_PTR(s));
int r;
- assert(s);
-
(void) bus_scope_track_controller(s);
r = unit_acquire_invocation_id(u);
@@ -458,9 +451,7 @@ fail:
}
static int scope_start(Unit *u) {
- Scope *s = SCOPE(u);
-
- assert(s);
+ Scope *s = ASSERT_PTR(SCOPE(u));
if (unit_has_name(u, SPECIAL_INIT_SCOPE))
return -EPERM;
@@ -489,9 +480,7 @@ static int scope_start(Unit *u) {
}
static int scope_stop(Unit *u) {
- Scope *s = SCOPE(u);
-
- assert(s);
+ Scope *s = ASSERT_PTR(SCOPE(u));
if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
return 0;
@@ -503,9 +492,7 @@ static int scope_stop(Unit *u) {
}
static void scope_reset_failed(Unit *u) {
- Scope *s = SCOPE(u);
-
- assert(s);
+ Scope *s = ASSERT_PTR(SCOPE(u));
if (s->state == SCOPE_FAILED)
scope_set_state(s, SCOPE_DEAD);
@@ -514,7 +501,7 @@ static void scope_reset_failed(Unit *u) {
}
static int scope_get_timeout(Unit *u, usec_t *timeout) {
- Scope *s = SCOPE(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
usec_t t;
int r;
@@ -532,10 +519,9 @@ static int scope_get_timeout(Unit *u, usec_t *timeout) {
}
static int scope_serialize(Unit *u, FILE *f, FDSet *fds) {
- Scope *s = SCOPE(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
PidRef *pid;
- assert(s);
assert(f);
assert(fds);
@@ -552,10 +538,9 @@ static int scope_serialize(Unit *u, FILE *f, FDSet *fds) {
}
static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Scope *s = SCOPE(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
int r;
- assert(u);
assert(key);
assert(value);
assert(fds);
@@ -600,8 +585,7 @@ static int scope_deserialize_item(Unit *u, const char *key, const char *value, F
}
static void scope_notify_cgroup_empty_event(Unit *u) {
- Scope *s = SCOPE(u);
- assert(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
log_unit_debug(u, "cgroup is empty");
@@ -610,7 +594,7 @@ static void scope_notify_cgroup_empty_event(Unit *u) {
}
static void scope_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
- Scope *s = SCOPE(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
if (managed_oom)
log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd.");
@@ -642,9 +626,7 @@ static void scope_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
}
static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
- Scope *s = SCOPE(u);
-
- assert(s);
+ Scope *s = ASSERT_PTR(SCOPE(u));
if (s->state == SCOPE_START_CHOWN) {
if (!is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
@@ -662,9 +644,8 @@ static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
}
static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
- Scope *s = SCOPE(userdata);
+ Scope *s = ASSERT_PTR(SCOPE(userdata));
- assert(s);
assert(s->timer_event_source == source);
switch (s->state) {
@@ -726,15 +707,15 @@ int scope_abandon(Scope *s) {
}
static UnitActiveState scope_active_state(Unit *u) {
- assert(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
- return state_translation_table[SCOPE(u)->state];
+ return state_translation_table[s->state];
}
static const char *scope_sub_state_to_string(Unit *u) {
- assert(u);
+ Scope *s = ASSERT_PTR(SCOPE(u));
- return scope_state_to_string(SCOPE(u)->state);
+ return scope_state_to_string(s->state);
}
static void scope_enumerate_perpetual(Manager *m) {
@@ -782,6 +763,7 @@ const UnitVTable scope_vtable = {
.object_size = sizeof(Scope),
.cgroup_context_offset = offsetof(Scope, cgroup_context),
.kill_context_offset = offsetof(Scope, kill_context),
+ .cgroup_runtime_offset = offsetof(Scope, cgroup_runtime),
.sections =
"Unit\0"
@@ -806,8 +788,7 @@ const UnitVTable scope_vtable = {
.start = scope_start,
.stop = scope_stop,
- .freeze = unit_freeze_vtable_common,
- .thaw = unit_thaw_vtable_common,
+ .freezer_action = unit_cgroup_freezer_action,
.get_timeout = scope_get_timeout,
diff --git a/src/core/scope.h b/src/core/scope.h
index c9574a3..1090431 100644
--- a/src/core/scope.h
+++ b/src/core/scope.h
@@ -21,6 +21,7 @@ struct Scope {
CGroupContext cgroup_context;
KillContext kill_context;
+ CGroupRuntime *cgroup_runtime;
ScopeState state, deserialized_state;
ScopeResult result;
diff --git a/src/core/selinux-access.c b/src/core/selinux-access.c
index 62181a6..a67a520 100644
--- a/src/core/selinux-access.c
+++ b/src/core/selinux-access.c
@@ -193,7 +193,6 @@ int mac_selinux_access_check_internal(
assert(message);
assert(permission);
assert(function);
- assert(error);
r = access_init(error);
if (r <= 0)
@@ -248,7 +247,7 @@ int mac_selinux_access_check_internal(
tclass = "system";
}
- sd_bus_creds_get_cmdline(creds, &cmdline);
+ (void) sd_bus_creds_get_cmdline(creds, &cmdline);
cl = strv_join(cmdline, " ");
struct audit_info audit_info = {
@@ -268,7 +267,7 @@ int mac_selinux_access_check_internal(
log_full_errno_zerook(LOG_DEBUG, r,
"SELinux access check scon=%s tcon=%s tclass=%s perm=%s state=%s function=%s path=%s cmdline=%s: %m",
- scon, acon, tclass, permission, enforce ? "enforcing" : "permissive", function, strna(unit_path), strna(empty_to_null(cl)));
+ scon, acon, tclass, permission, enforce ? "enforcing" : "permissive", function, strna(unit_path), empty_to_na(cl));
return enforce ? r : 0;
}
diff --git a/src/core/service.c b/src/core/service.c
index ffe92d2..8ec27c4 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -24,6 +24,7 @@
#include "fd-util.h"
#include "fileio.h"
#include "format-util.h"
+#include "io-util.h"
#include "load-dropin.h"
#include "load-fragment.h"
#include "log.h"
@@ -34,6 +35,7 @@
#include "path-util.h"
#include "process-util.h"
#include "random-util.h"
+#include "selinux-util.h"
#include "serialize.h"
#include "service.h"
#include "signal-util.h"
@@ -49,61 +51,61 @@
#define service_spawn(...) service_spawn_internal(__func__, __VA_ARGS__)
static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = {
- [SERVICE_DEAD] = UNIT_INACTIVE,
- [SERVICE_CONDITION] = UNIT_ACTIVATING,
- [SERVICE_START_PRE] = UNIT_ACTIVATING,
- [SERVICE_START] = UNIT_ACTIVATING,
- [SERVICE_START_POST] = UNIT_ACTIVATING,
- [SERVICE_RUNNING] = UNIT_ACTIVE,
- [SERVICE_EXITED] = UNIT_ACTIVE,
- [SERVICE_RELOAD] = UNIT_RELOADING,
- [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING,
- [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING,
- [SERVICE_STOP] = UNIT_DEACTIVATING,
- [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
- [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
- [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
- [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
- [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
- [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
- [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
- [SERVICE_FAILED] = UNIT_FAILED,
- [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE,
+ [SERVICE_DEAD] = UNIT_INACTIVE,
+ [SERVICE_CONDITION] = UNIT_ACTIVATING,
+ [SERVICE_START_PRE] = UNIT_ACTIVATING,
+ [SERVICE_START] = UNIT_ACTIVATING,
+ [SERVICE_START_POST] = UNIT_ACTIVATING,
+ [SERVICE_RUNNING] = UNIT_ACTIVE,
+ [SERVICE_EXITED] = UNIT_ACTIVE,
+ [SERVICE_RELOAD] = UNIT_RELOADING,
+ [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING,
+ [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING,
+ [SERVICE_STOP] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_FAILED] = UNIT_FAILED,
+ [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE,
[SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED,
- [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE,
- [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
- [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING,
- [SERVICE_CLEANING] = UNIT_MAINTENANCE,
+ [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE,
+ [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
+ [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING,
+ [SERVICE_CLEANING] = UNIT_MAINTENANCE,
};
/* For Type=idle we never want to delay any other jobs, hence we
* consider idle jobs active as soon as we start working on them */
static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = {
- [SERVICE_DEAD] = UNIT_INACTIVE,
- [SERVICE_CONDITION] = UNIT_ACTIVE,
- [SERVICE_START_PRE] = UNIT_ACTIVE,
- [SERVICE_START] = UNIT_ACTIVE,
- [SERVICE_START_POST] = UNIT_ACTIVE,
- [SERVICE_RUNNING] = UNIT_ACTIVE,
- [SERVICE_EXITED] = UNIT_ACTIVE,
- [SERVICE_RELOAD] = UNIT_RELOADING,
- [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING,
- [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING,
- [SERVICE_STOP] = UNIT_DEACTIVATING,
- [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
- [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
- [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
- [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
- [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
- [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
- [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
- [SERVICE_FAILED] = UNIT_FAILED,
- [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE,
+ [SERVICE_DEAD] = UNIT_INACTIVE,
+ [SERVICE_CONDITION] = UNIT_ACTIVE,
+ [SERVICE_START_PRE] = UNIT_ACTIVE,
+ [SERVICE_START] = UNIT_ACTIVE,
+ [SERVICE_START_POST] = UNIT_ACTIVE,
+ [SERVICE_RUNNING] = UNIT_ACTIVE,
+ [SERVICE_EXITED] = UNIT_ACTIVE,
+ [SERVICE_RELOAD] = UNIT_RELOADING,
+ [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING,
+ [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING,
+ [SERVICE_STOP] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_FAILED] = UNIT_FAILED,
+ [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE,
[SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED,
- [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE,
- [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
- [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING,
- [SERVICE_CLEANING] = UNIT_MAINTENANCE,
+ [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE,
+ [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
+ [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING,
+ [SERVICE_CLEANING] = UNIT_MAINTENANCE,
};
static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
@@ -114,6 +116,25 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
static void service_enter_signal(Service *s, ServiceState state, ServiceResult f);
static void service_enter_reload_by_notify(Service *s);
+static bool SERVICE_STATE_WITH_MAIN_PROCESS(ServiceState state) {
+ return IN_SET(state,
+ SERVICE_START, SERVICE_START_POST,
+ SERVICE_RUNNING,
+ SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL);
+}
+
+static bool SERVICE_STATE_WITH_CONTROL_PROCESS(ServiceState state) {
+ return IN_SET(state,
+ SERVICE_CONDITION,
+ SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+ SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+ SERVICE_CLEANING);
+}
+
static void service_init(Unit *u) {
Service *s = SERVICE(u);
@@ -151,25 +172,17 @@ static void service_init(Unit *u) {
static void service_unwatch_control_pid(Service *s) {
assert(s);
-
- if (!pidref_is_set(&s->control_pid))
- return;
-
- unit_unwatch_pidref(UNIT(s), &s->control_pid);
- pidref_done(&s->control_pid);
+ unit_unwatch_pidref_done(UNIT(s), &s->control_pid);
}
static void service_unwatch_main_pid(Service *s) {
assert(s);
-
- if (!pidref_is_set(&s->main_pid))
- return;
-
- unit_unwatch_pidref(UNIT(s), &s->main_pid);
- pidref_done(&s->main_pid);
+ unit_unwatch_pidref_done(UNIT(s), &s->main_pid);
}
static void service_unwatch_pid_file(Service *s) {
+ assert(s);
+
if (!s->pid_file_pathspec)
return;
@@ -179,42 +192,41 @@ static void service_unwatch_pid_file(Service *s) {
s->pid_file_pathspec = mfree(s->pid_file_pathspec);
}
-static int service_set_main_pidref(Service *s, PidRef *pidref) {
+static int service_set_main_pidref(Service *s, PidRef pidref_consume, const dual_timestamp *start_timestamp) {
+ _cleanup_(pidref_done) PidRef pidref = pidref_consume;
int r;
assert(s);
- /* Takes ownership of the specified pidref on success, but not on failure. */
+ /* Takes ownership of the specified pidref on both success and failure. */
- if (!pidref_is_set(pidref))
+ if (!pidref_is_set(&pidref))
return -ESRCH;
- if (pidref->pid <= 1)
+ if (pidref.pid <= 1)
return -EINVAL;
- if (pidref_is_self(pidref))
+ if (pidref_is_self(&pidref))
return -EINVAL;
- if (pidref_equal(&s->main_pid, pidref) && s->main_pid_known) {
- pidref_done(pidref);
+ if (s->main_pid_known && pidref_equal(&s->main_pid, &pidref))
return 0;
- }
- if (!pidref_equal(&s->main_pid, pidref)) {
+ if (!pidref_equal(&s->main_pid, &pidref)) {
service_unwatch_main_pid(s);
- exec_status_start(&s->main_exec_status, pidref->pid);
+ exec_status_start(&s->main_exec_status, pidref.pid, start_timestamp);
}
- s->main_pid = TAKE_PIDREF(*pidref);
+ s->main_pid = TAKE_PIDREF(pidref);
s->main_pid_known = true;
r = pidref_is_my_child(&s->main_pid);
if (r < 0)
log_unit_warning_errno(UNIT(s), r, "Can't determine if process "PID_FMT" is our child, assuming it is not: %m", s->main_pid.pid);
- else if (r == 0)
+ else if (r == 0) // FIXME: Supervise through pidfd here
log_unit_warning(UNIT(s), "Supervising process "PID_FMT" which is not our child. We'll most likely not notice when it exits.", s->main_pid.pid);
-
s->main_pid_alien = r <= 0;
+
return 0;
}
@@ -290,7 +302,7 @@ static void service_start_watchdog(Service *s) {
/* Let's process everything else which might be a sign
* of living before we consider a service died. */
- r = sd_event_source_set_priority(s->watchdog_event_source, SD_EVENT_PRIORITY_IDLE);
+ r = sd_event_source_set_priority(s->watchdog_event_source, EVENT_PRIORITY_SERVICE_WATCHDOG);
}
if (r < 0)
log_unit_warning_errno(UNIT(s), r, "Failed to install watchdog timer: %m");
@@ -429,7 +441,7 @@ static void service_release_fd_store(Service *s) {
static void service_release_stdio_fd(Service *s) {
assert(s);
- if (s->stdin_fd < 0 && s->stdout_fd < 0 && s->stdout_fd < 0)
+ if (s->stdin_fd < 0 && s->stdout_fd < 0 && s->stderr_fd < 0)
return;
log_unit_debug(UNIT(s), "Releasing stdin/stdout/stderr file descriptors.");
@@ -438,10 +450,9 @@ static void service_release_stdio_fd(Service *s) {
s->stdout_fd = asynchronous_close(s->stdout_fd);
s->stderr_fd = asynchronous_close(s->stderr_fd);
}
-static void service_done(Unit *u) {
- Service *s = SERVICE(u);
- assert(s);
+static void service_done(Unit *u) {
+ Service *s = ASSERT_PTR(SERVICE(u));
open_file_free_many(&s->open_files);
@@ -449,6 +460,7 @@ static void service_done(Unit *u) {
s->status_text = mfree(s->status_text);
s->exec_runtime = exec_runtime_free(s->exec_runtime);
+
exec_command_free_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX);
s->control_command = NULL;
s->main_command = NULL;
@@ -511,7 +523,8 @@ static int service_add_fd_store(Service *s, int fd_in, const char *name, bool do
if (fstat(fd, &st) < 0)
return -errno;
- log_unit_debug(UNIT(s), "Trying to stash fd for dev=" DEVNUM_FORMAT_STR "/inode=%" PRIu64, DEVNUM_FORMAT_VAL(st.st_dev), (uint64_t) st.st_ino);
+ log_unit_debug(UNIT(s), "Trying to stash fd for dev=" DEVNUM_FORMAT_STR "/inode=%" PRIu64,
+ DEVNUM_FORMAT_VAL(st.st_dev), (uint64_t) st.st_ino);
if (s->n_fd_store >= s->n_fd_store_max)
/* Our store is full. Use this errno rather than E[NM]FILE to distinguish from the case
@@ -545,17 +558,16 @@ static int service_add_fd_store(Service *s, int fd_in, const char *name, bool do
r = sd_event_add_io(UNIT(s)->manager->event, &fs->event_source, fs->fd, 0, on_fd_store_io, fs);
if (r < 0 && r != -EPERM) /* EPERM indicates fds that aren't pollable, which is OK */
return r;
- else if (r >= 0)
+ if (r >= 0)
(void) sd_event_source_set_description(fs->event_source, "service-fd-store");
}
+ log_unit_debug(UNIT(s), "Added fd %i (%s) to fd store.", fs->fd, fs->fdname);
+
fs->service = s;
- LIST_PREPEND(fd_store, s->fd_store, fs);
+ LIST_PREPEND(fd_store, s->fd_store, TAKE_PTR(fs));
s->n_fd_store++;
- log_unit_debug(UNIT(s), "Added fd %i (%s) to fd store.", fs->fd, fs->fdname);
-
- TAKE_PTR(fs);
return 1; /* fd newly stored */
}
@@ -654,9 +666,6 @@ static int service_verify(Service *s) {
if (s->type == SERVICE_ONESHOT && IN_SET(s->restart, SERVICE_RESTART_ALWAYS, SERVICE_RESTART_ON_SUCCESS))
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has Restart= set to either always or on-success, which isn't allowed for Type=oneshot services. Refusing.");
- if (s->type == SERVICE_ONESHOT && !exit_status_set_is_empty(&s->restart_force_status))
- return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has RestartForceExitStatus= set, which isn't allowed for Type=oneshot services. Refusing.");
-
if (s->type == SERVICE_ONESHOT && s->exit_type == SERVICE_EXIT_CGROUP)
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has ExitType=cgroup set, which isn't allowed for Type=oneshot services. Refusing.");
@@ -856,7 +865,7 @@ static int service_add_extras(Service *s) {
}
static int service_load(Unit *u) {
- Service *s = SERVICE(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
int r;
r = unit_load_fragment_and_dropin(u, true);
@@ -901,21 +910,19 @@ static void service_dump_fdstore(Service *s, FILE *f, const char *prefix) {
"%s%s '%s' (type=%s; dev=" DEVNUM_FORMAT_STR "; inode=%" PRIu64 "; rdev=" DEVNUM_FORMAT_STR "; path=%s; access=%s)\n",
prefix, i == s->fd_store ? "File Descriptor Store Entry:" : " ",
i->fdname,
- inode_type_to_string(st.st_mode),
+ strna(inode_type_to_string(st.st_mode)),
DEVNUM_FORMAT_VAL(st.st_dev),
(uint64_t) st.st_ino,
DEVNUM_FORMAT_VAL(st.st_rdev),
strna(path),
- accmode_to_string(flags));
+ strna(accmode_to_string(flags)));
}
}
static void service_dump(Unit *u, FILE *f, const char *prefix) {
- Service *s = SERVICE(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
const char *prefix2;
- assert(s);
-
prefix = strempty(prefix);
prefix2 = strjoina(prefix, "\t");
@@ -1016,8 +1023,8 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) {
if (!s->exec_command[c])
continue;
- fprintf(f, "%s-> %s:\n",
- prefix, service_exec_command_to_string(c));
+ fprintf(f, "%s%s %s:\n",
+ prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), service_exec_command_to_string(c));
exec_command_dump_list(s->exec_command[c], f, prefix2);
}
@@ -1159,7 +1166,7 @@ static int service_load_pid_file(Service *s, bool may_warn) {
} else
log_unit_debug(UNIT(s), "Main PID loaded: "PID_FMT, pidref.pid);
- r = service_set_main_pidref(s, &pidref);
+ r = service_set_main_pidref(s, TAKE_PIDREF(pidref), /* start_timestamp = */ NULL);
if (r < 0)
return r;
@@ -1189,7 +1196,7 @@ static void service_search_main_pid(Service *s) {
return;
log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid.pid);
- if (service_set_main_pidref(s, &pid) < 0)
+ if (service_set_main_pidref(s, TAKE_PIDREF(pid), /* start_timestamp = */ NULL) < 0)
return;
r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
@@ -1224,22 +1231,12 @@ static void service_set_state(Service *s, ServiceState state) {
SERVICE_CLEANING))
s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
- if (!IN_SET(state,
- SERVICE_START, SERVICE_START_POST,
- SERVICE_RUNNING,
- SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
- SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
- SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
+ if (!SERVICE_STATE_WITH_MAIN_PROCESS(state)) {
service_unwatch_main_pid(s);
s->main_command = NULL;
}
- if (!IN_SET(state,
- SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
- SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
- SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
- SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
- SERVICE_CLEANING)) {
+ if (!SERVICE_STATE_WITH_CONTROL_PROCESS(state)) {
service_unwatch_control_pid(s);
s->control_command = NULL;
s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
@@ -1326,12 +1323,7 @@ static int service_coldplug(Unit *u) {
if (pidref_is_set(&s->main_pid) &&
pidref_is_unwaited(&s->main_pid) > 0 &&
- (IN_SET(s->deserialized_state,
- SERVICE_START, SERVICE_START_POST,
- SERVICE_RUNNING,
- SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
- SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
- SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) {
+ SERVICE_STATE_WITH_MAIN_PROCESS(s->deserialized_state)) {
r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
if (r < 0)
return r;
@@ -1339,12 +1331,7 @@ static int service_coldplug(Unit *u) {
if (pidref_is_set(&s->control_pid) &&
pidref_is_unwaited(&s->control_pid) > 0 &&
- IN_SET(s->deserialized_state,
- SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
- SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
- SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
- SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
- SERVICE_CLEANING)) {
+ SERVICE_STATE_WITH_CONTROL_PROCESS(s->deserialized_state)) {
r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false);
if (r < 0)
return r;
@@ -1357,6 +1344,7 @@ static int service_coldplug(Unit *u) {
SERVICE_DEAD_RESOURCES_PINNED)) {
(void) unit_enqueue_rewatch_pids(u);
(void) unit_setup_exec_runtime(u);
+ (void) unit_setup_cgroup_runtime(u);
}
if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY))
@@ -1418,13 +1406,12 @@ static int service_collect_fds(
UNIT_FOREACH_DEPENDENCY(u, UNIT(s), UNIT_ATOM_TRIGGERED_BY) {
_cleanup_free_ int *cfds = NULL;
- Socket *sock;
int cn_fds;
-
- if (u->type != UNIT_SOCKET)
- continue;
+ Socket *sock;
sock = SOCKET(u);
+ if (!sock)
+ continue;
cn_fds = socket_collect_fds(sock, &cfds);
if (cn_fds < 0)
@@ -1436,18 +1423,8 @@ static int service_collect_fds(
if (!rfds) {
rfds = TAKE_PTR(cfds);
rn_socket_fds = cn_fds;
- } else {
- int *t;
-
- t = reallocarray(rfds, rn_socket_fds + cn_fds, sizeof(int));
- if (!t)
- return -ENOMEM;
-
- memcpy(t + rn_socket_fds, cfds, cn_fds * sizeof(int));
-
- rfds = t;
- rn_socket_fds += cn_fds;
- }
+ } else if (!GREEDY_REALLOC_APPEND(rfds, rn_socket_fds, cfds, cn_fds))
+ return -ENOMEM;
r = strv_extend_n(&rfd_names, socket_fdname(sock), cn_fds);
if (r < 0)
@@ -1510,9 +1487,10 @@ static int service_allocate_exec_fd_event_source(
if (r < 0)
return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m");
- /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */
+ /* This is a bit higher priority than SIGCHLD, to make sure we don't confuse the case "failed to
+ * start" from the case "succeeded to start, but failed immediately after". */
- r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3);
+ r = sd_event_source_set_priority(source, EVENT_PRIORITY_EXEC_FD);
if (r < 0)
return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m");
@@ -1602,12 +1580,52 @@ static Service *service_get_triggering_service(Service *s) {
return NULL;
}
+static ExecFlags service_exec_flags(ServiceExecCommand command_id, ExecFlags cred_flag) {
+ /* All service main/control processes honor sandboxing and namespacing options (except those
+ explicitly excluded in service_spawn()) */
+ ExecFlags flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT;
+
+ assert(command_id >= 0);
+ assert(command_id < _SERVICE_EXEC_COMMAND_MAX);
+ assert((cred_flag & ~(EXEC_SETUP_CREDENTIALS_FRESH|EXEC_SETUP_CREDENTIALS)) == 0);
+ assert((cred_flag != 0) == (command_id == SERVICE_EXEC_START));
+
+ /* Control processes spawned before main process also get tty access */
+ if (IN_SET(command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START))
+ flags |= EXEC_APPLY_TTY_STDIN;
+
+ /* All start phases get access to credentials. ExecStartPre= gets a new credential store upon
+ * every invocation, so that updating credential files through it works. When the first main process
+ * starts, passed creds become stable. Also see 'cred_flag'. */
+ if (command_id == SERVICE_EXEC_START_PRE)
+ flags |= EXEC_SETUP_CREDENTIALS_FRESH;
+ if (command_id == SERVICE_EXEC_START_POST)
+ flags |= EXEC_SETUP_CREDENTIALS;
+
+ if (IN_SET(command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START))
+ flags |= EXEC_SETENV_MONITOR_RESULT;
+
+ if (command_id == SERVICE_EXEC_START)
+ return flags|cred_flag|EXEC_PASS_FDS|EXEC_SET_WATCHDOG;
+
+ flags |= EXEC_IS_CONTROL;
+
+ /* Put control processes spawned later than main process under .control sub-cgroup if appropriate */
+ if (!IN_SET(command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE))
+ flags |= EXEC_CONTROL_CGROUP;
+
+ if (IN_SET(command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST))
+ flags |= EXEC_SETENV_RESULT;
+
+ return flags;
+}
+
static int service_spawn_internal(
const char *caller,
Service *s,
ExecCommand *c,
- usec_t timeout,
ExecFlags flags,
+ usec_t timeout,
PidRef *ret_pid) {
_cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(flags);
@@ -1615,7 +1633,6 @@ static int service_spawn_internal(
_cleanup_strv_free_ char **final_env = NULL, **our_env = NULL;
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
size_t n_env = 0;
- pid_t pid;
int r;
assert(caller);
@@ -1631,7 +1648,7 @@ static int service_spawn_internal(
assert(!s->exec_fd_event_source);
- if (flags & EXEC_IS_CONTROL) {
+ if (FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL)) {
/* If this is a control process, mask the permissions/chroot application if this is requested. */
if (s->permissions_start_only)
exec_params.flags &= ~EXEC_APPLY_SANDBOXING;
@@ -1639,7 +1656,7 @@ static int service_spawn_internal(
exec_params.flags &= ~EXEC_APPLY_CHROOT;
}
- if ((flags & EXEC_PASS_FDS) ||
+ if (FLAGS_SET(exec_params.flags, EXEC_PASS_FDS) ||
s->exec_context.std_input == EXEC_INPUT_SOCKET ||
s->exec_context.std_output == EXEC_OUTPUT_SOCKET ||
s->exec_context.std_error == EXEC_OUTPUT_SOCKET) {
@@ -1654,10 +1671,12 @@ static int service_spawn_internal(
exec_params.open_files = s->open_files;
+ exec_params.flags |= EXEC_PASS_FDS;
+
log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds);
}
- if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
+ if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd);
if (r < 0)
return r;
@@ -1671,7 +1690,7 @@ static int service_spawn_internal(
if (!our_env)
return -ENOMEM;
- if (service_exec_needs_notify_socket(s, flags)) {
+ if (service_exec_needs_notify_socket(s, exec_params.flags)) {
if (asprintf(our_env + n_env++, "NOTIFY_SOCKET=%s", UNIT(s)->manager->notify_socket) < 0)
return -ENOMEM;
@@ -1730,10 +1749,10 @@ static int service_spawn_internal(
Service *env_source = NULL;
const char *monitor_prefix;
- if (flags & EXEC_SETENV_RESULT) {
+ if (FLAGS_SET(exec_params.flags, EXEC_SETENV_RESULT)) {
env_source = s;
monitor_prefix = "";
- } else if (flags & EXEC_SETENV_MONITOR_RESULT) {
+ } else if (FLAGS_SET(exec_params.flags, EXEC_SETENV_MONITOR_RESULT)) {
env_source = service_get_triggering_service(s);
monitor_prefix = "MONITOR_";
}
@@ -1751,18 +1770,15 @@ static int service_spawn_internal(
r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%i", monitor_prefix, env_source->main_exec_status.status);
else
r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%s", monitor_prefix, signal_to_string(env_source->main_exec_status.status));
-
if (r < 0)
return -ENOMEM;
}
if (env_source != s) {
- if (!sd_id128_is_null(UNIT(env_source)->invocation_id)) {
- r = asprintf(our_env + n_env++, "%sINVOCATION_ID=" SD_ID128_FORMAT_STR,
- monitor_prefix, SD_ID128_FORMAT_VAL(UNIT(env_source)->invocation_id));
- if (r < 0)
+ if (!sd_id128_is_null(UNIT(env_source)->invocation_id))
+ if (asprintf(our_env + n_env++, "%sINVOCATION_ID=" SD_ID128_FORMAT_STR,
+ monitor_prefix, SD_ID128_FORMAT_VAL(UNIT(env_source)->invocation_id)) < 0)
return -ENOMEM;
- }
if (asprintf(our_env + n_env++, "%sUNIT=%s", monitor_prefix, UNIT(env_source)->id) < 0)
return -ENOMEM;
@@ -1806,17 +1822,13 @@ static int service_spawn_internal(
&exec_params,
s->exec_runtime,
&s->cgroup_context,
- &pid);
+ &pidref);
if (r < 0)
return r;
s->exec_fd_event_source = TAKE_PTR(exec_fd_source);
s->exec_fd_hot = false;
- r = pidref_set_pid(&pidref, pid);
- if (r < 0)
- return r;
-
r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true);
if (r < 0)
return r;
@@ -1864,10 +1876,10 @@ static int cgroup_good(Service *s) {
/* Returns 0 if the cgroup is empty or doesn't exist, > 0 if it is exists and is populated, < 0 if we can't
* figure it out */
- if (!UNIT(s)->cgroup_path)
+ if (!s->cgroup_runtime || !s->cgroup_runtime->cgroup_path)
return 0;
- r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path);
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, s->cgroup_runtime->cgroup_path);
if (r < 0)
return r;
@@ -1876,6 +1888,7 @@ static int cgroup_good(Service *s) {
static bool service_shall_restart(Service *s, const char **reason) {
assert(s);
+ assert(reason);
/* Don't restart after manual stops */
if (s->forbid_restart) {
@@ -1891,6 +1904,13 @@ static bool service_shall_restart(Service *s, const char **reason) {
/* Restart if the exit code/status are configured as restart triggers */
if (exit_status_set_test(&s->restart_force_status, s->main_exec_status.code, s->main_exec_status.status)) {
+ /* Don't allow Type=oneshot services to restart on success. Note that Restart=always/on-success
+ * is already rejected in service_verify. */
+ if (s->type == SERVICE_ONESHOT && s->result == SERVICE_SUCCESS) {
+ *reason = "service type and exit status";
+ return false;
+ }
+
*reason = "forced by exit status";
return true;
}
@@ -1962,7 +1982,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart)
} else if (s->result == SERVICE_SKIP_CONDITION) {
unit_log_skip(UNIT(s), service_result_to_string(s->result));
end_state = service_determine_dead_state(s);
- restart_state = SERVICE_DEAD_BEFORE_AUTO_RESTART;
+ restart_state = _SERVICE_STATE_INVALID; /* Never restart if skipped due to condition failure */
} else {
unit_log_failure(UNIT(s), service_result_to_string(s->result));
end_state = SERVICE_FAILED;
@@ -1984,8 +2004,10 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart)
if (allow_restart) {
usec_t restart_usec_next;
+ assert(restart_state >= 0 && restart_state < _SERVICE_STATE_MAX);
+
/* We make two state changes here: one that maps to the high-level UNIT_INACTIVE/UNIT_FAILED
- * state (i.e. a state indicating deactivation), and then one that that maps to the
+ * state (i.e. a state indicating deactivation), and then one that maps to the
* high-level UNIT_STARTING state (i.e. a state indicating activation). We do this so that
* external software can watch the state changes and see all service failures, even if they
* are only transitionary and followed by an automatic restart. We have fine-grained
@@ -1999,8 +2021,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart)
r = service_arm_timer(s, /* relative= */ true, restart_usec_next);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to install restart timer: %m");
- service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false);
- return;
+ return service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false);
}
log_unit_debug(UNIT(s), "Next restart interval calculated as: %s", FORMAT_TIMESPAN(restart_usec_next, 0));
@@ -2064,8 +2085,8 @@ static void service_enter_stop_post(Service *s, ServiceResult f) {
r = service_spawn(s,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_stop_usec,
- EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP,
&s->control_pid);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-post' task: %m");
@@ -2118,13 +2139,7 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
(void) unit_enqueue_rewatch_pids(UNIT(s));
kill_operation = state_to_kill_operation(s, state);
- r = unit_kill_context(
- UNIT(s),
- &s->kill_context,
- kill_operation,
- &s->main_pid,
- &s->control_pid,
- s->main_pid_alien);
+ r = unit_kill_context(UNIT(s), kill_operation);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
goto fail;
@@ -2193,8 +2208,8 @@ static void service_enter_stop(Service *s, ServiceResult f) {
r = service_spawn(s,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_stop_usec,
- EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP,
&s->control_pid);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop' task: %m");
@@ -2209,6 +2224,7 @@ static void service_enter_stop(Service *s, ServiceResult f) {
static bool service_good(Service *s) {
int main_pid_ok;
+
assert(s);
if (s->type == SERVICE_DBUS && !s->bus_name_good)
@@ -2265,6 +2281,7 @@ static void service_enter_running(Service *s, ServiceResult f) {
static void service_enter_start_post(Service *s) {
int r;
+
assert(s);
service_unwatch_control_pid(s);
@@ -2277,8 +2294,8 @@ static void service_enter_start_post(Service *s) {
r = service_spawn(s,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
- EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP,
&s->control_pid);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-post' task: %m");
@@ -2387,43 +2404,44 @@ static void service_enter_start(Service *s) {
r = service_spawn(s,
c,
+ service_exec_flags(SERVICE_EXEC_START, EXEC_SETUP_CREDENTIALS_FRESH),
timeout,
- EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_WRITE_CREDENTIALS|EXEC_SETENV_MONITOR_RESULT,
&pidref);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start' task: %m");
goto fail;
}
- if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) {
- /* For simple services we immediately start
- * the START_POST binaries. */
+ assert(pidref.pid == c->exec_status.pid);
- (void) service_set_main_pidref(s, &pidref);
- service_enter_start_post(s);
-
- } else if (s->type == SERVICE_FORKING) {
+ switch (s->type) {
- /* For forking services we wait until the start
- * process exited. */
+ case SERVICE_SIMPLE:
+ case SERVICE_IDLE:
+ /* For simple services we immediately start the START_POST binaries. */
+ (void) service_set_main_pidref(s, TAKE_PIDREF(pidref), &c->exec_status.start_timestamp);
+ return service_enter_start_post(s);
+ case SERVICE_FORKING:
+ /* For forking services we wait until the start process exited. */
pidref_done(&s->control_pid);
s->control_pid = TAKE_PIDREF(pidref);
- service_set_state(s, SERVICE_START);
-
- } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC)) {
+ return service_set_state(s, SERVICE_START);
+
+ case SERVICE_ONESHOT: /* For oneshot services we wait until the start process exited, too, but it is our main process. */
+ case SERVICE_EXEC:
+ case SERVICE_DBUS:
+ case SERVICE_NOTIFY:
+ case SERVICE_NOTIFY_RELOAD:
+ /* For D-Bus services we know the main pid right away, but wait for the bus name to appear
+ * on the bus. 'notify' and 'exec' services wait for readiness notification and EOF
+ * on exec_fd, respectively. */
+ (void) service_set_main_pidref(s, TAKE_PIDREF(pidref), &c->exec_status.start_timestamp);
+ return service_set_state(s, SERVICE_START);
- /* For oneshot services we wait until the start process exited, too, but it is our main process. */
-
- /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the
- * bus. 'notify' and 'exec' services are similar. */
-
- (void) service_set_main_pidref(s, &pidref);
- service_set_state(s, SERVICE_START);
- } else
+ default:
assert_not_reached();
-
- return;
+ }
fail:
service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
@@ -2447,8 +2465,8 @@ static void service_enter_start_pre(Service *s) {
r = service_spawn(s,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
- EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS,
&s->control_pid);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-pre' task: %m");
@@ -2484,10 +2502,9 @@ static void service_enter_condition(Service *s) {
r = service_spawn(s,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
- EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN,
&s->control_pid);
-
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'exec-condition' task: %m");
goto fail;
@@ -2527,11 +2544,9 @@ static void service_enter_restart(Service *s) {
/* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't
* fully stopped, i.e. as long as it remains up or remains in auto-start states. The user can reset
* the counter explicitly however via the usual "systemctl reset-failure" logic. */
- s->n_restarts ++;
+ s->n_restarts++;
s->flush_n_restarts = false;
- s->notify_access_override = _NOTIFY_ACCESS_INVALID;
-
log_unit_struct(UNIT(s), LOG_INFO,
"MESSAGE_ID=" SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR,
LOG_UNIT_INVOCATION_ID(UNIT(s)),
@@ -2595,8 +2610,8 @@ static void service_enter_reload(Service *s) {
r = service_spawn(s,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
- EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP,
&s->control_pid);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'reload' task: %m");
@@ -2651,13 +2666,8 @@ static void service_run_next_control(Service *s) {
r = service_spawn(s,
s->control_command,
+ service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
timeout,
- EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|
- (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD) ? EXEC_WRITE_CREDENTIALS : 0)|
- (IN_SET(s->control_command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_STOP_POST) ? EXEC_APPLY_TTY_STDIN : 0)|
- (IN_SET(s->control_command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_SETENV_RESULT : 0)|
- (IN_SET(s->control_command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START) ? EXEC_SETENV_MONITOR_RESULT : 0)|
- (IN_SET(s->control_command_id, SERVICE_EXEC_START_POST, SERVICE_EXEC_RELOAD, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_CONTROL_CGROUP : 0),
&s->control_pid);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to spawn next control task: %m");
@@ -2688,8 +2698,8 @@ static void service_run_next_main(Service *s) {
r = service_spawn(s,
s->main_command,
+ service_exec_flags(SERVICE_EXEC_START, EXEC_SETUP_CREDENTIALS),
s->timeout_start_usec,
- EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS,
&pidref);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to spawn next main task: %m");
@@ -2697,7 +2707,7 @@ static void service_run_next_main(Service *s) {
return;
}
- (void) service_set_main_pidref(s, &pidref);
+ (void) service_set_main_pidref(s, TAKE_PIDREF(pidref), &s->main_command->exec_status.start_timestamp);
}
static int service_start(Unit *u) {
@@ -2755,16 +2765,16 @@ static int service_start(Unit *u) {
s->flush_n_restarts = false;
}
- u->reset_accounting = true;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (crt)
+ crt->reset_accounting = true;
service_enter_condition(s);
return 1;
}
static int service_stop(Unit *u) {
- Service *s = SERVICE(u);
-
- assert(s);
+ Service *s = ASSERT_PTR(SERVICE(u));
/* Don't create restart jobs from manual stops. */
s->forbid_restart = true;
@@ -2821,9 +2831,7 @@ static int service_stop(Unit *u) {
}
static int service_reload(Unit *u) {
- Service *s = SERVICE(u);
-
- assert(s);
+ Service *s = ASSERT_PTR(SERVICE(u));
assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED));
@@ -2832,9 +2840,7 @@ static int service_reload(Unit *u) {
}
static bool service_can_reload(Unit *u) {
- Service *s = SERVICE(u);
-
- assert(s);
+ Service *s = ASSERT_PTR(SERVICE(u));
return s->exec_command[SERVICE_EXEC_RELOAD] ||
s->type == SERVICE_NOTIFY_RELOAD;
@@ -2858,14 +2864,13 @@ static unsigned service_exec_command_index(Unit *u, ServiceExecCommand id, const
}
static int service_serialize_exec_command(Unit *u, FILE *f, const ExecCommand *command) {
+ Service *s = ASSERT_PTR(SERVICE(u));
_cleanup_free_ char *args = NULL, *p = NULL;
- Service *s = SERVICE(u);
const char *type, *key;
ServiceExecCommand id;
size_t length = 0;
unsigned idx;
- assert(s);
assert(f);
if (!command)
@@ -2927,10 +2932,9 @@ static int service_serialize_exec_command(Unit *u, FILE *f, const ExecCommand *c
}
static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
- Service *s = SERVICE(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
int r;
- assert(u);
assert(f);
assert(fds);
@@ -2996,13 +3000,14 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
if (!c)
return log_oom();
- (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %i", copy, c, fs->do_poll);
+ (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %s", copy, c, one_zero(fs->do_poll));
}
if (s->main_exec_status.pid > 0) {
(void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid);
(void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp);
(void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp);
+ (void) serialize_dual_timestamp(f, "main-exec-status-handoff", &s->main_exec_status.handoff_timestamp);
if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) {
(void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code);
@@ -3033,14 +3038,14 @@ int service_deserialize_exec_command(
const char *key,
const char *value) {
- Service *s = SERVICE(u);
- int r;
- unsigned idx = 0, i;
- bool control, found = false, last = false;
- ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID;
+ Service *s = ASSERT_PTR(SERVICE(u));
ExecCommand *command = NULL;
+ ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID;
_cleanup_free_ char *path = NULL;
_cleanup_strv_free_ char **argv = NULL;
+ unsigned idx = 0, i;
+ bool control, found = false, last = false;
+ int r;
enum ExecCommandState {
STATE_EXEC_COMMAND_TYPE,
@@ -3051,7 +3056,6 @@ int service_deserialize_exec_command(
_STATE_EXEC_COMMAND_INVALID = -EINVAL,
} state;
- assert(s);
assert(key);
assert(value);
@@ -3096,7 +3100,7 @@ int service_deserialize_exec_command(
case STATE_EXEC_COMMAND_ARGS:
r = strv_extend(&argv, arg);
if (r < 0)
- return -ENOMEM;
+ return r;
break;
default:
assert_not_reached();
@@ -3139,10 +3143,9 @@ int service_deserialize_exec_command(
}
static int service_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Service *s = SERVICE(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
int r;
- assert(u);
assert(key);
assert(value);
assert(fds);
@@ -3179,10 +3182,10 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
(void) deserialize_pidref(fds, value, &s->control_pid);
} else if (streq(key, "main-pid")) {
- _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ PidRef pidref;
if (!pidref_is_set(&s->main_pid) && deserialize_pidref(fds, value, &pidref) >= 0)
- (void) service_set_main_pidref(s, &pidref);
+ (void) service_set_main_pidref(s, pidref, /* start_timestamp = */ NULL);
} else if (streq(key, "main-pid-known")) {
int b;
@@ -3239,9 +3242,9 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
_cleanup_close_ int fd = -EBADF;
int do_poll;
- r = extract_first_word(&value, &fdv, NULL, 0);
- if (r <= 0) {
- log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value);
+ r = extract_many_words(&value, " ", EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE, &fdv, &fdn, &fdp);
+ if (r < 2 || r > 3) {
+ log_unit_debug(u, "Failed to deserialize fd-store-fd, ignoring: %s", value);
return 0;
}
@@ -3249,24 +3252,17 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
if (fd < 0)
return 0;
- r = extract_first_word(&value, &fdn, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
- if (r <= 0) {
- log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value);
- return 0;
- }
-
- r = extract_first_word(&value, &fdp, NULL, 0);
- if (r == 0) {
- /* If the value is not present, we assume the default */
- do_poll = 1;
- } else if (r < 0 || (r = safe_atoi(fdp, &do_poll)) < 0) {
- log_unit_debug_errno(u, r, "Failed to parse fd-store-fd value \"%s\", ignoring: %m", value);
+ do_poll = r == 3 ? parse_boolean(fdp) : true;
+ if (do_poll < 0) {
+ log_unit_debug_errno(u, do_poll,
+ "Failed to deserialize fd-store-fd do_poll, ignoring: %s", fdp);
return 0;
}
r = service_add_fd_store(s, fd, fdn, do_poll);
if (r < 0) {
- log_unit_debug_errno(u, r, "Failed to store deserialized fd %i, ignoring: %m", fd);
+ log_unit_debug_errno(u, r,
+ "Failed to store deserialized fd '%s', ignoring: %m", fdn);
return 0;
}
@@ -3296,6 +3292,8 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp);
else if (streq(key, "main-exec-status-exit"))
deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp);
+ else if (streq(key, "main-exec-status-handoff"))
+ deserialize_dual_timestamp(value, &s->main_exec_status.handoff_timestamp);
else if (streq(key, "notify-access-override")) {
NotifyAccess notify_access;
@@ -3383,13 +3381,12 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
}
static UnitActiveState service_active_state(Unit *u) {
+ Service *s = ASSERT_PTR(SERVICE(u));
const UnitActiveState *table;
- assert(u);
-
- table = SERVICE(u)->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table;
+ table = s->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table;
- return table[SERVICE(u)->state];
+ return table[s->state];
}
static const char *service_sub_state_to_string(Unit *u) {
@@ -3399,9 +3396,7 @@ static const char *service_sub_state_to_string(Unit *u) {
}
static bool service_may_gc(Unit *u) {
- Service *s = SERVICE(u);
-
- assert(s);
+ Service *s = ASSERT_PTR(SERVICE(u));
/* Never clean up services that still have a process around, even if the service is formally dead. Note that
* unit_may_gc() already checked our cgroup for us, we just check our two additional PIDs, too, in case they
@@ -3422,6 +3417,7 @@ static bool service_may_gc(Unit *u) {
static int service_retry_pid_file(Service *s) {
int r;
+ assert(s);
assert(s->pid_file);
assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST));
@@ -3438,6 +3434,8 @@ static int service_retry_pid_file(Service *s) {
static int service_watch_pid_file(Service *s) {
int r;
+ assert(s);
+
log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path);
r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io);
@@ -3457,6 +3455,7 @@ static int service_watch_pid_file(Service *s) {
static int service_demand_pid_file(Service *s) {
_cleanup_free_ PathSpec *ps = NULL;
+ assert(s);
assert(s->pid_file);
assert(!s->pid_file_pathspec);
@@ -3485,11 +3484,8 @@ static int service_demand_pid_file(Service *s) {
static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
PathSpec *p = ASSERT_PTR(userdata);
- Service *s;
+ Service *s = ASSERT_PTR(SERVICE(p->unit));
- s = SERVICE(p->unit);
-
- assert(s);
assert(fd >= 0);
assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST));
assert(s->pid_file_pathspec);
@@ -3515,20 +3511,19 @@ fail:
}
static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
- Service *s = SERVICE(userdata);
-
- assert(s);
+ Service *s = ASSERT_PTR(SERVICE(userdata));
log_unit_debug(UNIT(s), "got exec-fd event");
/* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve()
- * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically
- * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the
- * parent. We need to be careful however, as there are other reasons that we might cause the child's side of
- * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
- * the child signalled us first that it is about to call the execve(). It does so by sending us a simple
- * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
- * sends a zero byte we'll ignore POLLHUP on the fd again. */
+ * successfully for it. We implement this through a pipe() towards the child, which the kernel
+ * automatically closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on
+ * the pipe in the parent. We need to be careful however, as there are other reasons that we might
+ * cause the child's side of the pipe to be closed (for example, a simple exit()). To deal with that
+ * we'll ignore EOFs on the pipe unless the child signalled us first that it is about to call the
+ * execve(). It does so by sending us a simple non-zero byte via the pipe. We also provide the child
+ * with a way to inform us in case execve() failed: if it sends a zero byte we'll ignore POLLHUP on
+ * the fd again. */
for (;;) {
uint8_t x;
@@ -3541,8 +3536,7 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
}
- if (n == 0) { /* EOF → the event we are waiting for */
-
+ if (n == 0) { /* EOF → the event we are waiting for in case of Type=exec */
s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
@@ -3561,16 +3555,13 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
/* A byte was read → this turns on/off the exec fd logic */
assert(n == sizeof(x));
+
s->exec_fd_hot = x;
}
-
- return 0;
}
static void service_notify_cgroup_empty_event(Unit *u) {
- Service *s = SERVICE(u);
-
- assert(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
log_unit_debug(u, "Control group is empty.");
@@ -3647,7 +3638,7 @@ static void service_notify_cgroup_empty_event(Unit *u) {
}
static void service_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
- Service *s = SERVICE(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
if (managed_oom)
log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd.");
@@ -3702,12 +3693,12 @@ static void service_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
}
static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ Service *s = ASSERT_PTR(SERVICE(u));
bool notify_dbus = true;
- Service *s = SERVICE(u);
ServiceResult f;
ExitClean clean_mode;
+ int r;
- assert(s);
assert(pid >= 0);
/* Oneshot services and non-SERVICE_EXEC_START commands should not be
@@ -3918,7 +3909,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
s->control_command->command_next &&
f == SERVICE_SUCCESS) {
- /* There is another command to * execute, so let's do that. */
+ /* There is another command to execute, so let's do that. */
log_unit_debug(u, "Running next control command for state %s.", service_state_to_string(s->state));
service_run_next_control(s);
@@ -3959,7 +3950,6 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
if (s->pid_file) {
bool has_start_post;
- int r;
/* Let's try to load the pid file here if we can.
* The PID file might actually be created by a START_POST
@@ -3986,8 +3976,6 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
}
if (s->pid_file) {
- int r;
-
r = service_load_pid_file(s, true);
if (r < 0) {
r = service_demand_pid_file(s);
@@ -4076,9 +4064,8 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
}
static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
- Service *s = SERVICE(userdata);
+ Service *s = ASSERT_PTR(SERVICE(userdata));
- assert(s);
assert(source == s->timer_event_source);
switch (s->state) {
@@ -4275,10 +4262,9 @@ static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *us
}
static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata) {
- Service *s = SERVICE(userdata);
+ Service *s = ASSERT_PTR(SERVICE(userdata));
usec_t watchdog_usec;
- assert(s);
assert(source == s->watchdog_event_source);
watchdog_usec = service_get_watchdog_usec(s);
@@ -4295,35 +4281,49 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void
return 0;
}
-static bool service_notify_message_authorized(Service *s, pid_t pid, FDSet *fds) {
+static void service_force_watchdog(Service *s) {
assert(s);
+ if (!UNIT(s)->manager->service_watchdogs)
+ return;
+
+ log_unit_error(UNIT(s), "Watchdog request (last status: %s)!",
+ s->status_text ?: "<unset>");
+
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG);
+}
+
+static bool service_notify_message_authorized(Service *s, pid_t pid) {
+ assert(s);
+ assert(pid_is_valid(pid));
+
NotifyAccess notify_access = service_get_notify_access(s);
if (notify_access == NOTIFY_NONE) {
- log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled.", pid);
+ /* Warn level only if no notifications are expected */
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled", pid);
return false;
}
if (notify_access == NOTIFY_MAIN && pid != s->main_pid.pid) {
if (pidref_is_set(&s->main_pid))
- log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid);
+ log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid);
else
- log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid);
+ log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid);
return false;
}
if (notify_access == NOTIFY_EXEC && pid != s->main_pid.pid && pid != s->control_pid.pid) {
if (pidref_is_set(&s->main_pid) && pidref_is_set(&s->control_pid))
- log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT,
- pid, s->main_pid.pid, s->control_pid.pid);
+ log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT,
+ pid, s->main_pid.pid, s->control_pid.pid);
else if (pidref_is_set(&s->main_pid))
- log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid);
+ log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid);
else if (pidref_is_set(&s->control_pid))
- log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid.pid);
+ log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid.pid);
else
- log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid);
+ log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid);
return false;
}
@@ -4331,44 +4331,35 @@ static bool service_notify_message_authorized(Service *s, pid_t pid, FDSet *fds)
return true;
}
-static void service_force_watchdog(Service *s) {
- if (!UNIT(s)->manager->service_watchdogs)
- return;
-
- log_unit_error(UNIT(s), "Watchdog request (last status: %s)!",
- s->status_text ?: "<unset>");
-
- service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG);
-}
-
static void service_notify_message(
Unit *u,
const struct ucred *ucred,
char * const *tags,
FDSet *fds) {
- Service *s = SERVICE(u);
- bool notify_dbus = false;
- usec_t monotonic_usec = USEC_INFINITY;
- const char *e;
+ Service *s = ASSERT_PTR(SERVICE(u));
int r;
- assert(u);
assert(ucred);
- if (!service_notify_message_authorized(s, ucred->pid, fds))
+ if (!service_notify_message_authorized(s, ucred->pid))
return;
if (DEBUG_LOGGING) {
- _cleanup_free_ char *cc = NULL;
-
- cc = strv_join(tags, ", ");
+ _cleanup_free_ char *cc = strv_join(tags, ", ");
log_unit_debug(u, "Got notification message from PID "PID_FMT" (%s)", ucred->pid, empty_to_na(cc));
}
+ usec_t monotonic_usec = USEC_INFINITY;
+ bool notify_dbus = false;
+ const char *e;
+
/* Interpret MAINPID= */
e = strv_find_startswith(tags, "MAINPID=");
- if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY)) {
+ if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING,
+ SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+ SERVICE_STOP, SERVICE_STOP_SIGTERM)) {
+
_cleanup_(pidref_done) PidRef new_main_pid = PIDREF_NULL;
r = pidref_set_pidstr(&new_main_pid, e);
@@ -4384,10 +4375,10 @@ static void service_notify_message(
log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, but we'll accept it as the request to change it came from a privileged process.", new_main_pid.pid);
r = 1;
} else
- log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid.pid);
+ log_unit_warning(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid.pid);
}
if (r > 0) {
- (void) service_set_main_pidref(s, &new_main_pid);
+ (void) service_set_main_pidref(s, TAKE_PIDREF(new_main_pid), /* start_timestamp = */ NULL);
r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
if (r < 0)
@@ -4585,11 +4576,36 @@ static void service_notify_message(
unit_add_to_dbus_queue(u);
}
+static void service_handoff_timestamp(
+ Unit *u,
+ const struct ucred *ucred,
+ const dual_timestamp *ts) {
+
+ Service *s = ASSERT_PTR(SERVICE(u));
+
+ assert(ucred);
+ assert(ts);
+
+ if (s->main_pid.pid == ucred->pid) {
+ if (s->main_command)
+ exec_status_handoff(&s->main_command->exec_status, ucred, ts);
+
+ exec_status_handoff(&s->main_exec_status, ucred, ts);
+ } else if (s->control_pid.pid == ucred->pid && s->control_command)
+ exec_status_handoff(&s->control_command->exec_status, ucred, ts);
+ else
+ return;
+
+ unit_add_to_dbus_queue(u);
+}
+
static int service_get_timeout(Unit *u, usec_t *timeout) {
- Service *s = SERVICE(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
uint64_t t;
int r;
+ assert(timeout);
+
if (!s->timer_event_source)
return 0;
@@ -4604,7 +4620,7 @@ static int service_get_timeout(Unit *u, usec_t *timeout) {
}
static usec_t service_get_timeout_start_usec(Unit *u) {
- Service *s = SERVICE(ASSERT_PTR(u));
+ Service *s = ASSERT_PTR(SERVICE(u));
return s->timeout_start_usec;
}
@@ -4624,16 +4640,14 @@ static bool pick_up_pid_from_bus_name(Service *s) {
}
static int bus_name_pid_lookup_callback(sd_bus_message *reply, void *userdata, sd_bus_error *ret_error) {
+ Service *s = ASSERT_PTR(SERVICE(userdata));
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
const sd_bus_error *e;
- Unit *u = ASSERT_PTR(userdata);
uint32_t pid;
- Service *s;
int r;
assert(reply);
- s = SERVICE(u);
s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
if (!s->bus_name || !pick_up_pid_from_bus_name(s))
@@ -4658,20 +4672,17 @@ static int bus_name_pid_lookup_callback(sd_bus_message *reply, void *userdata, s
return 1;
}
- log_unit_debug(u, "D-Bus name %s is now owned by process " PID_FMT, s->bus_name, pidref.pid);
+ log_unit_debug(UNIT(s), "D-Bus name %s is now owned by process " PID_FMT, s->bus_name, pidref.pid);
- (void) service_set_main_pidref(s, &pidref);
+ (void) service_set_main_pidref(s, TAKE_PIDREF(pidref), /* start_timestamp = */ NULL);
(void) unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
return 1;
}
static void service_bus_name_owner_change(Unit *u, const char *new_owner) {
-
- Service *s = SERVICE(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
int r;
- assert(s);
-
if (new_owner)
log_unit_debug(u, "D-Bus name %s now owned by %s", s->bus_name, new_owner);
else
@@ -4721,7 +4732,7 @@ int service_set_socket_fd(
Service *s,
int fd,
Socket *sock,
- SocketPeer *peer,
+ SocketPeer *peer, /* reference to object is donated to us on success */
bool selinux_context_net) {
_cleanup_free_ char *peer_text = NULL;
@@ -4729,6 +4740,7 @@ int service_set_socket_fd(
assert(s);
assert(fd >= 0);
+ assert(sock);
/* This is called by the socket code when instantiating a new service for a stream socket and the socket needs
* to be configured. We take ownership of the passed fd on success. */
@@ -4760,12 +4772,13 @@ int service_set_socket_fd(
return r;
}
- r = unit_add_two_dependencies(UNIT(sock), UNIT_BEFORE, UNIT_TRIGGERS, UNIT(s), false, UNIT_DEPENDENCY_IMPLICIT);
+ r = unit_add_two_dependencies(UNIT(s), UNIT_AFTER, UNIT_TRIGGERED_BY, UNIT(sock), false, UNIT_DEPENDENCY_IMPLICIT);
if (r < 0)
- return r;
+ return log_unit_debug_errno(UNIT(s), r,
+ "Failed to add After=/TriggeredBy= dependencies on socket unit: %m");
s->socket_fd = fd;
- s->socket_peer = socket_peer_ref(peer);
+ s->socket_peer = peer;
s->socket_fd_selinux_context_net = selinux_context_net;
unit_ref_set(&s->accept_socket, UNIT(s), UNIT(sock));
@@ -4773,9 +4786,7 @@ int service_set_socket_fd(
}
static void service_reset_failed(Unit *u) {
- Service *s = SERVICE(u);
-
- assert(s);
+ Service *s = ASSERT_PTR(SERVICE(u));
if (s->state == SERVICE_FAILED)
service_set_state(s, service_determine_dead_state(s));
@@ -4787,8 +4798,13 @@ static void service_reset_failed(Unit *u) {
s->flush_n_restarts = false;
}
-static PidRef* service_main_pid(Unit *u) {
- return &ASSERT_PTR(SERVICE(u))->main_pid;
+static PidRef* service_main_pid(Unit *u, bool *ret_is_alien) {
+ Service *s = ASSERT_PTR(SERVICE(u));
+
+ if (ret_is_alien)
+ *ret_is_alien = s->main_pid_alien;
+
+ return &s->main_pid;
}
static PidRef* service_control_pid(Unit *u) {
@@ -4796,9 +4812,7 @@ static PidRef* service_control_pid(Unit *u) {
}
static bool service_needs_console(Unit *u) {
- Service *s = SERVICE(u);
-
- assert(s);
+ Service *s = ASSERT_PTR(SERVICE(u));
/* We provide our own implementation of this here, instead of relying of the generic implementation
* unit_needs_console() provides, since we want to return false if we are in SERVICE_EXITED state. */
@@ -4826,9 +4840,7 @@ static bool service_needs_console(Unit *u) {
}
static int service_exit_status(Unit *u) {
- Service *s = SERVICE(u);
-
- assert(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
if (s->main_exec_status.pid <= 0 ||
!dual_timestamp_is_set(&s->main_exec_status.exit_timestamp))
@@ -4841,20 +4853,17 @@ static int service_exit_status(Unit *u) {
}
static const char* service_status_text(Unit *u) {
- Service *s = SERVICE(u);
-
- assert(s);
+ Service *s = ASSERT_PTR(SERVICE(u));
return s->status_text;
}
static int service_clean(Unit *u, ExecCleanMask mask) {
+ Service *s = ASSERT_PTR(SERVICE(u));
_cleanup_strv_free_ char **l = NULL;
bool may_clean_fdstore = false;
- Service *s = SERVICE(u);
int r;
- assert(s);
assert(mask != 0);
if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_DEAD_RESOURCES_PINNED))
@@ -4910,11 +4919,10 @@ fail:
}
static int service_can_clean(Unit *u, ExecCleanMask *ret) {
- Service *s = SERVICE(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
ExecCleanMask mask = 0;
int r;
- assert(s);
assert(ret);
r = exec_context_get_clean_mask(&s->exec_context, &mask);
@@ -4928,10 +4936,12 @@ static int service_can_clean(Unit *u, ExecCleanMask *ret) {
return 0;
}
-static const char *service_finished_job(Unit *u, JobType t, JobResult result) {
+static const char* service_finished_job(Unit *u, JobType t, JobResult result) {
+ Service *s = ASSERT_PTR(SERVICE(u));
+
if (t == JOB_START &&
result == JOB_DONE &&
- SERVICE(u)->type == SERVICE_ONESHOT)
+ s->type == SERVICE_ONESHOT)
return "Finished %s.";
/* Fall back to generic */
@@ -4939,11 +4949,9 @@ static const char *service_finished_job(Unit *u, JobType t, JobResult result) {
}
static int service_can_start(Unit *u) {
- Service *s = SERVICE(u);
+ Service *s = ASSERT_PTR(SERVICE(u));
int r;
- assert(s);
-
/* Make sure we don't enter a busy loop of some kind. */
r = unit_test_start_limit(u);
if (r < 0) {
@@ -4955,7 +4963,7 @@ static int service_can_start(Unit *u) {
}
static void service_release_resources(Unit *u) {
- Service *s = SERVICE(ASSERT_PTR(u));
+ Service *s = ASSERT_PTR(SERVICE(u));
/* Invoked by the unit state engine, whenever it realizes that unit is dead and there's no job
* anymore for it, and it hence is a good idea to release resources */
@@ -4978,6 +4986,52 @@ static void service_release_resources(Unit *u) {
service_set_state(s, SERVICE_DEAD);
}
+int service_determine_exec_selinux_label(Service *s, char **ret) {
+ int r;
+
+ assert(s);
+ assert(ret);
+
+ if (!mac_selinux_use())
+ return -ENODATA;
+
+ /* Returns the SELinux label used for execution of the main service binary */
+
+ if (s->exec_context.selinux_context)
+ /* Prefer the explicitly configured label if there is one */
+ return strdup_to(ret, s->exec_context.selinux_context);
+
+ if (s->exec_context.root_image ||
+ s->exec_context.n_extension_images > 0 ||
+ !strv_isempty(s->exec_context.extension_directories)) /* We cannot chase paths through images */
+ return log_unit_debug_errno(UNIT(s), SYNTHETIC_ERRNO(ENODATA), "Service with RootImage=, ExtensionImages= or ExtensionDirectories= set, cannot determine socket SELinux label before activation, ignoring.");
+
+ ExecCommand *c = s->exec_command[SERVICE_EXEC_START];
+ if (!c)
+ return -ENODATA;
+
+ _cleanup_free_ char *path = NULL;
+ r = chase(c->path, s->exec_context.root_directory, CHASE_PREFIX_ROOT, &path, NULL);
+ if (r < 0) {
+ log_unit_debug_errno(UNIT(s), r, "Failed to resolve service binary '%s', ignoring.", c->path);
+ return -ENODATA;
+ }
+
+ r = mac_selinux_get_create_label_from_exe(path, ret);
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
+ log_unit_debug_errno(UNIT(s), r, "Reading SELinux label off binary '%s' is not supported, ignoring.", path);
+ return -ENODATA;
+ }
+ if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+ log_unit_debug_errno(UNIT(s), r, "Can't read SELinux label off binary '%s', due to privileges, ignoring.", path);
+ return -ENODATA;
+ }
+ if (r < 0)
+ return log_unit_debug_errno(UNIT(s), r, "Failed to read SELinux label off binary '%s': %m", path);
+
+ return 0;
+}
+
static const char* const service_restart_table[_SERVICE_RESTART_MAX] = {
[SERVICE_RESTART_NO] = "no",
[SERVICE_RESTART_ON_SUCCESS] = "on-success",
@@ -4992,7 +5046,7 @@ DEFINE_STRING_TABLE_LOOKUP(service_restart, ServiceRestart);
static const char* const service_restart_mode_table[_SERVICE_RESTART_MODE_MAX] = {
[SERVICE_RESTART_MODE_NORMAL] = "normal",
- [SERVICE_RESTART_MODE_DIRECT] = "direct",
+ [SERVICE_RESTART_MODE_DIRECT] = "direct",
};
DEFINE_STRING_TABLE_LOOKUP(service_restart_mode, ServiceRestartMode);
@@ -5080,6 +5134,7 @@ const UnitVTable service_vtable = {
.cgroup_context_offset = offsetof(Service, cgroup_context),
.kill_context_offset = offsetof(Service, kill_context),
.exec_runtime_offset = offsetof(Service, exec_runtime),
+ .cgroup_runtime_offset = offsetof(Service, cgroup_runtime),
.sections =
"Unit\0"
@@ -5110,8 +5165,7 @@ const UnitVTable service_vtable = {
.clean = service_clean,
.can_clean = service_can_clean,
- .freeze = unit_freeze_vtable_common,
- .thaw = unit_thaw_vtable_common,
+ .freezer_action = unit_cgroup_freezer_action,
.serialize = service_serialize,
.deserialize_item = service_deserialize_item,
@@ -5130,6 +5184,7 @@ const UnitVTable service_vtable = {
.notify_cgroup_empty = service_notify_cgroup_empty_event,
.notify_cgroup_oom = service_notify_cgroup_oom_event,
.notify_message = service_notify_message,
+ .notify_handoff_timestamp = service_handoff_timestamp,
.main_pid = service_main_pid,
.control_pid = service_control_pid,
diff --git a/src/core/service.h b/src/core/service.h
index e85302e..59598f7 100644
--- a/src/core/service.h
+++ b/src/core/service.h
@@ -168,6 +168,8 @@ struct Service {
/* Runtime data of the execution context */
ExecRuntime *exec_runtime;
+ CGroupRuntime *cgroup_runtime;
+
PidRef main_pid, control_pid;
/* if we are a socket activated service instance, store information of the connection/peer/socket */
@@ -255,6 +257,8 @@ void service_release_socket_fd(Service *s);
usec_t service_restart_usec_next(Service *s);
+int service_determine_exec_selinux_label(Service *s, char **ret);
+
const char* service_restart_to_string(ServiceRestart i) _const_;
ServiceRestart service_restart_from_string(const char *s) _pure_;
diff --git a/src/core/show-status.c b/src/core/show-status.c
index 5b003ba..57ad4db 100644
--- a/src/core/show-status.c
+++ b/src/core/show-status.c
@@ -38,13 +38,13 @@ int parse_show_status(const char *v, ShowStatus *ret) {
int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) {
static const char status_indent[] = " "; /* "[" STATUS "] " */
+ static bool prev_ephemeral = false;
static int dumb = -1;
_cleanup_free_ char *s = NULL;
_cleanup_close_ int fd = -EBADF;
struct iovec iovec[7] = {};
int n = 0;
- static bool prev_ephemeral;
assert(format);
@@ -75,7 +75,7 @@ int status_vprintf(const char *status, ShowStatusFlags flags, const char *format
if (c <= 0)
c = 80;
- sl = status ? sizeof(status_indent)-1 : 0;
+ sl = status ? strlen(status_indent) : 0;
emax = c - sl - 1;
if (emax < 3)
diff --git a/src/core/slice.c b/src/core/slice.c
index fb4f23c..4e71976 100644
--- a/src/core/slice.c
+++ b/src/core/slice.c
@@ -16,8 +16,8 @@
#include "unit.h"
static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = {
- [SLICE_DEAD] = UNIT_INACTIVE,
- [SLICE_ACTIVE] = UNIT_ACTIVE
+ [SLICE_DEAD] = UNIT_INACTIVE,
+ [SLICE_ACTIVE] = UNIT_ACTIVE,
};
static void slice_init(Unit *u) {
@@ -27,32 +27,29 @@ static void slice_init(Unit *u) {
u->ignore_on_isolate = true;
}
-static void slice_set_state(Slice *t, SliceState state) {
+static void slice_set_state(Slice *s, SliceState state) {
SliceState old_state;
- assert(t);
- if (t->state != state)
- bus_unit_send_pending_change_signal(UNIT(t), false);
+ assert(s);
+
+ if (s->state != state)
+ bus_unit_send_pending_change_signal(UNIT(s), false);
- old_state = t->state;
- t->state = state;
+ old_state = s->state;
+ s->state = state;
if (state != old_state)
- log_debug("%s changed %s -> %s",
- UNIT(t)->id,
- slice_state_to_string(old_state),
- slice_state_to_string(state));
+ log_unit_debug(UNIT(s), "Changed %s -> %s",
+ slice_state_to_string(old_state), slice_state_to_string(state));
- unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+ unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
}
static int slice_add_parent_slice(Slice *s) {
- Unit *u = UNIT(s);
+ Unit *u = UNIT(ASSERT_PTR(s));
_cleanup_free_ char *a = NULL;
int r;
- assert(s);
-
if (UNIT_GET_SLICE(u))
return 0;
@@ -151,10 +148,9 @@ static int slice_load_system_slice(Unit *u) {
}
static int slice_load(Unit *u) {
- Slice *s = SLICE(u);
+ Slice *s = ASSERT_PTR(SLICE(u));
int r;
- assert(s);
assert(u->load_state == UNIT_STUB);
r = slice_load_root_slice(u);
@@ -196,36 +192,35 @@ static int slice_load(Unit *u) {
}
static int slice_coldplug(Unit *u) {
- Slice *t = SLICE(u);
+ Slice *s = ASSERT_PTR(SLICE(u));
- assert(t);
- assert(t->state == SLICE_DEAD);
+ assert(s->state == SLICE_DEAD);
- if (t->deserialized_state != t->state)
- slice_set_state(t, t->deserialized_state);
+ if (s->deserialized_state != s->state)
+ slice_set_state(s, s->deserialized_state);
return 0;
}
static void slice_dump(Unit *u, FILE *f, const char *prefix) {
- Slice *t = SLICE(u);
+ Slice *s = ASSERT_PTR(SLICE(u));
- assert(t);
+ assert(s);
assert(f);
+ assert(prefix);
fprintf(f,
"%sSlice State: %s\n",
- prefix, slice_state_to_string(t->state));
+ prefix, slice_state_to_string(s->state));
- cgroup_context_dump(UNIT(t), f, prefix);
+ cgroup_context_dump(u, f, prefix);
}
static int slice_start(Unit *u) {
- Slice *t = SLICE(u);
+ Slice *s = ASSERT_PTR(SLICE(u));
int r;
- assert(t);
- assert(t->state == SLICE_DEAD);
+ assert(s->state == SLICE_DEAD);
r = unit_acquire_invocation_id(u);
if (r < 0)
@@ -234,27 +229,25 @@ static int slice_start(Unit *u) {
(void) unit_realize_cgroup(u);
(void) unit_reset_accounting(u);
- slice_set_state(t, SLICE_ACTIVE);
+ slice_set_state(s, SLICE_ACTIVE);
return 1;
}
static int slice_stop(Unit *u) {
- Slice *t = SLICE(u);
+ Slice *s = ASSERT_PTR(SLICE(u));
- assert(t);
- assert(t->state == SLICE_ACTIVE);
+ assert(s->state == SLICE_ACTIVE);
/* We do not need to destroy the cgroup explicitly,
* unit_notify() will do that for us anyway. */
- slice_set_state(t, SLICE_DEAD);
+ slice_set_state(s, SLICE_DEAD);
return 1;
}
static int slice_serialize(Unit *u, FILE *f, FDSet *fds) {
- Slice *s = SLICE(u);
+ Slice *s = ASSERT_PTR(SLICE(u));
- assert(s);
assert(f);
assert(fds);
@@ -264,9 +257,8 @@ static int slice_serialize(Unit *u, FILE *f, FDSet *fds) {
}
static int slice_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Slice *s = SLICE(u);
+ Slice *s = ASSERT_PTR(SLICE(u));
- assert(u);
assert(key);
assert(value);
assert(fds);
@@ -276,26 +268,26 @@ static int slice_deserialize_item(Unit *u, const char *key, const char *value, F
state = slice_state_from_string(value);
if (state < 0)
- log_debug("Failed to parse state value %s", value);
+ log_unit_debug(u, "Failed to parse state: %s", value);
else
s->deserialized_state = state;
} else
- log_debug("Unknown serialization key '%s'", key);
+ log_unit_debug(u, "Unknown serialization key: %s", key);
return 0;
}
static UnitActiveState slice_active_state(Unit *u) {
- assert(u);
+ Slice *s = ASSERT_PTR(SLICE(u));
- return state_translation_table[SLICE(u)->state];
+ return state_translation_table[s->state];
}
static const char *slice_sub_state_to_string(Unit *u) {
- assert(u);
+ Slice *s = ASSERT_PTR(SLICE(u));
- return slice_state_to_string(SLICE(u)->state);
+ return slice_state_to_string(s->state);
}
static int slice_make_perpetual(Manager *m, const char *name, Unit **ret) {
@@ -347,46 +339,47 @@ static void slice_enumerate_perpetual(Manager *m) {
(void) slice_make_perpetual(m, SPECIAL_SYSTEM_SLICE, NULL);
}
-static bool slice_freezer_action_supported_by_children(Unit *s) {
+static bool slice_can_freeze(Unit *s) {
Unit *member;
assert(s);
- UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) {
-
- if (member->type == UNIT_SLICE &&
- !slice_freezer_action_supported_by_children(member))
+ UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF)
+ if (!unit_can_freeze(member))
return false;
-
- if (!UNIT_VTABLE(member)->freeze)
- return false;
- }
-
return true;
}
static int slice_freezer_action(Unit *s, FreezerAction action) {
+ FreezerAction child_action;
Unit *member;
int r;
assert(s);
- assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
-
- if (action == FREEZER_FREEZE && !slice_freezer_action_supported_by_children(s)) {
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_PARENT_FREEZE,
+ FREEZER_THAW, FREEZER_PARENT_THAW));
+
+ if (action == FREEZER_FREEZE && !slice_can_freeze(s)) {
+ /* We're intentionally only checking for FREEZER_FREEZE here and ignoring the
+ * _BY_PARENT variant. If we're being frozen by parent, that means someone has
+ * already checked if we can be frozen further up the call stack. No point to
+ * redo that work */
log_unit_warning(s, "Requested freezer operation is not supported by all children of the slice");
return 0;
}
- UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) {
- if (!member->cgroup_realized)
- continue;
+ if (action == FREEZER_FREEZE)
+ child_action = FREEZER_PARENT_FREEZE;
+ else if (action == FREEZER_THAW)
+ child_action = FREEZER_PARENT_THAW;
+ else
+ child_action = action;
- if (action == FREEZER_FREEZE)
- r = UNIT_VTABLE(member)->freeze(member);
- else if (UNIT_VTABLE(member)->thaw)
- r = UNIT_VTABLE(member)->thaw(member);
+ UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) {
+ if (UNIT_VTABLE(member)->freezer_action)
+ r = UNIT_VTABLE(member)->freezer_action(member, child_action);
else
- /* Thawing is requested but no corresponding method is available, ignore. */
+ /* Only thawing will reach here, since freezing checks for a method in can_freeze */
r = 0;
if (r < 0)
return r;
@@ -395,27 +388,10 @@ static int slice_freezer_action(Unit *s, FreezerAction action) {
return unit_cgroup_freezer_action(s, action);
}
-static int slice_freeze(Unit *s) {
- assert(s);
-
- return slice_freezer_action(s, FREEZER_FREEZE);
-}
-
-static int slice_thaw(Unit *s) {
- assert(s);
-
- return slice_freezer_action(s, FREEZER_THAW);
-}
-
-static bool slice_can_freeze(Unit *s) {
- assert(s);
-
- return slice_freezer_action_supported_by_children(s);
-}
-
const UnitVTable slice_vtable = {
.object_size = sizeof(Slice),
.cgroup_context_offset = offsetof(Slice, cgroup_context),
+ .cgroup_runtime_offset = offsetof(Slice, cgroup_runtime),
.sections =
"Unit\0"
@@ -436,8 +412,7 @@ const UnitVTable slice_vtable = {
.start = slice_start,
.stop = slice_stop,
- .freeze = slice_freeze,
- .thaw = slice_thaw,
+ .freezer_action = slice_freezer_action,
.can_freeze = slice_can_freeze,
.serialize = slice_serialize,
diff --git a/src/core/slice.h b/src/core/slice.h
index e2f9274..004349d 100644
--- a/src/core/slice.h
+++ b/src/core/slice.h
@@ -11,6 +11,8 @@ struct Slice {
SliceState state, deserialized_state;
CGroupContext cgroup_context;
+
+ CGroupRuntime *cgroup_runtime;
};
extern const UnitVTable slice_vtable;
diff --git a/src/core/socket.c b/src/core/socket.c
index 9adae16..41147d4 100644
--- a/src/core/socket.c
+++ b/src/core/socket.c
@@ -53,29 +53,44 @@ struct SocketPeer {
Socket *socket;
union sockaddr_union peer;
socklen_t peer_salen;
+ struct ucred peer_cred;
};
static const UnitActiveState state_translation_table[_SOCKET_STATE_MAX] = {
- [SOCKET_DEAD] = UNIT_INACTIVE,
- [SOCKET_START_PRE] = UNIT_ACTIVATING,
- [SOCKET_START_CHOWN] = UNIT_ACTIVATING,
- [SOCKET_START_POST] = UNIT_ACTIVATING,
- [SOCKET_LISTENING] = UNIT_ACTIVE,
- [SOCKET_RUNNING] = UNIT_ACTIVE,
- [SOCKET_STOP_PRE] = UNIT_DEACTIVATING,
+ [SOCKET_DEAD] = UNIT_INACTIVE,
+ [SOCKET_START_PRE] = UNIT_ACTIVATING,
+ [SOCKET_START_CHOWN] = UNIT_ACTIVATING,
+ [SOCKET_START_POST] = UNIT_ACTIVATING,
+ [SOCKET_LISTENING] = UNIT_ACTIVE,
+ [SOCKET_RUNNING] = UNIT_ACTIVE,
+ [SOCKET_STOP_PRE] = UNIT_DEACTIVATING,
[SOCKET_STOP_PRE_SIGTERM] = UNIT_DEACTIVATING,
[SOCKET_STOP_PRE_SIGKILL] = UNIT_DEACTIVATING,
- [SOCKET_STOP_POST] = UNIT_DEACTIVATING,
- [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING,
- [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING,
- [SOCKET_FAILED] = UNIT_FAILED,
- [SOCKET_CLEANING] = UNIT_MAINTENANCE,
+ [SOCKET_STOP_POST] = UNIT_DEACTIVATING,
+ [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+ [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+ [SOCKET_FAILED] = UNIT_FAILED,
+ [SOCKET_CLEANING] = UNIT_MAINTENANCE,
};
static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
static void flush_ports(Socket *s);
+static bool SOCKET_STATE_WITH_PROCESS(SocketState state) {
+ return IN_SET(state,
+ SOCKET_START_PRE,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_STOP_POST,
+ SOCKET_FINAL_SIGTERM,
+ SOCKET_FINAL_SIGKILL,
+ SOCKET_CLEANING);
+}
+
static void socket_init(Unit *u) {
Socket *s = SOCKET(u);
@@ -108,12 +123,7 @@ static void socket_init(Unit *u) {
static void socket_unwatch_control_pid(Socket *s) {
assert(s);
-
- if (!pidref_is_set(&s->control_pid))
- return;
-
- unit_unwatch_pidref(UNIT(s), &s->control_pid);
- pidref_done(&s->control_pid);
+ unit_unwatch_pidref_done(UNIT(s), &s->control_pid);
}
static void socket_cleanup_fd_list(SocketPort *p) {
@@ -144,11 +154,9 @@ void socket_free_ports(Socket *s) {
}
static void socket_done(Unit *u) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
SocketPeer *p;
- assert(s);
-
socket_free_ports(s);
while ((p = set_steal_first(s->peers_by_address)))
@@ -157,6 +165,7 @@ static void socket_done(Unit *u) {
s->peers_by_address = set_free(s->peers_by_address);
s->exec_runtime = exec_runtime_free(s->exec_runtime);
+
exec_command_free_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX);
s->control_command = NULL;
@@ -221,7 +230,7 @@ static int socket_add_mount_dependencies(Socket *s) {
if (!path)
continue;
- r = unit_require_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES);
if (r < 0)
return r;
}
@@ -243,6 +252,7 @@ static int socket_add_device_dependencies(Socket *s) {
static int socket_add_default_dependencies(Socket *s) {
int r;
+
assert(s);
if (!UNIT(s)->default_dependencies)
@@ -263,6 +273,7 @@ static int socket_add_default_dependencies(Socket *s) {
static bool socket_has_exec(Socket *s) {
unsigned i;
+
assert(s);
for (i = 0; i < _SOCKET_EXEC_COMMAND_MAX; i++)
@@ -273,11 +284,9 @@ static bool socket_has_exec(Socket *s) {
}
static int socket_add_extras(Socket *s) {
- Unit *u = UNIT(s);
+ Unit *u = UNIT(ASSERT_PTR(s));
int r;
- assert(s);
-
/* Pick defaults for the trigger limit, if nothing was explicitly configured. We pick a relatively high limit
* in Accept=yes mode, and a lower limit for Accept=no. Reason: in Accept=yes mode we are invoking accept()
* ourselves before the trigger limit can hit, thus incoming connections are taken off the socket queue quickly
@@ -406,11 +415,13 @@ static void peer_address_hash_func(const SocketPeer *s, struct siphash *state) {
assert(s);
if (s->peer.sa.sa_family == AF_INET)
- siphash24_compress(&s->peer.in.sin_addr, sizeof(s->peer.in.sin_addr), state);
+ siphash24_compress_typesafe(s->peer.in.sin_addr, state);
else if (s->peer.sa.sa_family == AF_INET6)
- siphash24_compress(&s->peer.in6.sin6_addr, sizeof(s->peer.in6.sin6_addr), state);
+ siphash24_compress_typesafe(s->peer.in6.sin6_addr, state);
else if (s->peer.sa.sa_family == AF_VSOCK)
- siphash24_compress(&s->peer.vm.svm_cid, sizeof(s->peer.vm.svm_cid), state);
+ siphash24_compress_typesafe(s->peer.vm.svm_cid, state);
+ else if (s->peer.sa.sa_family == AF_UNIX)
+ siphash24_compress_typesafe(s->peer_cred.uid, state);
else
assert_not_reached();
}
@@ -429,6 +440,8 @@ static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) {
return memcmp(&x->peer.in6.sin6_addr, &y->peer.in6.sin6_addr, sizeof(x->peer.in6.sin6_addr));
case AF_VSOCK:
return CMP(x->peer.vm.svm_cid, y->peer.vm.svm_cid);
+ case AF_UNIX:
+ return CMP(x->peer_cred.uid, y->peer_cred.uid);
}
assert_not_reached();
}
@@ -436,10 +449,9 @@ static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) {
DEFINE_PRIVATE_HASH_OPS(peer_address_hash_ops, SocketPeer, peer_address_hash_func, peer_address_compare_func);
static int socket_load(Unit *u) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
int r;
- assert(u);
assert(u->load_state == UNIT_STUB);
r = unit_load_fragment_and_dropin(u, true);
@@ -457,16 +469,22 @@ static int socket_load(Unit *u) {
return socket_verify(s);
}
-static SocketPeer *socket_peer_new(void) {
+static SocketPeer *socket_peer_dup(const SocketPeer *q) {
SocketPeer *p;
+ assert(q);
+
p = new(SocketPeer, 1);
if (!p)
return NULL;
*p = (SocketPeer) {
.n_ref = 1,
+ .peer = q->peer,
+ .peer_salen = q->peer_salen,
+ .peer_cred = q->peer_cred,
};
+
return p;
}
@@ -483,36 +501,46 @@ DEFINE_TRIVIAL_REF_UNREF_FUNC(SocketPeer, socket_peer, socket_peer_free);
int socket_acquire_peer(Socket *s, int fd, SocketPeer **ret) {
_cleanup_(socket_peer_unrefp) SocketPeer *remote = NULL;
- SocketPeer sa = {
+ SocketPeer key = {
.peer_salen = sizeof(union sockaddr_union),
+ .peer_cred = UCRED_INVALID,
}, *i;
int r;
- assert(fd >= 0);
assert(s);
+ assert(fd >= 0);
assert(ret);
- if (getpeername(fd, &sa.peer.sa, &sa.peer_salen) < 0)
+ if (getpeername(fd, &key.peer.sa, &key.peer_salen) < 0)
return log_unit_error_errno(UNIT(s), errno, "getpeername() failed: %m");
- if (!IN_SET(sa.peer.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) {
+ switch (key.peer.sa.sa_family) {
+ case AF_INET:
+ case AF_INET6:
+ case AF_VSOCK:
+ break;
+
+ case AF_UNIX:
+ r = getpeercred(fd, &key.peer_cred);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to get peer credentials of socket: %m");
+ break;
+
+ default:
*ret = NULL;
return 0;
}
- i = set_get(s->peers_by_address, &sa);
+ i = set_get(s->peers_by_address, &key);
if (i) {
*ret = socket_peer_ref(i);
return 1;
}
- remote = socket_peer_new();
+ remote = socket_peer_dup(&key);
if (!remote)
return log_oom();
- remote->peer = sa.peer;
- remote->peer_salen = sa.peer_salen;
-
r = set_ensure_put(&s->peers_by_address, &peer_address_hash_ops, remote);
if (r < 0)
return log_unit_error_errno(UNIT(s), r, "Failed to insert peer info into hash table: %m");
@@ -540,10 +568,9 @@ static const char* listen_lookup(int family, int type) {
}
static void socket_dump(Unit *u, FILE *f, const char *prefix) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
const char *prefix2, *str;
- assert(s);
assert(f);
prefix = strempty(prefix);
@@ -563,6 +590,7 @@ static void socket_dump(Unit *u, FILE *f, const char *prefix) {
"%sTransparent: %s\n"
"%sBroadcast: %s\n"
"%sPassCredentials: %s\n"
+ "%sPassFileDescriptorsToExec: %s\n"
"%sPassSecurity: %s\n"
"%sPassPacketInfo: %s\n"
"%sTCPCongestion: %s\n"
@@ -583,6 +611,7 @@ static void socket_dump(Unit *u, FILE *f, const char *prefix) {
prefix, yes_no(s->transparent),
prefix, yes_no(s->broadcast),
prefix, yes_no(s->pass_cred),
+ prefix, yes_no(s->pass_fds_to_exec),
prefix, yes_no(s->pass_sec),
prefix, yes_no(s->pass_pktinfo),
prefix, strna(s->tcp_congestion),
@@ -776,8 +805,8 @@ static void socket_dump(Unit *u, FILE *f, const char *prefix) {
if (!s->exec_command[c])
continue;
- fprintf(f, "%s-> %s:\n",
- prefix, socket_exec_command_to_string(c));
+ fprintf(f, "%s%s %s:\n",
+ prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), socket_exec_command_to_string(c));
exec_command_dump_list(s->exec_command[c], f, prefix2);
}
@@ -1274,6 +1303,9 @@ static int socket_symlink(Socket *s) {
static int usbffs_write_descs(int fd, Service *s) {
int r;
+ assert(fd >= 0);
+ assert(s);
+
if (!s->usb_function_descriptors || !s->usb_function_strings)
return -EINVAL;
@@ -1339,12 +1371,17 @@ clear:
}
int socket_load_service_unit(Socket *s, int cfd, Unit **ret) {
+ int r;
+
/* Figure out what the unit that will be used to handle the connections on the socket looks like.
*
* If cfd < 0, then we don't have a connection yet. In case of Accept=yes sockets, use a fake
* instance name.
*/
+ assert(s);
+ assert(ret);
+
if (UNIT_ISSET(s->service)) {
*ret = UNIT_DEREF(s->service);
return 0;
@@ -1355,7 +1392,6 @@ int socket_load_service_unit(Socket *s, int cfd, Unit **ret) {
/* Build the instance name and load the unit */
_cleanup_free_ char *prefix = NULL, *instance = NULL, *name = NULL;
- int r;
r = unit_name_to_prefix(UNIT(s)->id, &prefix);
if (r < 0)
@@ -1385,50 +1421,26 @@ int socket_load_service_unit(Socket *s, int cfd, Unit **ret) {
}
static int socket_determine_selinux_label(Socket *s, char **ret) {
+ Unit *service;
int r;
assert(s);
assert(ret);
- Unit *service;
- ExecCommand *c;
- const char *exec_context;
- _cleanup_free_ char *path = NULL;
-
- r = socket_load_service_unit(s, -1, &service);
- if (r == -ENODATA)
- goto no_label;
+ r = socket_load_service_unit(s, /* cfd= */ -EBADF, &service);
+ if (r == -ENODATA) {
+ *ret = NULL;
+ return 0;
+ }
if (r < 0)
return r;
- exec_context = SERVICE(service)->exec_context.selinux_context;
- if (exec_context) {
- char *con;
-
- con = strdup(exec_context);
- if (!con)
- return -ENOMEM;
-
- *ret = TAKE_PTR(con);
+ r = service_determine_exec_selinux_label(SERVICE(service), ret);
+ if (r == -ENODATA) {
+ *ret = NULL;
return 0;
}
-
- c = SERVICE(service)->exec_command[SERVICE_EXEC_START];
- if (!c)
- goto no_label;
-
- r = chase(c->path, SERVICE(service)->exec_context.root_directory, CHASE_PREFIX_ROOT, &path, NULL);
- if (r < 0)
- goto no_label;
-
- r = mac_selinux_get_create_label_from_exe(path, ret);
- if (IN_SET(r, -EPERM, -EOPNOTSUPP))
- goto no_label;
return r;
-
-no_label:
- *ret = NULL;
- return 0;
}
static int socket_address_listen_do(
@@ -1794,6 +1806,7 @@ static int socket_check_open(Socket *s) {
static void socket_set_state(Socket *s, SocketState state) {
SocketState old_state;
+
assert(s);
if (s->state != state)
@@ -1802,18 +1815,7 @@ static void socket_set_state(Socket *s, SocketState state) {
old_state = s->state;
s->state = state;
- if (!IN_SET(state,
- SOCKET_START_PRE,
- SOCKET_START_CHOWN,
- SOCKET_START_POST,
- SOCKET_STOP_PRE,
- SOCKET_STOP_PRE_SIGTERM,
- SOCKET_STOP_PRE_SIGKILL,
- SOCKET_STOP_POST,
- SOCKET_FINAL_SIGTERM,
- SOCKET_FINAL_SIGKILL,
- SOCKET_CLEANING)) {
-
+ if (!SOCKET_STATE_WITH_PROCESS(state)) {
s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
socket_unwatch_control_pid(s);
s->control_command = NULL;
@@ -1841,10 +1843,9 @@ static void socket_set_state(Socket *s, SocketState state) {
}
static int socket_coldplug(Unit *u) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
int r;
- assert(s);
assert(s->state == SOCKET_DEAD);
if (s->deserialized_state == s->state)
@@ -1852,17 +1853,7 @@ static int socket_coldplug(Unit *u) {
if (pidref_is_set(&s->control_pid) &&
pidref_is_unwaited(&s->control_pid) > 0 &&
- IN_SET(s->deserialized_state,
- SOCKET_START_PRE,
- SOCKET_START_CHOWN,
- SOCKET_START_POST,
- SOCKET_STOP_PRE,
- SOCKET_STOP_PRE_SIGTERM,
- SOCKET_STOP_PRE_SIGKILL,
- SOCKET_STOP_POST,
- SOCKET_FINAL_SIGTERM,
- SOCKET_FINAL_SIGKILL,
- SOCKET_CLEANING)) {
+ SOCKET_STATE_WITH_PROCESS(s->deserialized_state)) {
r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false);
if (r < 0)
@@ -1911,11 +1902,9 @@ static int socket_coldplug(Unit *u) {
}
static int socket_spawn(Socket *s, ExecCommand *c, PidRef *ret_pid) {
-
_cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(
EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN);
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
- pid_t pid;
int r;
assert(s);
@@ -1934,17 +1923,33 @@ static int socket_spawn(Socket *s, ExecCommand *c, PidRef *ret_pid) {
if (r < 0)
return r;
+ /* Note that ExecStartPre= command doesn't inherit any FDs. It runs before we open listen FDs. */
+ if (s->pass_fds_to_exec) {
+ _cleanup_strv_free_ char **fd_names = NULL;
+ _cleanup_free_ int *fds = NULL;
+ int n_fds;
+
+ n_fds = socket_collect_fds(s, &fds);
+ if (n_fds < 0)
+ return n_fds;
+
+ r = strv_extend_n(&fd_names, socket_fdname(s), n_fds);
+ if (r < 0)
+ return r;
+
+ exec_params.flags |= EXEC_PASS_FDS;
+ exec_params.fds = TAKE_PTR(fds);
+ exec_params.fd_names = TAKE_PTR(fd_names);
+ exec_params.n_socket_fds = n_fds;
+ }
+
r = exec_spawn(UNIT(s),
c,
&s->exec_context,
&exec_params,
s->exec_runtime,
&s->cgroup_context,
- &pid);
- if (r < 0)
- return r;
-
- r = pidref_set_pid(&pidref, pid);
+ &pidref);
if (r < 0)
return r;
@@ -2052,6 +2057,7 @@ static void socket_enter_signal(Socket *s, SocketState state, SocketResult f);
static void socket_enter_stop_post(Socket *s, SocketResult f) {
int r;
+
assert(s);
if (s->result == SOCKET_SUCCESS)
@@ -2094,13 +2100,7 @@ static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) {
if (s->result == SOCKET_SUCCESS)
s->result = f;
- r = unit_kill_context(
- UNIT(s),
- &s->kill_context,
- state_to_kill_operation(s, state),
- /* main_pid= */ NULL,
- &s->control_pid,
- /* main_pid_alien= */ false);
+ r = unit_kill_context(UNIT(s), state_to_kill_operation(s, state));
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
goto fail;
@@ -2134,6 +2134,7 @@ fail:
static void socket_enter_stop_pre(Socket *s, SocketResult f) {
int r;
+
assert(s);
if (s->result == SOCKET_SUCCESS)
@@ -2160,6 +2161,7 @@ static void socket_enter_stop_pre(Socket *s, SocketResult f) {
static void socket_enter_listening(Socket *s) {
int r;
+
assert(s);
if (!s->accept && s->flush_pending) {
@@ -2179,6 +2181,7 @@ static void socket_enter_listening(Socket *s) {
static void socket_enter_start_post(Socket *s) {
int r;
+
assert(s);
socket_unwatch_control_pid(s);
@@ -2235,6 +2238,7 @@ fail:
static void socket_enter_start_pre(Socket *s) {
int r;
+
assert(s);
socket_unwatch_control_pid(s);
@@ -2278,7 +2282,6 @@ static void socket_enter_running(Socket *s, int cfd_in) {
/* Note that this call takes possession of the connection fd passed. It either has to assign it
* somewhere or close it. */
_cleanup_close_ int cfd = cfd_in;
-
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
int r;
@@ -2315,8 +2318,8 @@ static void socket_enter_running(Socket *s, int cfd_in) {
if (!pending) {
if (!UNIT_ISSET(s->service)) {
- r = log_unit_warning_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT),
- "Service to activate vanished, refusing activation.");
+ log_unit_warning(UNIT(s),
+ "Service to activate vanished, refusing activation.");
goto fail;
}
@@ -2347,7 +2350,10 @@ static void socket_enter_running(Socket *s, int cfd_in) {
if (r > 0 && p->n_ref > s->max_connections_per_source) {
_cleanup_free_ char *t = NULL;
- (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, true, false, &t);
+ if (p->peer.sa.sa_family == AF_UNIX)
+ (void) asprintf(&t, "UID " UID_FMT, p->peer_cred.uid);
+ else
+ (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, /* translate_ipv6= */ true, /* include_port= */ false, &t);
log_unit_warning(UNIT(s),
"Too many incoming connections (%u) from source %s, dropping connection.",
@@ -2357,18 +2363,15 @@ static void socket_enter_running(Socket *s, int cfd_in) {
}
r = socket_load_service_unit(s, cfd, &service);
- if (r < 0) {
- if (ERRNO_IS_DISCONNECT(r))
- return;
-
- log_unit_warning_errno(UNIT(s), r, "Failed to load connection service unit: %m");
+ if (ERRNO_IS_NEG_DISCONNECT(r))
+ return;
+ if (r < 0 || UNIT_IS_LOAD_ERROR(service->load_state)) {
+ log_unit_warning_errno(UNIT(s), r < 0 ? r : service->load_error,
+ "Failed to load connection service unit: %m");
goto fail;
}
-
- r = unit_add_two_dependencies(UNIT(s), UNIT_BEFORE, UNIT_TRIGGERS, service,
- false, UNIT_DEPENDENCY_IMPLICIT);
- if (r < 0) {
- log_unit_warning_errno(UNIT(s), r, "Failed to add Before=/Triggers= dependencies on connection unit: %m");
+ if (service->load_state == UNIT_MASKED) {
+ log_unit_warning(UNIT(s), "Connection service unit is masked, refusing.");
goto fail;
}
@@ -2383,7 +2386,10 @@ static void socket_enter_running(Socket *s, int cfd_in) {
goto fail;
}
- TAKE_FD(cfd); /* We passed ownership of the fd to the service now. Forget it here. */
+ /* We passed ownership of the fd and socket peer to the service now. */
+ TAKE_FD(cfd);
+ TAKE_PTR(p);
+
s->n_connections++;
r = manager_add_job(UNIT(s)->manager, JOB_START, service, JOB_REPLACE, NULL, &error, NULL);
@@ -2405,13 +2411,9 @@ refuse:
return;
queue_error:
- if (ERRNO_IS_RESOURCE(r))
- log_unit_warning(UNIT(s), "Failed to queue service startup job: %s",
- bus_error_message(&error, r));
- else
- log_unit_warning(UNIT(s), "Failed to queue service startup job (Maybe the service file is missing or not a %s unit?): %s",
- cfd >= 0 ? "template" : "non-template",
- bus_error_message(&error, r));
+ log_unit_warning_errno(UNIT(s), r, "Failed to queue service startup job%s: %s",
+ cfd >= 0 && !ERRNO_IS_RESOURCE(r) ? " (Maybe the service is missing or is a template unit?)" : "",
+ bus_error_message(&error, r));
fail:
socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
@@ -2444,11 +2446,9 @@ static void socket_run_next(Socket *s) {
}
static int socket_start(Unit *u) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
int r;
- assert(s);
-
/* We cannot fulfill this request right now, try again later
* please! */
if (IN_SET(s->state,
@@ -2496,16 +2496,15 @@ static int socket_start(Unit *u) {
s->result = SOCKET_SUCCESS;
exec_command_reset_status_list_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX);
- u->reset_accounting = true;
+ if (s->cgroup_runtime)
+ s->cgroup_runtime->reset_accounting = true;
socket_enter_start_pre(s);
return 1;
}
static int socket_stop(Unit *u) {
- Socket *s = SOCKET(u);
-
- assert(s);
+ Socket *s = ASSERT_PTR(SOCKET(u));
/* Already on it */
if (IN_SET(s->state,
@@ -2540,10 +2539,9 @@ static int socket_stop(Unit *u) {
}
static int socket_serialize(Unit *u, FILE *f, FDSet *fds) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
int r;
- assert(u);
assert(f);
assert(fds);
@@ -2595,10 +2593,9 @@ static int socket_serialize(Unit *u, FILE *f, FDSet *fds) {
}
static int socket_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
int r;
- assert(u);
assert(key);
assert(value);
@@ -2836,9 +2833,7 @@ static int socket_deserialize_item(Unit *u, const char *key, const char *value,
}
static void socket_distribute_fds(Unit *u, FDSet *fds) {
- Socket *s = SOCKET(u);
-
- assert(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
LIST_FOREACH(port, p, s->ports) {
int fd;
@@ -2860,15 +2855,15 @@ static void socket_distribute_fds(Unit *u, FDSet *fds) {
}
static UnitActiveState socket_active_state(Unit *u) {
- assert(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
- return state_translation_table[SOCKET(u)->state];
+ return state_translation_table[s->state];
}
static const char *socket_sub_state_to_string(Unit *u) {
- assert(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
- return socket_state_to_string(SOCKET(u)->state);
+ return socket_state_to_string(s->state);
}
int socket_port_to_address(const SocketPort *p, char **ret) {
@@ -2906,7 +2901,6 @@ int socket_port_to_address(const SocketPort *p, char **ret) {
}
const char* socket_port_type_to_string(SocketPort *p) {
-
assert(p);
switch (p->type) {
@@ -2968,9 +2962,7 @@ SocketType socket_port_type_from_string(const char *s) {
}
static bool socket_may_gc(Unit *u) {
- Socket *s = SOCKET(u);
-
- assert(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
return s->n_connections == 0;
}
@@ -3108,10 +3100,9 @@ fail:
}
static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
SocketResult f;
- assert(s);
assert(pid >= 0);
if (pid != s->control_pid.pid)
@@ -3215,9 +3206,8 @@ static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) {
}
static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
- Socket *s = SOCKET(userdata);
+ Socket *s = ASSERT_PTR(SOCKET(userdata));
- assert(s);
assert(s->timer_event_source == source);
switch (s->state) {
@@ -3289,12 +3279,11 @@ static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *use
return 0;
}
-int socket_collect_fds(Socket *s, int **fds) {
- size_t k = 0, n = 0;
- int *rfds;
+int socket_collect_fds(Socket *s, int **ret) {
+ size_t n = 0, k = 0;
assert(s);
- assert(fds);
+ assert(ret);
/* Called from the service code for requesting our fds */
@@ -3304,25 +3293,25 @@ int socket_collect_fds(Socket *s, int **fds) {
n += p->n_auxiliary_fds;
}
- if (n <= 0) {
- *fds = NULL;
+ if (n == 0) {
+ *ret = NULL;
return 0;
}
- rfds = new(int, n);
- if (!rfds)
+ int *fds = new(int, n);
+ if (!fds)
return -ENOMEM;
LIST_FOREACH(port, p, s->ports) {
if (p->fd >= 0)
- rfds[k++] = p->fd;
- for (size_t i = 0; i < p->n_auxiliary_fds; ++i)
- rfds[k++] = p->auxiliary_fds[i];
+ fds[k++] = p->fd;
+ FOREACH_ARRAY(i, p->auxiliary_fds, p->n_auxiliary_fds)
+ fds[k++] = *i;
}
assert(k == n);
- *fds = rfds;
+ *ret = fds;
return (int) n;
}
@@ -3353,9 +3342,8 @@ void socket_connection_unref(Socket *s) {
}
static void socket_trigger_notify(Unit *u, Unit *other) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
- assert(u);
assert(other);
/* Filter out invocations with bogus state */
@@ -3390,8 +3378,24 @@ static void socket_trigger_notify(Unit *u, Unit *other) {
socket_set_state(s, SOCKET_RUNNING);
}
+static void socket_handoff_timestamp(
+ Unit *u,
+ const struct ucred *ucred,
+ const dual_timestamp *ts) {
+
+ Socket *s = ASSERT_PTR(SOCKET(u));
+
+ assert(ucred);
+ assert(ts);
+
+ if (s->control_pid.pid == ucred->pid && s->control_command) {
+ exec_status_handoff(&s->control_command->exec_status, ucred, ts);
+ unit_add_to_dbus_queue(u);
+ }
+}
+
static int socket_get_timeout(Unit *u, usec_t *timeout) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
usec_t t;
int r;
@@ -3423,11 +3427,10 @@ static PidRef *socket_control_pid(Unit *u) {
}
static int socket_clean(Unit *u, ExecCleanMask mask) {
+ Socket *s = ASSERT_PTR(SOCKET(u));
_cleanup_strv_free_ char **l = NULL;
- Socket *s = SOCKET(u);
int r;
- assert(s);
assert(mask != 0);
if (s->state != SOCKET_DEAD)
@@ -3467,19 +3470,15 @@ fail:
}
static int socket_can_clean(Unit *u, ExecCleanMask *ret) {
- Socket *s = SOCKET(u);
-
- assert(s);
+ Socket *s = ASSERT_PTR(SOCKET(u));
return exec_context_get_clean_mask(&s->exec_context, ret);
}
static int socket_can_start(Unit *u) {
- Socket *s = SOCKET(u);
+ Socket *s = ASSERT_PTR(SOCKET(u));
int r;
- assert(s);
-
r = unit_test_start_limit(u);
if (r < 0) {
socket_enter_dead(s, SOCKET_FAILURE_START_LIMIT_HIT);
@@ -3494,7 +3493,7 @@ static const char* const socket_exec_command_table[_SOCKET_EXEC_COMMAND_MAX] = {
[SOCKET_EXEC_START_CHOWN] = "ExecStartChown",
[SOCKET_EXEC_START_POST] = "ExecStartPost",
[SOCKET_EXEC_STOP_PRE] = "ExecStopPre",
- [SOCKET_EXEC_STOP_POST] = "ExecStopPost"
+ [SOCKET_EXEC_STOP_POST] = "ExecStopPost",
};
DEFINE_STRING_TABLE_LOOKUP(socket_exec_command, SocketExecCommand);
@@ -3508,7 +3507,7 @@ static const char* const socket_result_table[_SOCKET_RESULT_MAX] = {
[SOCKET_FAILURE_CORE_DUMP] = "core-dump",
[SOCKET_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
[SOCKET_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit",
- [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit"
+ [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit",
};
DEFINE_STRING_TABLE_LOOKUP(socket_result, SocketResult);
@@ -3552,6 +3551,7 @@ const UnitVTable socket_vtable = {
.cgroup_context_offset = offsetof(Socket, cgroup_context),
.kill_context_offset = offsetof(Socket, kill_context),
.exec_runtime_offset = offsetof(Socket, exec_runtime),
+ .cgroup_runtime_offset = offsetof(Socket, cgroup_runtime),
.sections =
"Unit\0"
@@ -3596,6 +3596,8 @@ const UnitVTable socket_vtable = {
.reset_failed = socket_reset_failed,
+ .notify_handoff_timestamp = socket_handoff_timestamp,
+
.control_pid = socket_control_pid,
.bus_set_property = bus_socket_set_property,
diff --git a/src/core/socket.h b/src/core/socket.h
index 0983e8c..5e3929c 100644
--- a/src/core/socket.h
+++ b/src/core/socket.h
@@ -92,6 +92,7 @@ struct Socket {
CGroupContext cgroup_context;
ExecRuntime *exec_runtime;
+ CGroupRuntime *cgroup_runtime;
/* For Accept=no sockets refers to the one service we'll
* activate. For Accept=yes sockets is either NULL, or filled
@@ -128,6 +129,7 @@ struct Socket {
bool transparent;
bool broadcast;
bool pass_cred;
+ bool pass_fds_to_exec;
bool pass_sec;
bool pass_pktinfo;
SocketTimestamping timestamping;
@@ -170,7 +172,7 @@ int socket_acquire_peer(Socket *s, int fd, SocketPeer **p);
DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPeer*, socket_peer_unref);
/* Called from the service code when collecting fds */
-int socket_collect_fds(Socket *s, int **fds);
+int socket_collect_fds(Socket *s, int **ret);
/* Called from the service code when a per-connection service ended */
void socket_connection_unref(Socket *s);
diff --git a/src/core/swap.c b/src/core/swap.c
index 682c2b9..c4d2ba8 100644
--- a/src/core/swap.c
+++ b/src/core/swap.c
@@ -30,15 +30,15 @@
#include "virt.h"
static const UnitActiveState state_translation_table[_SWAP_STATE_MAX] = {
- [SWAP_DEAD] = UNIT_INACTIVE,
- [SWAP_ACTIVATING] = UNIT_ACTIVATING,
- [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE,
- [SWAP_ACTIVE] = UNIT_ACTIVE,
- [SWAP_DEACTIVATING] = UNIT_DEACTIVATING,
+ [SWAP_DEAD] = UNIT_INACTIVE,
+ [SWAP_ACTIVATING] = UNIT_ACTIVATING,
+ [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE,
+ [SWAP_ACTIVE] = UNIT_ACTIVE,
+ [SWAP_DEACTIVATING] = UNIT_DEACTIVATING,
[SWAP_DEACTIVATING_SIGTERM] = UNIT_DEACTIVATING,
[SWAP_DEACTIVATING_SIGKILL] = UNIT_DEACTIVATING,
- [SWAP_FAILED] = UNIT_FAILED,
- [SWAP_CLEANING] = UNIT_MAINTENANCE,
+ [SWAP_FAILED] = UNIT_FAILED,
+ [SWAP_CLEANING] = UNIT_MAINTENANCE,
};
static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
@@ -68,9 +68,7 @@ static const char *swap_sub_state_to_string(Unit *u) {
}
static bool swap_may_gc(Unit *u) {
- Swap *s = SWAP(u);
-
- assert(s);
+ Swap *s = ASSERT_PTR(SWAP(u));
if (s->from_proc_swaps)
return false;
@@ -134,10 +132,9 @@ static int swap_set_devnode(Swap *s, const char *devnode) {
}
static void swap_init(Unit *u) {
- Swap *s = SWAP(u);
+ Swap *s = ASSERT_PTR(SWAP(u));
- assert(s);
- assert(UNIT(s)->load_state == UNIT_STUB);
+ assert(u->load_state == UNIT_STUB);
s->timeout_usec = u->manager->defaults.timeout_start_usec;
@@ -152,18 +149,11 @@ static void swap_init(Unit *u) {
static void swap_unwatch_control_pid(Swap *s) {
assert(s);
-
- if (!pidref_is_set(&s->control_pid))
- return;
-
- unit_unwatch_pidref(UNIT(s), &s->control_pid);
- pidref_done(&s->control_pid);
+ unit_unwatch_pidref_done(UNIT(s), &s->control_pid);
}
static void swap_done(Unit *u) {
- Swap *s = SWAP(u);
-
- assert(s);
+ Swap *s = ASSERT_PTR(SWAP(u));
swap_unset_proc_swaps(s);
swap_set_devnode(s, NULL);
@@ -173,6 +163,7 @@ static void swap_done(Unit *u) {
s->parameters_fragment.options = mfree(s->parameters_fragment.options);
s->exec_runtime = exec_runtime_free(s->exec_runtime);
+
exec_command_done_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX);
s->control_command = NULL;
@@ -255,6 +246,7 @@ static int swap_verify(Swap *s) {
_cleanup_free_ char *e = NULL;
int r;
+ assert(s);
assert(UNIT(s)->load_state == UNIT_LOADED);
r = unit_name_from_path(s->what, ".swap", &e);
@@ -321,7 +313,7 @@ static int swap_add_extras(Swap *s) {
return r;
}
- r = unit_require_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT);
+ r = unit_add_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT, UNIT_MOUNT_REQUIRES);
if (r < 0)
return r;
@@ -353,25 +345,22 @@ static int swap_add_extras(Swap *s) {
}
static int swap_load(Unit *u) {
- Swap *s = SWAP(u);
- int r, q = 0;
+ Swap *s = ASSERT_PTR(SWAP(u));
+ int r;
- assert(s);
assert(u->load_state == UNIT_STUB);
/* Load a .swap file */
- bool fragment_optional = s->from_proc_swaps;
- r = unit_load_fragment_and_dropin(u, !fragment_optional);
+ r = unit_load_fragment_and_dropin(u, /* fragment_required = */ !s->from_proc_swaps);
/* Add in some extras, and do so either when we successfully loaded something or when /proc/swaps is
* already active. */
if (u->load_state == UNIT_LOADED || s->from_proc_swaps)
- q = swap_add_extras(s);
+ RET_GATHER(r, swap_add_extras(s));
if (r < 0)
return r;
- if (q < 0)
- return q;
+
if (u->load_state != UNIT_LOADED)
return 0;
@@ -385,11 +374,11 @@ static int swap_setup_unit(
int priority,
bool set_flags) {
+ _cleanup_(unit_freep) Unit *new_unit = NULL;
_cleanup_free_ char *e = NULL;
- bool delete = false;
- Unit *u = NULL;
+ Unit *u;
+ Swap *s;
int r;
- SwapParameters *p;
assert(m);
assert(what);
@@ -397,70 +386,61 @@ static int swap_setup_unit(
r = unit_name_from_path(what, ".swap", &e);
if (r < 0)
- return log_unit_error_errno(u, r, "Failed to generate unit name from path: %m");
+ return log_error_errno(r, "Failed to generate unit name from path: %m");
u = manager_get_unit(m, e);
- if (u &&
- SWAP(u)->from_proc_swaps &&
- !path_equal(SWAP(u)->parameters_proc_swaps.what, what_proc_swaps))
- return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
- "Swap %s appeared twice with different device paths %s and %s",
- e, SWAP(u)->parameters_proc_swaps.what, what_proc_swaps);
-
- if (!u) {
- delete = true;
+ if (u) {
+ s = ASSERT_PTR(SWAP(u));
+
+ if (s->from_proc_swaps &&
+ !path_equal(s->parameters_proc_swaps.what, what_proc_swaps))
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EEXIST),
+ "Swap appeared twice with different device paths %s and %s, refusing.",
+ s->parameters_proc_swaps.what, what_proc_swaps);
+ } else {
+ r = unit_new_for_name(m, sizeof(Swap), e, &new_unit);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to load swap unit '%s': %m", e);
- r = unit_new_for_name(m, sizeof(Swap), e, &u);
- if (r < 0) {
- log_unit_warning_errno(u, r, "Failed to load swap unit: %m");
- goto fail;
- }
+ u = new_unit;
+ s = ASSERT_PTR(SWAP(u));
- SWAP(u)->what = strdup(what);
- if (!SWAP(u)->what) {
- r = log_oom();
- goto fail;
- }
+ s->what = strdup(what);
+ if (!s->what)
+ return log_oom();
unit_add_to_load_queue(u);
- } else
- delete = false;
+ }
- p = &SWAP(u)->parameters_proc_swaps;
+ SwapParameters *p = &s->parameters_proc_swaps;
if (!p->what) {
p->what = strdup(what_proc_swaps);
- if (!p->what) {
- r = log_oom();
- goto fail;
- }
+ if (!p->what)
+ return log_oom();
}
- /* The unit is definitely around now, mark it as loaded if it was previously referenced but could not be
- * loaded. After all we can load it now, from the data in /proc/swaps. */
- if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) {
+ /* The unit is definitely around now, mark it as loaded if it was previously referenced but
+ * could not be loaded. After all we can load it now, from the data in /proc/swaps. */
+ if (UNIT_IS_LOAD_ERROR(u->load_state)) {
u->load_state = UNIT_LOADED;
u->load_error = 0;
}
if (set_flags) {
- SWAP(u)->is_active = true;
- SWAP(u)->just_activated = !SWAP(u)->from_proc_swaps;
+ s->is_active = true;
+ s->just_activated = !s->from_proc_swaps;
}
- SWAP(u)->from_proc_swaps = true;
+ s->from_proc_swaps = true;
p->priority = priority;
p->priority_set = true;
unit_add_to_dbus_queue(u);
- return 0;
+ TAKE_PTR(new_unit);
-fail:
- if (delete)
- unit_free(u);
-
- return r;
+ return 0;
}
static void swap_process_new(Manager *m, const char *device, int prio, bool set_flags) {
@@ -541,11 +521,10 @@ static void swap_set_state(Swap *s, SwapState state) {
}
static int swap_coldplug(Unit *u) {
- Swap *s = SWAP(u);
+ Swap *s = ASSERT_PTR(SWAP(u));
SwapState new_state = SWAP_DEAD;
int r;
- assert(s);
assert(s->state == SWAP_DEAD);
if (s->deserialized_state != s->state)
@@ -569,20 +548,25 @@ static int swap_coldplug(Unit *u) {
return r;
}
- if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED))
+ if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED)) {
(void) unit_setup_exec_runtime(u);
+ (void) unit_setup_cgroup_runtime(u);
+ }
swap_set_state(s, new_state);
return 0;
}
static void swap_dump(Unit *u, FILE *f, const char *prefix) {
- Swap *s = SWAP(u);
+ Swap *s = ASSERT_PTR(SWAP(u));
SwapParameters *p;
+ const char *prefix2;
- assert(s);
assert(f);
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
if (s->from_proc_swaps)
p = &s->parameters_proc_swaps;
else if (s->from_fragment)
@@ -628,14 +612,23 @@ static void swap_dump(Unit *u, FILE *f, const char *prefix) {
exec_context_dump(&s->exec_context, f, prefix);
kill_context_dump(&s->kill_context, f, prefix);
cgroup_context_dump(UNIT(s), f, prefix);
+
+ for (SwapExecCommand c = 0; c < _SWAP_EXEC_COMMAND_MAX; c++) {
+ if (!s->exec_command[c].argv)
+ continue;
+
+ fprintf(f, "%s%s %s:\n",
+ prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), swap_exec_command_to_string(c));
+
+ exec_command_dump(s->exec_command + c, f, prefix2);
+ }
+
}
static int swap_spawn(Swap *s, ExecCommand *c, PidRef *ret_pid) {
-
_cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(
EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN);
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
- pid_t pid;
int r;
assert(s);
@@ -660,11 +653,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, PidRef *ret_pid) {
&exec_params,
s->exec_runtime,
&s->cgroup_context,
- &pid);
- if (r < 0)
- return r;
-
- r = pidref_set_pid(&pidref, pid);
+ &pidref);
if (r < 0)
return r;
@@ -734,13 +723,7 @@ static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) {
if (s->result == SWAP_SUCCESS)
s->result = f;
- r = unit_kill_context(
- UNIT(s),
- &s->kill_context,
- state_to_kill_operation(s, state),
- /* main_pid= */ NULL,
- &s->control_pid,
- /* main_pid_alien= */ false);
+ r = unit_kill_context(UNIT(s), state_to_kill_operation(s, state));
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
goto fail;
@@ -870,7 +853,9 @@ static void swap_cycle_clear(Swap *s) {
s->result = SWAP_SUCCESS;
exec_command_reset_status_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX);
- UNIT(s)->reset_accounting = true;
+
+ if (s->cgroup_runtime)
+ s->cgroup_runtime->reset_accounting = true;
}
static int swap_start(Unit *u) {
@@ -913,9 +898,7 @@ static int swap_start(Unit *u) {
}
static int swap_stop(Unit *u) {
- Swap *s = SWAP(u);
-
- assert(s);
+ Swap *s = ASSERT_PTR(SWAP(u));
switch (s->state) {
@@ -949,9 +932,8 @@ static int swap_stop(Unit *u) {
}
static int swap_serialize(Unit *u, FILE *f, FDSet *fds) {
- Swap *s = SWAP(u);
+ Swap *s = ASSERT_PTR(SWAP(u));
- assert(s);
assert(f);
assert(fds);
@@ -966,9 +948,8 @@ static int swap_serialize(Unit *u, FILE *f, FDSet *fds) {
}
static int swap_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Swap *s = SWAP(u);
+ Swap *s = ASSERT_PTR(SWAP(u));
- assert(s);
assert(fds);
if (streq(key, "state")) {
@@ -1009,10 +990,9 @@ static int swap_deserialize_item(Unit *u, const char *key, const char *value, FD
}
static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) {
- Swap *s = SWAP(u);
+ Swap *s = ASSERT_PTR(SWAP(u));
SwapResult f;
- assert(s);
assert(pid >= 0);
if (pid != s->control_pid.pid)
@@ -1086,9 +1066,8 @@ static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) {
}
static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
- Swap *s = SWAP(userdata);
+ Swap *s = ASSERT_PTR(SWAP(userdata));
- assert(s);
assert(s->timer_event_source == source);
switch (s->state) {
@@ -1261,12 +1240,10 @@ static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, v
return swap_process_proc_swaps(m);
}
-static Unit *swap_following(Unit *u) {
- Swap *s = SWAP(u);
+static Unit* swap_following(Unit *u) {
+ Swap *s = ASSERT_PTR(SWAP(u));
Swap *first = NULL;
- assert(s);
-
/* If the user configured the swap through /etc/fstab or
* a device unit, follow that. */
@@ -1298,16 +1275,15 @@ static Unit *swap_following(Unit *u) {
return UNIT(first);
}
-static int swap_following_set(Unit *u, Set **_set) {
- Swap *s = SWAP(u);
+static int swap_following_set(Unit *u, Set **ret) {
+ Swap *s = ASSERT_PTR(SWAP(u));
_cleanup_set_free_ Set *set = NULL;
int r;
- assert(s);
- assert(_set);
+ assert(ret);
if (LIST_JUST_US(same_devnode, s)) {
- *_set = NULL;
+ *ret = NULL;
return 0;
}
@@ -1321,7 +1297,7 @@ static int swap_following_set(Unit *u, Set **_set) {
return r;
}
- *_set = TAKE_PTR(set);
+ *ret = TAKE_PTR(set);
return 1;
}
@@ -1358,7 +1334,7 @@ static void swap_enumerate(Manager *m) {
/* Dispatch this before we dispatch SIGCHLD, so that
* we always get the events from /proc/swaps before
* the SIGCHLD of /sbin/swapon. */
- r = sd_event_source_set_priority(m->swap_event_source, SD_EVENT_PRIORITY_NORMAL-10);
+ r = sd_event_source_set_priority(m->swap_event_source, EVENT_PRIORITY_SWAP_TABLE);
if (r < 0) {
log_error_errno(r, "Failed to change /proc/swaps priority: %m");
goto fail;
@@ -1422,28 +1398,22 @@ int swap_process_device_new(Manager *m, sd_device *dev) {
int swap_process_device_remove(Manager *m, sd_device *dev) {
const char *dn;
- int r;
Swap *s;
+ int r;
r = sd_device_get_devname(dev, &dn);
if (r < 0)
return 0;
- while ((s = hashmap_get(m->swaps_by_devnode, dn))) {
- int q;
-
- q = swap_set_devnode(s, NULL);
- if (q < 0)
- r = q;
- }
+ r = 0;
+ while ((s = hashmap_get(m->swaps_by_devnode, dn)))
+ RET_GATHER(r, swap_set_devnode(s, NULL));
return r;
}
static void swap_reset_failed(Unit *u) {
- Swap *s = SWAP(u);
-
- assert(s);
+ Swap *s = ASSERT_PTR(SWAP(u));
if (s->state == SWAP_FAILED)
swap_set_state(s, SWAP_DEAD);
@@ -1452,14 +1422,27 @@ static void swap_reset_failed(Unit *u) {
s->clean_result = SWAP_SUCCESS;
}
+static void swap_handoff_timestamp(
+ Unit *u,
+ const struct ucred *ucred,
+ const dual_timestamp *ts) {
+
+ Swap *s = ASSERT_PTR(SWAP(u));
+
+ assert(ucred);
+ assert(ts);
+
+ if (s->control_pid.pid == ucred->pid && s->control_command) {
+ exec_status_handoff(&s->control_command->exec_status, ucred, ts);
+ unit_add_to_dbus_queue(u);
+ }
+}
+
static int swap_get_timeout(Unit *u, usec_t *timeout) {
- Swap *s = SWAP(u);
+ Swap *s = ASSERT_PTR(SWAP(u));
usec_t t;
int r;
- assert(s);
- assert(u);
-
if (!s->timer_event_source)
return 0;
@@ -1493,11 +1476,10 @@ static PidRef* swap_control_pid(Unit *u) {
}
static int swap_clean(Unit *u, ExecCleanMask mask) {
+ Swap *s = ASSERT_PTR(SWAP(u));
_cleanup_strv_free_ char **l = NULL;
- Swap *s = SWAP(u);
int r;
- assert(s);
assert(mask != 0);
if (s->state != SWAP_DEAD)
@@ -1537,19 +1519,15 @@ fail:
}
static int swap_can_clean(Unit *u, ExecCleanMask *ret) {
- Swap *s = SWAP(u);
-
- assert(s);
+ Swap *s = ASSERT_PTR(SWAP(u));
return exec_context_get_clean_mask(&s->exec_context, ret);
}
static int swap_can_start(Unit *u) {
- Swap *s = SWAP(u);
+ Swap *s = ASSERT_PTR(SWAP(u));
int r;
- assert(s);
-
r = unit_test_start_limit(u);
if (r < 0) {
swap_enter_dead(s, SWAP_FAILURE_START_LIMIT_HIT);
@@ -1605,6 +1583,7 @@ const UnitVTable swap_vtable = {
.cgroup_context_offset = offsetof(Swap, cgroup_context),
.kill_context_offset = offsetof(Swap, kill_context),
.exec_runtime_offset = offsetof(Swap, exec_runtime),
+ .cgroup_runtime_offset = offsetof(Swap, cgroup_runtime),
.sections =
"Unit\0"
@@ -1645,6 +1624,8 @@ const UnitVTable swap_vtable = {
.reset_failed = swap_reset_failed,
+ .notify_handoff_timestamp = swap_handoff_timestamp,
+
.control_pid = swap_control_pid,
.bus_set_property = bus_swap_set_property,
diff --git a/src/core/swap.h b/src/core/swap.h
index ef20f0f..d9bbd37 100644
--- a/src/core/swap.h
+++ b/src/core/swap.h
@@ -70,6 +70,7 @@ struct Swap {
CGroupContext cgroup_context;
ExecRuntime *exec_runtime;
+ CGroupRuntime *cgroup_runtime;
SwapState state, deserialized_state;
diff --git a/src/core/system.conf.in b/src/core/system.conf.in
index 05eb681..1c08aa4 100644
--- a/src/core/system.conf.in
+++ b/src/core/system.conf.in
@@ -26,7 +26,7 @@
#ShowStatus=yes
#CrashChangeVT=no
#CrashShell=no
-#CrashReboot=no
+#CrashAction=freeze
#CtrlAltDelBurstAction=reboot-force
#CPUAffinity=
#NUMAPolicy=default
@@ -39,6 +39,7 @@
#WatchdogDevice=
#CapabilityBoundingSet=
#NoNewPrivileges=no
+#ProtectSystem=auto
#SystemCallArchitectures=
#TimerSlackNSec=
#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}
diff --git a/src/core/taint.c b/src/core/taint.c
new file mode 100644
index 0000000..969b37f
--- /dev/null
+++ b/src/core/taint.c
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/utsname.h>
+
+#include "alloc-util.h"
+#include "cgroup-util.h"
+#include "clock-util.h"
+#include "errno-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "log.h"
+#include "os-util.h"
+#include "path-util.h"
+#include "strv.h"
+#include "taint.h"
+#include "uid-range.h"
+
+static int short_uid_gid_range(UIDRangeUsernsMode mode) {
+ _cleanup_(uid_range_freep) UIDRange *p = NULL;
+ int r;
+
+ /* Taint systemd if we the UID/GID range assigned to this environment doesn't at least cover 0…65534,
+ * i.e. from root to nobody. */
+
+ r = uid_range_load_userns(/* path= */ NULL, mode, &p);
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+ return false;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load uid_map or gid_map: %m");
+
+ return !uid_range_covers(p, 0, 65535);
+}
+
+char* taint_string(void) {
+ const char *stage[12] = {};
+ size_t n = 0;
+
+ /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". Only things that are detected at
+ * runtime should be tagged here. For stuff that is known during compilation, emit a warning in the
+ * configuration phase. */
+
+ _cleanup_free_ char *bin = NULL, *usr_sbin = NULL, *var_run = NULL;
+
+ if (readlink_malloc("/bin", &bin) < 0 || !PATH_IN_SET(bin, "usr/bin", "/usr/bin"))
+ stage[n++] = "unmerged-usr";
+
+ /* Note that the check is different from default_PATH(), as we want to taint on uncanonical symlinks
+ * too. */
+ if (readlink_malloc("/usr/sbin", &usr_sbin) < 0 || !PATH_IN_SET(usr_sbin, "bin", "/usr/bin"))
+ stage[n++] = "unmerged-bin";
+
+ if (readlink_malloc("/var/run", &var_run) < 0 || !PATH_IN_SET(var_run, "../run", "/run"))
+ stage[n++] = "var-run-bad";
+
+ if (cg_all_unified() == 0)
+ stage[n++] = "cgroupsv1";
+
+ if (clock_is_localtime(NULL) > 0)
+ stage[n++] = "local-hwclock";
+
+ if (os_release_support_ended(NULL, /* quiet= */ true, NULL) > 0)
+ stage[n++] = "support-ended";
+
+ struct utsname uts;
+ assert_se(uname(&uts) >= 0);
+ if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
+ stage[n++] = "old-kernel";
+
+ _cleanup_free_ char *overflowuid = NULL, *overflowgid = NULL;
+ if (read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid) >= 0 &&
+ !streq(overflowuid, "65534"))
+ stage[n++] = "overflowuid-not-65534";
+ if (read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid) >= 0 &&
+ !streq(overflowgid, "65534"))
+ stage[n++] = "overflowgid-not-65534";
+
+ if (short_uid_gid_range(UID_RANGE_USERNS_INSIDE) > 0)
+ stage[n++] = "short-uid-range";
+ if (short_uid_gid_range(GID_RANGE_USERNS_INSIDE) > 0)
+ stage[n++] = "short-gid-range";
+
+ assert(n < ELEMENTSOF(stage) - 1); /* One extra for NULL terminator */
+
+ return strv_join((char**) stage, ":");
+}
diff --git a/src/core/taint.h b/src/core/taint.h
new file mode 100644
index 0000000..2e514e3
--- /dev/null
+++ b/src/core/taint.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+char* taint_string(void);
diff --git a/src/core/target.c b/src/core/target.c
index 8f2a331..15866e9 100644
--- a/src/core/target.c
+++ b/src/core/target.c
@@ -11,12 +11,13 @@
#include "unit.h"
static const UnitActiveState state_translation_table[_TARGET_STATE_MAX] = {
- [TARGET_DEAD] = UNIT_INACTIVE,
- [TARGET_ACTIVE] = UNIT_ACTIVE
+ [TARGET_DEAD] = UNIT_INACTIVE,
+ [TARGET_ACTIVE] = UNIT_ACTIVE,
};
static void target_set_state(Target *t, TargetState state) {
TargetState old_state;
+
assert(t);
if (t->state != state)
@@ -26,10 +27,8 @@ static void target_set_state(Target *t, TargetState state) {
t->state = state;
if (state != old_state)
- log_debug("%s changed %s -> %s",
- UNIT(t)->id,
- target_state_to_string(old_state),
- target_state_to_string(state));
+ log_unit_debug(UNIT(t), "Changed %s -> %s",
+ target_state_to_string(old_state), target_state_to_string(state));
unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
}
@@ -56,8 +55,8 @@ static int target_add_default_dependencies(Target *t) {
if (n_others < 0)
return n_others;
- for (int i = 0; i < n_others; i++) {
- r = unit_add_default_target_dependency(others[i], UNIT(t));
+ FOREACH_ARRAY(i, others, n_others) {
+ r = unit_add_default_target_dependency(*i, UNIT(t));
if (r < 0)
return r;
}
@@ -70,11 +69,9 @@ static int target_add_default_dependencies(Target *t) {
}
static int target_load(Unit *u) {
- Target *t = TARGET(u);
+ Target *t = ASSERT_PTR(TARGET(u));
int r;
- assert(t);
-
r = unit_load_fragment_and_dropin(u, true);
if (r < 0)
return r;
@@ -87,9 +84,8 @@ static int target_load(Unit *u) {
}
static int target_coldplug(Unit *u) {
- Target *t = TARGET(u);
+ Target *t = ASSERT_PTR(TARGET(u));
- assert(t);
assert(t->state == TARGET_DEAD);
if (t->deserialized_state != t->state)
@@ -99,10 +95,10 @@ static int target_coldplug(Unit *u) {
}
static void target_dump(Unit *u, FILE *f, const char *prefix) {
- Target *t = TARGET(u);
+ Target *t = ASSERT_PTR(TARGET(u));
- assert(t);
assert(f);
+ assert(prefix);
fprintf(f,
"%sTarget State: %s\n",
@@ -110,10 +106,9 @@ static void target_dump(Unit *u, FILE *f, const char *prefix) {
}
static int target_start(Unit *u) {
- Target *t = TARGET(u);
+ Target *t = ASSERT_PTR(TARGET(u));
int r;
- assert(t);
assert(t->state == TARGET_DEAD);
r = unit_acquire_invocation_id(u);
@@ -125,9 +120,8 @@ static int target_start(Unit *u) {
}
static int target_stop(Unit *u) {
- Target *t = TARGET(u);
+ Target *t = ASSERT_PTR(TARGET(u));
- assert(t);
assert(t->state == TARGET_ACTIVE);
target_set_state(t, TARGET_DEAD);
@@ -135,21 +129,18 @@ static int target_stop(Unit *u) {
}
static int target_serialize(Unit *u, FILE *f, FDSet *fds) {
- Target *s = TARGET(u);
+ Target *t = ASSERT_PTR(TARGET(u));
- assert(s);
assert(f);
assert(fds);
- (void) serialize_item(f, "state", target_state_to_string(s->state));
+ (void) serialize_item(f, "state", target_state_to_string(t->state));
return 0;
}
static int target_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Target *s = TARGET(u);
+ Target *t = ASSERT_PTR(TARGET(u));
- assert(s);
- assert(u);
assert(key);
assert(value);
assert(fds);
@@ -159,26 +150,26 @@ static int target_deserialize_item(Unit *u, const char *key, const char *value,
state = target_state_from_string(value);
if (state < 0)
- log_debug("Failed to parse state value %s", value);
+ log_unit_debug(u, "Failed to parse state: %s", value);
else
- s->deserialized_state = state;
+ t->deserialized_state = state;
} else
- log_debug("Unknown serialization key '%s'", key);
+ log_unit_debug(u, "Unknown serialization key: %s", key);
return 0;
}
static UnitActiveState target_active_state(Unit *u) {
- assert(u);
+ Target *t = ASSERT_PTR(TARGET(u));
- return state_translation_table[TARGET(u)->state];
+ return state_translation_table[t->state];
}
static const char *target_sub_state_to_string(Unit *u) {
- assert(u);
+ Target *t = ASSERT_PTR(TARGET(u));
- return target_state_to_string(TARGET(u)->state);
+ return target_state_to_string(t->state);
}
const UnitVTable target_vtable = {
@@ -213,4 +204,6 @@ const UnitVTable target_vtable = {
[JOB_DONE] = "Stopped target %s.",
},
},
+
+ .notify_supervisor = true,
};
diff --git a/src/core/timer.c b/src/core/timer.c
index 3c41a25..d7ce473 100644
--- a/src/core/timer.c
+++ b/src/core/timer.c
@@ -25,19 +25,18 @@
#include "virt.h"
static const UnitActiveState state_translation_table[_TIMER_STATE_MAX] = {
- [TIMER_DEAD] = UNIT_INACTIVE,
+ [TIMER_DEAD] = UNIT_INACTIVE,
[TIMER_WAITING] = UNIT_ACTIVE,
[TIMER_RUNNING] = UNIT_ACTIVE,
[TIMER_ELAPSED] = UNIT_ACTIVE,
- [TIMER_FAILED] = UNIT_FAILED
+ [TIMER_FAILED] = UNIT_FAILED,
};
static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata);
static void timer_init(Unit *u) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
- assert(u);
assert(u->load_state == UNIT_STUB);
t->next_elapse_monotonic_or_boottime = USEC_INFINITY;
@@ -58,9 +57,7 @@ void timer_free_values(Timer *t) {
}
static void timer_done(Unit *u) {
- Timer *t = TIMER(u);
-
- assert(t);
+ Timer *t = ASSERT_PTR(TIMER(u));
timer_free_values(t);
@@ -141,7 +138,7 @@ static int timer_setup_persistent(Timer *t) {
if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) {
- r = unit_require_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES);
if (r < 0)
return r;
@@ -192,19 +189,18 @@ static uint64_t timer_get_fixed_delay_hash(Timer *t) {
}
siphash24_init(&state, hash_key);
- siphash24_compress(&machine_id, sizeof(sd_id128_t), &state);
+ siphash24_compress_typesafe(machine_id, &state);
siphash24_compress_boolean(MANAGER_IS_SYSTEM(UNIT(t)->manager), &state);
- siphash24_compress(&uid, sizeof(uid_t), &state);
+ siphash24_compress_typesafe(uid, &state);
siphash24_compress_string(UNIT(t)->id, &state);
return siphash24_finalize(&state);
}
static int timer_load(Unit *u) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
int r;
- assert(u);
assert(u->load_state == UNIT_STUB);
r = unit_load_fragment_and_dropin(u, true);
@@ -231,9 +227,12 @@ static int timer_load(Unit *u) {
}
static void timer_dump(Unit *u, FILE *f, const char *prefix) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
Unit *trigger;
+ assert(f);
+ assert(prefix);
+
trigger = UNIT_TRIGGER(u);
fprintf(f,
@@ -279,6 +278,7 @@ static void timer_dump(Unit *u, FILE *f, const char *prefix) {
static void timer_set_state(Timer *t, TimerState state) {
TimerState old_state;
+
assert(t);
if (t->state != state)
@@ -303,9 +303,8 @@ static void timer_set_state(Timer *t, TimerState state) {
static void timer_enter_waiting(Timer *t, bool time_change);
static int timer_coldplug(Unit *u) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
- assert(t);
assert(t->state == TIMER_DEAD);
if (t->deserialized_state == t->state)
@@ -634,10 +633,9 @@ fail:
}
static int timer_start(Unit *u) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
int r;
- assert(t);
assert(IN_SET(t->state, TIMER_DEAD, TIMER_FAILED));
r = unit_test_trigger_loaded(u);
@@ -682,9 +680,8 @@ static int timer_start(Unit *u) {
}
static int timer_stop(Unit *u) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
- assert(t);
assert(IN_SET(t->state, TIMER_WAITING, TIMER_RUNNING, TIMER_ELAPSED));
timer_enter_dead(t, TIMER_SUCCESS);
@@ -692,9 +689,8 @@ static int timer_stop(Unit *u) {
}
static int timer_serialize(Unit *u, FILE *f, FDSet *fds) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
- assert(u);
assert(f);
assert(fds);
@@ -711,9 +707,8 @@ static int timer_serialize(Unit *u, FILE *f, FDSet *fds) {
}
static int timer_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
- assert(u);
assert(key);
assert(value);
assert(fds);
@@ -747,21 +742,19 @@ static int timer_deserialize_item(Unit *u, const char *key, const char *value, F
}
static UnitActiveState timer_active_state(Unit *u) {
- assert(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
- return state_translation_table[TIMER(u)->state];
+ return state_translation_table[t->state];
}
static const char *timer_sub_state_to_string(Unit *u) {
- assert(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
- return timer_state_to_string(TIMER(u)->state);
+ return timer_state_to_string(t->state);
}
static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) {
- Timer *t = TIMER(userdata);
-
- assert(t);
+ Timer *t = ASSERT_PTR(TIMER(userdata));
if (t->state != TIMER_WAITING)
return 0;
@@ -772,9 +765,8 @@ static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) {
}
static void timer_trigger_notify(Unit *u, Unit *other) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
- assert(u);
assert(other);
/* Filter out invocations with bogus state */
@@ -812,9 +804,7 @@ static void timer_trigger_notify(Unit *u, Unit *other) {
}
static void timer_reset_failed(Unit *u) {
- Timer *t = TIMER(u);
-
- assert(t);
+ Timer *t = ASSERT_PTR(TIMER(u));
if (t->state == TIMER_FAILED)
timer_set_state(t, TIMER_DEAD);
@@ -823,11 +813,9 @@ static void timer_reset_failed(Unit *u) {
}
static void timer_time_change(Unit *u) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
usec_t ts;
- assert(u);
-
if (t->state != TIMER_WAITING)
return;
@@ -849,9 +837,7 @@ static void timer_time_change(Unit *u) {
}
static void timer_timezone_change(Unit *u) {
- Timer *t = TIMER(u);
-
- assert(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
if (t->state != TIMER_WAITING)
return;
@@ -866,10 +852,9 @@ static void timer_timezone_change(Unit *u) {
}
static int timer_clean(Unit *u, ExecCleanMask mask) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
int r;
- assert(t);
assert(mask != 0);
if (t->state != TIMER_DEAD)
@@ -892,9 +877,8 @@ static int timer_clean(Unit *u, ExecCleanMask mask) {
}
static int timer_can_clean(Unit *u, ExecCleanMask *ret) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
- assert(t);
assert(ret);
*ret = t->persistent ? EXEC_CLEAN_STATE : 0;
@@ -902,11 +886,9 @@ static int timer_can_clean(Unit *u, ExecCleanMask *ret) {
}
static int timer_can_start(Unit *u) {
- Timer *t = TIMER(u);
+ Timer *t = ASSERT_PTR(TIMER(u));
int r;
- assert(t);
-
r = unit_test_start_limit(u);
if (r < 0) {
timer_enter_dead(t, TIMER_FAILURE_START_LIMIT_HIT);
@@ -917,9 +899,8 @@ static int timer_can_start(Unit *u) {
}
static void activation_details_timer_serialize(ActivationDetails *details, FILE *f) {
- ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+ ActivationDetailsTimer *t = ASSERT_PTR(ACTIVATION_DETAILS_TIMER(details));
- assert(details);
assert(f);
assert(t);
@@ -950,10 +931,9 @@ static int activation_details_timer_deserialize(const char *key, const char *val
}
static int activation_details_timer_append_env(ActivationDetails *details, char ***strv) {
- ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+ ActivationDetailsTimer *t = ASSERT_PTR(ACTIVATION_DETAILS_TIMER(details));
int r;
- assert(details);
assert(strv);
assert(t);
@@ -972,10 +952,9 @@ static int activation_details_timer_append_env(ActivationDetails *details, char
}
static int activation_details_timer_append_pair(ActivationDetails *details, char ***strv) {
- ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+ ActivationDetailsTimer *t = ASSERT_PTR(ACTIVATION_DETAILS_TIMER(details));
int r;
- assert(details);
assert(strv);
assert(t);
@@ -1014,7 +993,7 @@ static const char* const timer_base_table[_TIMER_BASE_MAX] = {
[TIMER_STARTUP] = "OnStartupSec",
[TIMER_UNIT_ACTIVE] = "OnUnitActiveSec",
[TIMER_UNIT_INACTIVE] = "OnUnitInactiveSec",
- [TIMER_CALENDAR] = "OnCalendar"
+ [TIMER_CALENDAR] = "OnCalendar",
};
DEFINE_STRING_TABLE_LOOKUP(timer_base, TimerBase);
diff --git a/src/core/transaction.c b/src/core/transaction.c
index a81c40f..ab6e699 100644
--- a/src/core/transaction.c
+++ b/src/core/transaction.c
@@ -446,10 +446,10 @@ static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsi
* the graph over 'before' edges in the actual job execution order. We traverse over both unit
* ordering dependencies and we test with job_compare() whether it is the 'before' edge in the job
* execution ordering. */
- for (size_t d = 0; d < ELEMENTSOF(directions); d++) {
+ FOREACH_ELEMENT(d, directions) {
Unit *u;
- UNIT_FOREACH_DEPENDENCY(u, j->unit, directions[d]) {
+ UNIT_FOREACH_DEPENDENCY(u, j->unit, *d) {
Job *o;
/* Is there a job for this unit? */
@@ -463,7 +463,7 @@ static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsi
}
/* Cut traversing if the job j is not really *before* o. */
- if (job_compare(j, o, directions[d]) >= 0)
+ if (job_compare(j, o, *d) >= 0)
continue;
r = transaction_verify_order_one(tr, o, j, generation, e);
@@ -964,7 +964,7 @@ int transaction_add_job_and_dependencies(
if (type != JOB_STOP) {
r = bus_unit_validate_load_state(unit, e);
- /* The time-based cache allows to start new units without daemon-reload, but if they are
+ /* The time-based cache allows new units to be started without daemon-reload, but if they are
* already referenced (because of dependencies or ordering) then we have to force a load of
* the fragment. As an optimization, check first if anything in the usual paths was modified
* since the last time the cache was loaded. Also check if the last time an attempt to load
diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c
index 9f95984..f25e2e3 100644
--- a/src/core/unit-printf.c
+++ b/src/core/unit-printf.c
@@ -4,6 +4,7 @@
#include "cgroup-util.h"
#include "format-util.h"
#include "macro.h"
+#include "sd-path.h"
#include "specifier.h"
#include "string-util.h"
#include "strv.h"
@@ -86,68 +87,46 @@ static void bad_specifier(const Unit *u, char specifier) {
static int specifier_cgroup(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
const Unit *u = ASSERT_PTR(userdata);
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
bad_specifier(u, specifier);
- if (u->cgroup_path) {
- char *n;
-
- n = strdup(u->cgroup_path);
- if (!n)
- return -ENOMEM;
-
- *ret = n;
- return 0;
- }
+ if (crt && crt->cgroup_path)
+ return strdup_to(ret, crt->cgroup_path);
return unit_default_cgroup_path(u, ret);
}
static int specifier_cgroup_root(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
const Unit *u = ASSERT_PTR(userdata);
- char *n;
bad_specifier(u, specifier);
- n = strdup(u->manager->cgroup_root);
- if (!n)
- return -ENOMEM;
-
- *ret = n;
- return 0;
+ return strdup_to(ret, u->manager->cgroup_root);
}
static int specifier_cgroup_slice(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
const Unit *u = ASSERT_PTR(userdata), *slice;
- char *n;
bad_specifier(u, specifier);
slice = UNIT_GET_SLICE(u);
if (slice) {
- if (slice->cgroup_path)
- n = strdup(slice->cgroup_path);
- else
- return unit_default_cgroup_path(slice, ret);
- } else
- n = strdup(u->manager->cgroup_root);
- if (!n)
- return -ENOMEM;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(slice);
- *ret = n;
- return 0;
+ if (crt && crt->cgroup_path)
+ return strdup_to(ret, crt->cgroup_path);
+
+ return unit_default_cgroup_path(slice, ret);
+ }
+
+ return strdup_to(ret, u->manager->cgroup_root);
}
static int specifier_special_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
const Unit *u = ASSERT_PTR(userdata);
- char *n;
-
- n = strdup(u->manager->prefix[PTR_TO_UINT(data)]);
- if (!n)
- return -ENOMEM;
- *ret = n;
- return 0;
+ return strdup_to(ret, u->manager->prefix[PTR_TO_UINT(data)]);
}
static int specifier_credentials_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
@@ -164,6 +143,14 @@ static int specifier_credentials_dir(char specifier, const void *data, const cha
return 0;
}
+static int specifier_shared_data_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ assert(ret);
+
+ return sd_path_lookup(MANAGER_IS_SYSTEM(u->manager) ? SD_PATH_SYSTEM_SHARED : SD_PATH_USER_SHARED, NULL, ret);
+}
+
int unit_name_printf(const Unit *u, const char* format, char **ret) {
/*
* This will use the passed string as format string and replace the following specifiers (which should all be
@@ -208,6 +195,7 @@ int unit_full_printf_full(const Unit *u, const char *format, size_t max_length,
*
* %C: the cache directory root (e.g. /var/cache or $XDG_CACHE_HOME)
* %d: the credentials directory ($CREDENTIALS_DIRECTORY)
+ * %D: the shared data root (e.g. /usr/share or $XDG_DATA_HOME)
* %E: the configuration directory root (e.g. /etc or $XDG_CONFIG_HOME)
* %L: the log directory root (e.g. /var/log or $XDG_STATE_HOME/log)
* %S: the state directory root (e.g. /var/lib or $XDG_STATE_HOME)
@@ -245,6 +233,7 @@ int unit_full_printf_full(const Unit *u, const char *format, size_t max_length,
{ 'C', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CACHE) },
{ 'd', specifier_credentials_dir, NULL },
+ { 'D', specifier_shared_data_dir, NULL },
{ 'E', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CONFIGURATION) },
{ 'L', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_LOGS) },
{ 'S', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_STATE) },
diff --git a/src/core/unit-serialize.c b/src/core/unit-serialize.c
index fe4221c..175e327 100644
--- a/src/core/unit-serialize.c
+++ b/src/core/unit-serialize.c
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include "bpf-restrict-ifaces.h"
#include "bpf-socket-bind.h"
#include "bus-util.h"
#include "dbus.h"
@@ -7,29 +8,11 @@
#include "fileio.h"
#include "format-util.h"
#include "parse-util.h"
-#include "restrict-ifaces.h"
#include "serialize.h"
#include "string-table.h"
#include "unit-serialize.h"
#include "user-util.h"
-static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) {
- _cleanup_free_ char *s = NULL;
- int r;
-
- assert(f);
- assert(key);
-
- if (mask == 0)
- return 0;
-
- r = cg_mask_to_string(mask, &s);
- if (r < 0)
- return log_error_errno(r, "Failed to format cgroup mask: %m");
-
- return serialize_item(f, key, s);
-}
-
/* Make sure out values fit in the bitfield. */
assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8);
@@ -69,40 +52,6 @@ static int deserialize_markers(Unit *u, const char *value) {
}
}
-static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
- [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes",
- [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets",
- [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes",
- [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets",
-};
-
-DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric);
-
-static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
- [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-base",
- [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-base",
- [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-base",
- [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base",
-};
-
-DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric);
-
-static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
- [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-last",
- [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-last",
- [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-last",
- [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last",
-};
-
-DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric);
-
-static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = {
- [CGROUP_MEMORY_PEAK] = "memory-accounting-peak",
- [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak",
-};
-
-DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric);
-
int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool switching_root) {
int r;
@@ -158,48 +107,7 @@ int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool switching_root) {
(void) serialize_bool(f, "exported-log-rate-limit-interval", u->exported_log_ratelimit_interval);
(void) serialize_bool(f, "exported-log-rate-limit-burst", u->exported_log_ratelimit_burst);
- (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, u->cpu_usage_base);
- if (u->cpu_usage_last != NSEC_INFINITY)
- (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last);
-
- if (u->managed_oom_kill_last > 0)
- (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, u->managed_oom_kill_last);
-
- if (u->oom_kill_last > 0)
- (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last);
-
- for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) {
- (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, u->io_accounting_base[im]);
-
- if (u->io_accounting_last[im] != UINT64_MAX)
- (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, u->io_accounting_last[im]);
- }
-
- for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) {
- uint64_t v;
-
- r = unit_get_memory_accounting(u, metric, &v);
- if (r >= 0)
- (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v);
- }
-
- if (u->cgroup_path)
- (void) serialize_item(f, "cgroup", u->cgroup_path);
-
- (void) serialize_bool(f, "cgroup-realized", u->cgroup_realized);
- (void) serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask);
- (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask);
- (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", u->cgroup_invalidated_mask);
-
- (void) bpf_serialize_socket_bind(u, f, fds);
-
- (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", u->ip_bpf_ingress_installed);
- (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", u->ip_bpf_egress_installed);
- (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", u->bpf_device_control_installed);
- (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", u->ip_bpf_custom_ingress_installed);
- (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", u->ip_bpf_custom_egress_installed);
-
- (void) serialize_restrict_network_interfaces(u, f, fds);
+ (void) cgroup_runtime_serialize(u, f, fds);
if (uid_is_valid(u->ref_uid))
(void) serialize_item_format(f, "ref-uid", UID_FMT, u->ref_uid);
@@ -214,14 +122,6 @@ int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool switching_root) {
bus_track_serialize(u->bus_track, f, "ref");
- for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
- uint64_t v;
-
- r = unit_get_ip_accounting(u, m, &v);
- if (r >= 0)
- (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v);
- }
-
if (!switching_root) {
if (u->job) {
fputs("job\n", f);
@@ -297,7 +197,6 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) {
for (;;) {
_cleanup_free_ char *l = NULL;
- ssize_t m;
size_t k;
char *v;
@@ -380,76 +279,7 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) {
else if (MATCH_DESERIALIZE("exported-log-rate-limit-burst", l, v, parse_boolean, u->exported_log_ratelimit_burst))
continue;
- else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-base", l, v, safe_atou64, u->cpu_usage_base) ||
- MATCH_DESERIALIZE_IMMEDIATE("cpuacct-usage-base", l, v, safe_atou64, u->cpu_usage_base))
- continue;
-
- else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-last", l, v, safe_atou64, u->cpu_usage_last))
- continue;
-
- else if (MATCH_DESERIALIZE_IMMEDIATE("managed-oom-kill-last", l, v, safe_atou64, u->managed_oom_kill_last))
- continue;
-
- else if (MATCH_DESERIALIZE_IMMEDIATE("oom-kill-last", l, v, safe_atou64, u->oom_kill_last))
- continue;
-
- else if (streq(l, "cgroup")) {
- r = unit_set_cgroup_path(u, v);
- if (r < 0)
- log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v);
-
- (void) unit_watch_cgroup(u);
- (void) unit_watch_cgroup_memory(u);
-
- continue;
-
- } else if (MATCH_DESERIALIZE("cgroup-realized", l, v, parse_boolean, u->cgroup_realized))
- continue;
-
- else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-realized-mask", l, v, cg_mask_from_string, u->cgroup_realized_mask))
- continue;
-
- else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-enabled-mask", l, v, cg_mask_from_string, u->cgroup_enabled_mask))
- continue;
-
- else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-invalidated-mask", l, v, cg_mask_from_string, u->cgroup_invalidated_mask))
- continue;
-
- else if (STR_IN_SET(l, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
- int fd;
-
- fd = deserialize_fd(fds, v);
- if (fd >= 0)
- (void) bpf_socket_bind_add_initial_link_fd(u, fd);
- continue;
-
- } else if (streq(l, "ip-bpf-ingress-installed")) {
- (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_ingress_installed);
- continue;
- } else if (streq(l, "ip-bpf-egress-installed")) {
- (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_egress_installed);
- continue;
- } else if (streq(l, "bpf-device-control-installed")) {
- (void) bpf_program_deserialize_attachment(v, fds, &u->bpf_device_control_installed);
- continue;
-
- } else if (streq(l, "ip-bpf-custom-ingress-installed")) {
- (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_ingress_installed);
- continue;
- } else if (streq(l, "ip-bpf-custom-egress-installed")) {
- (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_egress_installed);
- continue;
-
- } else if (streq(l, "restrict-ifaces-bpf-fd")) {
- int fd;
-
- fd = deserialize_fd(fds, v);
- if (fd >= 0)
- (void) restrict_network_interfaces_add_initial_link_fd(u, fd);
-
- continue;
-
- } else if (streq(l, "ref-uid")) {
+ else if (streq(l, "ref-uid")) {
uid_t uid;
r = parse_uid(v, &uid);
@@ -499,55 +329,6 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) {
continue;
}
- m = memory_accounting_metric_field_last_from_string(l);
- if (m >= 0) {
- uint64_t c;
-
- r = safe_atou64(v, &c);
- if (r < 0)
- log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", v);
- else
- u->memory_accounting_last[m] = c;
- continue;
- }
-
- /* Check if this is an IP accounting metric serialization field */
- m = ip_accounting_metric_field_from_string(l);
- if (m >= 0) {
- uint64_t c;
-
- r = safe_atou64(v, &c);
- if (r < 0)
- log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", v);
- else
- u->ip_accounting_extra[m] = c;
- continue;
- }
-
- m = io_accounting_metric_field_base_from_string(l);
- if (m >= 0) {
- uint64_t c;
-
- r = safe_atou64(v, &c);
- if (r < 0)
- log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", v);
- else
- u->io_accounting_base[m] = c;
- continue;
- }
-
- m = io_accounting_metric_field_last_from_string(l);
- if (m >= 0) {
- uint64_t c;
-
- r = safe_atou64(v, &c);
- if (r < 0)
- log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", v);
- else
- u->io_accounting_last[m] = c;
- continue;
- }
-
r = exec_shared_runtime_deserialize_compat(u, l, v, fds);
if (r < 0) {
log_unit_warning(u, "Failed to deserialize runtime parameter '%s', ignoring.", l);
@@ -556,6 +337,13 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) {
/* Returns positive if key was handled by the call */
continue;
+ r = cgroup_runtime_deserialize_one(u, l, v, fds);
+ if (r < 0) {
+ log_unit_warning(u, "Failed to deserialize cgroup runtime parameter '%s, ignoring.", l);
+ continue;
+ } else if (r > 0)
+ continue; /* was handled */
+
if (UNIT_VTABLE(u)->deserialize_item) {
r = UNIT_VTABLE(u)->deserialize_item(u, l, v, fds);
if (r < 0)
@@ -574,7 +362,9 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) {
/* Let's make sure that everything that is deserialized also gets any potential new cgroup settings
* applied after we are done. For that we invalidate anything already realized, so that we can
* realize it again. */
- if (u->cgroup_realized) {
+ CGroupRuntime *crt;
+ crt = unit_get_cgroup_runtime(u);
+ if (crt && crt->cgroup_realized) {
unit_invalidate_cgroup(u, _CGROUP_MASK_ALL);
unit_invalidate_cgroup_bpf(u);
}
@@ -661,8 +451,8 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
prefix2 = strjoina(prefix, "\t");
fprintf(f,
- "%s-> Unit %s:\n",
- prefix, u->id);
+ "%s%s Unit %s:\n",
+ prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), u->id);
SET_FOREACH(t, u->aliases)
fprintf(f, "%s\tAlias: %s\n", prefix, t);
@@ -707,23 +497,25 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
}
if (UNIT_HAS_CGROUP_CONTEXT(u)) {
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+
fprintf(f,
"%s\tSlice: %s\n"
"%s\tCGroup: %s\n"
"%s\tCGroup realized: %s\n",
prefix, strna(unit_slice_name(u)),
- prefix, strna(u->cgroup_path),
- prefix, yes_no(u->cgroup_realized));
+ prefix, strna(crt ? crt->cgroup_path : NULL),
+ prefix, yes_no(crt ? crt->cgroup_realized : false));
- if (u->cgroup_realized_mask != 0) {
+ if (crt && crt->cgroup_realized_mask != 0) {
_cleanup_free_ char *s = NULL;
- (void) cg_mask_to_string(u->cgroup_realized_mask, &s);
+ (void) cg_mask_to_string(crt->cgroup_realized_mask, &s);
fprintf(f, "%s\tCGroup realized mask: %s\n", prefix, strnull(s));
}
- if (u->cgroup_enabled_mask != 0) {
+ if (crt && crt->cgroup_enabled_mask != 0) {
_cleanup_free_ char *s = NULL;
- (void) cg_mask_to_string(u->cgroup_enabled_mask, &s);
+ (void) cg_mask_to_string(crt->cgroup_enabled_mask, &s);
fprintf(f, "%s\tCGroup enabled mask: %s\n", prefix, strnull(s));
}
@@ -831,21 +623,26 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
}
}
- if (!hashmap_isempty(u->requires_mounts_for)) {
- UnitDependencyInfo di;
- const char *path;
+ for (UnitMountDependencyType type = 0; type < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX; type++)
+ if (!hashmap_isempty(u->mounts_for[type])) {
+ UnitDependencyInfo di;
+ const char *path;
- HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) {
- bool space = false;
+ HASHMAP_FOREACH_KEY(di.data, path, u->mounts_for[type]) {
+ bool space = false;
- fprintf(f, "%s\tRequiresMountsFor: %s (", prefix, path);
+ fprintf(f,
+ "%s\t%s: %s (",
+ prefix,
+ unit_mount_dependency_type_to_string(type),
+ path);
- print_unit_dependency_mask(f, "origin", di.origin_mask, &space);
- print_unit_dependency_mask(f, "destination", di.destination_mask, &space);
+ print_unit_dependency_mask(f, "origin", di.origin_mask, &space);
+ print_unit_dependency_mask(f, "destination", di.destination_mask, &space);
- fputs(")\n", f);
+ fputs(")\n", f);
+ }
}
- }
if (u->load_state == UNIT_LOADED) {
diff --git a/src/core/unit.c b/src/core/unit.c
index 2fc9f5a..2d40618 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -67,27 +67,29 @@
#endif
/* Thresholds for logging at INFO level about resource consumption */
-#define MENTIONWORTHY_CPU_NSEC (1 * NSEC_PER_SEC)
-#define MENTIONWORTHY_IO_BYTES (1024 * 1024ULL)
-#define MENTIONWORTHY_IP_BYTES (0ULL)
+#define MENTIONWORTHY_CPU_NSEC (1 * NSEC_PER_SEC)
+#define MENTIONWORTHY_MEMORY_BYTES (64 * U64_MB)
+#define MENTIONWORTHY_IO_BYTES (1 * U64_MB)
+#define MENTIONWORTHY_IP_BYTES UINT64_C(0)
-/* Thresholds for logging at INFO level about resource consumption */
-#define NOTICEWORTHY_CPU_NSEC (10*60 * NSEC_PER_SEC) /* 10 minutes */
-#define NOTICEWORTHY_IO_BYTES (10 * 1024 * 1024ULL) /* 10 MB */
-#define NOTICEWORTHY_IP_BYTES (128 * 1024 * 1024ULL) /* 128 MB */
+/* Thresholds for logging at NOTICE level about resource consumption */
+#define NOTICEWORTHY_CPU_NSEC (10 * NSEC_PER_MINUTE)
+#define NOTICEWORTHY_MEMORY_BYTES (512 * U64_MB)
+#define NOTICEWORTHY_IO_BYTES (10 * U64_MB)
+#define NOTICEWORTHY_IP_BYTES (128 * U64_MB)
const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX] = {
- [UNIT_SERVICE] = &service_vtable,
- [UNIT_SOCKET] = &socket_vtable,
- [UNIT_TARGET] = &target_vtable,
- [UNIT_DEVICE] = &device_vtable,
- [UNIT_MOUNT] = &mount_vtable,
+ [UNIT_SERVICE] = &service_vtable,
+ [UNIT_SOCKET] = &socket_vtable,
+ [UNIT_TARGET] = &target_vtable,
+ [UNIT_DEVICE] = &device_vtable,
+ [UNIT_MOUNT] = &mount_vtable,
[UNIT_AUTOMOUNT] = &automount_vtable,
- [UNIT_SWAP] = &swap_vtable,
- [UNIT_TIMER] = &timer_vtable,
- [UNIT_PATH] = &path_vtable,
- [UNIT_SLICE] = &slice_vtable,
- [UNIT_SCOPE] = &scope_vtable,
+ [UNIT_SWAP] = &swap_vtable,
+ [UNIT_TIMER] = &timer_vtable,
+ [UNIT_PATH] = &path_vtable,
+ [UNIT_SLICE] = &slice_vtable,
+ [UNIT_SCOPE] = &scope_vtable,
};
Unit* unit_new(Manager *m, size_t size) {
@@ -107,29 +109,13 @@ Unit* unit_new(Manager *m, size_t size) {
u->unit_file_preset = -1;
u->on_failure_job_mode = JOB_REPLACE;
u->on_success_job_mode = JOB_FAIL;
- u->cgroup_control_inotify_wd = -1;
- u->cgroup_memory_inotify_wd = -1;
u->job_timeout = USEC_INFINITY;
u->job_running_timeout = USEC_INFINITY;
u->ref_uid = UID_INVALID;
u->ref_gid = GID_INVALID;
- u->cpu_usage_last = NSEC_INFINITY;
-
- unit_reset_memory_accounting_last(u);
- unit_reset_io_accounting_last(u);
-
- u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
u->failure_action_exit_status = u->success_action_exit_status = -1;
- u->ip_accounting_ingress_map_fd = -EBADF;
- u->ip_accounting_egress_map_fd = -EBADF;
-
- u->ipv4_allow_map_fd = -EBADF;
- u->ipv6_allow_map_fd = -EBADF;
- u->ipv4_deny_map_fd = -EBADF;
- u->ipv6_deny_map_fd = -EBADF;
-
u->last_section_private = -1;
u->start_ratelimit = (const RateLimit) {
@@ -137,7 +123,13 @@ Unit* unit_new(Manager *m, size_t size) {
m->defaults.start_limit_burst,
};
- u->auto_start_stop_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_SEC, .burst = 16 };
+ u->auto_start_stop_ratelimit = (const RateLimit) {
+ .interval = 10 * USEC_PER_SEC,
+ .burst = 16
+ };
+
+ unit_reset_memory_accounting_last(u);
+ unit_reset_io_accounting_last(u);
return u;
}
@@ -251,12 +243,12 @@ int unit_add_name(Unit *u, const char *text) {
if (unit_name_is_valid(text, UNIT_NAME_TEMPLATE)) {
if (!u->instance)
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
- "instance is not set when adding name '%s': %m", text);
+ "Instance is not set when adding name '%s'.", text);
r = unit_name_replace_instance(text, u->instance, &name);
if (r < 0)
return log_unit_debug_errno(u, r,
- "failed to build instance name from '%s': %m", text);
+ "Failed to build instance name from '%s': %m", text);
} else {
name = strdup(text);
if (!name)
@@ -268,47 +260,47 @@ int unit_add_name(Unit *u, const char *text) {
if (hashmap_contains(u->manager->units, name))
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST),
- "unit already exist when adding name '%s': %m", name);
+ "Unit already exist when adding name '%s'.", name);
if (!unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
- "name '%s' is invalid: %m", name);
+ "Name '%s' is invalid.", name);
t = unit_name_to_type(name);
if (t < 0)
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
- "failed to derive unit type from name '%s': %m", name);
+ "failed to derive unit type from name '%s'.", name);
if (u->type != _UNIT_TYPE_INVALID && t != u->type)
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
- "unit type is illegal: u->type(%d) and t(%d) for name '%s': %m",
+ "Unit type is illegal: u->type(%d) and t(%d) for name '%s'.",
u->type, t, name);
r = unit_name_to_instance(name, &instance);
if (r < 0)
- return log_unit_debug_errno(u, r, "failed to extract instance from name '%s': %m", name);
+ return log_unit_debug_errno(u, r, "Failed to extract instance from name '%s': %m", name);
if (instance && !unit_type_may_template(t))
- return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "templates are not allowed for name '%s': %m", name);
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "Templates are not allowed for name '%s'.", name);
/* Ensure that this unit either has no instance, or that the instance matches. */
if (u->type != _UNIT_TYPE_INVALID && !streq_ptr(u->instance, instance))
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
- "cannot add name %s, the instances don't match (\"%s\" != \"%s\").",
+ "Cannot add name %s, the instances don't match (\"%s\" != \"%s\").",
name, instance, u->instance);
if (u->id && !unit_type_may_alias(t))
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST),
- "cannot add name %s, aliases are not allowed for %s units.",
+ "Cannot add name %s, aliases are not allowed for %s units.",
name, unit_type_to_string(t));
if (hashmap_size(u->manager->units) >= MANAGER_MAX_NAMES)
- return log_unit_warning_errno(u, SYNTHETIC_ERRNO(E2BIG), "cannot add name, manager has too many units: %m");
+ return log_unit_warning_errno(u, SYNTHETIC_ERRNO(E2BIG), "Cannot add name, manager has too many units.");
/* Add name to the global hashmap first, because that's easier to undo */
r = hashmap_put(u->manager->units, name, u);
if (r < 0)
- return log_unit_debug_errno(u, r, "add unit to hashmap failed for name '%s': %m", text);
+ return log_unit_debug_errno(u, r, "Add unit to hashmap failed for name '%s': %m", text);
if (u->id) {
r = unit_add_alias(u, name); /* unit_add_alias() takes ownership of the name on success */
@@ -475,7 +467,7 @@ bool unit_may_gc(Unit *u) {
break;
case COLLECT_INACTIVE_OR_FAILED:
- if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED))
+ if (!UNIT_IS_INACTIVE_OR_FAILED(state))
return false;
break;
@@ -488,16 +480,11 @@ bool unit_may_gc(Unit *u) {
if (unit_success_failure_handler_has_jobs(u))
return false;
- if (u->cgroup_path) {
- /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay
- * around. Units with active processes should never be collected. */
-
- r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
- if (r < 0)
- log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
- if (r <= 0)
- return false;
- }
+ /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay
+ * around. Units with active processes should never be collected. */
+ r = unit_cgroup_is_empty(u);
+ if (r <= 0 && r != -ENXIO)
+ return false; /* ENXIO means: currently not realized */
if (!UNIT_VTABLE(u)->may_gc)
return true;
@@ -689,38 +676,39 @@ static void unit_remove_transient(Unit *u) {
}
}
-static void unit_free_requires_mounts_for(Unit *u) {
+static void unit_free_mounts_for(Unit *u) {
assert(u);
- for (;;) {
- _cleanup_free_ char *path = NULL;
+ for (UnitMountDependencyType t = 0; t < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX; ++t) {
+ for (;;) {
+ _cleanup_free_ char *path = NULL;
+
+ path = hashmap_steal_first_key(u->mounts_for[t]);
+ if (!path)
+ break;
- path = hashmap_steal_first_key(u->requires_mounts_for);
- if (!path)
- break;
- else {
char s[strlen(path) + 1];
PATH_FOREACH_PREFIX_MORE(s, path) {
char *y;
Set *x;
- x = hashmap_get2(u->manager->units_requiring_mounts_for, s, (void**) &y);
+ x = hashmap_get2(u->manager->units_needing_mounts_for[t], s, (void**) &y);
if (!x)
continue;
(void) set_remove(x, u);
if (set_isempty(x)) {
- (void) hashmap_remove(u->manager->units_requiring_mounts_for, y);
+ assert_se(hashmap_remove(u->manager->units_needing_mounts_for[t], y));
free(y);
set_free(x);
}
}
}
- }
- u->requires_mounts_for = hashmap_free(u->requires_mounts_for);
+ u->mounts_for[t] = hashmap_free(u->mounts_for[t]);
+ }
}
static void unit_done(Unit *u) {
@@ -769,7 +757,7 @@ Unit* unit_free(Unit *u) {
u->deserialized_refs = strv_free(u->deserialized_refs);
u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation);
- unit_free_requires_mounts_for(u);
+ unit_free_mounts_for(u);
SET_FOREACH(t, u->aliases)
hashmap_remove_value(u->manager->units, t, u);
@@ -801,12 +789,6 @@ Unit* unit_free(Unit *u) {
if (u->on_console)
manager_unref_console(u->manager);
- fdset_free(u->initial_socket_bind_link_fds);
-#if BPF_FRAMEWORK
- bpf_link_free(u->ipv4_socket_bind_link);
- bpf_link_free(u->ipv6_socket_bind_link);
-#endif
-
unit_release_cgroup(u);
if (!MANAGER_IS_RELOADING(u->manager))
@@ -863,16 +845,6 @@ Unit* unit_free(Unit *u) {
bpf_firewall_close(u);
- hashmap_free(u->bpf_foreign_by_key);
-
- bpf_program_free(u->bpf_device_control_installed);
-
-#if BPF_FRAMEWORK
- bpf_link_free(u->restrict_ifaces_ingress_bpf_link);
- bpf_link_free(u->restrict_ifaces_egress_bpf_link);
-#endif
- fdset_free(u->initial_restric_ifaces_link_fds);
-
condition_free_list(u->conditions);
condition_free_list(u->asserts);
@@ -902,32 +874,6 @@ FreezerState unit_freezer_state(Unit *u) {
return u->freezer_state;
}
-int unit_freezer_state_kernel(Unit *u, FreezerState *ret) {
- char *values[1] = {};
- int r;
-
- assert(u);
-
- r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
- STRV_MAKE("frozen"), values);
- if (r < 0)
- return r;
-
- r = _FREEZER_STATE_INVALID;
-
- if (values[0]) {
- if (streq(values[0], "0"))
- r = FREEZER_RUNNING;
- else if (streq(values[0], "1"))
- r = FREEZER_FROZEN;
- }
-
- free(values[0]);
- *ret = r;
-
- return 0;
-}
-
UnitActiveState unit_active_state(Unit *u) {
assert(u);
@@ -1277,20 +1223,24 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
/* Unlike unit_add_dependency() or friends, this always returns 0 on success. */
- if (c->working_directory && !c->working_directory_missing_ok) {
- r = unit_require_mounts_for(u, c->working_directory, UNIT_DEPENDENCY_FILE);
+ if (c->working_directory) {
+ r = unit_add_mounts_for(
+ u,
+ c->working_directory,
+ UNIT_DEPENDENCY_FILE,
+ c->working_directory_missing_ok ? UNIT_MOUNT_WANTS : UNIT_MOUNT_REQUIRES);
if (r < 0)
return r;
}
if (c->root_directory) {
- r = unit_require_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS);
if (r < 0)
return r;
}
if (c->root_image) {
- r = unit_require_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS);
if (r < 0)
return r;
}
@@ -1299,14 +1249,14 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
if (!u->manager->prefix[dt])
continue;
- for (size_t i = 0; i < c->directories[dt].n_items; i++) {
+ FOREACH_ARRAY(i, c->directories[dt].items, c->directories[dt].n_items) {
_cleanup_free_ char *p = NULL;
- p = path_join(u->manager->prefix[dt], c->directories[dt].items[i].path);
+ p = path_join(u->manager->prefix[dt], i->path);
if (!p)
return -ENOMEM;
- r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(u, p, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES);
if (r < 0)
return r;
}
@@ -1326,16 +1276,11 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
}
if (c->private_tmp) {
-
- /* FIXME: for now we make a special case for /tmp and add a weak dependency on
- * tmp.mount so /tmp being masked is supported. However there's no reason to treat
- * /tmp specifically and masking other mount units should be handled more
- * gracefully too, see PR#16894. */
- r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "tmp.mount", true, UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(u, "/tmp", UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS);
if (r < 0)
return r;
- r = unit_require_mounts_for(u, "/var/tmp", UNIT_DEPENDENCY_FILE);
+ r = unit_add_mounts_for(u, "/var/tmp", UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS);
if (r < 0)
return r;
@@ -1366,23 +1311,26 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
* is run first. */
if (c->log_namespace) {
- _cleanup_free_ char *socket_unit = NULL, *varlink_socket_unit = NULL;
-
- r = unit_name_build_from_type("systemd-journald", c->log_namespace, UNIT_SOCKET, &socket_unit);
- if (r < 0)
- return r;
+ static const struct {
+ const char *template;
+ UnitType type;
+ } deps[] = {
+ { "systemd-journald", UNIT_SOCKET, },
+ { "systemd-journald-varlink", UNIT_SOCKET, },
+ { "systemd-journald-sync", UNIT_SERVICE, },
+ };
- r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, socket_unit, true, UNIT_DEPENDENCY_FILE);
- if (r < 0)
- return r;
+ FOREACH_ELEMENT(i, deps) {
+ _cleanup_free_ char *unit = NULL;
- r = unit_name_build_from_type("systemd-journald-varlink", c->log_namespace, UNIT_SOCKET, &varlink_socket_unit);
- if (r < 0)
- return r;
+ r = unit_name_build_from_type(i->template, c->log_namespace, i->type, &unit);
+ if (r < 0)
+ return r;
- r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, varlink_socket_unit, true, UNIT_DEPENDENCY_FILE);
- if (r < 0)
- return r;
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, unit, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
} else {
r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_JOURNALD_SOCKET, true, UNIT_DEPENDENCY_FILE);
if (r < 0)
@@ -1515,6 +1463,7 @@ int unit_add_default_target_dependency(Unit *u, Unit *target) {
static int unit_add_slice_dependencies(Unit *u) {
Unit *slice;
+
assert(u);
if (!UNIT_HAS_CGROUP_CONTEXT(u))
@@ -1526,8 +1475,12 @@ static int unit_add_slice_dependencies(Unit *u) {
UnitDependencyMask mask = u->type == UNIT_SLICE ? UNIT_DEPENDENCY_IMPLICIT : UNIT_DEPENDENCY_FILE;
slice = UNIT_GET_SLICE(u);
- if (slice)
+ if (slice) {
+ if (!IN_SET(slice->freezer_state, FREEZER_RUNNING, FREEZER_THAWING))
+ u->freezer_state = FREEZER_FROZEN_BY_PARENT;
+
return unit_add_two_dependencies(u, UNIT_AFTER, UNIT_REQUIRES, slice, true, mask);
+ }
if (unit_has_name(u, SPECIAL_ROOT_SLICE))
return 0;
@@ -1536,51 +1489,72 @@ static int unit_add_slice_dependencies(Unit *u) {
}
static int unit_add_mount_dependencies(Unit *u) {
- UnitDependencyInfo di;
- const char *path;
bool changed = false;
int r;
assert(u);
- HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) {
- char prefix[strlen(path) + 1];
+ for (UnitMountDependencyType t = 0; t < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX; ++t) {
+ UnitDependencyInfo di;
+ const char *path;
- PATH_FOREACH_PREFIX_MORE(prefix, path) {
- _cleanup_free_ char *p = NULL;
- Unit *m;
+ HASHMAP_FOREACH_KEY(di.data, path, u->mounts_for[t]) {
- r = unit_name_from_path(prefix, ".mount", &p);
- if (r == -EINVAL)
- continue; /* If the path cannot be converted to a mount unit name, then it's
- * not manageable as a unit by systemd, and hence we don't need a
- * dependency on it. Let's thus silently ignore the issue. */
- if (r < 0)
- return r;
+ char prefix[strlen(ASSERT_PTR(path)) + 1];
- m = manager_get_unit(u->manager, p);
- if (!m) {
- /* Make sure to load the mount unit if it exists. If so the dependencies on
- * this unit will be added later during the loading of the mount unit. */
- (void) manager_load_unit_prepare(u->manager, p, NULL, NULL, &m);
- continue;
- }
- if (m == u)
- continue;
+ PATH_FOREACH_PREFIX_MORE(prefix, path) {
+ _cleanup_free_ char *p = NULL;
+ Unit *m;
- if (m->load_state != UNIT_LOADED)
- continue;
+ r = unit_name_from_path(prefix, ".mount", &p);
+ if (r == -EINVAL)
+ continue; /* If the path cannot be converted to a mount unit name,
+ * then it's not manageable as a unit by systemd, and
+ * hence we don't need a dependency on it. Let's thus
+ * silently ignore the issue. */
+ if (r < 0)
+ return r;
- r = unit_add_dependency(u, UNIT_AFTER, m, true, di.origin_mask);
- if (r < 0)
- return r;
- changed = changed || r > 0;
+ m = manager_get_unit(u->manager, p);
+ if (!m) {
+ /* Make sure to load the mount unit if it exists. If so the
+ * dependencies on this unit will be added later during the loading
+ * of the mount unit. */
+ (void) manager_load_unit_prepare(
+ u->manager,
+ p,
+ /* path= */NULL,
+ /* e= */NULL,
+ &m);
+ continue;
+ }
+ if (m == u)
+ continue;
- if (m->fragment_path) {
- r = unit_add_dependency(u, UNIT_REQUIRES, m, true, di.origin_mask);
+ if (m->load_state != UNIT_LOADED)
+ continue;
+
+ r = unit_add_dependency(
+ u,
+ UNIT_AFTER,
+ m,
+ /* add_reference= */ true,
+ di.origin_mask);
if (r < 0)
return r;
changed = changed || r > 0;
+
+ if (m->fragment_path) {
+ r = unit_add_dependency(
+ u,
+ unit_mount_dependency_type_to_dependency_type(t),
+ m,
+ /* add_reference= */ true,
+ di.origin_mask);
+ if (r < 0)
+ return r;
+ changed = changed || r > 0;
+ }
}
}
}
@@ -1959,6 +1933,10 @@ int unit_start(Unit *u, ActivationDetails *details) {
return unit_start(following, details);
}
+ /* Check to make sure the unit isn't frozen */
+ if (u->freezer_state != FREEZER_RUNNING)
+ return -EDEADLK;
+
/* Check our ability to start early so that failure conditions don't cause us to enter a busy loop. */
if (UNIT_VTABLE(u)->can_start) {
r = UNIT_VTABLE(u)->can_start(u);
@@ -1975,7 +1953,6 @@ int unit_start(Unit *u, ActivationDetails *details) {
* waits for a holdoff timer to elapse before it will start again. */
unit_add_to_dbus_queue(u);
- unit_cgroup_freezer_action(u, FREEZER_THAW);
if (!u->activation_details) /* Older details object wins */
u->activation_details = activation_details_ref(details);
@@ -2010,6 +1987,7 @@ bool unit_can_isolate(Unit *u) {
* -EBADR: This unit type does not support stopping.
* -EALREADY: Unit is already stopped.
* -EAGAIN: An operation is already in progress. Retry later.
+ * -EDEADLK: Unit is frozen
*/
int unit_stop(Unit *u) {
UnitActiveState state;
@@ -2027,11 +2005,14 @@ int unit_stop(Unit *u) {
return unit_stop(following);
}
+ /* Check to make sure the unit isn't frozen */
+ if (u->freezer_state != FREEZER_RUNNING)
+ return -EDEADLK;
+
if (!UNIT_VTABLE(u)->stop)
return -EBADR;
unit_add_to_dbus_queue(u);
- unit_cgroup_freezer_action(u, FREEZER_THAW);
return UNIT_VTABLE(u)->stop(u);
}
@@ -2056,6 +2037,7 @@ bool unit_can_stop(Unit *u) {
* -EBADR: This unit type does not support reloading.
* -ENOEXEC: Unit is not started.
* -EAGAIN: An operation is already in progress. Retry later.
+ * -EDEADLK: Unit is frozen.
*/
int unit_reload(Unit *u) {
UnitActiveState state;
@@ -2082,6 +2064,10 @@ int unit_reload(Unit *u) {
return unit_reload(following);
}
+ /* Check to make sure the unit isn't frozen */
+ if (u->freezer_state != FREEZER_RUNNING)
+ return -EDEADLK;
+
unit_add_to_dbus_queue(u);
if (!UNIT_VTABLE(u)->reload) {
@@ -2090,8 +2076,6 @@ int unit_reload(Unit *u) {
return 0;
}
- unit_cgroup_freezer_action(u, FREEZER_THAW);
-
return UNIT_VTABLE(u)->reload(u);
}
@@ -2238,16 +2222,16 @@ static void retroactively_start_dependencies(Unit *u) {
UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_REPLACE) /* Requires= + BindsTo= */
if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) &&
!UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other)))
- manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL);
+ (void) manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL);
UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_FAIL) /* Wants= */
if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) &&
!UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other)))
- manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL, NULL);
+ (void) manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL, NULL);
UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_START) /* Conflicts= (and inverse) */
if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other)))
- manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
+ (void) manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
}
static void retroactively_stop_dependencies(Unit *u) {
@@ -2259,7 +2243,7 @@ static void retroactively_stop_dependencies(Unit *u) {
/* Pull down units which are bound to us recursively if enabled */
UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_STOP) /* BoundBy= */
if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other)))
- manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
+ (void) manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
}
void unit_start_on_failure(
@@ -2291,7 +2275,7 @@ void unit_start_on_failure(
log_unit_warning_errno(
u, r, "Failed to enqueue %s job, ignoring: %s",
dependency_name, bus_error_message(&error, r));
- n_jobs ++;
+ n_jobs++;
}
if (n_jobs >= 0)
@@ -2318,273 +2302,179 @@ static int raise_level(int log_level, bool condition_info, bool condition_notice
}
static int unit_log_resources(Unit *u) {
- struct iovec iovec[1 + 2 + _CGROUP_IP_ACCOUNTING_METRIC_MAX + _CGROUP_IO_ACCOUNTING_METRIC_MAX + 4];
- bool any_traffic = false, have_ip_accounting = false, any_io = false, have_io_accounting = false;
- _cleanup_free_ char *igress = NULL, *egress = NULL, *rr = NULL, *wr = NULL;
- int log_level = LOG_DEBUG; /* May be raised if resources consumed over a threshold */
- size_t n_message_parts = 0, n_iovec = 0;
- char* message_parts[1 + 2 + 2 + 2 + 1], *t;
- nsec_t nsec = NSEC_INFINITY;
- uint64_t memory_peak = UINT64_MAX, memory_swap_peak = UINT64_MAX;
- int r;
- const char* const ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
- [CGROUP_IP_INGRESS_BYTES] = "IP_METRIC_INGRESS_BYTES",
- [CGROUP_IP_INGRESS_PACKETS] = "IP_METRIC_INGRESS_PACKETS",
- [CGROUP_IP_EGRESS_BYTES] = "IP_METRIC_EGRESS_BYTES",
- [CGROUP_IP_EGRESS_PACKETS] = "IP_METRIC_EGRESS_PACKETS",
- };
- const char* const io_fields[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
- [CGROUP_IO_READ_BYTES] = "IO_METRIC_READ_BYTES",
- [CGROUP_IO_WRITE_BYTES] = "IO_METRIC_WRITE_BYTES",
- [CGROUP_IO_READ_OPERATIONS] = "IO_METRIC_READ_OPERATIONS",
- [CGROUP_IO_WRITE_OPERATIONS] = "IO_METRIC_WRITE_OPERATIONS",
+
+ static const struct {
+ const char *journal_field;
+ const char *message_suffix;
+ } memory_fields[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = {
+ [CGROUP_MEMORY_PEAK] = { "MEMORY_PEAK", "memory peak" },
+ [CGROUP_MEMORY_SWAP_PEAK] = { "MEMORY_SWAP_PEAK", "memory swap peak" },
+ }, ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IP_INGRESS_BYTES] = { "IP_METRIC_INGRESS_BYTES", "incoming IP traffic" },
+ [CGROUP_IP_EGRESS_BYTES] = { "IP_METRIC_EGRESS_BYTES", "outgoing IP traffic" },
+ [CGROUP_IP_INGRESS_PACKETS] = { "IP_METRIC_INGRESS_PACKETS", NULL },
+ [CGROUP_IP_EGRESS_PACKETS] = { "IP_METRIC_EGRESS_PACKETS", NULL },
+ }, io_fields[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = { "IO_METRIC_READ_BYTES", "read from disk" },
+ [CGROUP_IO_WRITE_BYTES] = { "IO_METRIC_WRITE_BYTES", "written to disk" },
+ [CGROUP_IO_READ_OPERATIONS] = { "IO_METRIC_READ_OPERATIONS", NULL },
+ [CGROUP_IO_WRITE_OPERATIONS] = { "IO_METRIC_WRITE_OPERATIONS", NULL },
};
+ struct iovec *iovec = NULL;
+ size_t n_iovec = 0;
+ _cleanup_free_ char *message = NULL, *t = NULL;
+ nsec_t cpu_nsec = NSEC_INFINITY;
+ int log_level = LOG_DEBUG; /* May be raised if resources consumed over a threshold */
+
assert(u);
+ CLEANUP_ARRAY(iovec, n_iovec, iovec_array_free);
+
+ iovec = new(struct iovec, 1 + (_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1) +
+ _CGROUP_IP_ACCOUNTING_METRIC_MAX + _CGROUP_IO_ACCOUNTING_METRIC_MAX + 4);
+ if (!iovec)
+ return log_oom();
+
/* Invoked whenever a unit enters failed or dead state. Logs information about consumed resources if resource
* accounting was enabled for a unit. It does this in two ways: a friendly human readable string with reduced
* information and the complete data in structured fields. */
- (void) unit_get_cpu_usage(u, &nsec);
- if (nsec != NSEC_INFINITY) {
+ (void) unit_get_cpu_usage(u, &cpu_nsec);
+ if (cpu_nsec != NSEC_INFINITY) {
/* Format the CPU time for inclusion in the structured log message */
- if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, nsec) < 0) {
- r = log_oom();
- goto finish;
- }
- iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+ if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, cpu_nsec) < 0)
+ return log_oom();
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t));
/* Format the CPU time for inclusion in the human language message string */
- t = strjoin("consumed ", FORMAT_TIMESPAN(nsec / NSEC_PER_USEC, USEC_PER_MSEC), " CPU time");
- if (!t) {
- r = log_oom();
- goto finish;
- }
-
- message_parts[n_message_parts++] = t;
+ if (strextendf_with_separator(&message, ", ",
+ "Consumed %s CPU time",
+ FORMAT_TIMESPAN(cpu_nsec / NSEC_PER_USEC, USEC_PER_MSEC)) < 0)
+ return log_oom();
log_level = raise_level(log_level,
- nsec > MENTIONWORTHY_CPU_NSEC,
- nsec > NOTICEWORTHY_CPU_NSEC);
+ cpu_nsec > MENTIONWORTHY_CPU_NSEC,
+ cpu_nsec > NOTICEWORTHY_CPU_NSEC);
}
- (void) unit_get_memory_accounting(u, CGROUP_MEMORY_PEAK, &memory_peak);
- if (memory_peak != UINT64_MAX) {
- /* Format peak memory for inclusion in the structured log message */
- if (asprintf(&t, "MEMORY_PEAK=%" PRIu64, memory_peak) < 0) {
- r = log_oom();
- goto finish;
- }
- iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+ for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) {
+ uint64_t value = UINT64_MAX;
- /* Format peak memory for inclusion in the human language message string */
- t = strjoin(FORMAT_BYTES(memory_peak), " memory peak");
- if (!t) {
- r = log_oom();
- goto finish;
- }
- message_parts[n_message_parts++] = t;
- }
+ assert(memory_fields[metric].journal_field);
+ assert(memory_fields[metric].message_suffix);
- (void) unit_get_memory_accounting(u, CGROUP_MEMORY_SWAP_PEAK, &memory_swap_peak);
- if (memory_swap_peak != UINT64_MAX) {
- /* Format peak swap memory for inclusion in the structured log message */
- if (asprintf(&t, "MEMORY_SWAP_PEAK=%" PRIu64, memory_swap_peak) < 0) {
- r = log_oom();
- goto finish;
- }
- iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+ (void) unit_get_memory_accounting(u, metric, &value);
+ if (value == UINT64_MAX)
+ continue;
- /* Format peak swap memory for inclusion in the human language message string */
- t = strjoin(FORMAT_BYTES(memory_swap_peak), " memory swap peak");
- if (!t) {
- r = log_oom();
- goto finish;
- }
- message_parts[n_message_parts++] = t;
+ if (asprintf(&t, "%s=%" PRIu64, memory_fields[metric].journal_field, value) < 0)
+ return log_oom();
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t));
+
+ /* If value is 0, we don't log it in the MESSAGE= field. */
+ if (value == 0)
+ continue;
+
+ if (strextendf_with_separator(&message, ", ", "%s %s",
+ FORMAT_BYTES(value), memory_fields[metric].message_suffix) < 0)
+ return log_oom();
+
+ log_level = raise_level(log_level,
+ value > MENTIONWORTHY_MEMORY_BYTES,
+ value > NOTICEWORTHY_MEMORY_BYTES);
}
for (CGroupIOAccountingMetric k = 0; k < _CGROUP_IO_ACCOUNTING_METRIC_MAX; k++) {
uint64_t value = UINT64_MAX;
- assert(io_fields[k]);
+ assert(io_fields[k].journal_field);
(void) unit_get_io_accounting(u, k, k > 0, &value);
if (value == UINT64_MAX)
continue;
- have_io_accounting = true;
- if (value > 0)
- any_io = true;
-
/* Format IO accounting data for inclusion in the structured log message */
- if (asprintf(&t, "%s=%" PRIu64, io_fields[k], value) < 0) {
- r = log_oom();
- goto finish;
- }
- iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+ if (asprintf(&t, "%s=%" PRIu64, io_fields[k].journal_field, value) < 0)
+ return log_oom();
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t));
+
+ /* If value is 0, we don't log it in the MESSAGE= field. */
+ if (value == 0)
+ continue;
/* Format the IO accounting data for inclusion in the human language message string, but only
* for the bytes counters (and not for the operations counters) */
- if (k == CGROUP_IO_READ_BYTES) {
- assert(!rr);
- rr = strjoin("read ", strna(FORMAT_BYTES(value)), " from disk");
- if (!rr) {
- r = log_oom();
- goto finish;
- }
- } else if (k == CGROUP_IO_WRITE_BYTES) {
- assert(!wr);
- wr = strjoin("written ", strna(FORMAT_BYTES(value)), " to disk");
- if (!wr) {
- r = log_oom();
- goto finish;
- }
- }
+ if (io_fields[k].message_suffix) {
+ if (strextendf_with_separator(&message, ", ", "%s %s",
+ FORMAT_BYTES(value), io_fields[k].message_suffix) < 0)
+ return log_oom();
- if (IN_SET(k, CGROUP_IO_READ_BYTES, CGROUP_IO_WRITE_BYTES))
log_level = raise_level(log_level,
value > MENTIONWORTHY_IO_BYTES,
value > NOTICEWORTHY_IO_BYTES);
- }
-
- if (have_io_accounting) {
- if (any_io) {
- if (rr)
- message_parts[n_message_parts++] = TAKE_PTR(rr);
- if (wr)
- message_parts[n_message_parts++] = TAKE_PTR(wr);
-
- } else {
- char *k;
-
- k = strdup("no IO");
- if (!k) {
- r = log_oom();
- goto finish;
- }
-
- message_parts[n_message_parts++] = k;
}
}
for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
uint64_t value = UINT64_MAX;
- assert(ip_fields[m]);
+ assert(ip_fields[m].journal_field);
(void) unit_get_ip_accounting(u, m, &value);
if (value == UINT64_MAX)
continue;
- have_ip_accounting = true;
- if (value > 0)
- any_traffic = true;
-
/* Format IP accounting data for inclusion in the structured log message */
- if (asprintf(&t, "%s=%" PRIu64, ip_fields[m], value) < 0) {
- r = log_oom();
- goto finish;
- }
- iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
-
- /* Format the IP accounting data for inclusion in the human language message string, but only for the
- * bytes counters (and not for the packets counters) */
- if (m == CGROUP_IP_INGRESS_BYTES) {
- assert(!igress);
- igress = strjoin("received ", strna(FORMAT_BYTES(value)), " IP traffic");
- if (!igress) {
- r = log_oom();
- goto finish;
- }
- } else if (m == CGROUP_IP_EGRESS_BYTES) {
- assert(!egress);
- egress = strjoin("sent ", strna(FORMAT_BYTES(value)), " IP traffic");
- if (!egress) {
- r = log_oom();
- goto finish;
- }
- }
+ if (asprintf(&t, "%s=%" PRIu64, ip_fields[m].journal_field, value) < 0)
+ return log_oom();
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t));
+
+ /* If value is 0, we don't log it in the MESSAGE= field. */
+ if (value == 0)
+ continue;
+
+ /* Format the IP accounting data for inclusion in the human language message string, but only
+ * for the bytes counters (and not for the packets counters) */
+ if (ip_fields[m].message_suffix) {
+ if (strextendf_with_separator(&message, ", ", "%s %s",
+ FORMAT_BYTES(value), ip_fields[m].message_suffix) < 0)
+ return log_oom();
- if (IN_SET(m, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
log_level = raise_level(log_level,
value > MENTIONWORTHY_IP_BYTES,
value > NOTICEWORTHY_IP_BYTES);
- }
-
- /* This check is here because it is the earliest point following all possible log_level assignments. If
- * log_level is assigned anywhere after this point, move this check. */
- if (!unit_log_level_test(u, log_level)) {
- r = 0;
- goto finish;
- }
-
- if (have_ip_accounting) {
- if (any_traffic) {
- if (igress)
- message_parts[n_message_parts++] = TAKE_PTR(igress);
- if (egress)
- message_parts[n_message_parts++] = TAKE_PTR(egress);
-
- } else {
- char *k;
-
- k = strdup("no IP traffic");
- if (!k) {
- r = log_oom();
- goto finish;
- }
-
- message_parts[n_message_parts++] = k;
}
}
+ /* This check is here because it is the earliest point following all possible log_level assignments.
+ * (If log_level is assigned anywhere after this point, move this check.) */
+ if (!unit_log_level_test(u, log_level))
+ return 0;
+
/* Is there any accounting data available at all? */
if (n_iovec == 0) {
- r = 0;
- goto finish;
- }
-
- if (n_message_parts == 0)
- t = strjoina("MESSAGE=", u->id, ": Completed.");
- else {
- _cleanup_free_ char *joined = NULL;
-
- message_parts[n_message_parts] = NULL;
-
- joined = strv_join(message_parts, ", ");
- if (!joined) {
- r = log_oom();
- goto finish;
- }
-
- joined[0] = ascii_toupper(joined[0]);
- t = strjoina("MESSAGE=", u->id, ": ", joined, ".");
+ assert(!message);
+ return 0;
}
- /* The following four fields we allocate on the stack or are static strings, we hence don't want to free them,
- * and hence don't increase n_iovec for them */
- iovec[n_iovec] = IOVEC_MAKE_STRING(t);
- iovec[n_iovec + 1] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_UNIT_RESOURCES_STR);
-
- t = strjoina(u->manager->unit_log_field, u->id);
- iovec[n_iovec + 2] = IOVEC_MAKE_STRING(t);
-
- t = strjoina(u->manager->invocation_log_field, u->invocation_id_string);
- iovec[n_iovec + 3] = IOVEC_MAKE_STRING(t);
+ t = strjoin("MESSAGE=", u->id, ": ", message ?: "Completed", ".");
+ if (!t)
+ return log_oom();
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t));
- log_unit_struct_iovec(u, log_level, iovec, n_iovec + 4);
- r = 0;
+ if (!set_iovec_string_field(iovec, &n_iovec, "MESSAGE_ID=", SD_MESSAGE_UNIT_RESOURCES_STR))
+ return log_oom();
-finish:
- free_many_charp(message_parts, n_message_parts);
+ if (!set_iovec_string_field(iovec, &n_iovec, u->manager->unit_log_field, u->id))
+ return log_oom();
- for (size_t i = 0; i < n_iovec; i++)
- free(iovec[i].iov_base);
+ if (!set_iovec_string_field(iovec, &n_iovec, u->manager->invocation_log_field, u->invocation_id_string))
+ return log_oom();
- return r;
+ log_unit_struct_iovec(u, log_level, iovec, n_iovec);
+ return 0;
}
static void unit_update_on_console(Unit *u) {
@@ -2796,12 +2686,14 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
unit_emit_audit_start(u);
manager_send_unit_plymouth(m, u);
+ manager_send_unit_supervisor(m, u, /* active= */ true);
}
if (UNIT_IS_INACTIVE_OR_FAILED(ns) && !UNIT_IS_INACTIVE_OR_FAILED(os)) {
/* This unit just stopped/failed. */
unit_emit_audit_stop(u, ns);
+ manager_send_unit_supervisor(m, u, /* active= */ false);
unit_log_resources(u);
}
@@ -2859,7 +2751,7 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
}
}
-int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive) {
+int unit_watch_pidref(Unit *u, const PidRef *pid, bool exclusive) {
_cleanup_(pidref_freep) PidRef *pid_dup = NULL;
int r;
@@ -2943,7 +2835,7 @@ int unit_watch_pid(Unit *u, pid_t pid, bool exclusive) {
return unit_watch_pidref(u, &pidref, exclusive);
}
-void unit_unwatch_pidref(Unit *u, PidRef *pid) {
+void unit_unwatch_pidref(Unit *u, const PidRef *pid) {
assert(u);
assert(pidref_is_set(pid));
@@ -3005,6 +2897,16 @@ void unit_unwatch_all_pids(Unit *u) {
u->pids = set_free(u->pids);
}
+void unit_unwatch_pidref_done(Unit *u, PidRef *pidref) {
+ assert(u);
+
+ if (!pidref_is_set(pidref))
+ return;
+
+ unit_unwatch_pidref(u, pidref);
+ pidref_done(pidref);
+}
+
static void unit_tidy_watch_pids(Unit *u) {
PidRef *except1, *except2, *e;
@@ -3030,7 +2932,7 @@ static int on_rewatch_pids_event(sd_event_source *s, void *userdata) {
assert(s);
unit_tidy_watch_pids(u);
- unit_watch_all_pids(u);
+ (void) unit_watch_all_pids(u);
/* If the PID set is empty now, then let's finish this off. */
unit_synthesize_cgroup_empty_event(u);
@@ -3043,7 +2945,8 @@ int unit_enqueue_rewatch_pids(Unit *u) {
assert(u);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (!crt || !crt->cgroup_path)
return -ENOENT;
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
@@ -3063,7 +2966,7 @@ int unit_enqueue_rewatch_pids(Unit *u) {
if (r < 0)
return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m");
- r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE);
+ r = sd_event_source_set_priority(s, EVENT_PRIORITY_REWATCH_PIDS);
if (r < 0)
return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: %m");
@@ -3288,8 +3191,8 @@ int unit_add_dependency(
if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
return 0;
- /* Note that ordering a device unit after a unit is permitted since it allows to start its job
- * running timeout at a specific time. */
+ /* Note that ordering a device unit after a unit is permitted since it allows its job running
+ * timeout to be started at a specific time. */
if (FLAGS_SET(a, UNIT_ATOM_BEFORE) && other->type == UNIT_DEVICE) {
log_unit_warning(u, "Dependency Before=%s ignored (.device units cannot be delayed)", other->id);
return 0;
@@ -3529,8 +3432,11 @@ int unit_set_slice(Unit *u, Unit *slice) {
return 0;
/* Disallow slice changes if @u is already bound to cgroups */
- if (UNIT_GET_SLICE(u) && u->cgroup_realized)
- return -EBUSY;
+ if (UNIT_GET_SLICE(u)) {
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (crt && crt->cgroup_realized)
+ return -EBUSY;
+ }
/* Remove any slices assigned prior; we should only have one UNIT_IN_SLICE dependency */
if (UNIT_GET_SLICE(u))
@@ -4019,28 +3925,25 @@ void unit_notify_cgroup_oom(Unit *u, bool managed_oom) {
UNIT_VTABLE(u)->notify_cgroup_oom(u, managed_oom);
}
-static Set *unit_pid_set(pid_t main_pid, pid_t control_pid) {
- _cleanup_set_free_ Set *pid_set = NULL;
+static int unit_pid_set(Unit *u, Set **pid_set) {
int r;
- pid_set = set_new(NULL);
- if (!pid_set)
- return NULL;
+ assert(u);
+ assert(pid_set);
+
+ set_clear(*pid_set); /* This updates input. */
/* Exclude the main/control pids from being killed via the cgroup */
- if (main_pid > 0) {
- r = set_put(pid_set, PID_TO_PTR(main_pid));
- if (r < 0)
- return NULL;
- }
- if (control_pid > 0) {
- r = set_put(pid_set, PID_TO_PTR(control_pid));
- if (r < 0)
- return NULL;
- }
+ PidRef *pid;
+ FOREACH_ARGUMENT(pid, unit_main_pid(u), unit_control_pid(u))
+ if (pidref_is_set(pid)) {
+ r = set_ensure_put(pid_set, NULL, PID_TO_PTR(pid->pid));
+ if (r < 0)
+ return r;
+ }
- return TAKE_PTR(pid_set);
+ return 0;
}
static int kill_common_log(const PidRef *pid, int signo, void *userdata) {
@@ -4074,13 +3977,55 @@ static int kill_or_sigqueue(PidRef* pidref, int signo, int code, int value) {
}
}
+static int unit_kill_one(
+ Unit *u,
+ PidRef *pidref,
+ const char *type,
+ int signo,
+ int code,
+ int value,
+ sd_bus_error *ret_error) {
+
+ int r;
+
+ assert(u);
+ assert(type);
+
+ if (!pidref_is_set(pidref))
+ return 0;
+
+ _cleanup_free_ char *comm = NULL;
+ (void) pidref_get_comm(pidref, &comm);
+
+ r = kill_or_sigqueue(pidref, signo, code, value);
+ if (r == -ESRCH)
+ return 0;
+ if (r < 0) {
+ /* Report this failure both to the logs and to the client */
+ if (ret_error)
+ sd_bus_error_set_errnof(
+ ret_error, r,
+ "Failed to send signal SIG%s to %s process " PID_FMT " (%s): %m",
+ signal_to_string(signo), type, pidref->pid, strna(comm));
+
+ return log_unit_warning_errno(
+ u, r,
+ "Failed to send signal SIG%s to %s process " PID_FMT " (%s) on client request: %m",
+ signal_to_string(signo), type, pidref->pid, strna(comm));
+ }
+
+ log_unit_info(u, "Sent signal SIG%s to %s process " PID_FMT " (%s) on client request.",
+ signal_to_string(signo), type, pidref->pid, strna(comm));
+ return 1; /* killed */
+}
+
int unit_kill(
Unit *u,
KillWho who,
int signo,
int code,
int value,
- sd_bus_error *error) {
+ sd_bus_error *ret_error) {
PidRef *main_pid, *control_pid;
bool killed = false;
@@ -4100,110 +4045,71 @@ int unit_kill(
control_pid = unit_control_pid(u);
if (!UNIT_HAS_CGROUP_CONTEXT(u) && !main_pid && !control_pid)
- return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit type does not support process killing.");
+ return sd_bus_error_setf(ret_error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit type does not support process killing.");
if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL)) {
if (!main_pid)
- return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type));
+ return sd_bus_error_setf(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type));
if (!pidref_is_set(main_pid))
- return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill");
+ return sd_bus_error_set_const(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill");
}
if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL)) {
if (!control_pid)
- return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type));
+ return sd_bus_error_setf(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type));
if (!pidref_is_set(control_pid))
- return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill");
+ return sd_bus_error_set_const(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill");
}
- if (pidref_is_set(control_pid) &&
- IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL)) {
- _cleanup_free_ char *comm = NULL;
- (void) pidref_get_comm(control_pid, &comm);
-
- r = kill_or_sigqueue(control_pid, signo, code, value);
- if (r < 0) {
- ret = r;
-
- /* Report this failure both to the logs and to the client */
- sd_bus_error_set_errnof(
- error, r,
- "Failed to send signal SIG%s to control process " PID_FMT " (%s): %m",
- signal_to_string(signo), control_pid->pid, strna(comm));
- log_unit_warning_errno(
- u, r,
- "Failed to send signal SIG%s to control process " PID_FMT " (%s) on client request: %m",
- signal_to_string(signo), control_pid->pid, strna(comm));
- } else {
- log_unit_info(u, "Sent signal SIG%s to control process " PID_FMT " (%s) on client request.",
- signal_to_string(signo), control_pid->pid, strna(comm));
- killed = true;
- }
+ if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL)) {
+ r = unit_kill_one(u, control_pid, "control", signo, code, value, ret_error);
+ RET_GATHER(ret, r);
+ killed = killed || r > 0;
}
- if (pidref_is_set(main_pid) &&
- IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL)) {
- _cleanup_free_ char *comm = NULL;
- (void) pidref_get_comm(main_pid, &comm);
-
- r = kill_or_sigqueue(main_pid, signo, code, value);
- if (r < 0) {
- if (ret == 0) {
- ret = r;
-
- sd_bus_error_set_errnof(
- error, r,
- "Failed to send signal SIG%s to main process " PID_FMT " (%s): %m",
- signal_to_string(signo), main_pid->pid, strna(comm));
- }
-
- log_unit_warning_errno(
- u, r,
- "Failed to send signal SIG%s to main process " PID_FMT " (%s) on client request: %m",
- signal_to_string(signo), main_pid->pid, strna(comm));
-
- } else {
- log_unit_info(u, "Sent signal SIG%s to main process " PID_FMT " (%s) on client request.",
- signal_to_string(signo), main_pid->pid, strna(comm));
- killed = true;
- }
+ if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL)) {
+ r = unit_kill_one(u, main_pid, "main", signo, code, value, ret >= 0 ? ret_error : NULL);
+ RET_GATHER(ret, r);
+ killed = killed || r > 0;
}
/* Note: if we shall enqueue rather than kill we won't do this via the cgroup mechanism, since it
* doesn't really make much sense (and given that enqueued values are a relatively expensive
* resource, and we shouldn't allow us to be subjects for such allocation sprees) */
- if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && u->cgroup_path && code == SI_USER) {
- _cleanup_set_free_ Set *pid_set = NULL;
+ if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && code == SI_USER) {
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
- /* Exclude the main/control pids from being killed via the cgroup */
- pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0);
- if (!pid_set)
- return log_oom();
+ if (crt && crt->cgroup_path) {
+ _cleanup_set_free_ Set *pid_set = NULL;
- r = cg_kill_recursive(u->cgroup_path, signo, 0, pid_set, kill_common_log, u);
- if (r < 0) {
- if (!IN_SET(r, -ESRCH, -ENOENT)) {
- if (ret == 0) {
- ret = r;
+ /* Exclude the main/control pids from being killed via the cgroup */
+ r = unit_pid_set(u, &pid_set);
+ if (r < 0)
+ return log_oom();
+ r = cg_kill_recursive(crt->cgroup_path, signo, 0, pid_set, kill_common_log, u);
+ if (r < 0 && !IN_SET(r, -ESRCH, -ENOENT)) {
+ if (ret >= 0)
sd_bus_error_set_errnof(
- error, r,
+ ret_error, r,
"Failed to send signal SIG%s to auxiliary processes: %m",
signal_to_string(signo));
- }
log_unit_warning_errno(
u, r,
"Failed to send signal SIG%s to auxiliary processes on client request: %m",
signal_to_string(signo));
+
+ RET_GATHER(ret, r);
}
- } else
- killed = true;
+
+ killed = killed || r >= 0;
+ }
}
/* If the "fail" versions of the operation are requested, then complain if the set of processes we killed is empty */
- if (ret == 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL, KILL_MAIN_FAIL))
- return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No matching processes to kill");
+ if (ret >= 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL, KILL_MAIN_FAIL))
+ return sd_bus_error_set_const(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "No matching processes to kill");
return ret;
}
@@ -4316,6 +4222,21 @@ static int user_from_unit_name(Unit *u, char **ret) {
return 0;
}
+static int unit_verify_contexts(const Unit *u, const ExecContext *ec) {
+ assert(u);
+
+ if (!ec)
+ return 0;
+
+ if (MANAGER_IS_USER(u->manager) && ec->dynamic_user)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "DynamicUser= enabled for user unit, which is not supported. Refusing.");
+
+ if (ec->dynamic_user && ec->working_directory_home)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory=~ is not allowed under DynamicUser=yes. Refusing.");
+
+ return 0;
+}
+
int unit_patch_contexts(Unit *u) {
CGroupContext *cc;
ExecContext *ec;
@@ -4337,16 +4258,14 @@ int unit_patch_contexts(Unit *u) {
return -ENOMEM;
}
- if (MANAGER_IS_USER(u->manager) &&
- !ec->working_directory) {
-
+ if (MANAGER_IS_USER(u->manager) && !ec->working_directory) {
r = get_home_dir(&ec->working_directory);
if (r < 0)
return r;
- /* Allow user services to run, even if the
- * home directory is missing */
- ec->working_directory_missing_ok = true;
+ if (!ec->working_directory_home)
+ /* If home directory is implied by us, allow it to be missing. */
+ ec->working_directory_missing_ok = true;
}
if (ec->private_devices)
@@ -4390,8 +4309,8 @@ int unit_patch_contexts(Unit *u) {
ec->restrict_suid_sgid = true;
}
- for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
- exec_directory_sort(ec->directories + dt);
+ FOREACH_ARRAY(d, ec->directories, _EXEC_DIRECTORY_TYPE_MAX)
+ exec_directory_sort(d);
}
cc = unit_get_cgroup_context(u);
@@ -4441,7 +4360,7 @@ int unit_patch_contexts(Unit *u) {
}
}
- return 0;
+ return unit_verify_contexts(u, ec);
}
ExecContext *unit_get_exec_context(const Unit *u) {
@@ -4458,7 +4377,7 @@ ExecContext *unit_get_exec_context(const Unit *u) {
return (ExecContext*) ((uint8_t*) u + offset);
}
-KillContext *unit_get_kill_context(Unit *u) {
+KillContext *unit_get_kill_context(const Unit *u) {
size_t offset;
assert(u);
@@ -4472,7 +4391,7 @@ KillContext *unit_get_kill_context(Unit *u) {
return (KillContext*) ((uint8_t*) u + offset);
}
-CGroupContext *unit_get_cgroup_context(Unit *u) {
+CGroupContext *unit_get_cgroup_context(const Unit *u) {
size_t offset;
if (u->type < 0)
@@ -4485,7 +4404,7 @@ CGroupContext *unit_get_cgroup_context(Unit *u) {
return (CGroupContext*) ((uint8_t*) u + offset);
}
-ExecRuntime *unit_get_exec_runtime(Unit *u) {
+ExecRuntime *unit_get_exec_runtime(const Unit *u) {
size_t offset;
if (u->type < 0)
@@ -4498,6 +4417,19 @@ ExecRuntime *unit_get_exec_runtime(Unit *u) {
return *(ExecRuntime**) ((uint8_t*) u + offset);
}
+CGroupRuntime *unit_get_cgroup_runtime(const Unit *u) {
+ size_t offset;
+
+ if (u->type < 0)
+ return NULL;
+
+ offset = UNIT_VTABLE(u)->cgroup_runtime_offset;
+ if (offset <= 0)
+ return NULL;
+
+ return *(CGroupRuntime**) ((uint8_t*) u + offset);
+}
+
static const char* unit_drop_in_dir(Unit *u, UnitWriteFlags flags) {
assert(u);
@@ -4820,26 +4752,57 @@ static int operation_to_signal(
}
}
-int unit_kill_context(
+static int unit_kill_context_one(
Unit *u,
- KillContext *c,
- KillOperation k,
- PidRef* main_pid,
- PidRef* control_pid,
- bool main_pid_alien) {
+ const PidRef *pidref,
+ const char *type,
+ bool is_alien,
+ int sig,
+ bool send_sighup,
+ cg_kill_log_func_t log_func) {
+ int r;
+
+ assert(u);
+ assert(type);
+
+ /* This returns > 0 if it makes sense to wait for SIGCHLD for the process, == 0 if not. */
+
+ if (!pidref_is_set(pidref))
+ return 0;
+
+ if (log_func)
+ log_func(pidref, sig, u);
+
+ r = pidref_kill_and_sigcont(pidref, sig);
+ if (r == -ESRCH)
+ return !is_alien;
+ if (r < 0) {
+ _cleanup_free_ char *comm = NULL;
+
+ (void) pidref_get_comm(pidref, &comm);
+ return log_unit_warning_errno(u, r, "Failed to kill %s process " PID_FMT " (%s), ignoring: %m", type, pidref->pid, strna(comm));
+ }
+
+ if (send_sighup)
+ (void) pidref_kill(pidref, SIGHUP);
+
+ return !is_alien;
+}
+
+int unit_kill_context(Unit *u, KillOperation k) {
bool wait_for_exit = false, send_sighup;
cg_kill_log_func_t log_func = NULL;
int sig, r;
assert(u);
- assert(c);
/* Kill the processes belonging to this unit, in preparation for shutting the unit down. Returns > 0
* if we killed something worth waiting for, 0 otherwise. Do not confuse with unit_kill_common()
* which is used for user-requested killing of unit processes. */
- if (c->kill_mode == KILL_NONE)
+ KillContext *c = unit_get_kill_context(u);
+ if (!c || c->kill_mode == KILL_NONE)
return 0;
bool noteworthy;
@@ -4852,61 +4815,33 @@ int unit_kill_context(
IN_SET(k, KILL_TERMINATE, KILL_TERMINATE_AND_LOG) &&
sig != SIGHUP;
- if (pidref_is_set(main_pid)) {
- if (log_func)
- log_func(main_pid, sig, u);
-
- r = pidref_kill_and_sigcont(main_pid, sig);
- if (r < 0 && r != -ESRCH) {
- _cleanup_free_ char *comm = NULL;
- (void) pidref_get_comm(main_pid, &comm);
+ bool is_alien;
+ PidRef *main_pid = unit_main_pid_full(u, &is_alien);
+ r = unit_kill_context_one(u, main_pid, "main", is_alien, sig, send_sighup, log_func);
+ wait_for_exit = wait_for_exit || r > 0;
- log_unit_warning_errno(u, r, "Failed to kill main process " PID_FMT " (%s), ignoring: %m", main_pid->pid, strna(comm));
- } else {
- if (!main_pid_alien)
- wait_for_exit = true;
+ r = unit_kill_context_one(u, unit_control_pid(u), "control", /* is_alien = */ false, sig, send_sighup, log_func);
+ wait_for_exit = wait_for_exit || r > 0;
- if (r != -ESRCH && send_sighup)
- (void) pidref_kill(main_pid, SIGHUP);
- }
- }
-
- if (pidref_is_set(control_pid)) {
- if (log_func)
- log_func(control_pid, sig, u);
-
- r = pidref_kill_and_sigcont(control_pid, sig);
- if (r < 0 && r != -ESRCH) {
- _cleanup_free_ char *comm = NULL;
- (void) pidref_get_comm(control_pid, &comm);
-
- log_unit_warning_errno(u, r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", control_pid->pid, strna(comm));
- } else {
- wait_for_exit = true;
-
- if (r != -ESRCH && send_sighup)
- (void) pidref_kill(control_pid, SIGHUP);
- }
- }
-
- if (u->cgroup_path &&
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (crt && crt->cgroup_path &&
(c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL))) {
_cleanup_set_free_ Set *pid_set = NULL;
/* Exclude the main/control pids from being killed via the cgroup */
- pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0);
- if (!pid_set)
- return -ENOMEM;
+ r = unit_pid_set(u, &pid_set);
+ if (r < 0)
+ return r;
r = cg_kill_recursive(
- u->cgroup_path,
+ crt->cgroup_path,
sig,
CGROUP_SIGCONT|CGROUP_IGNORE_SELF,
pid_set,
log_func, u);
if (r < 0) {
if (!IN_SET(r, -EAGAIN, -ESRCH, -ENOENT))
- log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", empty_to_root(u->cgroup_path));
+ log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", empty_to_root(crt->cgroup_path));
} else if (r > 0) {
@@ -4922,14 +4857,12 @@ int unit_kill_context(
wait_for_exit = true;
if (send_sighup) {
- set_free(pid_set);
-
- pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0);
- if (!pid_set)
- return -ENOMEM;
+ r = unit_pid_set(u, &pid_set);
+ if (r < 0)
+ return r;
(void) cg_kill_recursive(
- u->cgroup_path,
+ crt->cgroup_path,
SIGHUP,
CGROUP_IGNORE_SELF,
pid_set,
@@ -4942,11 +4875,16 @@ int unit_kill_context(
return wait_for_exit;
}
-int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) {
+int unit_add_mounts_for(Unit *u, const char *path, UnitDependencyMask mask, UnitMountDependencyType type) {
+ Hashmap **unit_map, **manager_map;
int r;
assert(u);
assert(path);
+ assert(type >= 0 && type < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX);
+
+ unit_map = &u->mounts_for[type];
+ manager_map = &u->manager->units_needing_mounts_for[type];
/* Registers a unit for requiring a certain path and all its prefixes. We keep a hashtable of these
* paths in the unit (from the path to the UnitDependencyInfo structure indicating how to the
@@ -4956,7 +4894,7 @@ int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask)
if (!path_is_absolute(path))
return -EINVAL;
- if (hashmap_contains(u->requires_mounts_for, path)) /* Exit quickly if the path is already covered. */
+ if (hashmap_contains(*unit_map, path)) /* Exit quickly if the path is already covered. */
return 0;
/* Use the canonical form of the path as the stored key. We call path_is_normalized()
@@ -4975,7 +4913,7 @@ int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask)
.origin_mask = mask
};
- r = hashmap_ensure_put(&u->requires_mounts_for, &path_hash_ops, p, di.data);
+ r = hashmap_ensure_put(unit_map, &path_hash_ops, p, di.data);
if (r < 0)
return r;
assert(r > 0);
@@ -4985,11 +4923,11 @@ int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask)
PATH_FOREACH_PREFIX_MORE(prefix, path) {
Set *x;
- x = hashmap_get(u->manager->units_requiring_mounts_for, prefix);
+ x = hashmap_get(*manager_map, prefix);
if (!x) {
_cleanup_free_ char *q = NULL;
- r = hashmap_ensure_allocated(&u->manager->units_requiring_mounts_for, &path_hash_ops);
+ r = hashmap_ensure_allocated(manager_map, &path_hash_ops);
if (r < 0)
return r;
@@ -5001,7 +4939,7 @@ int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask)
if (!x)
return -ENOMEM;
- r = hashmap_put(u->manager->units_requiring_mounts_for, q, x);
+ r = hashmap_put(*manager_map, q, x);
if (r < 0) {
set_free(x);
return r;
@@ -5035,8 +4973,7 @@ int unit_setup_exec_runtime(Unit *u) {
if (*rt)
return 0;
- ec = unit_get_exec_context(u);
- assert(ec);
+ ec = ASSERT_PTR(unit_get_exec_context(u));
r = unit_get_transitive_dependency_set(u, UNIT_ATOM_JOINS_NAMESPACE_OF, &units);
if (r < 0)
@@ -5073,6 +5010,21 @@ int unit_setup_exec_runtime(Unit *u) {
return r;
}
+CGroupRuntime *unit_setup_cgroup_runtime(Unit *u) {
+ size_t offset;
+
+ assert(u);
+
+ offset = UNIT_VTABLE(u)->cgroup_runtime_offset;
+ assert(offset > 0);
+
+ CGroupRuntime **rt = (CGroupRuntime**) ((uint8_t*) u + offset);
+ if (*rt)
+ return *rt;
+
+ return (*rt = cgroup_runtime_new());
+}
+
bool unit_type_supported(UnitType t) {
static int8_t cache[_UNIT_TYPE_MAX] = {}; /* -1: disabled, 1: enabled: 0: don't know */
int r;
@@ -5178,12 +5130,14 @@ PidRef* unit_control_pid(Unit *u) {
return NULL;
}
-PidRef* unit_main_pid(Unit *u) {
+PidRef* unit_main_pid_full(Unit *u, bool *ret_is_alien) {
assert(u);
if (UNIT_VTABLE(u)->main_pid)
- return UNIT_VTABLE(u)->main_pid(u);
+ return UNIT_VTABLE(u)->main_pid(u, ret_is_alien);
+ if (ret_is_alien)
+ *ret_is_alien = false;
return NULL;
}
@@ -5393,7 +5347,6 @@ int unit_acquire_invocation_id(Unit *u) {
}
int unit_set_exec_params(Unit *u, ExecParameters *p) {
- const char *confirm_spawn;
int r;
assert(u);
@@ -5406,19 +5359,17 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) {
p->runtime_scope = u->manager->runtime_scope;
- confirm_spawn = manager_get_confirm_spawn(u->manager);
- if (confirm_spawn) {
- p->confirm_spawn = strdup(confirm_spawn);
- if (!p->confirm_spawn)
- return -ENOMEM;
- }
+ r = strdup_to(&p->confirm_spawn, manager_get_confirm_spawn(u->manager));
+ if (r < 0)
+ return r;
p->cgroup_supported = u->manager->cgroup_supported;
p->prefix = u->manager->prefix;
SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager));
/* Copy parameters from unit */
- p->cgroup_path = u->cgroup_path;
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ p->cgroup_path = crt ? crt->cgroup_path : NULL;
SET_FLAG(p->flags, EXEC_CGROUP_DELEGATE, unit_cgroup_delegate(u));
p->received_credentials_directory = u->manager->received_credentials_directory;
@@ -5428,17 +5379,18 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) {
p->fallback_smack_process_label = u->manager->defaults.smack_process_label;
- if (u->manager->restrict_fs && p->bpf_outer_map_fd < 0) {
- int fd = lsm_bpf_map_restrict_fs_fd(u);
+ if (u->manager->restrict_fs && p->bpf_restrict_fs_map_fd < 0) {
+ int fd = bpf_restrict_fs_map_fd(u);
if (fd < 0)
return fd;
- p->bpf_outer_map_fd = fd;
+ p->bpf_restrict_fs_map_fd = fd;
}
p->user_lookup_fd = u->manager->user_lookup_fds[1];
+ p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1];
- p->cgroup_id = u->cgroup_id;
+ p->cgroup_id = crt ? crt->cgroup_id : 0;
p->invocation_id = u->invocation_id;
sd_id128_to_string(p->invocation_id, p->invocation_id_string);
p->unit_id = strdup(u->id);
@@ -5460,6 +5412,10 @@ int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret) {
(void) unit_realize_cgroup(u);
+ CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
+ if (!crt)
+ return -ENOMEM;
+
r = safe_fork(name, FORK_REOPEN_LOG|FORK_DEATHSIG_SIGTERM, &pid);
if (r < 0)
return r;
@@ -5482,10 +5438,10 @@ int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret) {
(void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE);
(void) ignore_signals(SIGPIPE);
- if (u->cgroup_path) {
- r = cg_attach_everywhere(u->manager->cgroup_supported, u->cgroup_path, 0, NULL, NULL);
+ if (crt->cgroup_path) {
+ r = cg_attach_everywhere(u->manager->cgroup_supported, crt->cgroup_path, 0, NULL, NULL);
if (r < 0) {
- log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", empty_to_root(u->cgroup_path));
+ log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", empty_to_root(crt->cgroup_path));
_exit(EXIT_CGROUP);
}
}
@@ -5880,9 +5836,10 @@ int unit_prepare_exec(Unit *u) {
(void) unit_realize_cgroup(u);
- if (u->reset_accounting) {
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+ if (crt && crt->reset_accounting) {
(void) unit_reset_accounting(u);
- u->reset_accounting = false;
+ crt->reset_accounting = false;
}
unit_export_state_files(u);
@@ -5942,11 +5899,13 @@ int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func) {
(void) unit_pick_cgroup_path(u);
- if (!u->cgroup_path)
+ CGroupRuntime *crt = unit_get_cgroup_runtime(u);
+
+ if (!crt || !crt->cgroup_path)
return 0;
return cg_kill_recursive(
- u->cgroup_path,
+ crt->cgroup_path,
/* sig= */ 0,
/* flags= */ 0,
/* set= */ NULL,
@@ -5976,7 +5935,7 @@ bool unit_needs_console(Unit *u) {
return exec_context_may_touch_console(ec);
}
-int unit_pid_attachable(Unit *u, PidRef *pid, sd_bus_error *error) {
+int unit_pid_attachable(Unit *u, const PidRef *pid, sd_bus_error *error) {
int r;
assert(u);
@@ -6213,19 +6172,98 @@ bool unit_can_isolate_refuse_manual(Unit *u) {
return unit_can_isolate(u) && !u->refuse_manual_start;
}
+void unit_next_freezer_state(Unit *u, FreezerAction action, FreezerState *ret, FreezerState *ret_target) {
+ Unit *slice;
+ FreezerState curr, parent, next, tgt;
+
+ assert(u);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_PARENT_FREEZE,
+ FREEZER_THAW, FREEZER_PARENT_THAW));
+ assert(ret);
+ assert(ret_target);
+
+ /* This function determines the correct freezer state transitions for a unit
+ * given the action being requested. It returns the next state, and also the "target",
+ * which is either FREEZER_FROZEN or FREEZER_RUNNING, depending on what actual state we
+ * ultimately want to achieve. */
+
+ curr = u->freezer_state;
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ parent = slice->freezer_state;
+ else
+ parent = FREEZER_RUNNING;
+
+ if (action == FREEZER_FREEZE) {
+ /* We always "promote" a freeze initiated by parent into a normal freeze */
+ if (IN_SET(curr, FREEZER_FROZEN, FREEZER_FROZEN_BY_PARENT))
+ next = FREEZER_FROZEN;
+ else
+ next = FREEZER_FREEZING;
+ } else if (action == FREEZER_THAW) {
+ /* Thawing is the most complicated operation here, because we can't thaw a unit
+ * if its parent is frozen. So we instead "demote" a normal freeze into a freeze
+ * initiated by parent if the parent is frozen */
+ if (IN_SET(curr, FREEZER_RUNNING, FREEZER_THAWING, FREEZER_FREEZING_BY_PARENT, FREEZER_FROZEN_BY_PARENT))
+ next = curr;
+ else if (curr == FREEZER_FREEZING) {
+ if (IN_SET(parent, FREEZER_RUNNING, FREEZER_THAWING))
+ next = FREEZER_THAWING;
+ else
+ next = FREEZER_FREEZING_BY_PARENT;
+ } else {
+ assert(curr == FREEZER_FROZEN);
+ if (IN_SET(parent, FREEZER_RUNNING, FREEZER_THAWING))
+ next = FREEZER_THAWING;
+ else
+ next = FREEZER_FROZEN_BY_PARENT;
+ }
+ } else if (action == FREEZER_PARENT_FREEZE) {
+ /* We need to avoid accidentally demoting units frozen manually */
+ if (IN_SET(curr, FREEZER_FREEZING, FREEZER_FROZEN, FREEZER_FROZEN_BY_PARENT))
+ next = curr;
+ else
+ next = FREEZER_FREEZING_BY_PARENT;
+ } else {
+ assert(action == FREEZER_PARENT_THAW);
+
+ /* We don't want to thaw units from a parent if they were frozen
+ * manually, so for such units this action is a no-op */
+ if (IN_SET(curr, FREEZER_RUNNING, FREEZER_FREEZING, FREEZER_FROZEN))
+ next = curr;
+ else
+ next = FREEZER_THAWING;
+ }
+
+ tgt = freezer_state_finish(next);
+ if (tgt == FREEZER_FROZEN_BY_PARENT)
+ tgt = FREEZER_FROZEN;
+ assert(IN_SET(tgt, FREEZER_RUNNING, FREEZER_FROZEN));
+
+ *ret = next;
+ *ret_target = tgt;
+}
+
bool unit_can_freeze(Unit *u) {
assert(u);
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE))
+ return false;
+
if (UNIT_VTABLE(u)->can_freeze)
return UNIT_VTABLE(u)->can_freeze(u);
- return UNIT_VTABLE(u)->freeze;
+ return UNIT_VTABLE(u)->freezer_action;
}
void unit_frozen(Unit *u) {
assert(u);
- u->freezer_state = FREEZER_FROZEN;
+ u->freezer_state = u->freezer_state == FREEZER_FREEZING_BY_PARENT
+ ? FREEZER_FROZEN_BY_PARENT
+ : FREEZER_FROZEN;
+
+ log_unit_debug(u, "Unit now %s.", freezer_state_to_string(u->freezer_state));
bus_unit_send_pending_freezer_message(u, false);
}
@@ -6235,19 +6273,19 @@ void unit_thawed(Unit *u) {
u->freezer_state = FREEZER_RUNNING;
+ log_unit_debug(u, "Unit thawed.");
+
bus_unit_send_pending_freezer_message(u, false);
}
-static int unit_freezer_action(Unit *u, FreezerAction action) {
+int unit_freezer_action(Unit *u, FreezerAction action) {
UnitActiveState s;
- int (*method)(Unit*);
int r;
assert(u);
assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
- method = action == FREEZER_FREEZE ? UNIT_VTABLE(u)->freeze : UNIT_VTABLE(u)->thaw;
- if (!method || !cg_freezer_supported())
+ if (!cg_freezer_supported() || !unit_can_freeze(u))
return -EOPNOTSUPP;
if (u->job)
@@ -6260,36 +6298,21 @@ static int unit_freezer_action(Unit *u, FreezerAction action) {
if (s != UNIT_ACTIVE)
return -EHOSTDOWN;
- if ((IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING) && action == FREEZER_FREEZE) ||
- (u->freezer_state == FREEZER_THAWING && action == FREEZER_THAW))
+ if (action == FREEZER_FREEZE && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT))
return -EALREADY;
+ if (action == FREEZER_THAW && u->freezer_state == FREEZER_THAWING)
+ return -EALREADY;
+ if (action == FREEZER_THAW && IN_SET(u->freezer_state, FREEZER_FREEZING_BY_PARENT, FREEZER_FROZEN_BY_PARENT))
+ return -ECHILD;
- r = method(u);
+ r = UNIT_VTABLE(u)->freezer_action(u, action);
if (r <= 0)
return r;
- assert(IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING));
-
+ assert(IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT, FREEZER_THAWING));
return 1;
}
-int unit_freeze(Unit *u) {
- return unit_freezer_action(u, FREEZER_FREEZE);
-}
-
-int unit_thaw(Unit *u) {
- return unit_freezer_action(u, FREEZER_THAW);
-}
-
-/* Wrappers around low-level cgroup freezer operations common for service and scope units */
-int unit_freeze_vtable_common(Unit *u) {
- return unit_cgroup_freezer_action(u, FREEZER_FREEZE);
-}
-
-int unit_thaw_vtable_common(Unit *u) {
- return unit_cgroup_freezer_action(u, FREEZER_THAW);
-}
-
Condition *unit_find_failed_condition(Unit *u) {
Condition *failed_trigger = NULL;
bool has_succeeded_trigger = false;
@@ -6310,7 +6333,7 @@ Condition *unit_find_failed_condition(Unit *u) {
}
static const char* const collect_mode_table[_COLLECT_MODE_MAX] = {
- [COLLECT_INACTIVE] = "inactive",
+ [COLLECT_INACTIVE] = "inactive",
[COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed",
};
@@ -6460,7 +6483,7 @@ int unit_compare_priority(Unit *a, Unit *b) {
}
const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX] = {
- [UNIT_PATH] = &activation_details_path_vtable,
+ [UNIT_PATH] = &activation_details_path_vtable,
[UNIT_TIMER] = &activation_details_timer_vtable,
};
@@ -6596,11 +6619,7 @@ int activation_details_append_pair(ActivationDetails *details, char ***strv) {
return 0;
if (!isempty(details->trigger_unit_name)) {
- r = strv_extend(strv, "trigger_unit");
- if (r < 0)
- return r;
-
- r = strv_extend(strv, details->trigger_unit_name);
+ r = strv_extend_many(strv, "trigger_unit", details->trigger_unit_name);
if (r < 0)
return r;
}
@@ -6615,3 +6634,24 @@ int activation_details_append_pair(ActivationDetails *details, char ***strv) {
}
DEFINE_TRIVIAL_REF_UNREF_FUNC(ActivationDetails, activation_details, activation_details_free);
+
+static const char* const unit_mount_dependency_type_table[_UNIT_MOUNT_DEPENDENCY_TYPE_MAX] = {
+ [UNIT_MOUNT_WANTS] = "WantsMountsFor",
+ [UNIT_MOUNT_REQUIRES] = "RequiresMountsFor",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(unit_mount_dependency_type, UnitMountDependencyType);
+
+UnitDependency unit_mount_dependency_type_to_dependency_type(UnitMountDependencyType t) {
+ switch (t) {
+
+ case UNIT_MOUNT_WANTS:
+ return UNIT_WANTS;
+
+ case UNIT_MOUNT_REQUIRES:
+ return UNIT_REQUIRES;
+
+ default:
+ assert_not_reached();
+ }
+}
diff --git a/src/core/unit.h b/src/core/unit.h
index 60bc2e3..b135fec 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
+#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <sys/socket.h>
@@ -8,6 +9,14 @@
#include "sd-id128.h"
+/* Circular dependency with manager.h, needs to be defined before local includes */
+typedef enum UnitMountDependencyType {
+ UNIT_MOUNT_WANTS,
+ UNIT_MOUNT_REQUIRES,
+ _UNIT_MOUNT_DEPENDENCY_TYPE_MAX,
+ _UNIT_MOUNT_DEPENDENCY_TYPE_INVALID = -EINVAL,
+} UnitMountDependencyType;
+
#include "bpf-program.h"
#include "cgroup.h"
#include "condition.h"
@@ -55,7 +64,11 @@ static inline bool UNIT_IS_INACTIVE_OR_FAILED(UnitActiveState t) {
}
static inline bool UNIT_IS_LOAD_COMPLETE(UnitLoadState t) {
- return t >= 0 && t < _UNIT_LOAD_STATE_MAX && t != UNIT_STUB && t != UNIT_MERGED;
+ return t >= 0 && t < _UNIT_LOAD_STATE_MAX && !IN_SET(t, UNIT_STUB, UNIT_MERGED);
+}
+
+static inline bool UNIT_IS_LOAD_ERROR(UnitLoadState t) {
+ return IN_SET(t, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR);
}
/* Stores the 'reason' a dependency was created as a bit mask, i.e. due to which configuration source it came to be. We
@@ -199,6 +212,7 @@ struct UnitRef {
LIST_FIELDS(UnitRef, refs_by_target);
};
+/* The generic, dynamic definition of the unit */
typedef struct Unit {
Manager *manager;
@@ -216,9 +230,9 @@ typedef struct Unit {
* Hashmap(UnitDependency → Hashmap(Unit* → UnitDependencyInfo)) */
Hashmap *dependencies;
- /* Similar, for RequiresMountsFor= path dependencies. The key is the path, the value the
- * UnitDependencyInfo type */
- Hashmap *requires_mounts_for;
+ /* Similar, for RequiresMountsFor= and WantsMountsFor= path dependencies. The key is the path, the
+ * value the UnitDependencyInfo type */
+ Hashmap *mounts_for[_UNIT_MOUNT_DEPENDENCY_TYPE_MAX];
char *description;
char **documentation;
@@ -361,74 +375,6 @@ typedef struct Unit {
UnitFileState unit_file_state;
PresetAction unit_file_preset;
- /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */
- nsec_t cpu_usage_base;
- nsec_t cpu_usage_last; /* the most recently read value */
-
- /* Most recently read value of memory accounting metrics */
- uint64_t memory_accounting_last[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1];
-
- /* The current counter of OOM kills initiated by systemd-oomd */
- uint64_t managed_oom_kill_last;
-
- /* The current counter of the oom_kill field in the memory.events cgroup attribute */
- uint64_t oom_kill_last;
-
- /* Where the io.stat data was at the time the unit was started */
- uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
- uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */
-
- /* Counterparts in the cgroup filesystem */
- char *cgroup_path;
- uint64_t cgroup_id;
- CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */
- CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */
- CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */
- CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */
-
- /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */
- int cgroup_control_inotify_wd;
- int cgroup_memory_inotify_wd;
-
- /* Device Controller BPF program */
- BPFProgram *bpf_device_control_installed;
-
- /* IP BPF Firewalling/accounting */
- int ip_accounting_ingress_map_fd;
- int ip_accounting_egress_map_fd;
- uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];
-
- int ipv4_allow_map_fd;
- int ipv6_allow_map_fd;
- int ipv4_deny_map_fd;
- int ipv6_deny_map_fd;
- BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed;
- BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed;
-
- Set *ip_bpf_custom_ingress;
- Set *ip_bpf_custom_ingress_installed;
- Set *ip_bpf_custom_egress;
- Set *ip_bpf_custom_egress_installed;
-
- /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd,
- * attached to unit cgroup by provided program fd and attach type. */
- Hashmap *bpf_foreign_by_key;
-
- FDSet *initial_socket_bind_link_fds;
-#if BPF_FRAMEWORK
- /* BPF links to BPF programs attached to cgroup/bind{4|6} hooks and
- * responsible for allowing or denying a unit to bind(2) to a socket
- * address. */
- struct bpf_link *ipv4_socket_bind_link;
- struct bpf_link *ipv6_socket_bind_link;
-#endif
-
- FDSet *initial_restric_ifaces_link_fds;
-#if BPF_FRAMEWORK
- struct bpf_link *restrict_ifaces_ingress_bpf_link;
- struct bpf_link *restrict_ifaces_egress_bpf_link;
-#endif
-
/* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new
* ones which might have appeared. */
sd_event_source *rewatch_pids_event_source;
@@ -499,12 +445,6 @@ typedef struct Unit {
bool in_audit:1;
bool on_console:1;
- bool cgroup_realized:1;
- bool cgroup_members_mask_valid:1;
-
- /* Reset cgroup accounting next time we fork something off */
- bool reset_accounting:1;
-
bool start_limit_hit:1;
/* Did we already invoke unit_coldplug() for this unit? */
@@ -520,9 +460,6 @@ typedef struct Unit {
bool exported_log_ratelimit_interval:1;
bool exported_log_ratelimit_burst:1;
- /* Whether we warned about clamping the CPU quota period */
- bool warned_clamping_cpu_quota_period:1;
-
/* When writing transient unit files, stores which section we stored last. If < 0, we didn't write any yet. If
* == 0 we are in the [Unit] section, if > 0 we are in the unit type-specific section. */
signed int last_section_private:2;
@@ -568,6 +505,7 @@ static inline bool UNIT_WRITE_FLAGS_NOOP(UnitWriteFlags flags) {
#include "kill.h"
+/* The static const, immutable data about a specific unit type */
typedef struct UnitVTable {
/* How much memory does an object of this unit type need */
size_t object_size;
@@ -584,11 +522,14 @@ typedef struct UnitVTable {
* KillContext is found, if the unit type has that */
size_t kill_context_offset;
- /* If greater than 0, the offset into the object where the
- * pointer to ExecSharedRuntime is found, if the unit type has
- * that */
+ /* If greater than 0, the offset into the object where the pointer to ExecRuntime is found, if
+ * the unit type has that */
size_t exec_runtime_offset;
+ /* If greater than 0, the offset into the object where the pointer to CGroupRuntime is found, if the
+ * unit type has that */
+ size_t cgroup_runtime_offset;
+
/* The name of the configuration file section with the private settings of this unit */
const char *private_section;
@@ -633,9 +574,9 @@ typedef struct UnitVTable {
/* Clear out the various runtime/state/cache/logs/configuration data */
int (*clean)(Unit *u, ExecCleanMask m);
- /* Freeze the unit */
- int (*freeze)(Unit *u);
- int (*thaw)(Unit *u);
+ /* Freeze or thaw the unit. Returns > 0 to indicate that the request will be handled asynchronously; unit_frozen
+ * or unit_thawed should be called once the operation is done. Returns 0 if done successfully, or < 0 on error. */
+ int (*freezer_action)(Unit *u, FreezerAction a);
bool (*can_freeze)(Unit *u);
/* Return which kind of data can be cleaned */
@@ -691,6 +632,9 @@ typedef struct UnitVTable {
/* Called whenever a process of this unit sends us a message */
void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds);
+ /* Called whenever we learn a handoff timestamp */
+ void (*notify_handoff_timestamp)(Unit *u, const struct ucred *ucred, const dual_timestamp *ts);
+
/* Called whenever a name this Unit registered for comes or goes away. */
void (*bus_name_owner_change)(Unit *u, const char *new_owner);
@@ -722,10 +666,10 @@ typedef struct UnitVTable {
/* Returns the start timeout of a unit */
usec_t (*get_timeout_start_usec)(Unit *u);
- /* Returns the main PID if there is any defined, or 0. */
- PidRef* (*main_pid)(Unit *u);
+ /* Returns the main PID if there is any defined, or NULL. */
+ PidRef* (*main_pid)(Unit *u, bool *ret_is_alien);
- /* Returns the control PID if there is any defined, or 0. */
+ /* Returns the control PID if there is any defined, or NULL. */
PidRef* (*control_pid)(Unit *u);
/* Returns true if the unit currently needs access to the console */
@@ -794,6 +738,9 @@ typedef struct UnitVTable {
/* If true, we'll notify plymouth about this unit */
bool notify_plymouth;
+ /* If true, we'll notify a surrounding VMM/container manager about this unit becoming available */
+ bool notify_supervisor;
+
/* The audit events to generate on start + stop (or 0 if none shall be generated) */
int audit_start_message_type;
int audit_stop_message_type;
@@ -903,7 +850,6 @@ bool unit_has_name(const Unit *u, const char *name);
UnitActiveState unit_active_state(Unit *u);
FreezerState unit_freezer_state(Unit *u);
-int unit_freezer_state_kernel(Unit *u, FreezerState *ret);
const char* unit_sub_state_to_string(Unit *u);
@@ -916,17 +862,18 @@ int unit_start(Unit *u, ActivationDetails *details);
int unit_stop(Unit *u);
int unit_reload(Unit *u);
-int unit_kill(Unit *u, KillWho w, int signo, int code, int value, sd_bus_error *error);
+int unit_kill(Unit *u, KillWho w, int signo, int code, int value, sd_bus_error *ret_error);
void unit_notify_cgroup_oom(Unit *u, bool managed_oom);
void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_success);
-int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive);
+int unit_watch_pidref(Unit *u, const PidRef *pid, bool exclusive);
int unit_watch_pid(Unit *u, pid_t pid, bool exclusive);
-void unit_unwatch_pidref(Unit *u, PidRef *pid);
+void unit_unwatch_pidref(Unit *u, const PidRef *pid);
void unit_unwatch_pid(Unit *u, pid_t pid);
void unit_unwatch_all_pids(Unit *u);
+void unit_unwatch_pidref_done(Unit *u, PidRef *pidref);
int unit_enqueue_rewatch_pids(Unit *u);
void unit_dequeue_rewatch_pids(Unit *u);
@@ -984,12 +931,14 @@ void unit_ref_unset(UnitRef *ref);
int unit_patch_contexts(Unit *u);
ExecContext *unit_get_exec_context(const Unit *u) _pure_;
-KillContext *unit_get_kill_context(Unit *u) _pure_;
-CGroupContext *unit_get_cgroup_context(Unit *u) _pure_;
+KillContext *unit_get_kill_context(const Unit *u) _pure_;
+CGroupContext *unit_get_cgroup_context(const Unit *u) _pure_;
-ExecRuntime *unit_get_exec_runtime(Unit *u) _pure_;
+ExecRuntime *unit_get_exec_runtime(const Unit *u) _pure_;
+CGroupRuntime *unit_get_cgroup_runtime(const Unit *u) _pure_;
int unit_setup_exec_runtime(Unit *u);
+CGroupRuntime *unit_setup_cgroup_runtime(Unit *u);
const char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf);
char* unit_concat_strv(char **l, UnitWriteFlags flags);
@@ -997,11 +946,11 @@ char* unit_concat_strv(char **l, UnitWriteFlags flags);
int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data);
int unit_write_settingf(Unit *u, UnitWriteFlags mode, const char *name, const char *format, ...) _printf_(4,5);
-int unit_kill_context(Unit *u, KillContext *c, KillOperation k, PidRef *main_pid, PidRef *control_pid, bool main_pid_alien);
+int unit_kill_context(Unit *u, KillOperation k);
int unit_make_transient(Unit *u);
-int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask);
+int unit_add_mounts_for(Unit *u, const char *path, UnitDependencyMask mask, UnitMountDependencyType type);
bool unit_type_supported(UnitType t);
@@ -1012,7 +961,10 @@ bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit);
bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit);
PidRef* unit_control_pid(Unit *u);
-PidRef* unit_main_pid(Unit *u);
+PidRef* unit_main_pid_full(Unit *u, bool *ret_is_alien);
+static inline PidRef* unit_main_pid(Unit *u) {
+ return unit_main_pid_full(u, NULL);
+}
void unit_warn_if_dir_nonempty(Unit *u, const char* where);
int unit_fail_if_noncanonical(Unit *u, const char* where);
@@ -1046,7 +998,7 @@ int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func);
bool unit_needs_console(Unit *u);
-int unit_pid_attachable(Unit *unit, PidRef *pid, sd_bus_error *error);
+int unit_pid_attachable(Unit *unit, const PidRef *pid, sd_bus_error *error);
static inline bool unit_has_job_type(Unit *u, JobType type) {
return u && u->job && u->job->type == type;
@@ -1086,21 +1038,21 @@ bool unit_can_stop_refuse_manual(Unit *u);
bool unit_can_isolate_refuse_manual(Unit *u);
bool unit_can_freeze(Unit *u);
-int unit_freeze(Unit *u);
+int unit_freezer_action(Unit *u, FreezerAction action);
+void unit_next_freezer_state(Unit *u, FreezerAction a, FreezerState *ret, FreezerState *ret_tgt);
void unit_frozen(Unit *u);
-
-int unit_thaw(Unit *u);
void unit_thawed(Unit *u);
-int unit_freeze_vtable_common(Unit *u);
-int unit_thaw_vtable_common(Unit *u);
-
Condition *unit_find_failed_condition(Unit *u);
int unit_arm_timer(Unit *u, sd_event_source **source, bool relative, usec_t usec, sd_event_time_handler_t handler);
int unit_compare_priority(Unit *a, Unit *b);
+UnitMountDependencyType unit_mount_dependency_type_from_string(const char *s) _const_;
+const char* unit_mount_dependency_type_to_string(UnitMountDependencyType t) _const_;
+UnitDependency unit_mount_dependency_type_to_dependency_type(UnitMountDependencyType t) _pure_;
+
/* Macros which append UNIT= or USER_UNIT= to the message */
#define log_unit_full_errno_zerook(unit, level, error, ...) \