From efeb864cb547a2cbf96dc0053a8bdb4d9190b364 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 12 Jun 2024 05:50:45 +0200 Subject: Merging upstream version 256. Signed-off-by: Daniel Baumann --- src/core/automount.c | 75 +- src/core/bpf-devices.c | 82 +- src/core/bpf-firewall.c | 160 ++-- src/core/bpf-foreign.c | 21 +- src/core/bpf-lsm.c | 320 ------- src/core/bpf-lsm.h | 28 - src/core/bpf-restrict-fs.c | 324 +++++++ src/core/bpf-restrict-fs.h | 23 + src/core/bpf-restrict-ifaces.c | 223 +++++ src/core/bpf-restrict-ifaces.h | 16 + src/core/bpf-socket-bind.c | 51 +- src/core/bpf-socket-bind.h | 2 +- src/core/bpf-util.c | 3 +- src/core/cgroup.c | 1646 ++++++++++++++++++++++++++------- src/core/cgroup.h | 132 ++- src/core/core-varlink.c | 105 ++- src/core/core-varlink.h | 4 - src/core/crash-handler.c | 8 +- src/core/dbus-cgroup.c | 21 +- src/core/dbus-execute.c | 117 +-- src/core/dbus-execute.h | 1 + src/core/dbus-job.c | 25 +- src/core/dbus-manager.c | 407 +++++--- src/core/dbus-mount.c | 31 +- src/core/dbus-scope.c | 24 +- src/core/dbus-service.c | 2 - src/core/dbus-socket.c | 4 + src/core/dbus-unit.c | 158 ++-- src/core/dbus-util.c | 7 +- src/core/dbus-util.h | 3 +- src/core/dbus.c | 92 +- src/core/device.c | 75 +- src/core/dynamic-user.c | 49 +- src/core/emergency-action.c | 32 +- src/core/emergency-action.h | 6 +- src/core/exec-credential.c | 256 ++--- src/core/exec-credential.h | 4 +- src/core/exec-invoke.c | 649 +++++++------ src/core/execute-serialize.c | 131 +-- src/core/execute.c | 239 +++-- src/core/execute.h | 175 ++-- src/core/executor.c | 5 +- src/core/fuzz-execute-serialize.c | 2 +- src/core/generator-setup.c | 12 +- src/core/import-creds.c | 17 +- src/core/job.c | 42 +- src/core/job.h | 1 + src/core/kmod-setup.c | 48 +- src/core/load-fragment-gperf.gperf.in | 15 +- src/core/load-fragment.c | 353 +++---- src/core/load-fragment.h | 4 +- src/core/main.c | 246 ++++- src/core/main.h | 14 +- src/core/manager-dump.c | 2 +- src/core/manager-serialize.c | 97 +- src/core/manager.c | 671 ++++++++------ src/core/manager.h | 63 +- src/core/meson.build | 7 +- src/core/mount.c | 353 ++++--- src/core/mount.h | 1 + src/core/namespace.c | 333 ++++--- src/core/path.c | 81 +- src/core/restrict-ifaces.c | 200 ---- src/core/restrict-ifaces.h | 16 - src/core/scope.c | 95 +- src/core/scope.h | 1 + src/core/selinux-access.c | 5 +- src/core/service.c | 787 ++++++++-------- src/core/service.h | 4 + src/core/show-status.c | 4 +- src/core/slice.c | 147 ++- src/core/slice.h | 2 + src/core/socket.c | 382 ++++---- src/core/socket.h | 4 +- src/core/swap.c | 257 +++-- src/core/swap.h | 1 + src/core/system.conf.in | 3 +- src/core/taint.c | 85 ++ src/core/taint.h | 4 + src/core/target.c | 57 +- src/core/timer.c | 89 +- src/core/transaction.c | 8 +- src/core/unit-printf.c | 59 +- src/core/unit-serialize.c | 279 +----- src/core/unit.c | 1264 +++++++++++++------------ src/core/unit.h | 166 ++-- 86 files changed, 6932 insertions(+), 5085 deletions(-) delete mode 100644 src/core/bpf-lsm.c delete mode 100644 src/core/bpf-lsm.h create mode 100644 src/core/bpf-restrict-fs.c create mode 100644 src/core/bpf-restrict-fs.h create mode 100644 src/core/bpf-restrict-ifaces.c create mode 100644 src/core/bpf-restrict-ifaces.h delete mode 100644 src/core/restrict-ifaces.c delete mode 100644 src/core/restrict-ifaces.h create mode 100644 src/core/taint.c create mode 100644 src/core/taint.h (limited to 'src/core') diff --git a/src/core/automount.c b/src/core/automount.c index 14bf7e6..6cb9d52 100644 --- a/src/core/automount.c +++ b/src/core/automount.c @@ -38,10 +38,10 @@ #include "unit.h" static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = { - [AUTOMOUNT_DEAD] = UNIT_INACTIVE, + [AUTOMOUNT_DEAD] = UNIT_INACTIVE, [AUTOMOUNT_WAITING] = UNIT_ACTIVE, [AUTOMOUNT_RUNNING] = UNIT_ACTIVE, - [AUTOMOUNT_FAILED] = UNIT_FAILED + [AUTOMOUNT_FAILED] = UNIT_FAILED, }; static int open_dev_autofs(Manager *m); @@ -51,10 +51,8 @@ static void automount_stop_expire(Automount *a); static int automount_send_ready(Automount *a, Set *tokens, int status); static void automount_init(Unit *u) { - Automount *a = AUTOMOUNT(u); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); - assert(a); - assert(u); assert(u->load_state == UNIT_STUB); a->pipe_fd = -EBADF; @@ -88,9 +86,7 @@ static void unmount_autofs(Automount *a) { } static void automount_done(Unit *u) { - Automount *a = AUTOMOUNT(u); - - assert(a); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); unmount_autofs(a); @@ -126,7 +122,7 @@ static int automount_add_mount_dependencies(Automount *a) { if (r < 0) return r; - return unit_require_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT); + return unit_add_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT, UNIT_MOUNT_REQUIRES); } static int automount_add_default_dependencies(Automount *a) { @@ -227,10 +223,9 @@ static int automount_add_extras(Automount *a) { } static int automount_load(Unit *u) { - Automount *a = AUTOMOUNT(u); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); int r; - assert(u); assert(u->load_state == UNIT_STUB); /* Load a .automount file */ @@ -250,6 +245,7 @@ static int automount_load(Unit *u) { static void automount_set_state(Automount *a, AutomountState state) { AutomountState old_state; + assert(a); if (a->state != state) @@ -271,10 +267,9 @@ static void automount_set_state(Automount *a, AutomountState state) { } static int automount_coldplug(Unit *u) { - Automount *a = AUTOMOUNT(u); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); int r; - assert(a); assert(a->state == AUTOMOUNT_DEAD); if (a->deserialized_state == a->state) @@ -310,9 +305,7 @@ static int automount_coldplug(Unit *u) { } static void automount_dump(Unit *u, FILE *f, const char *prefix) { - Automount *a = AUTOMOUNT(u); - - assert(a); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); fprintf(f, "%sAutomount State: %s\n" @@ -478,30 +471,22 @@ static int automount_send_ready(Automount *a, Set *tokens, int status) { r = 0; /* Autofs thankfully does not hand out 0 as a token */ - while ((token = PTR_TO_UINT(set_steal_first(tokens)))) { - int k; - + while ((token = PTR_TO_UINT(set_steal_first(tokens)))) /* Autofs fun fact: * - * if you pass a positive status code here, kernels - * prior to 4.12 will freeze! Yay! */ - - k = autofs_send_ready(UNIT(a)->manager->dev_autofs_fd, - ioctl_fd, - token, - status); - if (k < 0) - r = k; - } + * if you pass a positive status code here, kernels prior to 4.12 will freeze! Yay! */ + RET_GATHER(r, autofs_send_ready(UNIT(a)->manager->dev_autofs_fd, + ioctl_fd, + token, + status)); return r; } static void automount_trigger_notify(Unit *u, Unit *other) { - Automount *a = AUTOMOUNT(u); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); int r; - assert(a); assert(other); /* Filter out invocations with bogus state */ @@ -697,11 +682,10 @@ static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) { } static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) { + Automount *a = ASSERT_PTR(AUTOMOUNT(userdata)); _cleanup_close_ int ioctl_fd = -EBADF; - Automount *a = AUTOMOUNT(userdata); int r; - assert(a); assert(source == a->expire_event_source); ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id); @@ -815,13 +799,12 @@ fail: } static int automount_start(Unit *u) { - Automount *a = AUTOMOUNT(u); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); int r; - assert(a); assert(IN_SET(a->state, AUTOMOUNT_DEAD, AUTOMOUNT_FAILED)); - if (path_is_mount_point(a->where, NULL, 0) > 0) + if (path_is_mount_point(a->where) > 0) return log_unit_error_errno(u, SYNTHETIC_ERRNO(EEXIST), "Path %s is already a mount point, refusing start.", a->where); r = unit_test_trigger_loaded(u); @@ -838,9 +821,8 @@ static int automount_start(Unit *u) { } static int automount_stop(Unit *u) { - Automount *a = AUTOMOUNT(u); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); - assert(a); assert(IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)); automount_enter_dead(a, AUTOMOUNT_SUCCESS); @@ -848,11 +830,10 @@ static int automount_stop(Unit *u) { } static int automount_serialize(Unit *u, FILE *f, FDSet *fds) { - Automount *a = AUTOMOUNT(u); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); void *p; int r; - assert(a); assert(f); assert(fds); @@ -873,10 +854,9 @@ static int automount_serialize(Unit *u, FILE *f, FDSet *fds) { } static int automount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Automount *a = AUTOMOUNT(u); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); int r; - assert(a); assert(fds); if (streq(key, "state")) { @@ -958,13 +938,12 @@ static bool automount_may_gc(Unit *u) { } static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata) { + Automount *a = ASSERT_PTR(AUTOMOUNT(userdata)); _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; union autofs_v5_packet_union packet; - Automount *a = AUTOMOUNT(userdata); Unit *trigger; int r; - assert(a); assert(fd == a->pipe_fd); if (events & (EPOLLHUP|EPOLLERR)) { @@ -1048,9 +1027,7 @@ static void automount_shutdown(Manager *m) { } static void automount_reset_failed(Unit *u) { - Automount *a = AUTOMOUNT(u); - - assert(a); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); if (a->state == AUTOMOUNT_FAILED) automount_set_state(a, AUTOMOUNT_DEAD); @@ -1068,11 +1045,9 @@ static bool automount_supported(void) { } static int automount_can_start(Unit *u) { - Automount *a = AUTOMOUNT(u); + Automount *a = ASSERT_PTR(AUTOMOUNT(u)); int r; - assert(a); - r = unit_test_start_limit(u); if (r < 0) { automount_enter_dead(a, AUTOMOUNT_FAILURE_START_LIMIT_HIT); diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c index 06d2146..8484dbc 100644 --- a/src/core/bpf-devices.c +++ b/src/core/bpf-devices.c @@ -24,15 +24,15 @@ assert_cc((unsigned) BPF_DEVCG_ACC_WRITE == (unsigned) CGROUP_DEVICE_WRITE); static int bpf_prog_allow_list_device( BPFProgram *prog, char type, - int major, - int minor, + unsigned major, + unsigned minor, CGroupDevicePermissions p) { int r; assert(prog); - log_trace("%s: %c %d:%d %s", __func__, type, major, minor, cgroup_device_permissions_to_string(p)); + log_trace("%s: %c %u:%u %s", __func__, type, major, minor, cgroup_device_permissions_to_string(p)); if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX) return -EINVAL; @@ -56,22 +56,22 @@ static int bpf_prog_allow_list_device( else r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); if (r < 0) - log_error_errno(r, "Extending device control BPF program failed: %m"); + return log_error_errno(r, "Extending device control BPF program failed: %m"); - return r; + return 1; /* return 1 → we did something */ } static int bpf_prog_allow_list_major( BPFProgram *prog, char type, - int major, + unsigned major, CGroupDevicePermissions p) { int r; assert(prog); - log_trace("%s: %c %d:* %s", __func__, type, major, cgroup_device_permissions_to_string(p)); + log_trace("%s: %c %u:* %s", __func__, type, major, cgroup_device_permissions_to_string(p)); if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX) return -EINVAL; @@ -94,9 +94,9 @@ static int bpf_prog_allow_list_major( else r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); if (r < 0) - log_error_errno(r, "Extending device control BPF program failed: %m"); + return log_error_errno(r, "Extending device control BPF program failed: %m"); - return r; + return 1; /* return 1 → we did something */ } static int bpf_prog_allow_list_class( @@ -130,9 +130,9 @@ static int bpf_prog_allow_list_class( else r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); if (r < 0) - log_error_errno(r, "Extending device control BPF program failed: %m"); + return log_error_errno(r, "Extending device control BPF program failed: %m"); - return r; + return 1; /* return 1 → we did something */ } int bpf_devices_cgroup_init( @@ -165,8 +165,10 @@ int bpf_devices_cgroup_init( assert(ret); - if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list) + if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list) { + *ret = NULL; return 0; + } r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &prog); if (r < 0) @@ -179,8 +181,7 @@ int bpf_devices_cgroup_init( } *ret = TAKE_PTR(prog); - - return 0; + return 1; } int bpf_devices_apply_policy( @@ -307,8 +308,8 @@ static int allow_list_device_pattern( BPFProgram *prog, const char *path, char type, - const unsigned *maj, - const unsigned *min, + unsigned major, + unsigned minor, CGroupDevicePermissions p) { assert(IN_SET(type, 'b', 'c')); @@ -317,10 +318,10 @@ static int allow_list_device_pattern( if (!prog) return 0; - if (maj && min) - return bpf_prog_allow_list_device(prog, type, *maj, *min, p); - else if (maj) - return bpf_prog_allow_list_major(prog, type, *maj, p); + if (major != UINT_MAX && minor != UINT_MAX) + return bpf_prog_allow_list_device(prog, type, major, minor, p); + else if (major != UINT_MAX) + return bpf_prog_allow_list_major(prog, type, major, p); else return bpf_prog_allow_list_class(prog, type, p); @@ -328,10 +329,10 @@ static int allow_list_device_pattern( char buf[2+DECIMAL_STR_MAX(unsigned)*2+2+4]; int r; - if (maj && min) - xsprintf(buf, "%c %u:%u %s", type, *maj, *min, cgroup_device_permissions_to_string(p)); - else if (maj) - xsprintf(buf, "%c %u:* %s", type, *maj, cgroup_device_permissions_to_string(p)); + if (major != UINT_MAX && minor != UINT_MAX) + xsprintf(buf, "%c %u:%u %s", type, major, minor, cgroup_device_permissions_to_string(p)); + else if (major != UINT_MAX) + xsprintf(buf, "%c %u:* %s", type, major, cgroup_device_permissions_to_string(p)); else xsprintf(buf, "%c *:* %s", type, cgroup_device_permissions_to_string(p)); @@ -371,8 +372,14 @@ int bpf_devices_allow_list_device( return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node); struct stat st; - if (stat(node, &st) < 0) + if (stat(node, &st) < 0) { + if (errno == ENOENT) { + log_debug_errno(errno, "Device '%s' does not exist, skipping.", node); + return 0; /* returning 0 means → skipped */ + } + return log_warning_errno(errno, "Couldn't stat device %s: %m", node); + } if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), "%s is not a device.", node); @@ -381,8 +388,7 @@ int bpf_devices_allow_list_device( rdev = (dev_t) st.st_rdev; } - unsigned maj = major(rdev), min = minor(rdev); - return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', &maj, &min, p); + return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', major(rdev), minor(rdev), p); } int bpf_devices_allow_list_major( @@ -392,7 +398,7 @@ int bpf_devices_allow_list_major( char type, CGroupDevicePermissions permissions) { - unsigned maj; + unsigned major; int r; assert(path); @@ -401,12 +407,12 @@ int bpf_devices_allow_list_major( if (streq(name, "*")) /* If the name is a wildcard, then apply this list to all devices of this type */ - return allow_list_device_pattern(prog, path, type, NULL, NULL, permissions); + return allow_list_device_pattern(prog, path, type, /* major= */ UINT_MAX, /* minor= */ UINT_MAX, permissions); - if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj)) + if (safe_atou(name, &major) >= 0 && DEVICE_MAJOR_VALID(major)) /* The name is numeric and suitable as major. In that case, let's take its major, and create * the entry directly. */ - return allow_list_device_pattern(prog, path, type, &maj, NULL, permissions); + return allow_list_device_pattern(prog, path, type, major, /* minor= */ UINT_MAX, permissions); _cleanup_fclose_ FILE *f = NULL; bool good = false, any = false; @@ -450,10 +456,10 @@ int bpf_devices_allow_list_major( continue; *w = 0; - r = safe_atou(p, &maj); + r = safe_atou(p, &major); if (r < 0) continue; - if (maj <= 0) + if (major <= 0) continue; w++; @@ -462,15 +468,15 @@ int bpf_devices_allow_list_major( if (fnmatch(name, w, 0) != 0) continue; - any = true; - (void) allow_list_device_pattern(prog, path, type, &maj, NULL, permissions); + if (allow_list_device_pattern(prog, path, type, major, /* minor= */ UINT_MAX, permissions) > 0) + any = true; } if (!any) return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "Device allow list pattern \"%s\" did not match anything.", name); - return 0; + return any; } int bpf_devices_allow_list_static( @@ -492,13 +498,13 @@ int bpf_devices_allow_list_static( NULSTR_FOREACH_PAIR(node, acc, auto_devices) { k = bpf_devices_allow_list_device(prog, path, node, cgroup_device_permissions_from_string(acc)); - if (r >= 0 && k < 0) + if ((r >= 0 && k < 0) || (r >= 0 && k > 0)) r = k; } /* PTS (/dev/pts) devices may not be duplicated, but accessed */ k = bpf_devices_allow_list_major(prog, path, "pts", 'c', CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE); - if (r >= 0 && k < 0) + if ((r >= 0 && k < 0) || (r >= 0 && k > 0)) r = k; return r; diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c index 66773e1..185ed7d 100644 --- a/src/core/bpf-firewall.c +++ b/src/core/bpf-firewall.c @@ -1,12 +1,13 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +/* Make sure the net/if.h header is included before any linux/ one */ +#include #include #include #include #include #include #include -#include #include #include #include @@ -196,19 +197,26 @@ static int bpf_firewall_compile_bpf( _cleanup_(bpf_program_freep) BPFProgram *p = NULL; int accounting_map_fd, r; bool access_enabled; + CGroupRuntime *crt; assert(u); assert(ret); + crt = unit_get_cgroup_runtime(u); + if (!crt) { + *ret = NULL; + return 0; + } + accounting_map_fd = is_ingress ? - u->ip_accounting_ingress_map_fd : - u->ip_accounting_egress_map_fd; + crt->ip_accounting_ingress_map_fd : + crt->ip_accounting_egress_map_fd; access_enabled = - u->ipv4_allow_map_fd >= 0 || - u->ipv6_allow_map_fd >= 0 || - u->ipv4_deny_map_fd >= 0 || - u->ipv6_deny_map_fd >= 0 || + crt->ipv4_allow_map_fd >= 0 || + crt->ipv6_allow_map_fd >= 0 || + crt->ipv4_deny_map_fd >= 0 || + crt->ipv6_deny_map_fd >= 0 || ip_allow_any || ip_deny_any; @@ -234,26 +242,26 @@ static int bpf_firewall_compile_bpf( * - Otherwise, access will be granted */ - if (u->ipv4_deny_map_fd >= 0) { - r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED); + if (crt->ipv4_deny_map_fd >= 0) { + r = add_lookup_instructions(p, crt->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED); if (r < 0) return r; } - if (u->ipv6_deny_map_fd >= 0) { - r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED); + if (crt->ipv6_deny_map_fd >= 0) { + r = add_lookup_instructions(p, crt->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED); if (r < 0) return r; } - if (u->ipv4_allow_map_fd >= 0) { - r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED); + if (crt->ipv4_allow_map_fd >= 0) { + r = add_lookup_instructions(p, crt->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED); if (r < 0) return r; } - if (u->ipv6_allow_map_fd >= 0) { - r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED); + if (crt->ipv6_allow_map_fd >= 0) { + r = add_lookup_instructions(p, crt->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED); if (r < 0) return r; } @@ -495,37 +503,36 @@ static int bpf_firewall_prepare_access_maps( return 0; } -static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) { +static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, CGroupRuntime *crt) { int r; assert(u); - assert(fd_ingress); - assert(fd_egress); + assert(crt); if (enabled) { - if (*fd_ingress < 0) { + if (crt->ip_accounting_ingress_map_fd < 0) { char *name = strjoina("I_", u->id); r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); if (r < 0) return r; - *fd_ingress = r; + crt->ip_accounting_ingress_map_fd = r; } - if (*fd_egress < 0) { + if (crt->ip_accounting_egress_map_fd < 0) { char *name = strjoina("E_", u->id); r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); if (r < 0) return r; - *fd_egress = r; + crt->ip_accounting_egress_map_fd = r; } } else { - *fd_ingress = safe_close(*fd_ingress); - *fd_egress = safe_close(*fd_egress); + crt->ip_accounting_ingress_map_fd = safe_close(crt->ip_accounting_ingress_map_fd); + crt->ip_accounting_egress_map_fd = safe_close(crt->ip_accounting_egress_map_fd); - zero(u->ip_accounting_extra); + zero(crt->ip_accounting_extra); } return 0; @@ -535,6 +542,7 @@ int bpf_firewall_compile(Unit *u) { const char *ingress_name = NULL, *egress_name = NULL; bool ip_allow_any = false, ip_deny_any = false; CGroupContext *cc; + CGroupRuntime *crt; int r, supported; assert(u); @@ -543,6 +551,10 @@ int bpf_firewall_compile(Unit *u) { if (!cc) return -EINVAL; + crt = unit_setup_cgroup_runtime(u); + if (!crt) + return -ENOMEM; + supported = bpf_firewall_supported(); if (supported < 0) return supported; @@ -569,14 +581,14 @@ int bpf_firewall_compile(Unit *u) { * but we reuse the accounting maps. That way the firewall in effect always maps to the actual * configuration, but we don't flush out the accounting unnecessarily */ - u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress); - u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress); + crt->ip_bpf_ingress = bpf_program_free(crt->ip_bpf_ingress); + crt->ip_bpf_egress = bpf_program_free(crt->ip_bpf_egress); - u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd); - u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd); + crt->ipv4_allow_map_fd = safe_close(crt->ipv4_allow_map_fd); + crt->ipv4_deny_map_fd = safe_close(crt->ipv4_deny_map_fd); - u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd); - u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd); + crt->ipv6_allow_map_fd = safe_close(crt->ipv6_allow_map_fd); + crt->ipv6_deny_map_fd = safe_close(crt->ipv6_deny_map_fd); if (u->type != UNIT_SLICE) { /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf @@ -585,24 +597,24 @@ int bpf_firewall_compile(Unit *u) { * means that all configure IP access rules *will* take effect on processes, even though we never * compile them for inner nodes. */ - r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any); + r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &crt->ipv4_allow_map_fd, &crt->ipv6_allow_map_fd, &ip_allow_any); if (r < 0) return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m"); - r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any); + r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &crt->ipv4_deny_map_fd, &crt->ipv6_deny_map_fd, &ip_deny_any); if (r < 0) return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m"); } - r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd); + r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, crt); if (r < 0) return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m"); - r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any); + r = bpf_firewall_compile_bpf(u, ingress_name, true, &crt->ip_bpf_ingress, ip_allow_any, ip_deny_any); if (r < 0) return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m"); - r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any); + r = bpf_firewall_compile_bpf(u, egress_name, false, &crt->ip_bpf_egress, ip_allow_any, ip_deny_any); if (r < 0) return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m"); @@ -634,6 +646,7 @@ static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set int bpf_firewall_load_custom(Unit *u) { CGroupContext *cc; + CGroupRuntime *crt; int r, supported; assert(u); @@ -641,6 +654,9 @@ int bpf_firewall_load_custom(Unit *u) { cc = unit_get_cgroup_context(u); if (!cc) return 0; + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; if (!(cc->ip_filters_ingress || cc->ip_filters_egress)) return 0; @@ -653,10 +669,10 @@ int bpf_firewall_load_custom(Unit *u) { return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs."); - r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress); + r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &crt->ip_bpf_custom_ingress); if (r < 0) return r; - r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress); + r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &crt->ip_bpf_custom_egress); if (r < 0) return r; @@ -686,6 +702,7 @@ int bpf_firewall_install(Unit *u) { _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL; _cleanup_free_ char *path = NULL; CGroupContext *cc; + CGroupRuntime *crt; int r, supported; uint32_t flags; @@ -694,9 +711,12 @@ int bpf_firewall_install(Unit *u) { cc = unit_get_cgroup_context(u); if (!cc) return -EINVAL; - if (!u->cgroup_path) + crt = unit_get_cgroup_runtime(u); + if (!crt) + return -EINVAL; + if (!crt->cgroup_path) return -EINVAL; - if (!u->cgroup_realized) + if (!crt->cgroup_realized) return -EINVAL; supported = bpf_firewall_supported(); @@ -709,11 +729,11 @@ int bpf_firewall_install(Unit *u) { return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units."); if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && - (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress))) + (!set_isempty(crt->ip_bpf_custom_ingress) || !set_isempty(crt->ip_bpf_custom_egress))) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs."); - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path); + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &path); if (r < 0) return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m"); @@ -724,44 +744,44 @@ int bpf_firewall_install(Unit *u) { * after attaching the new programs, so that there's no time window where neither program is * attached. (There will be a program where both are attached, but that's OK, since this is a * security feature where we rather want to lock down too much than too little */ - ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed); - ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed); + ip_bpf_egress_uninstall = TAKE_PTR(crt->ip_bpf_egress_installed); + ip_bpf_ingress_uninstall = TAKE_PTR(crt->ip_bpf_ingress_installed); } else { /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly * detach them) right before attaching the new program, to minimize the time window when we * don't account for IP traffic. */ - u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed); - u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed); + crt->ip_bpf_egress_installed = bpf_program_free(crt->ip_bpf_egress_installed); + crt->ip_bpf_ingress_installed = bpf_program_free(crt->ip_bpf_ingress_installed); } - if (u->ip_bpf_egress) { - r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags); + if (crt->ip_bpf_egress) { + r = bpf_program_cgroup_attach(crt->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags); if (r < 0) return log_unit_error_errno(u, r, "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path); /* Remember that this BPF program is installed now. */ - u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress); + crt->ip_bpf_egress_installed = TAKE_PTR(crt->ip_bpf_egress); } - if (u->ip_bpf_ingress) { - r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags); + if (crt->ip_bpf_ingress) { + r = bpf_program_cgroup_attach(crt->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags); if (r < 0) return log_unit_error_errno(u, r, "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path); - u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress); + crt->ip_bpf_ingress_installed = TAKE_PTR(crt->ip_bpf_ingress); } /* And now, definitely get rid of the old programs, and detach them */ ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall); ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall); - r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed); + r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &crt->ip_bpf_custom_egress, &crt->ip_bpf_custom_egress_installed); if (r < 0) return r; - r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed); + r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &crt->ip_bpf_custom_ingress, &crt->ip_bpf_custom_ingress_installed); if (r < 0) return r; @@ -954,21 +974,25 @@ void emit_bpf_firewall_warning(Unit *u) { void bpf_firewall_close(Unit *u) { assert(u); - u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd); - u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return; + + crt->ip_accounting_ingress_map_fd = safe_close(crt->ip_accounting_ingress_map_fd); + crt->ip_accounting_egress_map_fd = safe_close(crt->ip_accounting_egress_map_fd); - u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd); - u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd); - u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd); - u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd); + crt->ipv4_allow_map_fd = safe_close(crt->ipv4_allow_map_fd); + crt->ipv6_allow_map_fd = safe_close(crt->ipv6_allow_map_fd); + crt->ipv4_deny_map_fd = safe_close(crt->ipv4_deny_map_fd); + crt->ipv6_deny_map_fd = safe_close(crt->ipv6_deny_map_fd); - u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress); - u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed); - u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress); - u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed); + crt->ip_bpf_ingress = bpf_program_free(crt->ip_bpf_ingress); + crt->ip_bpf_ingress_installed = bpf_program_free(crt->ip_bpf_ingress_installed); + crt->ip_bpf_egress = bpf_program_free(crt->ip_bpf_egress); + crt->ip_bpf_egress_installed = bpf_program_free(crt->ip_bpf_egress_installed); - u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress); - u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress); - u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed); - u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed); + crt->ip_bpf_custom_ingress = set_free(crt->ip_bpf_custom_ingress); + crt->ip_bpf_custom_egress = set_free(crt->ip_bpf_custom_egress); + crt->ip_bpf_custom_ingress_installed = set_free(crt->ip_bpf_custom_ingress_installed); + crt->ip_bpf_custom_egress_installed = set_free(crt->ip_bpf_custom_egress_installed); } diff --git a/src/core/bpf-foreign.c b/src/core/bpf-foreign.c index cff2f61..851cc42 100644 --- a/src/core/bpf-foreign.c +++ b/src/core/bpf-foreign.c @@ -45,8 +45,8 @@ static int bpf_foreign_key_compare_func(const BPFForeignKey *a, const BPFForeign } static void bpf_foreign_key_hash_func(const BPFForeignKey *p, struct siphash *h) { - siphash24_compress(&p->prog_id, sizeof(p->prog_id), h); - siphash24_compress(&p->attach_type, sizeof(p->attach_type), h); + siphash24_compress_typesafe(p->prog_id, h); + siphash24_compress_typesafe(p->attach_type, h); } DEFINE_PRIVATE_HASH_OPS_FULL(bpf_foreign_by_key_hash_ops, @@ -81,6 +81,7 @@ static int bpf_foreign_prepare( Unit *u, enum bpf_attach_type attach_type, const char *bpffs_path) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; _cleanup_free_ BPFForeignKey *key = NULL; uint32_t prog_id; @@ -101,6 +102,11 @@ static int bpf_foreign_prepare( return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), "bpf-foreign: Path in BPF filesystem is expected."); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Failed to get control group runtime object."); + r = bpf_program_new_from_bpffs_path(bpffs_path, &prog); if (r < 0) return log_unit_error_errno(u, r, "bpf-foreign: Failed to create foreign BPF program: %m"); @@ -114,7 +120,7 @@ static int bpf_foreign_prepare( return log_unit_error_errno(u, r, "bpf-foreign: Failed to create foreign BPF program key from path '%s': %m", bpffs_path); - r = hashmap_ensure_put(&u->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog); + r = hashmap_ensure_put(&crt->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog); if (r == -EEXIST) { log_unit_warning_errno(u, r, "bpf-foreign: Foreign BPF program already exists, ignoring: %m"); return 0; @@ -131,6 +137,7 @@ static int bpf_foreign_prepare( int bpf_foreign_install(Unit *u) { _cleanup_free_ char *cgroup_path = NULL; CGroupContext *cc; + CGroupRuntime *crt; int r, ret = 0; assert(u); @@ -139,7 +146,11 @@ int bpf_foreign_install(Unit *u) { if (!cc) return 0; - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path); + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_path); if (r < 0) return log_unit_error_errno(u, r, "bpf-foreign: Failed to get cgroup path: %m"); @@ -149,6 +160,6 @@ int bpf_foreign_install(Unit *u) { ret = r; } - r = attach_programs(u, cgroup_path, u->bpf_foreign_by_key, BPF_F_ALLOW_MULTI); + r = attach_programs(u, cgroup_path, crt->bpf_foreign_by_key, BPF_F_ALLOW_MULTI); return ret < 0 ? ret : r; } diff --git a/src/core/bpf-lsm.c b/src/core/bpf-lsm.c deleted file mode 100644 index 216fc34..0000000 --- a/src/core/bpf-lsm.c +++ /dev/null @@ -1,320 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1-or-later */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "alloc-util.h" -#include "bpf-lsm.h" -#include "cgroup-util.h" -#include "fd-util.h" -#include "fileio.h" -#include "filesystems.h" -#include "log.h" -#include "lsm-util.h" -#include "manager.h" -#include "mkdir.h" -#include "nulstr-util.h" -#include "stat-util.h" -#include "strv.h" - -#if BPF_FRAMEWORK -/* libbpf, clang and llc compile time dependencies are satisfied */ -#include "bpf-dlopen.h" -#include "bpf-link.h" -#include "bpf-util.h" -#include "bpf/restrict_fs/restrict-fs-skel.h" - -#define CGROUP_HASH_SIZE_MAX 2048 - -static struct restrict_fs_bpf *restrict_fs_bpf_free(struct restrict_fs_bpf *obj) { - /* restrict_fs_bpf__destroy handles object == NULL case */ - (void) restrict_fs_bpf__destroy(obj); - - return NULL; -} - -DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free); - -static bool bpf_can_link_lsm_program(struct bpf_program *prog) { - _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; - - assert(prog); - - link = sym_bpf_program__attach_lsm(prog); - - /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory - * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence - * BPF_LSM_MAC attach type) is not supported. */ - return sym_libbpf_get_error(link) == 0; -} - -static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) { - _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; - _cleanup_close_ int inner_map_fd = -EBADF; - int r; - - assert(ret_obj); - - obj = restrict_fs_bpf__open(); - if (!obj) - return log_error_errno(errno, "bpf-lsm: Failed to open BPF object: %m"); - - /* TODO Maybe choose a number based on runtime information? */ - r = sym_bpf_map__set_max_entries(obj->maps.cgroup_hash, CGROUP_HASH_SIZE_MAX); - assert(r <= 0); - if (r < 0) - return log_error_errno(r, "bpf-lsm: Failed to resize BPF map '%s': %m", - sym_bpf_map__name(obj->maps.cgroup_hash)); - - /* Dummy map to satisfy the verifier */ - inner_map_fd = compat_bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), sizeof(uint32_t), 128U, NULL); - if (inner_map_fd < 0) - return log_error_errno(errno, "bpf-lsm: Failed to create BPF map: %m"); - - r = sym_bpf_map__set_inner_map_fd(obj->maps.cgroup_hash, inner_map_fd); - assert(r <= 0); - if (r < 0) - return log_error_errno(r, "bpf-lsm: Failed to set inner map fd: %m"); - - r = restrict_fs_bpf__load(obj); - assert(r <= 0); - if (r < 0) - return log_error_errno(r, "bpf-lsm: Failed to load BPF object: %m"); - - *ret_obj = TAKE_PTR(obj); - - return 0; -} - -bool lsm_bpf_supported(bool initialize) { - _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; - static int supported = -1; - int r; - - if (supported >= 0) - return supported; - if (!initialize) - return false; - - if (!cgroup_bpf_supported()) - return (supported = false); - - r = lsm_supported("bpf"); - if (r < 0) { - log_warning_errno(r, "bpf-lsm: Can't determine whether the BPF LSM module is used: %m"); - return (supported = false); - } - if (r == 0) { - log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), - "bpf-lsm: BPF LSM hook not enabled in the kernel, BPF LSM not supported"); - return (supported = false); - } - - r = prepare_restrict_fs_bpf(&obj); - if (r < 0) - return (supported = false); - - if (!bpf_can_link_lsm_program(obj->progs.restrict_filesystems)) { - log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), - "bpf-lsm: Failed to link program; assuming BPF LSM is not available"); - return (supported = false); - } - - return (supported = true); -} - -int lsm_bpf_setup(Manager *m) { - _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; - _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; - int r; - - assert(m); - - r = prepare_restrict_fs_bpf(&obj); - if (r < 0) - return r; - - link = sym_bpf_program__attach_lsm(obj->progs.restrict_filesystems); - r = sym_libbpf_get_error(link); - if (r != 0) - return log_error_errno(r, "bpf-lsm: Failed to link '%s' LSM BPF program: %m", - sym_bpf_program__name(obj->progs.restrict_filesystems)); - - log_info("bpf-lsm: LSM BPF program attached"); - - obj->links.restrict_filesystems = TAKE_PTR(link); - m->restrict_fs = TAKE_PTR(obj); - - return 0; -} - -int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list) { - uint32_t dummy_value = 1, zero = 0; - const char *fs; - const statfs_f_type_t *magic; - int r; - - assert(filesystems); - assert(outer_map_fd >= 0); - - int inner_map_fd = compat_bpf_map_create( - BPF_MAP_TYPE_HASH, - NULL, - sizeof(uint32_t), - sizeof(uint32_t), - 128U, /* Should be enough for all filesystem types */ - NULL); - if (inner_map_fd < 0) - return log_error_errno(errno, "bpf-lsm: Failed to create inner BPF map: %m"); - - if (sym_bpf_map_update_elem(outer_map_fd, &cgroup_id, &inner_map_fd, BPF_ANY) != 0) - return log_error_errno(errno, "bpf-lsm: Error populating BPF map: %m"); - - uint32_t allow = allow_list; - - /* Use key 0 to store whether this is an allow list or a deny list */ - if (sym_bpf_map_update_elem(inner_map_fd, &zero, &allow, BPF_ANY) != 0) - return log_error_errno(errno, "bpf-lsm: Error initializing map: %m"); - - SET_FOREACH(fs, filesystems) { - r = fs_type_from_string(fs, &magic); - if (r < 0) { - log_warning("bpf-lsm: Invalid filesystem name '%s', ignoring.", fs); - continue; - } - - log_debug("bpf-lsm: Restricting filesystem access to '%s'", fs); - - for (int i = 0; i < FILESYSTEM_MAGIC_MAX; i++) { - if (magic[i] == 0) - break; - - if (sym_bpf_map_update_elem(inner_map_fd, &magic[i], &dummy_value, BPF_ANY) != 0) { - r = log_error_errno(errno, "bpf-lsm: Failed to update BPF map: %m"); - - if (sym_bpf_map_delete_elem(outer_map_fd, &cgroup_id) != 0) - log_debug_errno(errno, "bpf-lsm: Failed to delete cgroup entry from BPF map: %m"); - - return r; - } - } - } - - return 0; -} - -int lsm_bpf_cleanup(const Unit *u) { - assert(u); - assert(u->manager); - - /* If we never successfully detected support, there is nothing to clean up. */ - if (!lsm_bpf_supported(/* initialize = */ false)) - return 0; - - if (!u->manager->restrict_fs) - return 0; - - if (u->cgroup_id == 0) - return 0; - - int fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash); - if (fd < 0) - return log_unit_error_errno(u, errno, "bpf-lsm: Failed to get BPF map fd: %m"); - - if (sym_bpf_map_delete_elem(fd, &u->cgroup_id) != 0 && errno != ENOENT) - return log_unit_debug_errno(u, errno, "bpf-lsm: Failed to delete cgroup entry from LSM BPF map: %m"); - - return 0; -} - -int lsm_bpf_map_restrict_fs_fd(Unit *unit) { - assert(unit); - assert(unit->manager); - - if (!unit->manager->restrict_fs) - return -ENOMEDIUM; - - return sym_bpf_map__fd(unit->manager->restrict_fs->maps.cgroup_hash); -} - -void lsm_bpf_destroy(struct restrict_fs_bpf *prog) { - restrict_fs_bpf__destroy(prog); -} -#else /* ! BPF_FRAMEWORK */ -bool lsm_bpf_supported(bool initialize) { - return false; -} - -int lsm_bpf_setup(Manager *m) { - return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to set up LSM BPF: %m"); -} - -int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, const bool allow_list) { - return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to restrict filesystems using LSM BPF: %m"); -} - -int lsm_bpf_cleanup(const Unit *u) { - return 0; -} - -int lsm_bpf_map_restrict_fs_fd(Unit *unit) { - return -ENOMEDIUM; -} - -void lsm_bpf_destroy(struct restrict_fs_bpf *prog) { - return; -} -#endif - -int lsm_bpf_parse_filesystem( - const char *name, - Set **filesystems, - FilesystemParseFlags flags, - const char *unit, - const char *filename, - unsigned line) { - int r; - - assert(name); - assert(filesystems); - - if (name[0] == '@') { - const FilesystemSet *set; - - set = filesystem_set_find(name); - if (!set) { - log_syntax(unit, flags & FILESYSTEM_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0, - "bpf-lsm: Unknown filesystem group, ignoring: %s", name); - return 0; - } - - NULSTR_FOREACH(i, set->value) { - /* Call ourselves again, for the group to parse. Note that we downgrade logging here - * (i.e. take away the FILESYSTEM_PARSE_LOG flag) since any issues in the group table - * are our own problem, not a problem in user configuration data and we shouldn't - * pretend otherwise by complaining about them. */ - r = lsm_bpf_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line); - if (r < 0) - return r; - } - } else { - /* If we previously wanted to forbid access to a filesystem and now - * we want to allow it, then remove it from the list. */ - if (!(flags & FILESYSTEM_PARSE_INVERT) == !!(flags & FILESYSTEM_PARSE_ALLOW_LIST)) { - r = set_put_strdup(filesystems, name); - if (r == -ENOMEM) - return flags & FILESYSTEM_PARSE_LOG ? log_oom() : -ENOMEM; - if (r < 0 && r != -EEXIST) /* When already in set, ignore */ - return r; - } else - free(set_remove(*filesystems, name)); - } - - return 0; -} diff --git a/src/core/bpf-lsm.h b/src/core/bpf-lsm.h deleted file mode 100644 index a6eda19..0000000 --- a/src/core/bpf-lsm.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1-or-later */ -#pragma once - -#include "hashmap.h" - -typedef enum FilesystemParseFlags { - FILESYSTEM_PARSE_INVERT = 1 << 0, - FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1, - FILESYSTEM_PARSE_LOG = 1 << 2, -} FilesystemParseFlags; - -typedef struct Unit Unit; -typedef struct Manager Manager; - -typedef struct restrict_fs_bpf restrict_fs_bpf; - -bool lsm_bpf_supported(bool initialize); -int lsm_bpf_setup(Manager *m); -int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list); -int lsm_bpf_cleanup(const Unit *u); -int lsm_bpf_map_restrict_fs_fd(Unit *u); -void lsm_bpf_destroy(struct restrict_fs_bpf *prog); -int lsm_bpf_parse_filesystem(const char *name, - Set **filesystems, - FilesystemParseFlags flags, - const char *unit, - const char *filename, - unsigned line); diff --git a/src/core/bpf-restrict-fs.c b/src/core/bpf-restrict-fs.c new file mode 100644 index 0000000..d36bfb5 --- /dev/null +++ b/src/core/bpf-restrict-fs.c @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bpf-restrict-fs.h" +#include "cgroup-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "filesystems.h" +#include "log.h" +#include "lsm-util.h" +#include "manager.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "stat-util.h" +#include "strv.h" + +#if BPF_FRAMEWORK +/* libbpf, clang and llc compile time dependencies are satisfied */ +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "bpf-util.h" +#include "bpf/restrict_fs/restrict-fs-skel.h" + +#define CGROUP_HASH_SIZE_MAX 2048 + +static struct restrict_fs_bpf *restrict_fs_bpf_free(struct restrict_fs_bpf *obj) { + /* restrict_fs_bpf__destroy handles object == NULL case */ + (void) restrict_fs_bpf__destroy(obj); + + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free); + +static bool bpf_can_link_lsm_program(struct bpf_program *prog) { + _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; + + assert(prog); + + link = sym_bpf_program__attach_lsm(prog); + + /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory + * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence + * BPF_LSM_MAC attach type) is not supported. */ + return bpf_get_error_translated(link) == 0; +} + +static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) { + _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; + _cleanup_close_ int inner_map_fd = -EBADF; + int r; + + assert(ret_obj); + + obj = restrict_fs_bpf__open(); + if (!obj) + return log_error_errno(errno, "bpf-restrict-fs: Failed to open BPF object: %m"); + + /* TODO Maybe choose a number based on runtime information? */ + r = sym_bpf_map__set_max_entries(obj->maps.cgroup_hash, CGROUP_HASH_SIZE_MAX); + assert(r <= 0); + if (r < 0) + return log_error_errno(r, "bpf-restrict-fs: Failed to resize BPF map '%s': %m", + sym_bpf_map__name(obj->maps.cgroup_hash)); + + /* Dummy map to satisfy the verifier */ + inner_map_fd = compat_bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), sizeof(uint32_t), 128U, NULL); + if (inner_map_fd < 0) + return log_error_errno(errno, "bpf-restrict-fs: Failed to create BPF map: %m"); + + r = sym_bpf_map__set_inner_map_fd(obj->maps.cgroup_hash, inner_map_fd); + assert(r <= 0); + if (r < 0) + return log_error_errno(r, "bpf-restrict-fs: Failed to set inner map fd: %m"); + + r = restrict_fs_bpf__load(obj); + assert(r <= 0); + if (r < 0) + return log_error_errno(r, "bpf-restrict-fs: Failed to load BPF object: %m"); + + *ret_obj = TAKE_PTR(obj); + + return 0; +} + +bool bpf_restrict_fs_supported(bool initialize) { + _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; + static int supported = -1; + int r; + + if (supported >= 0) + return supported; + if (!initialize) + return false; + + if (!cgroup_bpf_supported()) + return (supported = false); + + r = lsm_supported("bpf"); + if (r < 0) { + log_warning_errno(r, "bpf-restrict-fs: Can't determine whether the BPF LSM module is used: %m"); + return (supported = false); + } + if (r == 0) { + log_info("bpf-restrict-fs: BPF LSM hook not enabled in the kernel, BPF LSM not supported."); + return (supported = false); + } + + r = prepare_restrict_fs_bpf(&obj); + if (r < 0) + return (supported = false); + + if (!bpf_can_link_lsm_program(obj->progs.restrict_filesystems)) { + log_warning("bpf-restrict-fs: Failed to link program; assuming BPF LSM is not available."); + return (supported = false); + } + + return (supported = true); +} + +int bpf_restrict_fs_setup(Manager *m) { + _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; + _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; + int r; + + assert(m); + + r = prepare_restrict_fs_bpf(&obj); + if (r < 0) + return r; + + link = sym_bpf_program__attach_lsm(obj->progs.restrict_filesystems); + r = bpf_get_error_translated(link); + if (r != 0) + return log_error_errno(r, "bpf-restrict-fs: Failed to link '%s' LSM BPF program: %m", + sym_bpf_program__name(obj->progs.restrict_filesystems)); + + log_info("bpf-restrict-fs: LSM BPF program attached"); + + obj->links.restrict_filesystems = TAKE_PTR(link); + m->restrict_fs = TAKE_PTR(obj); + + return 0; +} + +int bpf_restrict_fs_update(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list) { + uint32_t dummy_value = 1, zero = 0; + const char *fs; + const statfs_f_type_t *magic; + int r; + + assert(filesystems); + assert(outer_map_fd >= 0); + + int inner_map_fd = compat_bpf_map_create( + BPF_MAP_TYPE_HASH, + NULL, + sizeof(uint32_t), + sizeof(uint32_t), + 128U, /* Should be enough for all filesystem types */ + NULL); + if (inner_map_fd < 0) + return log_error_errno(errno, "bpf-restrict-fs: Failed to create inner BPF map: %m"); + + if (sym_bpf_map_update_elem(outer_map_fd, &cgroup_id, &inner_map_fd, BPF_ANY) != 0) + return log_error_errno(errno, "bpf-restrict-fs: Error populating BPF map: %m"); + + uint32_t allow = allow_list; + + /* Use key 0 to store whether this is an allow list or a deny list */ + if (sym_bpf_map_update_elem(inner_map_fd, &zero, &allow, BPF_ANY) != 0) + return log_error_errno(errno, "bpf-restrict-fs: Error initializing map: %m"); + + SET_FOREACH(fs, filesystems) { + r = fs_type_from_string(fs, &magic); + if (r < 0) { + log_warning("bpf-restrict-fs: Invalid filesystem name '%s', ignoring.", fs); + continue; + } + + log_debug("bpf-restrict-fs: Restricting filesystem access to '%s'", fs); + + for (int i = 0; i < FILESYSTEM_MAGIC_MAX; i++) { + if (magic[i] == 0) + break; + + if (sym_bpf_map_update_elem(inner_map_fd, &magic[i], &dummy_value, BPF_ANY) != 0) { + r = log_error_errno(errno, "bpf-restrict-fs: Failed to update BPF map: %m"); + + if (sym_bpf_map_delete_elem(outer_map_fd, &cgroup_id) != 0) + log_debug_errno(errno, "bpf-restrict-fs: Failed to delete cgroup entry from BPF map: %m"); + + return r; + } + } + } + + return 0; +} + +int bpf_restrict_fs_cleanup(Unit *u) { + CGroupRuntime *crt; + + assert(u); + assert(u->manager); + + /* If we never successfully detected support, there is nothing to clean up. */ + if (!bpf_restrict_fs_supported(/* initialize = */ false)) + return 0; + + if (!u->manager->restrict_fs) + return 0; + + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; + + if (crt->cgroup_id == 0) + return 0; + + int fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash); + if (fd < 0) + return log_unit_error_errno(u, errno, "bpf-restrict-fs: Failed to get BPF map fd: %m"); + + if (sym_bpf_map_delete_elem(fd, &crt->cgroup_id) != 0 && errno != ENOENT) + return log_unit_debug_errno(u, errno, "bpf-restrict-fs: Failed to delete cgroup entry from LSM BPF map: %m"); + + return 0; +} + +int bpf_restrict_fs_map_fd(Unit *unit) { + assert(unit); + assert(unit->manager); + + if (!unit->manager->restrict_fs) + return -ENOMEDIUM; + + return sym_bpf_map__fd(unit->manager->restrict_fs->maps.cgroup_hash); +} + +void bpf_restrict_fs_destroy(struct restrict_fs_bpf *prog) { + restrict_fs_bpf__destroy(prog); +} +#else /* ! BPF_FRAMEWORK */ +bool bpf_restrict_fs_supported(bool initialize) { + return false; +} + +int bpf_restrict_fs_setup(Manager *m) { + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-restrict-fs: BPF framework is not supported."); +} + +int bpf_restrict_fs_update(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, const bool allow_list) { + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-restrict-fs: BPF framework is not supported."); +} + +int bpf_restrict_fs_cleanup(Unit *u) { + return 0; +} + +int bpf_restrict_fs_map_fd(Unit *unit) { + return -ENOMEDIUM; +} + +void bpf_restrict_fs_destroy(struct restrict_fs_bpf *prog) { + return; +} +#endif + +int bpf_restrict_fs_parse_filesystem( + const char *name, + Set **filesystems, + FilesystemParseFlags flags, + const char *unit, + const char *filename, + unsigned line) { + int r; + + assert(name); + assert(filesystems); + + if (name[0] == '@') { + const FilesystemSet *set; + + set = filesystem_set_find(name); + if (!set) { + log_syntax(unit, flags & FILESYSTEM_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0, + "bpf-restrict-fs: Unknown filesystem group, ignoring: %s", name); + return 0; + } + + NULSTR_FOREACH(i, set->value) { + /* Call ourselves again, for the group to parse. Note that we downgrade logging here + * (i.e. take away the FILESYSTEM_PARSE_LOG flag) since any issues in the group table + * are our own problem, not a problem in user configuration data and we shouldn't + * pretend otherwise by complaining about them. */ + r = bpf_restrict_fs_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line); + if (r < 0) + return r; + } + } else { + /* If we previously wanted to forbid access to a filesystem and now + * we want to allow it, then remove it from the list. */ + if (!(flags & FILESYSTEM_PARSE_INVERT) == !!(flags & FILESYSTEM_PARSE_ALLOW_LIST)) { + r = set_put_strdup(filesystems, name); + if (r == -ENOMEM) + return flags & FILESYSTEM_PARSE_LOG ? log_oom() : -ENOMEM; + if (r < 0 && r != -EEXIST) /* When already in set, ignore */ + return r; + } else + free(set_remove(*filesystems, name)); + } + + return 0; +} diff --git a/src/core/bpf-restrict-fs.h b/src/core/bpf-restrict-fs.h new file mode 100644 index 0000000..8da12de --- /dev/null +++ b/src/core/bpf-restrict-fs.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hashmap.h" + +typedef enum FilesystemParseFlags { + FILESYSTEM_PARSE_INVERT = 1 << 0, + FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1, + FILESYSTEM_PARSE_LOG = 1 << 2, +} FilesystemParseFlags; + +typedef struct Unit Unit; +typedef struct Manager Manager; + +typedef struct restrict_fs_bpf restrict_fs_bpf; + +bool bpf_restrict_fs_supported(bool initialize); +int bpf_restrict_fs_setup(Manager *m); +int bpf_restrict_fs_update(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list); +int bpf_restrict_fs_cleanup(Unit *u); +int bpf_restrict_fs_map_fd(Unit *u); +void bpf_restrict_fs_destroy(struct restrict_fs_bpf *prog); +int bpf_restrict_fs_parse_filesystem(const char *name, Set **filesystems, FilesystemParseFlags flags, const char *unit, const char *filename, unsigned line); diff --git a/src/core/bpf-restrict-ifaces.c b/src/core/bpf-restrict-ifaces.c new file mode 100644 index 0000000..64d8d1a --- /dev/null +++ b/src/core/bpf-restrict-ifaces.c @@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "bpf-restrict-ifaces.h" +#include "netlink-util.h" + +#if BPF_FRAMEWORK +/* libbpf, clang and llc compile time dependencies are satisfied */ + +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "bpf-util.h" +#include "bpf/restrict_ifaces/restrict-ifaces-skel.h" + +static struct restrict_ifaces_bpf *restrict_ifaces_bpf_free(struct restrict_ifaces_bpf *obj) { + restrict_ifaces_bpf__destroy(obj); + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_ifaces_bpf *, restrict_ifaces_bpf_free); + +static int prepare_restrict_ifaces_bpf( + Unit* u, + bool is_allow_list, + const Set *restrict_network_interfaces, + struct restrict_ifaces_bpf **ret_object) { + + _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + char *iface; + int r, map_fd; + + assert(ret_object); + + obj = restrict_ifaces_bpf__open(); + if (!obj) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "restrict-interfaces: Failed to open BPF object: %m"); + + r = sym_bpf_map__set_max_entries(obj->maps.sd_restrictif, MAX(set_size(restrict_network_interfaces), 1u)); + if (r != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r, + "restrict-interfaces: Failed to resize BPF map '%s': %m", + sym_bpf_map__name(obj->maps.sd_restrictif)); + + obj->rodata->is_allow_list = is_allow_list; + + r = restrict_ifaces_bpf__load(obj); + if (r != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, r, "restrict-interfaces: Failed to load BPF object: %m"); + + map_fd = sym_bpf_map__fd(obj->maps.sd_restrictif); + + SET_FOREACH(iface, restrict_network_interfaces) { + uint8_t dummy = 0; + int ifindex; + + ifindex = rtnl_resolve_interface(&rtnl, iface); + if (ifindex < 0) { + log_unit_warning_errno(u, ifindex, + "restrict-interfaces: Couldn't find index of network interface '%s', ignoring: %m", + iface); + continue; + } + + if (sym_bpf_map_update_elem(map_fd, &ifindex, &dummy, BPF_ANY)) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno, + "restrict-interfaces: Failed to update BPF map '%s' fd: %m", + sym_bpf_map__name(obj->maps.sd_restrictif)); + } + + *ret_object = TAKE_PTR(obj); + return 0; +} + +int bpf_restrict_ifaces_supported(void) { + _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; + static int supported = -1; + int r; + + if (supported >= 0) + return supported; + + if (!cgroup_bpf_supported()) + return (supported = false); + + if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SKB, /*opts=*/NULL)) { + log_debug("restrict-interfaces: BPF program type cgroup_skb is not supported"); + return (supported = false); + } + + r = prepare_restrict_ifaces_bpf(NULL, true, NULL, &obj); + if (r < 0) { + log_debug_errno(r, "restrict-interfaces: Failed to load BPF object: %m"); + return (supported = false); + } + + return (supported = bpf_can_link_program(obj->progs.sd_restrictif_i)); +} + +static int restrict_ifaces_install_impl(Unit *u) { + _cleanup_(bpf_link_freep) struct bpf_link *egress_link = NULL, *ingress_link = NULL; + _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; + _cleanup_free_ char *cgroup_path = NULL; + _cleanup_close_ int cgroup_fd = -EBADF; + CGroupContext *cc; + CGroupRuntime *crt; + int r; + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "restrict-interfaces: Failed to get cgroup path: %m"); + + if (!cc->restrict_network_interfaces) + return 0; + + r = prepare_restrict_ifaces_bpf(u, + cc->restrict_network_interfaces_is_allow_list, + cc->restrict_network_interfaces, + &obj); + if (r < 0) + return r; + + cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC | O_DIRECTORY, 0); + if (cgroup_fd < 0) + return -errno; + + ingress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_i, cgroup_fd); + r = bpf_get_error_translated(ingress_link); + if (r != 0) + return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create ingress cgroup link: %m"); + + egress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_e, cgroup_fd); + r = bpf_get_error_translated(egress_link); + if (r != 0) + return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create egress cgroup link: %m"); + + crt->restrict_ifaces_ingress_bpf_link = TAKE_PTR(ingress_link); + crt->restrict_ifaces_egress_bpf_link = TAKE_PTR(egress_link); + + return 0; +} + +int bpf_restrict_ifaces_install(Unit *u) { + CGroupRuntime *crt; + int r; + + assert(u); + + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; + + r = restrict_ifaces_install_impl(u); + fdset_close(crt->initial_restrict_ifaces_link_fds); + return r; +} + +int bpf_restrict_ifaces_serialize(Unit *u, FILE *f, FDSet *fds) { + CGroupRuntime *crt; + int r; + + assert(u); + + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; + + r = bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", crt->restrict_ifaces_ingress_bpf_link); + if (r < 0) + return r; + + return bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", crt->restrict_ifaces_egress_bpf_link); +} + +int bpf_restrict_ifaces_add_initial_link_fd(Unit *u, int fd) { + int r; + + assert(u); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return -EINVAL; + + if (!crt->initial_restrict_ifaces_link_fds) { + crt->initial_restrict_ifaces_link_fds = fdset_new(); + if (!crt->initial_restrict_ifaces_link_fds) + return log_oom(); + } + + r = fdset_put(crt->initial_restrict_ifaces_link_fds, fd); + if (r < 0) + return log_unit_error_errno(u, r, + "restrict-interfaces: Failed to put restrict-ifaces-bpf-fd %d to restored fdset: %m", fd); + + return 0; +} + +#else /* ! BPF_FRAMEWORK */ +int bpf_restrict_ifaces_supported(void) { + return 0; +} + +int bpf_restrict_ifaces_install(Unit *u) { + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "restrict-interfaces: Failed to install; BPF programs built from source code are not supported: %m"); +} + +int bpf_restrict_ifaces_serialize(Unit *u, FILE *f, FDSet *fds) { + return 0; +} + +int bpf_restrict_ifaces_add_initial_link_fd(Unit *u, int fd) { + return 0; +} +#endif diff --git a/src/core/bpf-restrict-ifaces.h b/src/core/bpf-restrict-ifaces.h new file mode 100644 index 0000000..28f7427 --- /dev/null +++ b/src/core/bpf-restrict-ifaces.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "fdset.h" +#include "unit.h" + +typedef struct Unit Unit; + +int bpf_restrict_ifaces_supported(void); +int bpf_restrict_ifaces_install(Unit *u); + +int bpf_restrict_ifaces_serialize(Unit *u, FILE *f, FDSet *fds); + +/* Add BPF link fd created before daemon-reload or daemon-reexec. + * FDs will be closed at the end of restrict_network_interfaces_install. */ +int bpf_restrict_ifaces_add_initial_link_fd(Unit *u, int fd); diff --git a/src/core/bpf-socket-bind.c b/src/core/bpf-socket-bind.c index 88ab487..2a1a027 100644 --- a/src/core/bpf-socket-bind.c +++ b/src/core/bpf-socket-bind.c @@ -148,13 +148,18 @@ int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) { assert(u); - if (!u->initial_socket_bind_link_fds) { - u->initial_socket_bind_link_fds = fdset_new(); - if (!u->initial_socket_bind_link_fds) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "Failed to get control group runtime object."); + + if (!crt->initial_socket_bind_link_fds) { + crt->initial_socket_bind_link_fds = fdset_new(); + if (!crt->initial_socket_bind_link_fds) return log_oom(); } - r = fdset_put(u->initial_socket_bind_link_fds, fd); + r = fdset_put(crt->initial_socket_bind_link_fds, fd); if (r < 0) return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to put BPF fd %d to initial fdset", fd); @@ -167,6 +172,7 @@ static int socket_bind_install_impl(Unit *u) { _cleanup_free_ char *cgroup_path = NULL; _cleanup_close_ int cgroup_fd = -EBADF; CGroupContext *cc; + CGroupRuntime *crt; int r; assert(u); @@ -175,7 +181,11 @@ static int socket_bind_install_impl(Unit *u) { if (!cc) return 0; - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path); + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_path); if (r < 0) return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to get cgroup path: %m"); @@ -191,46 +201,53 @@ static int socket_bind_install_impl(Unit *u) { return log_unit_error_errno(u, errno, "bpf-socket-bind: Failed to open cgroup %s for reading: %m", cgroup_path); ipv4 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind4, cgroup_fd); - r = sym_libbpf_get_error(ipv4); + r = bpf_get_error_translated(ipv4); if (r != 0) return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m", sym_bpf_program__name(obj->progs.sd_bind4)); ipv6 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind6, cgroup_fd); - r = sym_libbpf_get_error(ipv6); + r = bpf_get_error_translated(ipv6); if (r != 0) return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m", sym_bpf_program__name(obj->progs.sd_bind6)); - u->ipv4_socket_bind_link = TAKE_PTR(ipv4); - u->ipv6_socket_bind_link = TAKE_PTR(ipv6); + crt->ipv4_socket_bind_link = TAKE_PTR(ipv4); + crt->ipv6_socket_bind_link = TAKE_PTR(ipv6); return 0; } int bpf_socket_bind_install(Unit *u) { + CGroupRuntime *crt; int r; assert(u); - r = socket_bind_install_impl(u); - if (r == -ENOMEM) - return r; + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; - fdset_close(u->initial_socket_bind_link_fds); + r = socket_bind_install_impl(u); + fdset_close(crt->initial_socket_bind_link_fds); return r; } -int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) { +int bpf_socket_bind_serialize(Unit *u, FILE *f, FDSet *fds) { + CGroupRuntime *crt; int r; assert(u); - r = bpf_serialize_link(f, fds, "ipv4-socket-bind-bpf-link", u->ipv4_socket_bind_link); + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; + + r = bpf_serialize_link(f, fds, "ipv4-socket-bind-bpf-link", crt->ipv4_socket_bind_link); if (r < 0) return r; - return bpf_serialize_link(f, fds, "ipv6-socket-bind-bpf-link", u->ipv6_socket_bind_link); + return bpf_serialize_link(f, fds, "ipv6-socket-bind-bpf-link", crt->ipv6_socket_bind_link); } #else /* ! BPF_FRAMEWORK */ @@ -247,7 +264,7 @@ int bpf_socket_bind_install(Unit *u) { "bpf-socket-bind: Failed to install; BPF framework is not supported"); } -int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) { +int bpf_socket_bind_serialize(Unit *u, FILE *f, FDSet *fds) { return 0; } #endif diff --git a/src/core/bpf-socket-bind.h b/src/core/bpf-socket-bind.h index 7d426df..28b25f6 100644 --- a/src/core/bpf-socket-bind.h +++ b/src/core/bpf-socket-bind.h @@ -12,4 +12,4 @@ int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd); int bpf_socket_bind_install(Unit *u); -int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds); +int bpf_socket_bind_serialize(Unit *u, FILE *f, FDSet *fds); diff --git a/src/core/bpf-util.c b/src/core/bpf-util.c index 6fe229e..b337ba9 100644 --- a/src/core/bpf-util.c +++ b/src/core/bpf-util.c @@ -20,8 +20,7 @@ bool cgroup_bpf_supported(void) { } if (r == 0) { - log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), - "Not running with unified cgroup hierarchy, disabling cgroup BPF features."); + log_info("Not running with unified cgroup hierarchy, disabling cgroup BPF features."); return (supported = false); } diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 61ac4df..34fd2a2 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -10,6 +10,7 @@ #include "bpf-devices.h" #include "bpf-firewall.h" #include "bpf-foreign.h" +#include "bpf-restrict-ifaces.h" #include "bpf-socket-bind.h" #include "btrfs-util.h" #include "bus-error.h" @@ -32,7 +33,8 @@ #include "percent-util.h" #include "process-util.h" #include "procfs-util.h" -#include "restrict-ifaces.h" +#include "set.h" +#include "serialize.h" #include "special.h" #include "stdio-util.h" #include "string-table.h" @@ -115,10 +117,16 @@ bool unit_has_host_root_cgroup(Unit *u) { static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) { int r; - r = cg_set_attribute(controller, u->cgroup_path, attribute, value); + assert(u); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return -EOWNERDEAD; + + r = cg_set_attribute(controller, crt->cgroup_path, attribute, value); if (r < 0) log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m", - strna(attribute), empty_to_root(u->cgroup_path), (int) strcspn(value, NEWLINE), value); + strna(attribute), empty_to_root(crt->cgroup_path), (int) strcspn(value, NEWLINE), value); return r; } @@ -172,6 +180,8 @@ void cgroup_context_init(CGroupContext *c) { .memory_limit = CGROUP_LIMIT_MAX, + .memory_zswap_writeback = true, + .io_weight = CGROUP_WEIGHT_INVALID, .startup_io_weight = CGROUP_WEIGHT_INVALID, @@ -189,6 +199,319 @@ void cgroup_context_init(CGroupContext *c) { }; } +int cgroup_context_add_io_device_weight_dup(CGroupContext *c, const CGroupIODeviceWeight *w) { + _cleanup_free_ CGroupIODeviceWeight *n = NULL; + + assert(c); + assert(w); + + n = new(CGroupIODeviceWeight, 1); + if (!n) + return -ENOMEM; + + *n = (CGroupIODeviceWeight) { + .path = strdup(w->path), + .weight = w->weight, + }; + if (!n->path) + return -ENOMEM; + + LIST_PREPEND(device_weights, c->io_device_weights, TAKE_PTR(n)); + return 0; +} + +int cgroup_context_add_io_device_limit_dup(CGroupContext *c, const CGroupIODeviceLimit *l) { + _cleanup_free_ CGroupIODeviceLimit *n = NULL; + + assert(c); + assert(l); + + n = new0(CGroupIODeviceLimit, 1); + if (!n) + return -ENOMEM; + + n->path = strdup(l->path); + if (!n->path) + return -ENOMEM; + + for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) + n->limits[type] = l->limits[type]; + + LIST_PREPEND(device_limits, c->io_device_limits, TAKE_PTR(n)); + return 0; +} + +int cgroup_context_add_io_device_latency_dup(CGroupContext *c, const CGroupIODeviceLatency *l) { + _cleanup_free_ CGroupIODeviceLatency *n = NULL; + + assert(c); + assert(l); + + n = new(CGroupIODeviceLatency, 1); + if (!n) + return -ENOMEM; + + *n = (CGroupIODeviceLatency) { + .path = strdup(l->path), + .target_usec = l->target_usec, + }; + if (!n->path) + return -ENOMEM; + + LIST_PREPEND(device_latencies, c->io_device_latencies, TAKE_PTR(n)); + return 0; +} + +int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, const CGroupBlockIODeviceWeight *w) { + _cleanup_free_ CGroupBlockIODeviceWeight *n = NULL; + + assert(c); + assert(w); + + n = new(CGroupBlockIODeviceWeight, 1); + if (!n) + return -ENOMEM; + + *n = (CGroupBlockIODeviceWeight) { + .path = strdup(w->path), + .weight = w->weight, + }; + if (!n->path) + return -ENOMEM; + + LIST_PREPEND(device_weights, c->blockio_device_weights, TAKE_PTR(n)); + return 0; +} + +int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, const CGroupBlockIODeviceBandwidth *b) { + _cleanup_free_ CGroupBlockIODeviceBandwidth *n = NULL; + + assert(c); + assert(b); + + n = new(CGroupBlockIODeviceBandwidth, 1); + if (!n) + return -ENOMEM; + + *n = (CGroupBlockIODeviceBandwidth) { + .rbps = b->rbps, + .wbps = b->wbps, + }; + + LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, TAKE_PTR(n)); + return 0; +} + +int cgroup_context_add_device_allow_dup(CGroupContext *c, const CGroupDeviceAllow *a) { + _cleanup_free_ CGroupDeviceAllow *n = NULL; + + assert(c); + assert(a); + + n = new(CGroupDeviceAllow, 1); + if (!n) + return -ENOMEM; + + *n = (CGroupDeviceAllow) { + .path = strdup(a->path), + .permissions = a->permissions, + }; + if (!n->path) + return -ENOMEM; + + LIST_PREPEND(device_allow, c->device_allow, TAKE_PTR(n)); + return 0; +} + +static int cgroup_context_add_socket_bind_item_dup(CGroupContext *c, const CGroupSocketBindItem *i, CGroupSocketBindItem *h) { + _cleanup_free_ CGroupSocketBindItem *n = NULL; + + assert(c); + assert(i); + + n = new(CGroupSocketBindItem, 1); + if (!n) + return -ENOMEM; + + *n = (CGroupSocketBindItem) { + .address_family = i->address_family, + .ip_protocol = i->ip_protocol, + .nr_ports = i->nr_ports, + .port_min = i->port_min, + }; + + LIST_PREPEND(socket_bind_items, h, TAKE_PTR(n)); + return 0; +} + +int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, const CGroupSocketBindItem *i) { + return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_allow); +} + +int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, const CGroupSocketBindItem *i) { + return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_deny); +} + +int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src) { + struct in_addr_prefix *i; + char *iface; + int r; + + assert(src); + assert(dst); + + dst->cpu_accounting = src->cpu_accounting; + dst->io_accounting = src->io_accounting; + dst->blockio_accounting = src->blockio_accounting; + dst->memory_accounting = src->memory_accounting; + dst->tasks_accounting = src->tasks_accounting; + dst->ip_accounting = src->ip_accounting; + + dst->memory_oom_group = src->memory_oom_group; + + dst->cpu_weight = src->cpu_weight; + dst->startup_cpu_weight = src->startup_cpu_weight; + dst->cpu_quota_per_sec_usec = src->cpu_quota_per_sec_usec; + dst->cpu_quota_period_usec = src->cpu_quota_period_usec; + + dst->cpuset_cpus = src->cpuset_cpus; + dst->startup_cpuset_cpus = src->startup_cpuset_cpus; + dst->cpuset_mems = src->cpuset_mems; + dst->startup_cpuset_mems = src->startup_cpuset_mems; + + dst->io_weight = src->io_weight; + dst->startup_io_weight = src->startup_io_weight; + + LIST_FOREACH_BACKWARDS(device_weights, w, LIST_FIND_TAIL(device_weights, src->io_device_weights)) { + r = cgroup_context_add_io_device_weight_dup(dst, w); + if (r < 0) + return r; + } + + LIST_FOREACH_BACKWARDS(device_limits, l, LIST_FIND_TAIL(device_limits, src->io_device_limits)) { + r = cgroup_context_add_io_device_limit_dup(dst, l); + if (r < 0) + return r; + } + + LIST_FOREACH_BACKWARDS(device_latencies, l, LIST_FIND_TAIL(device_latencies, src->io_device_latencies)) { + r = cgroup_context_add_io_device_latency_dup(dst, l); + if (r < 0) + return r; + } + + dst->default_memory_min = src->default_memory_min; + dst->default_memory_low = src->default_memory_low; + dst->default_startup_memory_low = src->default_startup_memory_low; + dst->memory_min = src->memory_min; + dst->memory_low = src->memory_low; + dst->startup_memory_low = src->startup_memory_low; + dst->memory_high = src->memory_high; + dst->startup_memory_high = src->startup_memory_high; + dst->memory_max = src->memory_max; + dst->startup_memory_max = src->startup_memory_max; + dst->memory_swap_max = src->memory_swap_max; + dst->startup_memory_swap_max = src->startup_memory_swap_max; + dst->memory_zswap_max = src->memory_zswap_max; + dst->startup_memory_zswap_max = src->startup_memory_zswap_max; + + dst->default_memory_min_set = src->default_memory_min_set; + dst->default_memory_low_set = src->default_memory_low_set; + dst->default_startup_memory_low_set = src->default_startup_memory_low_set; + dst->memory_min_set = src->memory_min_set; + dst->memory_low_set = src->memory_low_set; + dst->startup_memory_low_set = src->startup_memory_low_set; + dst->startup_memory_high_set = src->startup_memory_high_set; + dst->startup_memory_max_set = src->startup_memory_max_set; + dst->startup_memory_swap_max_set = src->startup_memory_swap_max_set; + dst->startup_memory_zswap_max_set = src->startup_memory_zswap_max_set; + dst->memory_zswap_writeback = src->memory_zswap_writeback; + + SET_FOREACH(i, src->ip_address_allow) { + r = in_addr_prefix_add(&dst->ip_address_allow, i); + if (r < 0) + return r; + } + + SET_FOREACH(i, src->ip_address_deny) { + r = in_addr_prefix_add(&dst->ip_address_deny, i); + if (r < 0) + return r; + } + + dst->ip_address_allow_reduced = src->ip_address_allow_reduced; + dst->ip_address_deny_reduced = src->ip_address_deny_reduced; + + if (!strv_isempty(src->ip_filters_ingress)) { + dst->ip_filters_ingress = strv_copy(src->ip_filters_ingress); + if (!dst->ip_filters_ingress) + return -ENOMEM; + } + + if (!strv_isempty(src->ip_filters_egress)) { + dst->ip_filters_egress = strv_copy(src->ip_filters_egress); + if (!dst->ip_filters_egress) + return -ENOMEM; + } + + LIST_FOREACH_BACKWARDS(programs, l, LIST_FIND_TAIL(programs, src->bpf_foreign_programs)) { + r = cgroup_context_add_bpf_foreign_program_dup(dst, l); + if (r < 0) + return r; + } + + SET_FOREACH(iface, src->restrict_network_interfaces) { + r = set_put_strdup(&dst->restrict_network_interfaces, iface); + if (r < 0) + return r; + } + dst->restrict_network_interfaces_is_allow_list = src->restrict_network_interfaces_is_allow_list; + + dst->cpu_shares = src->cpu_shares; + dst->startup_cpu_shares = src->startup_cpu_shares; + + dst->blockio_weight = src->blockio_weight; + dst->startup_blockio_weight = src->startup_blockio_weight; + + LIST_FOREACH_BACKWARDS(device_weights, l, LIST_FIND_TAIL(device_weights, src->blockio_device_weights)) { + r = cgroup_context_add_block_io_device_weight_dup(dst, l); + if (r < 0) + return r; + } + + LIST_FOREACH_BACKWARDS(device_bandwidths, l, LIST_FIND_TAIL(device_bandwidths, src->blockio_device_bandwidths)) { + r = cgroup_context_add_block_io_device_bandwidth_dup(dst, l); + if (r < 0) + return r; + } + + dst->memory_limit = src->memory_limit; + + dst->device_policy = src->device_policy; + LIST_FOREACH_BACKWARDS(device_allow, l, LIST_FIND_TAIL(device_allow, src->device_allow)) { + r = cgroup_context_add_device_allow_dup(dst, l); + if (r < 0) + return r; + } + + LIST_FOREACH_BACKWARDS(socket_bind_items, l, LIST_FIND_TAIL(socket_bind_items, src->socket_bind_allow)) { + r = cgroup_context_add_socket_bind_item_allow_dup(dst, l); + if (r < 0) + return r; + + } + + LIST_FOREACH_BACKWARDS(socket_bind_items, l, LIST_FIND_TAIL(socket_bind_items, src->socket_bind_deny)) { + r = cgroup_context_add_socket_bind_item_deny_dup(dst, l); + if (r < 0) + return r; + } + + dst->tasks_max = src->tasks_max; + + return 0; +} + void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) { assert(c); assert(a); @@ -306,10 +629,11 @@ void cgroup_context_done(CGroupContext *c) { static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) { assert(u); - if (!u->cgroup_realized) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -EOWNERDEAD; - return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret); + return cg_get_attribute_as_uint64("memory", crt->cgroup_path, file, ret); } static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) { @@ -425,11 +749,12 @@ static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_ #define FORMAT_CGROUP_DIFF_MAX 128 -static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) { +static char *format_cgroup_memory_limit_comparison(Unit *u, const char *property_name, char *buf, size_t l) { uint64_t kval, sval; int r; assert(u); + assert(property_name); assert(buf); assert(l > 0); @@ -499,18 +824,9 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL; CGroupContext *c; struct in_addr_prefix *iaai; - - char cda[FORMAT_CGROUP_DIFF_MAX]; - char cdb[FORMAT_CGROUP_DIFF_MAX]; - char cdc[FORMAT_CGROUP_DIFF_MAX]; - char cdd[FORMAT_CGROUP_DIFF_MAX]; - char cde[FORMAT_CGROUP_DIFF_MAX]; - char cdf[FORMAT_CGROUP_DIFF_MAX]; - char cdg[FORMAT_CGROUP_DIFF_MAX]; - char cdh[FORMAT_CGROUP_DIFF_MAX]; - char cdi[FORMAT_CGROUP_DIFF_MAX]; - char cdj[FORMAT_CGROUP_DIFF_MAX]; - char cdk[FORMAT_CGROUP_DIFF_MAX]; + char cda[FORMAT_CGROUP_DIFF_MAX], cdb[FORMAT_CGROUP_DIFF_MAX], cdc[FORMAT_CGROUP_DIFF_MAX], cdd[FORMAT_CGROUP_DIFF_MAX], + cde[FORMAT_CGROUP_DIFF_MAX], cdf[FORMAT_CGROUP_DIFF_MAX], cdg[FORMAT_CGROUP_DIFF_MAX], cdh[FORMAT_CGROUP_DIFF_MAX], + cdi[FORMAT_CGROUP_DIFF_MAX], cdj[FORMAT_CGROUP_DIFF_MAX], cdk[FORMAT_CGROUP_DIFF_MAX]; assert(u); assert(f); @@ -564,6 +880,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { "%sStartupMemorySwapMax: %" PRIu64 "%s\n" "%sMemoryZSwapMax: %" PRIu64 "%s\n" "%sStartupMemoryZSwapMax: %" PRIu64 "%s\n" + "%sMemoryZSwapWriteback: %s\n" "%sMemoryLimit: %" PRIu64 "\n" "%sTasksMax: %" PRIu64 "\n" "%sDevicePolicy: %s\n" @@ -597,17 +914,18 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { prefix, c->startup_blockio_weight, prefix, c->default_memory_min, prefix, c->default_memory_low, - prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"), - prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"), - prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "StartupMemoryLow"), - prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryHigh"), - prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "StartupMemoryHigh"), - prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdf, sizeof(cdf), u, "MemoryMax"), - prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(cdg, sizeof(cdg), u, "StartupMemoryMax"), - prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cdh, sizeof(cdh), u, "MemorySwapMax"), - prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(cdi, sizeof(cdi), u, "StartupMemorySwapMax"), - prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(cdj, sizeof(cdj), u, "MemoryZSwapMax"), - prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(cdk, sizeof(cdk), u, "StartupMemoryZSwapMax"), + prefix, c->memory_min, format_cgroup_memory_limit_comparison(u, "MemoryMin", cda, sizeof(cda)), + prefix, c->memory_low, format_cgroup_memory_limit_comparison(u, "MemoryLow", cdb, sizeof(cdb)), + prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(u, "StartupMemoryLow", cdc, sizeof(cdc)), + prefix, c->memory_high, format_cgroup_memory_limit_comparison(u, "MemoryHigh", cdd, sizeof(cdd)), + prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(u, "StartupMemoryHigh", cde, sizeof(cde)), + prefix, c->memory_max, format_cgroup_memory_limit_comparison(u, "MemoryMax", cdf, sizeof(cdf)), + prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryMax", cdg, sizeof(cdg)), + prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(u, "MemorySwapMax", cdh, sizeof(cdh)), + prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(u, "StartupMemorySwapMax", cdi, sizeof(cdi)), + prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(u, "MemoryZSwapMax", cdj, sizeof(cdj)), + prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(u, "StartupMemoryZSwapMax", cdk, sizeof(cdk)), + prefix, yes_no(c->memory_zswap_writeback), prefix, c->memory_limit, prefix, cgroup_tasks_max_resolve(&c->tasks_max), prefix, cgroup_device_policy_to_string(c->device_policy), @@ -811,7 +1129,7 @@ int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_typ assert(bpffs_path); if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path)) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m"); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized."); d = strdup(bpffs_path); if (!d) @@ -867,12 +1185,13 @@ static void unit_set_xattr_graceful(Unit *u, const char *name, const void *data, assert(u); assert(name); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return; - r = cg_set_xattr(u->cgroup_path, name, data, size, 0); + r = cg_set_xattr(crt->cgroup_path, name, data, size, 0); if (r < 0) - log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path)); + log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path)); } static void unit_remove_xattr_graceful(Unit *u, const char *name) { @@ -881,12 +1200,13 @@ static void unit_remove_xattr_graceful(Unit *u, const char *name) { assert(u); assert(name); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return; - r = cg_remove_xattr(u->cgroup_path, name); + r = cg_remove_xattr(crt->cgroup_path, name); if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) - log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path)); + log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(crt->cgroup_path)); } static void cgroup_oomd_xattr_apply(Unit *u) { @@ -1013,9 +1333,13 @@ static void cgroup_survive_xattr_apply(Unit *u) { assert(u); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return; + if (u->survive_final_kill_signal) { r = cg_set_xattr( - u->cgroup_path, + crt->cgroup_path, "user.survive_final_kill_signal", "1", 1, @@ -1023,7 +1347,7 @@ static void cgroup_survive_xattr_apply(Unit *u) { /* user xattr support was added in kernel v5.7 */ if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) r = cg_set_xattr( - u->cgroup_path, + crt->cgroup_path, "trusted.survive_final_kill_signal", "1", 1, @@ -1033,7 +1357,7 @@ static void cgroup_survive_xattr_apply(Unit *u) { r, "Failed to set 'survive_final_kill_signal' xattr on control " "group %s, ignoring: %m", - empty_to_root(u->cgroup_path)); + empty_to_root(crt->cgroup_path)); } else { unit_remove_xattr_graceful(u, "user.survive_final_kill_signal"); unit_remove_xattr_graceful(u, "trusted.survive_final_kill_signal"); @@ -1170,6 +1494,12 @@ usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) { usec_t new_period; + assert(u); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return USEC_INFINITY; + if (quota == USEC_INFINITY) /* Always use default period for infinity quota. */ return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC; @@ -1182,10 +1512,10 @@ static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t qu new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC); if (new_period != period) { - log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, + log_unit_full(u, crt->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, "Clamping CPU interval for cpu.max: period is now %s", FORMAT_TIMESPAN(new_period, 1)); - u->warned_clamping_cpu_quota_period = true; + crt->warned_clamping_cpu_quota_period = true; } return new_period; @@ -1205,17 +1535,25 @@ static void cgroup_apply_unified_cpu_idle(Unit *u, uint64_t weight) { bool is_idle; const char *idle_val; + assert(u); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return; + is_idle = weight == CGROUP_WEIGHT_IDLE; idle_val = one_zero(is_idle); - r = cg_set_attribute("cpu", u->cgroup_path, "cpu.idle", idle_val); + r = cg_set_attribute("cpu", crt->cgroup_path, "cpu.idle", idle_val); if (r < 0 && (r != -ENOENT || is_idle)) log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m", - "cpu.idle", empty_to_root(u->cgroup_path), idle_val); + "cpu.idle", empty_to_root(crt->cgroup_path), idle_val); } static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) { char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1]; + assert(u); + period = cgroup_cpu_adjust_period_and_log(u, period, quota); if (quota != USEC_INFINITY) xsprintf(buf, USEC_FMT " " USEC_FMT "\n", @@ -1331,6 +1669,12 @@ static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t i uint64_t bfq_weight; int r; + assert(u); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return -EOWNERDEAD; + /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight" * See also: https://github.com/systemd/systemd/pull/13335 and * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */ @@ -1343,7 +1687,7 @@ static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t i else xsprintf(buf, "%" PRIu64 "\n", bfq_weight); - r = cg_set_attribute(controller, u->cgroup_path, p, buf); + r = cg_set_attribute(controller, crt->cgroup_path, p, buf); /* FIXME: drop this when kernels prior * 795fe54c2a82 ("bfq: Add per-device weight") v5.4 @@ -1367,13 +1711,19 @@ static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_ dev_t dev; int r, r1, r2; + assert(u); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return; + if (lookup_block_device(dev_path, &dev) < 0) return; r1 = set_bfq_weight(u, "io", dev, io_weight); xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight); - r2 = cg_set_attribute("io", u->cgroup_path, "io.weight", buf); + r2 = cg_set_attribute("io", crt->cgroup_path, "io.weight", buf); /* Look at the configured device, when both fail, prefer io.weight errno. */ r = r2 == -EOPNOTSUPP ? r1 : r2; @@ -1381,7 +1731,7 @@ static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_ if (r < 0) log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m", - empty_to_root(u->cgroup_path), (int) strcspn(buf, NEWLINE), buf); + empty_to_root(crt->cgroup_path), (int) strcspn(buf, NEWLINE), buf); } static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) { @@ -1498,7 +1848,8 @@ void unit_modify_nft_set(Unit *u, bool add) { if (cg_all_unified() <= 0) return; - if (u->cgroup_id == 0) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || crt->cgroup_id == 0) return; if (!u->manager->fw_ctx) { @@ -1515,15 +1866,15 @@ void unit_modify_nft_set(Unit *u, bool add) { if (nft_set->source != NFT_SET_SOURCE_CGROUP) continue; - uint64_t element = u->cgroup_id; + uint64_t element = crt->cgroup_id; r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element)); if (r < 0) log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64 ", ignoring: %m", - add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id); + add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id); else log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64, - add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id); + add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, crt->cgroup_id); } } @@ -1536,18 +1887,20 @@ static void cgroup_apply_socket_bind(Unit *u) { static void cgroup_apply_restrict_network_interfaces(Unit *u) { assert(u); - (void) restrict_network_interfaces_install(u); + (void) bpf_restrict_ifaces_install(u); } static int cgroup_apply_devices(Unit *u) { _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; - const char *path; CGroupContext *c; CGroupDevicePolicy policy; int r; assert_se(c = unit_get_cgroup_context(u)); - assert_se(path = u->cgroup_path); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return -EOWNERDEAD; policy = c->device_policy; @@ -1561,9 +1914,9 @@ static int cgroup_apply_devices(Unit *u) { * EINVAL here. */ if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO) - r = cg_set_attribute("devices", path, "devices.deny", "a"); + r = cg_set_attribute("devices", crt->cgroup_path, "devices.deny", "a"); else - r = cg_set_attribute("devices", path, "devices.allow", "a"); + r = cg_set_attribute("devices", crt->cgroup_path, "devices.allow", "a"); if (r < 0) log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r, "Failed to reset devices.allow/devices.deny: %m"); @@ -1571,10 +1924,14 @@ static int cgroup_apply_devices(Unit *u) { bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED || (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow); - if (allow_list_static) - (void) bpf_devices_allow_list_static(prog, path); - bool any = allow_list_static; + bool any = false; + if (allow_list_static) { + r = bpf_devices_allow_list_static(prog, crt->cgroup_path); + if (r > 0) + any = true; + } + LIST_FOREACH(device_allow, a, c->device_allow) { const char *val; @@ -1582,22 +1939,22 @@ static int cgroup_apply_devices(Unit *u) { continue; if (path_startswith(a->path, "/dev/")) - r = bpf_devices_allow_list_device(prog, path, a->path, a->permissions); + r = bpf_devices_allow_list_device(prog, crt->cgroup_path, a->path, a->permissions); else if ((val = startswith(a->path, "block-"))) - r = bpf_devices_allow_list_major(prog, path, val, 'b', a->permissions); + r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'b', a->permissions); else if ((val = startswith(a->path, "char-"))) - r = bpf_devices_allow_list_major(prog, path, val, 'c', a->permissions); + r = bpf_devices_allow_list_major(prog, crt->cgroup_path, val, 'c', a->permissions); else { log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path); continue; } - if (r >= 0) + if (r > 0) any = true; } if (prog && !any) { - log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter."); + log_unit_warning(u, "No devices matched by device filter."); /* The kernel verifier would reject a program we would build with the normal intro and outro but no allow-listing rules (outro would contain an unreachable instruction for successful @@ -1605,7 +1962,7 @@ static int cgroup_apply_devices(Unit *u) { policy = CGROUP_DEVICE_POLICY_STRICT; } - r = bpf_devices_apply_policy(&prog, policy, any, path, &u->bpf_device_control_installed); + r = bpf_devices_apply_policy(&prog, policy, any, crt->cgroup_path, &crt->bpf_device_control_installed); if (r < 0) { static bool warned = false; @@ -1652,9 +2009,9 @@ static void cgroup_context_apply( CGroupMask apply_mask, ManagerState state) { + bool is_host_root, is_local_root; const char *path; CGroupContext *c; - bool is_host_root, is_local_root; int r; assert(u); @@ -1669,7 +2026,12 @@ static void cgroup_context_apply( is_host_root = unit_has_host_root_cgroup(u); assert_se(c = unit_get_cgroup_context(u)); - assert_se(path = u->cgroup_path); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return; + + path = crt->cgroup_path; if (is_local_root) /* Make sure we don't try to display messages with an empty path. */ path = "/"; @@ -1879,6 +2241,7 @@ static void cgroup_context_apply( cgroup_apply_unified_memory_limit(u, "memory.zswap.max", zswap_max); (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group)); + (void) set_attribute_and_warn(u, "memory", "memory.zswap.writeback", one_zero(c->memory_zswap_writeback)); } else { char buf[DECIMAL_STR_MAX(uint64_t) + 1]; @@ -2137,20 +2500,24 @@ CGroupMask unit_get_members_mask(Unit *u) { /* Returns the mask of controllers all of the unit's children require, merged */ - if (u->cgroup_members_mask_valid) - return u->cgroup_members_mask; /* Use cached value if possible */ - - u->cgroup_members_mask = 0; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (crt && crt->cgroup_members_mask_valid) + return crt->cgroup_members_mask; /* Use cached value if possible */ + CGroupMask m = 0; if (u->type == UNIT_SLICE) { Unit *member; UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF) - u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */ + m |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */ + } + + if (crt) { + crt->cgroup_members_mask = m; + crt->cgroup_members_mask_valid = true; } - u->cgroup_members_mask_valid = true; - return u->cgroup_members_mask; + return m; } CGroupMask unit_get_siblings_mask(Unit *u) { @@ -2236,8 +2603,12 @@ void unit_invalidate_cgroup_members_masks(Unit *u) { assert(u); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return; + /* Recurse invalidate the member masks cache all the way up the tree */ - u->cgroup_members_mask_valid = false; + crt->cgroup_members_mask_valid = false; slice = UNIT_GET_SLICE(u); if (slice) @@ -2249,11 +2620,12 @@ const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) { /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */ while (u) { - - if (u->cgroup_path && - u->cgroup_realized && - FLAGS_SET(u->cgroup_realized_mask, mask)) - return u->cgroup_path; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (crt && + crt->cgroup_path && + crt->cgroup_realized && + FLAGS_SET(crt->cgroup_realized_mask, mask)) + return crt->cgroup_path; u = UNIT_GET_SLICE(u); } @@ -2303,27 +2675,34 @@ int unit_default_cgroup_path(const Unit *u, char **ret) { int unit_set_cgroup_path(Unit *u, const char *path) { _cleanup_free_ char *p = NULL; + CGroupRuntime *crt; int r; assert(u); - if (streq_ptr(u->cgroup_path, path)) + crt = unit_get_cgroup_runtime(u); + + if (crt && streq_ptr(crt->cgroup_path, path)) return 0; + unit_release_cgroup(u); + + crt = unit_setup_cgroup_runtime(u); + if (!crt) + return -ENOMEM; + if (path) { p = strdup(path); if (!p) return -ENOMEM; - } - if (p) { r = hashmap_put(u->manager->cgroup_unit, p, u); if (r < 0) return r; } - unit_release_cgroup(u); - u->cgroup_path = TAKE_PTR(p); + assert(!crt->cgroup_path); + crt->cgroup_path = TAKE_PTR(p); return 1; } @@ -2337,10 +2716,11 @@ int unit_watch_cgroup(Unit *u) { /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if * cgroupv2 is available. */ - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return 0; - if (u->cgroup_control_inotify_wd >= 0) + if (crt->cgroup_control_inotify_wd >= 0) return 0; /* Only applies to the unified hierarchy */ @@ -2358,30 +2738,29 @@ int unit_watch_cgroup(Unit *u) { if (r < 0) return log_oom(); - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events); + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.events", &events); if (r < 0) return log_oom(); - u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); - if (u->cgroup_control_inotify_wd < 0) { + crt->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (crt->cgroup_control_inotify_wd < 0) { if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this * is not an error */ return 0; - return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path)); + return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path)); } - r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u); + r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd), u); if (r < 0) - return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path)); + return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path)); return 0; } int unit_watch_cgroup_memory(Unit *u) { _cleanup_free_ char *events = NULL; - CGroupContext *c; int r; assert(u); @@ -2389,10 +2768,11 @@ int unit_watch_cgroup_memory(Unit *u) { /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if * cgroupv2 is available. */ - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return 0; - c = unit_get_cgroup_context(u); + CGroupContext *c = unit_get_cgroup_context(u); if (!c) return 0; @@ -2407,7 +2787,7 @@ int unit_watch_cgroup_memory(Unit *u) { if (u->type == UNIT_SLICE) return 0; - if (u->cgroup_memory_inotify_wd >= 0) + if (crt->cgroup_memory_inotify_wd >= 0) return 0; /* Only applies to the unified hierarchy */ @@ -2421,23 +2801,23 @@ int unit_watch_cgroup_memory(Unit *u) { if (r < 0) return log_oom(); - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events); + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "memory.events", &events); if (r < 0) return log_oom(); - u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); - if (u->cgroup_memory_inotify_wd < 0) { + crt->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY); + if (crt->cgroup_memory_inotify_wd < 0) { if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this * is not an error */ return 0; - return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path)); + return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(crt->cgroup_path)); } - r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u); + r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd), u); if (r < 0) - return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path)); + return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(crt->cgroup_path)); return 0; } @@ -2448,12 +2828,15 @@ int unit_pick_cgroup_path(Unit *u) { assert(u); - if (u->cgroup_path) - return 0; - if (!UNIT_HAS_CGROUP_CONTEXT(u)) return -EINVAL; + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); + if (!crt) + return -ENOMEM; + if (crt->cgroup_path) + return 0; + r = unit_default_cgroup_path(u, &path); if (r < 0) return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m"); @@ -2483,30 +2866,35 @@ static int unit_update_cgroup( if (!UNIT_HAS_CGROUP_CONTEXT(u)) return 0; + if (u->freezer_state != FREEZER_RUNNING) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY), "Cannot realize cgroup for frozen unit."); + /* Figure out our cgroup path */ r = unit_pick_cgroup_path(u); if (r < 0) return r; + CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u)); + /* First, create our own group */ - r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path); + r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, crt->cgroup_path); if (r < 0) - return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(u->cgroup_path)); + return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(crt->cgroup_path)); created = r; if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) { uint64_t cgroup_id = 0; - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_full_path); + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &cgroup_full_path); if (r == 0) { r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id); if (r < 0) log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r, "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path); } else - log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path)); + log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path)); - u->cgroup_id = cgroup_id; + crt->cgroup_id = cgroup_id; } /* Start watching it */ @@ -2515,23 +2903,23 @@ static int unit_update_cgroup( /* For v2 we preserve enabled controllers in delegated units, adjust others, * for v1 we figure out which controller hierarchies need migration. */ - if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) { + if (created || !crt->cgroup_realized || !unit_cgroup_delegate(u)) { CGroupMask result_mask = 0; /* Enable all controllers we need */ - r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask); + r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, crt->cgroup_path, &result_mask); if (r < 0) - log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path)); + log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path)); /* Remember what's actually enabled now */ - u->cgroup_enabled_mask = result_mask; + crt->cgroup_enabled_mask = result_mask; - migrate_mask = u->cgroup_realized_mask ^ target_mask; + migrate_mask = crt->cgroup_realized_mask ^ target_mask; } /* Keep track that this is now realized */ - u->cgroup_realized = true; - u->cgroup_realized_mask = target_mask; + crt->cgroup_realized = true; + crt->cgroup_realized_mask = target_mask; /* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling). * @@ -2541,14 +2929,14 @@ static int unit_update_cgroup( * delegated units. */ if (cg_all_unified() == 0) { - r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u); + r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, crt->cgroup_path, migrate_callback, u); if (r < 0) - log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(u->cgroup_path)); + log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(crt->cgroup_path)); is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE); - r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice); + r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, crt->cgroup_path, !is_root_slice); if (r < 0) - log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(u->cgroup_path)); + log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(crt->cgroup_path)); } /* Set attributes */ @@ -2578,11 +2966,12 @@ static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suf if (!u->manager->system_bus) return -EIO; - if (!u->cgroup_path) - return -EINVAL; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return -EOWNERDEAD; /* Determine this unit's cgroup path relative to our cgroup root */ - pp = path_startswith(u->cgroup_path, u->manager->cgroup_root); + pp = path_startswith(crt->cgroup_path, u->manager->cgroup_root); if (!pp) return -EINVAL; @@ -2626,10 +3015,12 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) { if (r < 0) return r; + CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u)); + if (isempty(suffix_path)) - p = u->cgroup_path; + p = crt->cgroup_path; else { - joined = path_join(u->cgroup_path, suffix_path); + joined = path_join(crt->cgroup_path, suffix_path); if (!joined) return -ENOMEM; @@ -2701,7 +3092,7 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) { continue; /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */ - if (delegated_mask & u->cgroup_realized_mask & bit) { + if (delegated_mask & crt->cgroup_realized_mask & bit) { r = cg_attach(cgroup_controller_to_string(c), p, pid->pid); if (r >= 0) continue; /* Success! */ @@ -2734,6 +3125,10 @@ static bool unit_has_mask_realized( assert(u); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return false; + /* Returns true if this unit is fully realized. We check four things: * * 1. Whether the cgroup was created at all @@ -2749,10 +3144,10 @@ static bool unit_has_mask_realized( * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they * simply don't matter. */ - return u->cgroup_realized && - ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 && - ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 && - u->cgroup_invalidated_mask == 0; + return crt->cgroup_realized && + ((crt->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 && + ((crt->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 && + crt->cgroup_invalidated_mask == 0; } static bool unit_has_mask_disables_realized( @@ -2762,14 +3157,18 @@ static bool unit_has_mask_disables_realized( assert(u); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return true; + /* Returns true if all controllers which should be disabled are indeed disabled. * * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is * already removed. */ - return !u->cgroup_realized || - (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) && - FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2)); + return !crt->cgroup_realized || + (FLAGS_SET(crt->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) && + FLAGS_SET(crt->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2)); } static bool unit_has_mask_enables_realized( @@ -2779,14 +3178,18 @@ static bool unit_has_mask_enables_realized( assert(u); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return false; + /* Returns true if all controllers which should be enabled are indeed enabled. * * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything * we want to add is already added. */ - return u->cgroup_realized && - ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) && - ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2); + return crt->cgroup_realized && + ((crt->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (crt->cgroup_realized_mask & CGROUP_MASK_V1) && + ((crt->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (crt->cgroup_enabled_mask & CGROUP_MASK_V2); } void unit_add_to_cgroup_realize_queue(Unit *u) { @@ -2835,8 +3238,10 @@ static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) { if (unit_has_mask_enables_realized(u, target_mask, enable_mask)) return 0; - new_target_mask = u->cgroup_realized_mask | target_mask; - new_enable_mask = u->cgroup_enabled_mask | enable_mask; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + + new_target_mask = (crt ? crt->cgroup_realized_mask : 0) | target_mask; + new_enable_mask = (crt ? crt->cgroup_enabled_mask : 0) | enable_mask; return unit_update_cgroup(u, new_target_mask, new_enable_mask, state); } @@ -2855,9 +3260,13 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) { CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask; int r; + CGroupRuntime *rt = unit_get_cgroup_runtime(m); + if (!rt) + continue; + /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't * holding any controllers open anyway. */ - if (!m->cgroup_realized) + if (!rt->cgroup_realized) continue; /* We must disable those below us first in order to release the controller. */ @@ -2871,8 +3280,8 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) { if (unit_has_mask_disables_realized(m, target_mask, enable_mask)) continue; - new_target_mask = m->cgroup_realized_mask & target_mask; - new_enable_mask = m->cgroup_enabled_mask & enable_mask; + new_target_mask = rt->cgroup_realized_mask & target_mask; + new_enable_mask = rt->cgroup_enabled_mask & enable_mask; r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state); if (r < 0) @@ -2959,8 +3368,10 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { if (r < 0) return r; + CGroupRuntime *crt = ASSERT_PTR(unit_get_cgroup_runtime(u)); + /* Now, reset the invalidation mask */ - u->cgroup_invalidated_mask = 0; + crt->cgroup_invalidated_mask = 0; return 0; } @@ -3011,11 +3422,13 @@ void unit_add_family_to_cgroup_realize_queue(Unit *u) { * masks. */ do { - Unit *m; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); /* Children of u likely changed when we're called */ - u->cgroup_members_mask_valid = false; + if (crt) + crt->cgroup_members_mask_valid = false; + Unit *m; UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) { /* No point in doing cgroup application for units without active processes. */ @@ -3024,7 +3437,8 @@ void unit_add_family_to_cgroup_realize_queue(Unit *u) { /* We only enqueue siblings if they were realized once at least, in the main * hierarchy. */ - if (!m->cgroup_realized) + crt = unit_get_cgroup_runtime(m); + if (!crt || !crt->cgroup_realized) continue; /* If the unit doesn't need any new controllers and has current ones @@ -3075,26 +3489,50 @@ void unit_release_cgroup(Unit *u) { /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call * when we close down everything for reexecution, where we really want to leave the cgroup in place. */ - if (u->cgroup_path) { - (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path); - u->cgroup_path = mfree(u->cgroup_path); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return; + + if (crt->cgroup_path) { + (void) hashmap_remove(u->manager->cgroup_unit, crt->cgroup_path); + crt->cgroup_path = mfree(crt->cgroup_path); } - if (u->cgroup_control_inotify_wd >= 0) { - if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0) - log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id); + if (crt->cgroup_control_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_control_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", crt->cgroup_control_inotify_wd, u->id); - (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd)); - u->cgroup_control_inotify_wd = -1; + (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(crt->cgroup_control_inotify_wd)); + crt->cgroup_control_inotify_wd = -1; } - if (u->cgroup_memory_inotify_wd >= 0) { - if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0) - log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id); + if (crt->cgroup_memory_inotify_wd >= 0) { + if (inotify_rm_watch(u->manager->cgroup_inotify_fd, crt->cgroup_memory_inotify_wd) < 0) + log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", crt->cgroup_memory_inotify_wd, u->id); - (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd)); - u->cgroup_memory_inotify_wd = -1; + (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(crt->cgroup_memory_inotify_wd)); + crt->cgroup_memory_inotify_wd = -1; } + + *(CGroupRuntime**) ((uint8_t*) u + UNIT_VTABLE(u)->cgroup_runtime_offset) = cgroup_runtime_free(crt); +} + +int unit_cgroup_is_empty(Unit *u) { + int r; + + assert(u); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return -ENXIO; + if (!crt->cgroup_path) + return -EOWNERDEAD; + + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty, ignoring: %m", empty_to_root(crt->cgroup_path)); + + return r; } bool unit_maybe_release_cgroup(Unit *u) { @@ -3102,17 +3540,16 @@ bool unit_maybe_release_cgroup(Unit *u) { assert(u); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return true; - /* Don't release the cgroup if there are still processes under it. If we get notified later when all the - * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed) - * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned - * up later. */ - r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); - if (r < 0) - log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m"); - else if (r == 1) { + /* Don't release the cgroup if there are still processes under it. If we get notified later when all + * the processes exit (e.g. the processes were in D-state and exited after the unit was marked as + * failed) we need the cgroup paths to continue to be tracked by the manager so they can be looked up + * and cleaned up later. */ + r = unit_cgroup_is_empty(u); + if (r == 1) { unit_release_cgroup(u); return true; } @@ -3127,28 +3564,32 @@ void unit_prune_cgroup(Unit *u) { assert(u); /* Removes the cgroup, if empty and possible, and stops watching it. */ - - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return; - (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */ + /* Cache the last CPU and memory usage values before we destroy the cgroup */ + (void) unit_get_cpu_usage(u, /* ret = */ NULL); + + for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) + (void) unit_get_memory_accounting(u, metric, /* ret = */ NULL); #if BPF_FRAMEWORK - (void) lsm_bpf_cleanup(u); /* Remove cgroup from the global LSM BPF map */ + (void) bpf_restrict_fs_cleanup(u); /* Remove cgroup from the global LSM BPF map */ #endif unit_modify_nft_set(u, /* add = */ false); is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE); - r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice); + r = cg_trim_everywhere(u->manager->cgroup_supported, crt->cgroup_path, !is_root_slice); if (r < 0) /* One reason we could have failed here is, that the cgroup still contains a process. * However, if the cgroup becomes removable at a later time, it might be removed when * the containing slice is stopped. So even if we failed now, this unit shouldn't assume * that the cgroup is still realized the next time it is started. Do not return early * on error, continue cleanup. */ - log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path)); + log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(crt->cgroup_path)); if (is_root_slice) return; @@ -3156,11 +3597,15 @@ void unit_prune_cgroup(Unit *u) { if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */ return; - u->cgroup_realized = false; - u->cgroup_realized_mask = 0; - u->cgroup_enabled_mask = 0; + crt = unit_get_cgroup_runtime(u); /* The above might have destroyed the runtime object, let's see if it's still there */ + if (!crt) + return; + + crt->cgroup_realized = false; + crt->cgroup_realized_mask = 0; + crt->cgroup_enabled_mask = 0; - u->bpf_device_control_installed = bpf_program_free(u->bpf_device_control_installed); + crt->bpf_device_control_installed = bpf_program_free(crt->bpf_device_control_installed); } int unit_search_main_pid(Unit *u, PidRef *ret) { @@ -3171,17 +3616,20 @@ int unit_search_main_pid(Unit *u, PidRef *ret) { assert(u); assert(ret); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -ENXIO; - r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f); + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, &f); if (r < 0) return r; for (;;) { _cleanup_(pidref_done) PidRef npidref = PIDREF_NULL; - r = cg_read_pidref(f, &npidref); + /* cg_read_pidref() will return an error on unmapped PIDs. + * We can't reasonably deal with units that contain those. */ + r = cg_read_pidref(f, &npidref, CGROUP_DONT_SKIP_UNMAPPED); if (r < 0) return r; if (r == 0) @@ -3223,7 +3671,7 @@ static int unit_watch_pids_in_path(Unit *u, const char *path) { for (;;) { _cleanup_(pidref_done) PidRef pid = PIDREF_NULL; - r = cg_read_pidref(f, &pid); + r = cg_read_pidref(f, &pid, /* flags = */ 0); if (r == 0) break; if (r < 0) { @@ -3270,7 +3718,8 @@ int unit_synthesize_cgroup_empty_event(Unit *u) { * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can * get as notification source as soon as we stopped having any useful PIDs to watch for. */ - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -ENOENT; r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); @@ -3296,7 +3745,8 @@ int unit_watch_all_pids(Unit *u) { * get reliable cgroup empty notifications: we try to use * SIGCHLD as replacement. */ - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -ENOENT; r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); @@ -3305,7 +3755,7 @@ int unit_watch_all_pids(Unit *u) { if (r > 0) /* On unified we can use proper notifications */ return 0; - return unit_watch_pids_in_path(u, u->cgroup_path); + return unit_watch_pids_in_path(u, crt->cgroup_path); } static int on_cgroup_empty_event(sd_event_source *s, void *userdata) { @@ -3370,15 +3820,8 @@ void unit_add_to_cgroup_empty_queue(Unit *u) { return; /* Let's verify that the cgroup is really empty */ - if (!u->cgroup_path) - return; - - r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); - if (r < 0) { - log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path)); - return; - } - if (r == 0) + r = unit_cgroup_is_empty(u); + if (r <= 0) return; LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u); @@ -3406,7 +3849,10 @@ int unit_check_oomd_kill(Unit *u) { uint64_t n = 0; int r; - if (!u->cgroup_path) + assert(u); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return 0; r = cg_all_unified(); @@ -3415,7 +3861,7 @@ int unit_check_oomd_kill(Unit *u) { else if (r == 0) return 0; - r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_ooms", &value); + r = cg_get_xattr_malloc(crt->cgroup_path, "user.oomd_ooms", &value); if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) return r; @@ -3425,15 +3871,15 @@ int unit_check_oomd_kill(Unit *u) { return r; } - increased = n > u->managed_oom_kill_last; - u->managed_oom_kill_last = n; + increased = n > crt->managed_oom_kill_last; + crt->managed_oom_kill_last = n; if (!increased) return 0; n = 0; value = mfree(value); - r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_kill", &value); + r = cg_get_xattr_malloc(crt->cgroup_path, "user.oomd_kill", &value); if (r >= 0 && !isempty(value)) (void) safe_atou64(value, &n); @@ -3460,10 +3906,16 @@ int unit_check_oom(Unit *u) { uint64_t c; int r; - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return 0; - r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill); + r = cg_get_keyed_attribute( + "memory", + crt->cgroup_path, + "memory.events", + STRV_MAKE("oom_kill"), + &oom_kill); if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */ c = 0; else if (r < 0) @@ -3474,8 +3926,8 @@ int unit_check_oom(Unit *u) { return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m"); } - increased = c > u->oom_kill_last; - u->oom_kill_last = c; + increased = c > crt->oom_kill_last; + crt->oom_kill_last = c; if (!increased) return 0; @@ -3525,7 +3977,9 @@ static void unit_add_to_cgroup_oom_queue(Unit *u) { if (u->in_cgroup_oom_queue) return; - if (!u->cgroup_path) + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return; LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u); @@ -3541,7 +3995,7 @@ static void unit_add_to_cgroup_oom_queue(Unit *u) { return; } - r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8); + r = sd_event_source_set_priority(s, EVENT_PRIORITY_CGROUP_OOM); if (r < 0) { log_error_errno(r, "Failed to set priority of cgroup oom event source: %m"); return; @@ -3562,11 +4016,16 @@ static int unit_check_cgroup_events(Unit *u) { assert(u); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return 0; - r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", - STRV_MAKE("populated", "frozen"), values); + r = cg_get_keyed_attribute_graceful( + SYSTEMD_CGROUP_CONTROLLER, + crt->cgroup_path, + "cgroup.events", + STRV_MAKE("populated", "frozen"), + values); if (r < 0) return r; @@ -3580,8 +4039,10 @@ static int unit_check_cgroup_events(Unit *u) { unit_add_to_cgroup_empty_queue(u); } - /* Disregard freezer state changes due to operations not initiated by us */ - if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) { + /* Disregard freezer state changes due to operations not initiated by us. + * See: https://github.com/systemd/systemd/pull/13512/files#r416469963 and + * https://github.com/systemd/systemd/pull/13512#issuecomment-573007207 */ + if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT, FREEZER_THAWING)) { if (streq(values[1], "0")) unit_thawed(u); else @@ -3670,7 +4131,7 @@ static int cg_bpf_mask_supported(CGroupMask *ret) { mask |= CGROUP_MASK_BPF_SOCKET_BIND; /* BPF-based cgroup_skb/{egress|ingress} hooks */ - r = restrict_network_interfaces_supported(); + r = bpf_restrict_ifaces_supported(); if (r < 0) return r; if (r > 0) @@ -3747,7 +4208,7 @@ int manager_setup_cgroup(Manager *m) { /* Schedule cgroup empty checks early, but after having processed service notification messages or * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */ - r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5); + r = sd_event_source_set_priority(m->cgroup_empty_event_source, EVENT_PRIORITY_CGROUP_EMPTY); if (r < 0) return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m"); @@ -3776,7 +4237,7 @@ int manager_setup_cgroup(Manager *m) { /* Process cgroup empty notifications early. Note that when this event is dispatched it'll * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ - r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9); + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, EVENT_PRIORITY_CGROUP_INOTIFY); if (r < 0) return log_error_errno(r, "Failed to set priority of inotify event source: %m"); @@ -3885,7 +4346,7 @@ Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) { } } -Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid) { +Unit *manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid) { _cleanup_free_ char *cgroup = NULL; assert(m); @@ -3896,7 +4357,7 @@ Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid) { return manager_get_unit_by_cgroup(m, cgroup); } -Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid) { +Unit *manager_get_unit_by_pidref_watching(Manager *m, const PidRef *pid) { Unit *u, **array; assert(m); @@ -3915,7 +4376,7 @@ Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid) { return NULL; } -Unit *manager_get_unit_by_pidref(Manager *m, PidRef *pid) { +Unit *manager_get_unit_by_pidref(Manager *m, const PidRef *pid) { Unit *u; assert(m); @@ -3994,7 +4455,8 @@ int unit_get_memory_available(Unit *u, uint64_t *ret) { if (!unit_context) return -ENODATA; - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) continue; (void) unit_get_memory_current(u, ¤t); @@ -4026,21 +4488,22 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) { if (!UNIT_CGROUP_BOOL(u, memory_accounting)) return -ENODATA; - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -ENODATA; /* The root cgroup doesn't expose this information, let's get it from /proc instead */ if (unit_has_host_root_cgroup(u)) return procfs_memory_get_used(ret); - if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0) + if ((crt->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0) return -ENODATA; r = cg_all_unified(); if (r < 0) return r; - return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret); + return cg_get_attribute_as_uint64("memory", crt->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret); } int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret) { @@ -4063,7 +4526,10 @@ int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uin if (!UNIT_CGROUP_BOOL(u, memory_accounting)) return -ENODATA; - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return -ENODATA; + if (!crt->cgroup_path) /* If the cgroup is already gone, we try to find the last cached value. */ goto finish; @@ -4071,7 +4537,7 @@ int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uin if (unit_has_host_root_cgroup(u)) return -ENODATA; - if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_MEMORY)) + if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_MEMORY)) return -ENODATA; r = cg_all_unified(); @@ -4080,14 +4546,14 @@ int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uin if (r == 0) return -ENODATA; - r = cg_get_attribute_as_uint64("memory", u->cgroup_path, attributes_table[metric], &bytes); + r = cg_get_attribute_as_uint64("memory", crt->cgroup_path, attributes_table[metric], &bytes); if (r < 0 && r != -ENODATA) return r; updated = r >= 0; finish: if (metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST) { - uint64_t *last = &u->memory_accounting_last[metric]; + uint64_t *last = &crt->memory_accounting_last[metric]; if (updated) *last = bytes; @@ -4112,17 +4578,18 @@ int unit_get_tasks_current(Unit *u, uint64_t *ret) { if (!UNIT_CGROUP_BOOL(u, tasks_accounting)) return -ENODATA; - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -ENODATA; /* The root cgroup doesn't expose this information, let's get it from /proc instead */ if (unit_has_host_root_cgroup(u)) return procfs_tasks_get_current(ret); - if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0) + if ((crt->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0) return -ENODATA; - return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret); + return cg_get_attribute_as_uint64("pids", crt->cgroup_path, "pids.current", ret); } static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { @@ -4132,7 +4599,8 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { assert(u); assert(ret); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -ENODATA; /* The root cgroup doesn't expose this information, let's get it from /proc instead */ @@ -4140,7 +4608,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { return procfs_cpu_get_usage(ret); /* Requisite controllers for CPU accounting are not enabled */ - if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0) + if ((get_cpu_accounting_mask() & ~crt->cgroup_realized_mask) != 0) return -ENODATA; r = cg_all_unified(); @@ -4150,7 +4618,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { _cleanup_free_ char *val = NULL; uint64_t us; - r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val); + r = cg_get_keyed_attribute("cpu", crt->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val); if (IN_SET(r, -ENOENT, -ENXIO)) return -ENODATA; if (r < 0) @@ -4162,7 +4630,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { ns = us * NSEC_PER_USEC; } else - return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret); + return cg_get_attribute_as_uint64("cpuacct", crt->cgroup_path, "cpuacct.usage", ret); *ret = ns; return 0; @@ -4178,27 +4646,31 @@ int unit_get_cpu_usage(Unit *u, nsec_t *ret) { * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply * call this function with a NULL return value. */ + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return -ENODATA; + if (!UNIT_CGROUP_BOOL(u, cpu_accounting)) return -ENODATA; r = unit_get_cpu_usage_raw(u, &ns); - if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) { + if (r == -ENODATA && crt->cpu_usage_last != NSEC_INFINITY) { /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our * cached value. */ if (ret) - *ret = u->cpu_usage_last; + *ret = crt->cpu_usage_last; return 0; } if (r < 0) return r; - if (ns > u->cpu_usage_base) - ns -= u->cpu_usage_base; + if (ns > crt->cpu_usage_base) + ns -= crt->cpu_usage_base; else ns = 0; - u->cpu_usage_last = ns; + crt->cpu_usage_last = ns; if (ret) *ret = ns; @@ -4221,9 +4693,13 @@ int unit_get_ip_accounting( if (!UNIT_CGROUP_BOOL(u, ip_accounting)) return -ENODATA; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return -ENODATA; + fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ? - u->ip_accounting_ingress_map_fd : - u->ip_accounting_egress_map_fd; + crt->ip_accounting_ingress_map_fd : + crt->ip_accounting_egress_map_fd; if (fd < 0) return -ENODATA; @@ -4238,11 +4714,62 @@ int unit_get_ip_accounting( * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the * ip_accounting_extra[] field, and add them in here transparently. */ - *ret = value + u->ip_accounting_extra[metric]; + *ret = value + crt->ip_accounting_extra[metric]; return r; } +static uint64_t unit_get_effective_limit_one(Unit *u, CGroupLimitType type) { + CGroupContext *cc; + + assert(u); + assert(UNIT_HAS_CGROUP_CONTEXT(u)); + + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + switch (type) { + case CGROUP_LIMIT_MEMORY_MAX: + case CGROUP_LIMIT_MEMORY_HIGH: + return physical_memory(); + case CGROUP_LIMIT_TASKS_MAX: + return system_tasks_max(); + default: + assert_not_reached(); + } + + cc = ASSERT_PTR(unit_get_cgroup_context(u)); + switch (type) { + /* Note: on legacy/hybrid hierarchies memory_max stays CGROUP_LIMIT_MAX unless configured + * explicitly. Effective value of MemoryLimit= (cgroup v1) is not implemented. */ + case CGROUP_LIMIT_MEMORY_MAX: + return cc->memory_max; + case CGROUP_LIMIT_MEMORY_HIGH: + return cc->memory_high; + case CGROUP_LIMIT_TASKS_MAX: + return cgroup_tasks_max_resolve(&cc->tasks_max); + default: + assert_not_reached(); + } +} + +int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret) { + uint64_t infimum; + + assert(u); + assert(ret); + assert(type >= 0); + assert(type < _CGROUP_LIMIT_TYPE_MAX); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + infimum = unit_get_effective_limit_one(u, type); + for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice)) + infimum = MIN(infimum, unit_get_effective_limit_one(slice, type)); + + *ret = infimum; + return 0; +} + static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) { static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { [CGROUP_IO_READ_BYTES] = "rbytes=", @@ -4257,7 +4784,8 @@ static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_AC assert(u); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -ENODATA; if (unit_has_host_root_cgroup(u)) @@ -4266,13 +4794,13 @@ static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_AC r = cg_all_unified(); if (r < 0) return r; - if (r == 0) /* TODO: support cgroupv1 */ + if (r == 0) return -ENODATA; - if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO)) + if (!FLAGS_SET(crt->cgroup_realized_mask, CGROUP_MASK_IO)) return -ENODATA; - r = cg_get_path("io", u->cgroup_path, "io.stat", &path); + r = cg_get_path("io", crt->cgroup_path, "io.stat", &path); if (r < 0) return r; @@ -4340,26 +4868,30 @@ int unit_get_io_accounting( if (!UNIT_CGROUP_BOOL(u, io_accounting)) return -ENODATA; - if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return -ENODATA; + + if (allow_cache && crt->io_accounting_last[metric] != UINT64_MAX) goto done; r = unit_get_io_accounting_raw(u, raw); - if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX) + if (r == -ENODATA && crt->io_accounting_last[metric] != UINT64_MAX) goto done; if (r < 0) return r; for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) { /* Saturated subtraction */ - if (raw[i] > u->io_accounting_base[i]) - u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i]; + if (raw[i] > crt->io_accounting_base[i]) + crt->io_accounting_last[i] = raw[i] - crt->io_accounting_base[i]; else - u->io_accounting_last[i] = 0; + crt->io_accounting_last[i] = 0; } done: if (ret) - *ret = u->io_accounting_last[metric]; + *ret = crt->io_accounting_last[metric]; return 0; } @@ -4369,11 +4901,15 @@ int unit_reset_cpu_accounting(Unit *u) { assert(u); - u->cpu_usage_last = NSEC_INFINITY; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return 0; + + crt->cpu_usage_last = NSEC_INFINITY; - r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base); + r = unit_get_cpu_usage_raw(u, &crt->cpu_usage_base); if (r < 0) { - u->cpu_usage_base = 0; + crt->cpu_usage_base = 0; return r; } @@ -4383,7 +4919,11 @@ int unit_reset_cpu_accounting(Unit *u) { void unit_reset_memory_accounting_last(Unit *u) { assert(u); - FOREACH_ARRAY(i, u->memory_accounting_last, ELEMENTSOF(u->memory_accounting_last)) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return; + + FOREACH_ELEMENT(i, crt->memory_accounting_last) *i = UINT64_MAX; } @@ -4392,13 +4932,17 @@ int unit_reset_ip_accounting(Unit *u) { assert(u); - if (u->ip_accounting_ingress_map_fd >= 0) - RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd)); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return 0; + + if (crt->ip_accounting_ingress_map_fd >= 0) + RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_ingress_map_fd)); - if (u->ip_accounting_egress_map_fd >= 0) - RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd)); + if (crt->ip_accounting_egress_map_fd >= 0) + RET_GATHER(r, bpf_firewall_reset_accounting(crt->ip_accounting_egress_map_fd)); - zero(u->ip_accounting_extra); + zero(crt->ip_accounting_extra); return r; } @@ -4406,7 +4950,11 @@ int unit_reset_ip_accounting(Unit *u) { void unit_reset_io_accounting_last(Unit *u) { assert(u); - FOREACH_ARRAY(i, u->io_accounting_last, _CGROUP_IO_ACCOUNTING_METRIC_MAX) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return; + + FOREACH_ARRAY(i, crt->io_accounting_last, _CGROUP_IO_ACCOUNTING_METRIC_MAX) *i = UINT64_MAX; } @@ -4415,11 +4963,15 @@ int unit_reset_io_accounting(Unit *u) { assert(u); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return 0; + unit_reset_io_accounting_last(u); - r = unit_get_io_accounting_raw(u, u->io_accounting_base); + r = unit_get_io_accounting_raw(u, crt->io_accounting_base); if (r < 0) { - zero(u->io_accounting_base); + zero(crt->io_accounting_base); return r; } @@ -4445,6 +4997,10 @@ void unit_invalidate_cgroup(Unit *u, CGroupMask m) { if (!UNIT_HAS_CGROUP_CONTEXT(u)) return; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return; + if (m == 0) return; @@ -4455,10 +5011,10 @@ void unit_invalidate_cgroup(Unit *u, CGroupMask m) { if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT)) m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT; - if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */ + if (FLAGS_SET(crt->cgroup_invalidated_mask, m)) /* NOP? */ return; - u->cgroup_invalidated_mask |= m; + crt->cgroup_invalidated_mask |= m; unit_add_to_cgroup_realize_queue(u); } @@ -4468,10 +5024,14 @@ void unit_invalidate_cgroup_bpf(Unit *u) { if (!UNIT_HAS_CGROUP_CONTEXT(u)) return; - if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */ + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return; + + if (crt->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */ return; - u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL; + crt->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL; unit_add_to_cgroup_realize_queue(u); /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access @@ -4523,66 +5083,102 @@ void manager_invalidate_startup_units(Manager *m) { unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET); } +static int unit_cgroup_freezer_kernel_state(Unit *u, FreezerState *ret) { + _cleanup_free_ char *val = NULL; + FreezerState s; + int r; + + assert(u); + assert(ret); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return -EOWNERDEAD; + + r = cg_get_keyed_attribute( + SYSTEMD_CGROUP_CONTROLLER, + crt->cgroup_path, + "cgroup.events", + STRV_MAKE("frozen"), + &val); + if (IN_SET(r, -ENOENT, -ENXIO)) + return -ENODATA; + if (r < 0) + return r; + + if (streq(val, "0")) + s = FREEZER_RUNNING; + else if (streq(val, "1")) + s = FREEZER_FROZEN; + else { + log_unit_debug(u, "Unexpected cgroup frozen state: %s", val); + s = _FREEZER_STATE_INVALID; + } + + *ret = s; + return 0; +} + int unit_cgroup_freezer_action(Unit *u, FreezerAction action) { _cleanup_free_ char *path = NULL; - FreezerState target, kernel = _FREEZER_STATE_INVALID; - int r, ret; + FreezerState target, current, next; + int r; assert(u); - assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_PARENT_FREEZE, + FREEZER_THAW, FREEZER_PARENT_THAW)); if (!cg_freezer_supported()) return 0; - /* Ignore all requests to thaw init.scope or -.slice and reject all requests to freeze them */ - if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE)) - return action == FREEZER_FREEZE ? -EPERM : 0; - - if (!u->cgroup_realized) - return -EBUSY; - - if (action == FREEZER_THAW) { - Unit *slice = UNIT_GET_SLICE(u); + unit_next_freezer_state(u, action, &next, &target); - if (slice) { - r = unit_cgroup_freezer_action(slice, FREEZER_THAW); - if (r < 0) - return log_unit_error_errno(u, r, "Failed to thaw slice %s of unit: %m", slice->id); - } + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_realized) { + /* No realized cgroup = nothing to freeze */ + u->freezer_state = freezer_state_finish(next); + return 0; } - target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING; - - r = unit_freezer_state_kernel(u, &kernel); + r = unit_cgroup_freezer_kernel_state(u, ¤t); if (r < 0) - log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m"); + return r; - if (target == kernel) { - u->freezer_state = target; - if (action == FREEZER_FREEZE) - return 0; - ret = 0; - } else - ret = 1; + if (current == target) + next = freezer_state_finish(next); + else if (IN_SET(next, FREEZER_FROZEN, FREEZER_FROZEN_BY_PARENT, FREEZER_RUNNING)) { + /* We're transitioning into a finished state, which implies that the cgroup's + * current state already matches the target and thus we'd return 0. But, reality + * shows otherwise. This indicates that our freezer_state tracking has diverged + * from the real state of the cgroup, which can happen if someone meddles with the + * cgroup from underneath us. This really shouldn't happen during normal operation, + * though. So, let's warn about it and fix up the state to be valid */ + + log_unit_warning(u, "Unit wants to transition to %s freezer state but cgroup is unexpectedly %s, fixing up.", + freezer_state_to_string(next), freezer_state_to_string(current) ?: "(invalid)"); + + if (next == FREEZER_FROZEN) + next = FREEZER_FREEZING; + else if (next == FREEZER_FROZEN_BY_PARENT) + next = FREEZER_FREEZING_BY_PARENT; + else if (next == FREEZER_RUNNING) + next = FREEZER_THAWING; + } - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path); + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, "cgroup.freeze", &path); if (r < 0) return r; - log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing"); - - if (target != kernel) { - if (action == FREEZER_FREEZE) - u->freezer_state = FREEZER_FREEZING; - else - u->freezer_state = FREEZER_THAWING; - } + log_unit_debug(u, "Unit freezer state was %s, now %s.", + freezer_state_to_string(u->freezer_state), + freezer_state_to_string(next)); - r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER); + r = write_string_file(path, one_zero(target == FREEZER_FROZEN), WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) return r; - return ret; + u->freezer_state = next; + return target != current; } int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { @@ -4592,10 +5188,11 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { assert(u); assert(cpus); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -ENODATA; - if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0) + if ((crt->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0) return -ENODATA; r = cg_all_unified(); @@ -4604,7 +5201,7 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { if (r == 0) return -ENODATA; - r = cg_get_attribute("cpuset", u->cgroup_path, name, &v); + r = cg_get_attribute("cpuset", crt->cgroup_path, name, &v); if (r == -ENOENT) return -ENODATA; if (r < 0) @@ -4613,6 +5210,422 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL); } +CGroupRuntime *cgroup_runtime_new(void) { + _cleanup_(cgroup_runtime_freep) CGroupRuntime *crt = NULL; + + crt = new(CGroupRuntime, 1); + if (!crt) + return NULL; + + *crt = (CGroupRuntime) { + .cpu_usage_last = NSEC_INFINITY, + + .cgroup_control_inotify_wd = -1, + .cgroup_memory_inotify_wd = -1, + + .ip_accounting_ingress_map_fd = -EBADF, + .ip_accounting_egress_map_fd = -EBADF, + + .ipv4_allow_map_fd = -EBADF, + .ipv6_allow_map_fd = -EBADF, + .ipv4_deny_map_fd = -EBADF, + .ipv6_deny_map_fd = -EBADF, + + .cgroup_invalidated_mask = _CGROUP_MASK_ALL, + }; + + FOREACH_ELEMENT(i, crt->memory_accounting_last) + *i = UINT64_MAX; + FOREACH_ELEMENT(i, crt->io_accounting_base) + *i = UINT64_MAX; + FOREACH_ELEMENT(i, crt->io_accounting_last) + *i = UINT64_MAX; + FOREACH_ELEMENT(i, crt->ip_accounting_extra) + *i = UINT64_MAX; + + return TAKE_PTR(crt); +} + +CGroupRuntime *cgroup_runtime_free(CGroupRuntime *crt) { + if (!crt) + return NULL; + + fdset_free(crt->initial_socket_bind_link_fds); +#if BPF_FRAMEWORK + bpf_link_free(crt->ipv4_socket_bind_link); + bpf_link_free(crt->ipv6_socket_bind_link); +#endif + hashmap_free(crt->bpf_foreign_by_key); + + bpf_program_free(crt->bpf_device_control_installed); + +#if BPF_FRAMEWORK + bpf_link_free(crt->restrict_ifaces_ingress_bpf_link); + bpf_link_free(crt->restrict_ifaces_egress_bpf_link); +#endif + fdset_free(crt->initial_restrict_ifaces_link_fds); + + safe_close(crt->ipv4_allow_map_fd); + safe_close(crt->ipv6_allow_map_fd); + safe_close(crt->ipv4_deny_map_fd); + safe_close(crt->ipv6_deny_map_fd); + + bpf_program_free(crt->ip_bpf_ingress); + bpf_program_free(crt->ip_bpf_ingress_installed); + bpf_program_free(crt->ip_bpf_egress); + bpf_program_free(crt->ip_bpf_egress_installed); + + set_free(crt->ip_bpf_custom_ingress); + set_free(crt->ip_bpf_custom_ingress_installed); + set_free(crt->ip_bpf_custom_egress); + set_free(crt->ip_bpf_custom_egress_installed); + + free(crt->cgroup_path); + + return mfree(crt); +} + +static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes", + [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets", + [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes", + [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric); + +static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-base", + [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-base", + [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-base", + [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric); + +static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-last", + [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-last", + [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-last", + [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric); + +static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = { + [CGROUP_MEMORY_PEAK] = "memory-accounting-peak", + [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric); + +static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) { + _cleanup_free_ char *s = NULL; + int r; + + assert(f); + assert(key); + + if (mask == 0) + return 0; + + r = cg_mask_to_string(mask, &s); + if (r < 0) + return log_error_errno(r, "Failed to format cgroup mask: %m"); + + return serialize_item(f, key, s); +} + +int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds) { + int r; + + assert(u); + assert(f); + assert(fds); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; + + (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, crt->cpu_usage_base); + if (crt->cpu_usage_last != NSEC_INFINITY) + (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, crt->cpu_usage_last); + + if (crt->managed_oom_kill_last > 0) + (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, crt->managed_oom_kill_last); + + if (crt->oom_kill_last > 0) + (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, crt->oom_kill_last); + + for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) { + uint64_t v; + + r = unit_get_memory_accounting(u, metric, &v); + if (r >= 0) + (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v); + } + + for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { + uint64_t v; + + r = unit_get_ip_accounting(u, m, &v); + if (r >= 0) + (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v); + } + + for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) { + (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, crt->io_accounting_base[im]); + + if (crt->io_accounting_last[im] != UINT64_MAX) + (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, crt->io_accounting_last[im]); + } + + if (crt->cgroup_path) + (void) serialize_item(f, "cgroup", crt->cgroup_path); + if (crt->cgroup_id != 0) + (void) serialize_item_format(f, "cgroup-id", "%" PRIu64, crt->cgroup_id); + + (void) serialize_bool(f, "cgroup-realized", crt->cgroup_realized); + (void) serialize_cgroup_mask(f, "cgroup-realized-mask", crt->cgroup_realized_mask); + (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", crt->cgroup_enabled_mask); + (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", crt->cgroup_invalidated_mask); + + (void) bpf_socket_bind_serialize(u, f, fds); + + (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", crt->ip_bpf_ingress_installed); + (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", crt->ip_bpf_egress_installed); + (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", crt->bpf_device_control_installed); + (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", crt->ip_bpf_custom_ingress_installed); + (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", crt->ip_bpf_custom_egress_installed); + + (void) bpf_restrict_ifaces_serialize(u, f, fds); + + return 0; +} + +#define MATCH_DESERIALIZE(u, key, l, v, parse_func, target) \ + ({ \ + bool _deserialize_matched = streq(l, key); \ + if (_deserialize_matched) { \ + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \ + if (!crt) \ + log_oom_debug(); \ + else { \ + int _deserialize_r = parse_func(v); \ + if (_deserialize_r < 0) \ + log_unit_debug_errno(u, _deserialize_r, \ + "Failed to parse \"%s=%s\", ignoring.", l, v); \ + else \ + crt->target = _deserialize_r; \ + } \ + } \ + _deserialize_matched; \ + }) + +#define MATCH_DESERIALIZE_IMMEDIATE(u, key, l, v, parse_func, target) \ + ({ \ + bool _deserialize_matched = streq(l, key); \ + if (_deserialize_matched) { \ + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \ + if (!crt) \ + log_oom_debug(); \ + else { \ + int _deserialize_r = parse_func(v, &crt->target); \ + if (_deserialize_r < 0) \ + log_unit_debug_errno(u, _deserialize_r, \ + "Failed to parse \"%s=%s\", ignoring", l, v); \ + } \ + } \ + _deserialize_matched; \ + }) + +#define MATCH_DESERIALIZE_METRIC(u, key, l, v, parse_func, target) \ + ({ \ + bool _deserialize_matched = streq(l, key); \ + if (_deserialize_matched) { \ + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); \ + if (!crt) \ + log_oom_debug(); \ + else { \ + int _deserialize_r = parse_func(v); \ + if (_deserialize_r < 0) \ + log_unit_debug_errno(u, _deserialize_r, \ + "Failed to parse \"%s=%s\", ignoring.", l, v); \ + else \ + crt->target = _deserialize_r; \ + } \ + } \ + _deserialize_matched; \ + }) + +int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, FDSet *fds) { + int r; + + assert(u); + assert(value); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return 0; + + if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-base", key, value, safe_atou64, cpu_usage_base) || + MATCH_DESERIALIZE_IMMEDIATE(u, "cpuacct-usage-base", key, value, safe_atou64, cpu_usage_base)) + return 1; + + if (MATCH_DESERIALIZE_IMMEDIATE(u, "cpu-usage-last", key, value, safe_atou64, cpu_usage_last)) + return 1; + + if (MATCH_DESERIALIZE_IMMEDIATE(u, "managed-oom-kill-last", key, value, safe_atou64, managed_oom_kill_last)) + return 1; + + if (MATCH_DESERIALIZE_IMMEDIATE(u, "oom-kill-last", key, value, safe_atou64, oom_kill_last)) + return 1; + + if (streq(key, "cgroup")) { + r = unit_set_cgroup_path(u, value); + if (r < 0) + log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", value); + + (void) unit_watch_cgroup(u); + (void) unit_watch_cgroup_memory(u); + return 1; + } + + if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-id", key, value, safe_atou64, cgroup_id)) + return 1; + + if (MATCH_DESERIALIZE(u, "cgroup-realized", key, value, parse_boolean, cgroup_realized)) + return 1; + + if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-realized-mask", key, value, cg_mask_from_string, cgroup_realized_mask)) + return 1; + + if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-enabled-mask", key, value, cg_mask_from_string, cgroup_enabled_mask)) + return 1; + + if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-invalidated-mask", key, value, cg_mask_from_string, cgroup_invalidated_mask)) + return 1; + + if (STR_IN_SET(key, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) { + int fd; + + fd = deserialize_fd(fds, value); + if (fd >= 0) + (void) bpf_socket_bind_add_initial_link_fd(u, fd); + + return 1; + } + + if (STR_IN_SET(key, + "ip-bpf-ingress-installed", "ip-bpf-egress-installed", + "bpf-device-control-installed", + "ip-bpf-custom-ingress-installed", "ip-bpf-custom-egress-installed")) { + + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); + if (!crt) + log_oom_debug(); + else { + if (streq(key, "ip-bpf-ingress-installed")) + (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_ingress_installed); + + if (streq(key, "ip-bpf-egress-installed")) + (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_egress_installed); + + if (streq(key, "bpf-device-control-installed")) + (void) bpf_program_deserialize_attachment(value, fds, &crt->bpf_device_control_installed); + + if (streq(key, "ip-bpf-custom-ingress-installed")) + (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_ingress_installed); + + if (streq(key, "ip-bpf-custom-egress-installed")) + (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_egress_installed); + } + + return 1; + } + + if (streq(key, "restrict-ifaces-bpf-fd")) { + int fd; + + fd = deserialize_fd(fds, value); + if (fd >= 0) + (void) bpf_restrict_ifaces_add_initial_link_fd(u, fd); + return 1; + } + + CGroupMemoryAccountingMetric mm = memory_accounting_metric_field_last_from_string(key); + if (mm >= 0) { + uint64_t c; + + r = safe_atou64(value, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", value); + else { + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); + if (!crt) + log_oom_debug(); + else + crt->memory_accounting_last[mm] = c; + } + + return 1; + } + + CGroupIPAccountingMetric ipm = ip_accounting_metric_field_from_string(key); + if (ipm >= 0) { + uint64_t c; + + r = safe_atou64(value, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", value); + else { + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); + if (!crt) + log_oom_debug(); + else + crt->ip_accounting_extra[ipm] = c; + } + + return 1; + } + + CGroupIOAccountingMetric iom = io_accounting_metric_field_base_from_string(key); + if (iom >= 0) { + uint64_t c; + + r = safe_atou64(value, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", value); + else { + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); + if (!crt) + log_oom_debug(); + else + crt->io_accounting_base[iom] = c; + } + + return 1; + } + + iom = io_accounting_metric_field_last_from_string(key); + if (iom >= 0) { + uint64_t c; + + r = safe_atou64(value, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", value); + else { + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); + if (!crt) + log_oom_debug(); + else + crt->io_accounting_last[iom] = c; + } + return 1; + } + + return 0; +} + static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = { [CGROUP_DEVICE_POLICY_AUTO] = "auto", [CGROUP_DEVICE_POLICY_CLOSED] = "closed", @@ -4621,17 +5634,10 @@ static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy); -static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = { - [FREEZER_FREEZE] = "freeze", - [FREEZER_THAW] = "thaw", -}; - -DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction); - static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = { - [CGROUP_PRESSURE_WATCH_OFF] = "off", + [CGROUP_PRESSURE_WATCH_OFF] = "off", [CGROUP_PRESSURE_WATCH_AUTO] = "auto", - [CGROUP_PRESSURE_WATCH_ON] = "on", + [CGROUP_PRESSURE_WATCH_ON] = "on", [CGROUP_PRESSURE_WATCH_SKIP] = "skip", }; @@ -4663,3 +5669,11 @@ static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_AC }; DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric); + +static const char *const cgroup_effective_limit_type_table[_CGROUP_LIMIT_TYPE_MAX] = { + [CGROUP_LIMIT_MEMORY_MAX] = "EffectiveMemoryMax", + [CGROUP_LIMIT_MEMORY_HIGH] = "EffectiveMemoryHigh", + [CGROUP_LIMIT_TASKS_MAX] = "EffectiveTasksMax", +}; + +DEFINE_STRING_TABLE_LOOKUP(cgroup_effective_limit_type, CGroupLimitType); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index f1b674b..72fe275 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -3,7 +3,10 @@ #include -#include "bpf-lsm.h" +#include "sd-event.h" + +#include "bpf-program.h" +#include "bpf-restrict-fs.h" #include "cgroup-util.h" #include "cpu-set-util.h" #include "firewall-util.h" @@ -35,6 +38,7 @@ typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight; typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth; typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram; typedef struct CGroupSocketBindItem CGroupSocketBindItem; +typedef struct CGroupRuntime CGroupRuntime; typedef enum CGroupDevicePolicy { /* When devices listed, will allow those, plus built-in ones, if none are listed will allow @@ -53,7 +57,9 @@ typedef enum CGroupDevicePolicy { typedef enum FreezerAction { FREEZER_FREEZE, + FREEZER_PARENT_FREEZE, FREEZER_THAW, + FREEZER_PARENT_THAW, _FREEZER_ACTION_MAX, _FREEZER_ACTION_INVALID = -EINVAL, @@ -129,6 +135,9 @@ typedef enum CGroupPressureWatch { _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL, } CGroupPressureWatch; +/* The user-supplied cgroup-related configuration options. This remains mostly immutable while the service + * manager is running (except for an occasional SetProperty() configuration change), outside of reload + * cycles. When adding members make sure to update cgroup_context_copy() accordingly. */ struct CGroupContext { bool cpu_accounting; bool io_accounting; @@ -188,6 +197,8 @@ struct CGroupContext { bool startup_memory_swap_max_set:1; bool startup_memory_zswap_max_set:1; + bool memory_zswap_writeback; + Set *ip_address_allow; Set *ip_address_deny; /* These two flags indicate that redundant entries have been removed from @@ -276,6 +287,95 @@ typedef enum CGroupMemoryAccountingMetric { _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL, } CGroupMemoryAccountingMetric; +/* Used for limits whose value sets have infimum */ +typedef enum CGroupLimitType { + CGROUP_LIMIT_MEMORY_MAX, + CGROUP_LIMIT_MEMORY_HIGH, + CGROUP_LIMIT_TASKS_MAX, + _CGROUP_LIMIT_TYPE_MAX, + _CGROUP_LIMIT_INVALID = -EINVAL, +} CGroupLimitType; + +/* The dynamic, regular updated information about a unit that as a realized cgroup. This is only allocated when a unit is first realized */ +typedef struct CGroupRuntime { + /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */ + nsec_t cpu_usage_base; + nsec_t cpu_usage_last; /* the most recently read value */ + + /* Most recently read value of memory accounting metrics */ + uint64_t memory_accounting_last[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1]; + + /* The current counter of OOM kills initiated by systemd-oomd */ + uint64_t managed_oom_kill_last; + + /* The current counter of the oom_kill field in the memory.events cgroup attribute */ + uint64_t oom_kill_last; + + /* Where the io.stat data was at the time the unit was started */ + uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; + uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */ + + /* Counterparts in the cgroup filesystem */ + char *cgroup_path; + uint64_t cgroup_id; + CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */ + CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */ + CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */ + CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ + + /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */ + int cgroup_control_inotify_wd; + int cgroup_memory_inotify_wd; + + /* Device Controller BPF program */ + BPFProgram *bpf_device_control_installed; + + /* IP BPF Firewalling/accounting */ + int ip_accounting_ingress_map_fd; + int ip_accounting_egress_map_fd; + uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX]; + + int ipv4_allow_map_fd; + int ipv6_allow_map_fd; + int ipv4_deny_map_fd; + int ipv6_deny_map_fd; + BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed; + BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed; + + Set *ip_bpf_custom_ingress; + Set *ip_bpf_custom_ingress_installed; + Set *ip_bpf_custom_egress; + Set *ip_bpf_custom_egress_installed; + + /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd, + * attached to unit cgroup by provided program fd and attach type. */ + Hashmap *bpf_foreign_by_key; + + FDSet *initial_socket_bind_link_fds; +#if BPF_FRAMEWORK + /* BPF links to BPF programs attached to cgroup/bind{4|6} hooks and + * responsible for allowing or denying a unit to bind(2) to a socket + * address. */ + struct bpf_link *ipv4_socket_bind_link; + struct bpf_link *ipv6_socket_bind_link; +#endif + + FDSet *initial_restrict_ifaces_link_fds; +#if BPF_FRAMEWORK + struct bpf_link *restrict_ifaces_ingress_bpf_link; + struct bpf_link *restrict_ifaces_egress_bpf_link; +#endif + + bool cgroup_realized:1; + bool cgroup_members_mask_valid:1; + + /* Reset cgroup accounting next time we fork something off */ + bool reset_accounting:1; + + /* Whether we warned about clamping the CPU quota period */ + bool warned_clamping_cpu_quota_period:1; +} CGroupRuntime; + typedef struct Unit Unit; typedef struct Manager Manager; typedef enum ManagerState ManagerState; @@ -285,6 +385,7 @@ uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state); usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period); void cgroup_context_init(CGroupContext *c); +int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src); void cgroup_context_done(CGroupContext *c); void cgroup_context_dump(Unit *u, FILE* f, const char *prefix); void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f); @@ -309,6 +410,17 @@ static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) { int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p); int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p); int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path); +static inline int cgroup_context_add_bpf_foreign_program_dup(CGroupContext *c, const CGroupBPFForeignProgram *p) { + return cgroup_context_add_bpf_foreign_program(c, p->attach_type, p->bpffs_path); +} +int cgroup_context_add_io_device_limit_dup(CGroupContext *c, const CGroupIODeviceLimit *l); +int cgroup_context_add_io_device_weight_dup(CGroupContext *c, const CGroupIODeviceWeight *w); +int cgroup_context_add_io_device_latency_dup(CGroupContext *c, const CGroupIODeviceLatency *l); +int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, const CGroupBlockIODeviceWeight *w); +int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, const CGroupBlockIODeviceBandwidth *b); +int cgroup_context_add_device_allow_dup(CGroupContext *c, const CGroupDeviceAllow *a); +int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, const CGroupSocketBindItem *i); +int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, const CGroupSocketBindItem *i); void unit_modify_nft_set(Unit *u, bool add); @@ -336,6 +448,7 @@ int unit_watch_cgroup(Unit *u); int unit_watch_cgroup_memory(Unit *u); void unit_add_to_cgroup_realize_queue(Unit *u); +int unit_cgroup_is_empty(Unit *u); void unit_release_cgroup(Unit *u); /* Releases the cgroup only if it is recursively empty. * Returns true if the cgroup was released, false otherwise. */ @@ -353,9 +466,9 @@ void manager_shutdown_cgroup(Manager *m, bool delete); unsigned manager_dispatch_cgroup_realize_queue(Manager *m); Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup); -Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid); -Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid); -Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid); +Unit *manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid); +Unit *manager_get_unit_by_pidref_watching(Manager *m, const PidRef *pid); +Unit* manager_get_unit_by_pidref(Manager *m, const PidRef *pid); Unit* manager_get_unit_by_pid(Manager *m, pid_t pid); uint64_t unit_get_ancestor_memory_min(Unit *u); @@ -374,6 +487,7 @@ int unit_get_tasks_current(Unit *u, uint64_t *ret); int unit_get_cpu_usage(Unit *u, nsec_t *ret); int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret); int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret); +int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret); int unit_reset_cpu_accounting(Unit *u); void unit_reset_memory_accounting_last(Unit *u); @@ -413,6 +527,13 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action); const char* freezer_action_to_string(FreezerAction a) _const_; FreezerAction freezer_action_from_string(const char *s) _pure_; +CGroupRuntime *cgroup_runtime_new(void); +CGroupRuntime *cgroup_runtime_free(CGroupRuntime *crt); +DEFINE_TRIVIAL_CLEANUP_FUNC(CGroupRuntime*, cgroup_runtime_free); + +int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds); +int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, FDSet *fds); + const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_; CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_; @@ -425,5 +546,8 @@ CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_; CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_; +const char* cgroup_effective_limit_type_to_string(CGroupLimitType m) _const_; +CGroupLimitType cgroup_effective_limit_type_from_string(const char *s) _pure_; + const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_; CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_; diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c index cd91381..3e6168d 100644 --- a/src/core/core-varlink.c +++ b/src/core/core-varlink.c @@ -69,6 +69,10 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, J if (!c) return -EINVAL; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt) + return -EINVAL; + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) /* systemd-oomd should always treat inactive units as though they didn't enable any action since they * should not have a valid cgroup */ @@ -83,19 +87,24 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, J return json_build(ret_v, JSON_BUILD_OBJECT( JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)), - JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)), + JSON_BUILD_PAIR("path", JSON_BUILD_STRING(crt->cgroup_path)), JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)), JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)))); } int manager_varlink_send_managed_oom_update(Unit *u) { _cleanup_(json_variant_unrefp) JsonVariant *arr = NULL, *v = NULL; + CGroupRuntime *crt; CGroupContext *c; int r; assert(u); - if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->cgroup_path) + if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager) + return 0; + + crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return 0; if (MANAGER_IS_SYSTEM(u->manager)) { @@ -119,10 +128,10 @@ int manager_varlink_send_managed_oom_update(Unit *u) { if (r < 0) return r; - for (size_t i = 0; i < ELEMENTSOF(managed_oom_mode_properties); i++) { + FOREACH_ELEMENT(i, managed_oom_mode_properties) { _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; - r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[i], &e); + r = build_managed_oom_json_array_element(u, *i, &e); if (r < 0) return r; @@ -173,16 +182,16 @@ static int build_managed_oom_cgroups_json(Manager *m, JsonVariant **ret) { if (!c) continue; - for (size_t j = 0; j < ELEMENTSOF(managed_oom_mode_properties); j++) { + FOREACH_ELEMENT(i, managed_oom_mode_properties) { _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; /* For the initial varlink call we only care about units that enabled (i.e. mode is not * set to "auto") oomd properties. */ - if (!(streq(managed_oom_mode_properties[j], "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) && - !(streq(managed_oom_mode_properties[j], "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL)) + if (!(streq(*i, "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) && + !(streq(*i, "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL)) continue; - r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[j], &e); + r = build_managed_oom_json_array_element(u, *i, &e); if (r < 0) return r; @@ -359,7 +368,7 @@ static int build_group_json(const char *group_name, gid_t gid, JsonVariant **ret JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid)), JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")), JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic")))))); - } +} static bool group_match_lookup_parameters(LookupParameters *p, const char *name, gid_t gid) { assert(p); @@ -491,6 +500,43 @@ static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) { m->managed_oom_varlink = varlink_unref(link); } +static int manager_setup_varlink_server(Manager *m, VarlinkServer **ret) { + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + int r; + + assert(m); + assert(ret); + + r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA); + if (r < 0) + return log_debug_errno(r, "Failed to allocate varlink server object: %m"); + + varlink_server_set_userdata(s, m); + + r = varlink_server_add_interface_many( + s, + &vl_interface_io_systemd_UserDatabase, + &vl_interface_io_systemd_ManagedOOM); + if (r < 0) + return log_debug_errno(r, "Failed to add interfaces to varlink server: %m"); + + r = varlink_server_bind_method_many( + s, + "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record, + "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record, + "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships, + "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups); + if (r < 0) + return log_debug_errno(r, "Failed to register varlink methods: %m"); + + r = varlink_server_bind_disconnect(s, vl_disconnect); + if (r < 0) + return log_debug_errno(r, "Failed to register varlink disconnect handler: %m"); + + *ret = TAKE_PTR(s); + return 0; +} + static int manager_varlink_init_system(Manager *m) { _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; int r; @@ -527,7 +573,7 @@ static int manager_varlink_init_system(Manager *m) { } } - r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); + r = varlink_server_attach_event(s, m->event, EVENT_PRIORITY_IPC); if (r < 0) return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); @@ -585,7 +631,7 @@ static int manager_varlink_init_user(Manager *m) { if (r < 0) return r; - r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL); + r = varlink_attach_event(link, m->event, EVENT_PRIORITY_IPC); if (r < 0) return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); @@ -597,43 +643,6 @@ static int manager_varlink_init_user(Manager *m) { return 1; } -int manager_setup_varlink_server(Manager *m, VarlinkServer **ret) { - _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; - int r; - - assert(m); - assert(ret); - - r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA); - if (r < 0) - return log_debug_errno(r, "Failed to allocate varlink server object: %m"); - - varlink_server_set_userdata(s, m); - - r = varlink_server_add_interface_many( - s, - &vl_interface_io_systemd_UserDatabase, - &vl_interface_io_systemd_ManagedOOM); - if (r < 0) - return log_error_errno(r, "Failed to add interfaces to varlink server: %m"); - - r = varlink_server_bind_method_many( - s, - "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record, - "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record, - "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships, - "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups); - if (r < 0) - return log_debug_errno(r, "Failed to register varlink methods: %m"); - - r = varlink_server_bind_disconnect(s, vl_disconnect); - if (r < 0) - return log_debug_errno(r, "Failed to register varlink disconnect handler: %m"); - - *ret = TAKE_PTR(s); - return 0; -} - int manager_varlink_init(Manager *m) { return MANAGER_IS_SYSTEM(m) ? manager_varlink_init_system(m) : manager_varlink_init_user(m); } diff --git a/src/core/core-varlink.h b/src/core/core-varlink.h index 7f810d1..20507a4 100644 --- a/src/core/core-varlink.h +++ b/src/core/core-varlink.h @@ -6,10 +6,6 @@ int manager_varlink_init(Manager *m); void manager_varlink_done(Manager *m); -/* Creates a new VarlinkServer and binds methods. Does not set up sockets or attach events. - * Used for manager serialize/deserialize. */ -int manager_setup_varlink_server(Manager *m, VarlinkServer **ret_s); - /* The manager is expected to send an update to systemd-oomd if one of the following occurs: * - The value of ManagedOOM*= properties change * - A unit with ManagedOOM*= properties changes unit active state */ diff --git a/src/core/crash-handler.c b/src/core/crash-handler.c index f5c31b6..4a3fc01 100644 --- a/src/core/crash-handler.c +++ b/src/core/crash-handler.c @@ -27,7 +27,13 @@ _noreturn_ void freeze_or_exit_or_reboot(void) { _exit(EXIT_EXCEPTION); } - if (arg_crash_reboot) { + if (arg_crash_action == CRASH_POWEROFF) { + log_notice("Shutting down..."); + (void) reboot(RB_POWER_OFF); + log_struct_errno(LOG_EMERG, errno, + LOG_MESSAGE("Failed to power off: %m"), + "MESSAGE_ID=" SD_MESSAGE_CRASH_FAILED_STR); + } else if (arg_crash_action == CRASH_REBOOT) { log_notice("Rebooting in 10s..."); (void) sleep(10); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 8a9570f..49e84b4 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -487,6 +487,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("StartupMemorySwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_swap_max), 0), SD_BUS_PROPERTY("MemoryZSwapMax", "t", NULL, offsetof(CGroupContext, memory_zswap_max), 0), SD_BUS_PROPERTY("StartupMemoryZSwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_zswap_max), 0), + SD_BUS_PROPERTY("MemoryZSwapWriteback", "b", bus_property_get_bool, offsetof(CGroupContext, memory_zswap_writeback), 0), SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0), SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0), SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0), @@ -1279,6 +1280,9 @@ int bus_cgroup_set_property( if (streq(name, "MemoryLimitScale")) return bus_cgroup_set_memory_scale(u, name, &c->memory_limit, message, flags, error); + if (streq(name, "MemoryZSwapWriteback")) + return bus_cgroup_set_boolean(u, name, &c->memory_zswap_writeback, CGROUP_MASK_MEMORY, message, flags, error); + if (streq(name, "TasksAccounting")) return bus_cgroup_set_boolean(u, name, &c->tasks_accounting, CGROUP_MASK_PIDS, message, flags, error); @@ -1300,17 +1304,18 @@ int bus_cgroup_set_property( if (!UNIT_WRITE_FLAGS_NOOP(flags)) { c->cpu_quota_per_sec_usec = u64; - u->warned_clamping_cpu_quota_period = false; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (crt) + crt->warned_clamping_cpu_quota_period = false; unit_invalidate_cgroup(u, CGROUP_MASK_CPU); if (c->cpu_quota_per_sec_usec == USEC_INFINITY) unit_write_setting(u, flags, "CPUQuota", "CPUQuota="); else - /* config_parse_cpu_quota() requires an integer, so truncating division is used on - * purpose here. */ unit_write_settingf(u, flags, "CPUQuota", - "CPUQuota=%0.f%%", - (double) (c->cpu_quota_per_sec_usec / 10000)); + "CPUQuota=" USEC_FMT ".%02" PRI_USEC "%%", + c->cpu_quota_per_sec_usec / 10000, + (c->cpu_quota_per_sec_usec % 10000) / 100); } return 1; @@ -1324,7 +1329,9 @@ int bus_cgroup_set_property( if (!UNIT_WRITE_FLAGS_NOOP(flags)) { c->cpu_quota_period_usec = u64; - u->warned_clamping_cpu_quota_period = false; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (crt) + crt->warned_clamping_cpu_quota_period = false; unit_invalidate_cgroup(u, CGROUP_MASK_CPU); if (c->cpu_quota_period_usec == USEC_INFINITY) unit_write_setting(u, flags, "CPUQuotaPeriodSec", "CPUQuotaPeriodSec="); @@ -2188,7 +2195,7 @@ int bus_cgroup_set_property( c->restrict_network_interfaces_is_allow_list = is_allow_list; STRV_FOREACH(s, l) { - if (!ifname_valid(*s)) { + if (!ifname_valid_full(*s, IFNAME_VALID_ALTERNATIVE)) { log_full(LOG_WARNING, "Invalid interface name, ignoring: %s", *s); continue; } diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 2d05ba7..21c260b 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -67,6 +67,7 @@ static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_policy, "i", ExecContext, static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_priority, "i", ExecContext, exec_context_get_cpu_sched_priority); static BUS_DEFINE_PROPERTY_GET(property_get_coredump_filter, "t", ExecContext, exec_context_get_coredump_filter); static BUS_DEFINE_PROPERTY_GET(property_get_timer_slack_nsec, "t", ExecContext, exec_context_get_timer_slack_nsec); +static BUS_DEFINE_PROPERTY_GET(property_get_set_login_environment, "b", ExecContext, exec_context_get_set_login_environment); static int property_get_environment_files( sd_bus *bus, @@ -1038,7 +1039,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("User", "s", NULL, offsetof(ExecContext, user), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Group", "s", NULL, offsetof(ExecContext, group), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DynamicUser", "b", bus_property_get_bool, offsetof(ExecContext, dynamic_user), SD_BUS_VTABLE_PROPERTY_CONST), - SD_BUS_PROPERTY("SetLoginEnvironment", "b", bus_property_get_tristate, offsetof(ExecContext, set_login_environment), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SetLoginEnvironment", "b", property_get_set_login_environment, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(ExecContext, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SetCredential", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SetCredentialEncrypted", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST), @@ -1305,18 +1306,24 @@ int bus_set_transient_exec_command( sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { - bool is_ex_prop = endswith(name, "Ex"); - unsigned n = 0; + + const char *ex_prop = endswith(ASSERT_PTR(name), "Ex"); + size_t n = 0; int r; + assert(u); + assert(exec_command); + assert(message); + assert(error); + /* Drop Ex from the written setting. E.g. ExecStart=, not ExecStartEx=. */ - const char *written_name = is_ex_prop ? strndupa(name, strlen(name) - 2) : name; + const char *written_name = ex_prop ? strndupa_safe(name, ex_prop - name) : name; - r = sd_bus_message_enter_container(message, 'a', is_ex_prop ? "(sasas)" : "(sasb)"); + r = sd_bus_message_enter_container(message, 'a', ex_prop ? "(sasas)" : "(sasb)"); if (r < 0) return r; - while ((r = sd_bus_message_enter_container(message, 'r', is_ex_prop ? "sasas" : "sasb")) > 0) { + while ((r = sd_bus_message_enter_container(message, 'r', ex_prop ? "sasas" : "sasb")) > 0) { _cleanup_strv_free_ char **argv = NULL, **ex_opts = NULL; const char *path; int b; @@ -1338,7 +1345,7 @@ int bus_set_transient_exec_command( return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "\"%s\" argv cannot be empty", name); - r = is_ex_prop ? sd_bus_message_read_strv(message, &ex_opts) : sd_bus_message_read(message, "b", &b); + r = ex_prop ? sd_bus_message_read_strv(message, &ex_opts) : sd_bus_message_read(message, "b", &b); if (r < 0) return r; @@ -1347,29 +1354,28 @@ int bus_set_transient_exec_command( return r; if (!UNIT_WRITE_FLAGS_NOOP(flags)) { - ExecCommand *c; + _cleanup_(exec_command_freep) ExecCommand *c = NULL; - c = new0(ExecCommand, 1); + c = new(ExecCommand, 1); if (!c) return -ENOMEM; - c->path = strdup(path); - if (!c->path) { - free(c); - return -ENOMEM; - } + *c = (ExecCommand) { + .argv = TAKE_PTR(argv), + }; - c->argv = TAKE_PTR(argv); + r = path_simplify_alloc(path, &c->path); + if (r < 0) + return r; - if (is_ex_prop) { + if (ex_prop) { r = exec_command_flags_from_strv(ex_opts, &c->flags); if (r < 0) return r; - } else - c->flags = b ? EXEC_COMMAND_IGNORE_FAILURE : 0; + } else if (b) + c->flags |= EXEC_COMMAND_IGNORE_FAILURE; - path_simplify(c->path); - exec_command_append_list(exec_command, c); + exec_command_append_list(exec_command, TAKE_PTR(c)); } n++; @@ -1738,6 +1744,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "PrivateMounts")) return bus_set_transient_tristate(u, name, &c->private_mounts, message, flags, error); + if (streq(name, "MountAPIVFS")) + return bus_set_transient_tristate(u, name, &c->mount_apivfs, message, flags, error); + if (streq(name, "PrivateNetwork")) return bus_set_transient_bool(u, name, &c->private_network, message, flags, error); @@ -1897,7 +1906,7 @@ int bus_exec_context_set_transient_property( c->restrict_filesystems_allow_list = allow_list; STRV_FOREACH(s, l) { - r = lsm_bpf_parse_filesystem( + r = bpf_restrict_fs_parse_filesystem( *s, &c->restrict_filesystems, FILESYSTEM_PARSE_LOG| @@ -1948,7 +1957,7 @@ int bus_exec_context_set_transient_property( r = strv_extend_strv(&c->supplementary_groups, l, true); if (r < 0) - return -ENOMEM; + return r; joined = strv_join(c->supplementary_groups, " "); if (!joined) @@ -2705,51 +2714,51 @@ int bus_exec_context_set_transient_property( return 1; - } else if (streq(name, "MountAPIVFS")) { - bool b; - - r = bus_set_transient_bool(u, name, &b, message, flags, error); - if (r < 0) - return r; - - if (!UNIT_WRITE_FLAGS_NOOP(flags)) { - c->mount_apivfs = b; - c->mount_apivfs_set = true; - } - - return 1; - } else if (streq(name, "WorkingDirectory")) { + _cleanup_free_ char *simplified = NULL; + bool missing_ok = false, is_home = false; const char *s; - bool missing_ok; r = sd_bus_message_read(message, "s", &s); if (r < 0) return r; - if (s[0] == '-') { - missing_ok = true; - s++; - } else - missing_ok = false; + if (!isempty(s)) { + if (s[0] == '-') { + missing_ok = true; + s++; + } - if (!isempty(s) && !streq(s, "~") && !path_is_absolute(s)) - return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "WorkingDirectory= expects an absolute path or '~'"); + if (streq(s, "~")) + is_home = true; + else { + if (!path_is_absolute(s)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, + "WorkingDirectory= expects an absolute path or '~'"); - if (!UNIT_WRITE_FLAGS_NOOP(flags)) { - if (streq(s, "~")) { - c->working_directory = mfree(c->working_directory); - c->working_directory_home = true; - } else { - r = free_and_strdup(&c->working_directory, empty_to_null(s)); + r = path_simplify_alloc(s, &simplified); if (r < 0) return r; - c->working_directory_home = false; + if (!path_is_normalized(simplified)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, + "WorkingDirectory= expects a normalized path or '~'"); + + if (path_below_api_vfs(simplified)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, + "WorkingDirectory= may not be below /proc/, /sys/ or /dev/"); } + } + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + free_and_replace(c->working_directory, simplified); + c->working_directory_home = is_home; c->working_directory_missing_ok = missing_ok; - unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s); + + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, + "WorkingDirectory=%s%s", + c->working_directory_missing_ok ? "-" : "", + c->working_directory_home ? "~" : strempty(c->working_directory)); } return 1; @@ -3173,7 +3182,7 @@ int bus_exec_context_set_transient_property( r = strv_extend_strv(dirs, l, true); if (r < 0) - return -ENOMEM; + return r; unit_write_settingf(u, flags, name, "%s=%s", name, joined); } @@ -3200,7 +3209,7 @@ int bus_exec_context_set_transient_property( _cleanup_free_ char *joined = NULL; r = strv_extend_strv(&c->exec_search_path, l, true); if (r < 0) - return -ENOMEM; + return r; joined = strv_join(c->exec_search_path, ":"); if (!joined) return log_oom(); diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h index 5926bdb..4b7cb86 100644 --- a/src/core/dbus-execute.h +++ b/src/core/dbus-execute.h @@ -9,6 +9,7 @@ #define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \ + BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoffTimestamp", (offset) + offsetof(ExecStatus, handoff_timestamp), flags), \ SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \ SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \ SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags) diff --git a/src/core/dbus-job.c b/src/core/dbus-job.c index c88d8c2..693efbb 100644 --- a/src/core/dbus-job.c +++ b/src/core/dbus-job.c @@ -54,7 +54,7 @@ int bus_job_method_cancel(sd_bus_message *message, void *userdata, sd_bus_error if (!sd_bus_track_contains(j->bus_track, sd_bus_message_get_sender(message))) { /* And for everybody else consult polkit */ - r = bus_verify_manage_units_async(j->unit->manager, message, error); + r = bus_verify_manage_units_async(j->manager, message, error); if (r < 0) return r; if (r == 0) @@ -87,22 +87,23 @@ int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_ if (r < 0) return r; - for (int i = 0; i < n; i ++) { + FOREACH_ARRAY(i, list, n) { _cleanup_free_ char *unit_path = NULL, *job_path = NULL; + Job *job = *i; - job_path = job_dbus_path(list[i]); + job_path = job_dbus_path(job); if (!job_path) return -ENOMEM; - unit_path = unit_dbus_path(list[i]->unit); + unit_path = unit_dbus_path(job->unit); if (!unit_path) return -ENOMEM; r = sd_bus_message_append(reply, "(usssoo)", - list[i]->id, - list[i]->unit->id, - job_type_to_string(list[i]->type), - job_state_to_string(list[i]->state), + job->id, + job->unit->id, + job_type_to_string(job->type), + job_state_to_string(job->state), job_path, unit_path); if (r < 0) @@ -262,7 +263,7 @@ void bus_job_send_pending_change_signal(Job *j, bool including_new) { if (!j->sent_dbus_new_signal && !including_new) return; - if (MANAGER_IS_RELOADING(j->unit->manager)) + if (MANAGER_IS_RELOADING(j->manager)) return; bus_job_send_change_signal(j); @@ -331,12 +332,12 @@ static int bus_job_allocate_bus_track(Job *j) { if (j->bus_track) return 0; - return sd_bus_track_new(j->unit->manager->api_bus, &j->bus_track, bus_job_track_handler, j); + return sd_bus_track_new(j->manager->api_bus, &j->bus_track, bus_job_track_handler, j); } int bus_job_coldplug_bus_track(Job *j) { - int r; _cleanup_strv_free_ char **deserialized_clients = NULL; + int r; assert(j); @@ -361,7 +362,7 @@ int bus_job_track_sender(Job *j, sd_bus_message *m) { assert(j); assert(m); - if (sd_bus_message_get_bus(m) != j->unit->manager->api_bus) { + if (sd_bus_message_get_bus(m) != j->manager->api_bus) { j->ref_by_private_bus = true; return 0; } diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index 745f5cc..2515f54 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -11,6 +11,7 @@ #include "bus-common-errors.h" #include "bus-get-properties.h" #include "bus-log-control-api.h" +#include "bus-util.h" #include "chase.h" #include "confidential-virt.h" #include "data-fd-util.h" @@ -39,6 +40,7 @@ #include "string-util.h" #include "strv.h" #include "syslog-util.h" +#include "taint.h" #include "user-util.h" #include "version.h" #include "virt.h" @@ -125,13 +127,10 @@ static int property_get_tainted( void *userdata, sd_bus_error *error) { - _cleanup_free_ char *s = NULL; - Manager *m = ASSERT_PTR(userdata); - assert(bus); assert(reply); - s = manager_taint_string(m); + _cleanup_free_ char *s = taint_string(); if (!s) return log_oom(); @@ -464,18 +463,13 @@ static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char * its sleeve: if the name is specified empty we use the client's unit. */ if (isempty(name)) { - _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; - pid_t pid; - - r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); - if (r < 0) - return r; + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - r = sd_bus_creds_get_pid(creds, &pid); + r = bus_query_sender_pidref(message, &pidref); if (r < 0) return r; - u = manager_get_unit_by_pid(m, pid); + u = manager_get_unit_by_pidref(m, &pidref); if (!u) return sd_bus_error_set(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit."); } else { @@ -542,7 +536,7 @@ static int method_get_unit(sd_bus_message *message, void *userdata, sd_bus_error static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) { Manager *m = ASSERT_PTR(userdata); - pid_t pid; + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; Unit *u; int r; @@ -552,27 +546,20 @@ static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bu /* Anyone can call this method */ - r = sd_bus_message_read(message, "u", &pid); + r = sd_bus_message_read(message, "u", &pidref.pid); if (r < 0) return r; - if (pid < 0) - return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pid); - - if (pid == 0) { - _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; - - r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); - if (r < 0) - return r; - - r = sd_bus_creds_get_pid(creds, &pid); + if (pidref.pid < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pidref.pid); + if (pidref.pid == 0) { + r = bus_query_sender_pidref(message, &pidref); if (r < 0) return r; } - u = manager_get_unit_by_pid(m, pid); + u = manager_get_unit_by_pidref(m, &pidref); if (!u) - return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pid); + return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pidref.pid); return reply_unit_path(u, message, error); } @@ -581,41 +568,27 @@ static int method_get_unit_by_invocation_id(sd_bus_message *message, void *userd _cleanup_free_ char *path = NULL; Manager *m = ASSERT_PTR(userdata); sd_id128_t id; - const void *a; Unit *u; - size_t sz; int r; assert(message); /* Anyone can call this method */ - r = sd_bus_message_read_array(message, 'y', &a, &sz); - if (r < 0) - return r; - if (sz == 0) - id = SD_ID128_NULL; - else if (sz == 16) - memcpy(&id, a, sz); - else + if (bus_message_read_id128(message, &id) < 0) return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid invocation ID"); if (sd_id128_is_null(id)) { - _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; - pid_t pid; - - r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); - if (r < 0) - return r; + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - r = sd_bus_creds_get_pid(creds, &pid); + r = bus_query_sender_pidref(message, &pidref); if (r < 0) return r; - u = manager_get_unit_by_pid(m, pid); + u = manager_get_unit_by_pidref(m, &pidref); if (!u) return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, - "Client " PID_FMT " not member of any unit.", pid); + "Client " PID_FMT " not member of any unit.", pidref.pid); } else { u = hashmap_get(m->units_by_invocation_id, &id); if (!u) @@ -797,6 +770,7 @@ static int method_generic_unit_operation( assert(message); assert(m); + assert(handler); /* Read the first argument from the command and pass the operation to the specified per-unit * method. */ @@ -860,11 +834,13 @@ static int method_clean_unit(sd_bus_message *message, void *userdata, sd_bus_err } static int method_freeze_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { - return method_generic_unit_operation(message, userdata, error, bus_unit_method_freeze, 0); + /* Only active units can be frozen, which must be properly loaded already */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_freeze, GENERIC_UNIT_VALIDATE_LOADED); } static int method_thaw_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { - return method_generic_unit_operation(message, userdata, error, bus_unit_method_thaw, 0); + /* Same as freeze above */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_thaw, GENERIC_UNIT_VALIDATE_LOADED); } static int method_reset_failed_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { @@ -972,9 +948,10 @@ static int method_list_units_by_names(sd_bus_message *message, void *userdata, s } static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { - /* Don't load a unit (since it won't have any processes if it's not loaded), but don't insist on the - * unit being loaded (because even improperly loaded units might still have processes around */ - return method_generic_unit_operation(message, userdata, error, bus_unit_method_get_processes, 0); + /* Don't load a unit actively (since it won't have any processes if it's not loaded), but don't + * insist on the unit being loaded either (because even improperly loaded units might still have + * processes around). */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_get_processes, /* flags = */ 0); } static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { @@ -1430,11 +1407,11 @@ static int dump_impl( * operations, and can cause PID1 to stall. So it seems similar enough in terms of security * considerations and impact, and thus use the same access check for dumps which, given the * large amount of data to fetch, can stall PID1 for quite some time. */ - r = mac_selinux_access_check(message, "reload", error); + r = mac_selinux_access_check(message, "reload", /* error = */ NULL); if (r < 0) goto ratelimited; - r = bus_verify_bypass_dump_ratelimit_async(m, message, error); + r = bus_verify_bypass_dump_ratelimit_async(m, message, /* error = */ NULL); if (r < 0) goto ratelimited; if (r == 0) @@ -1469,7 +1446,7 @@ static int method_dump(sd_bus_message *message, void *userdata, sd_bus_error *er static int reply_dump_by_fd(sd_bus_message *message, char *dump) { _cleanup_close_ int fd = -EBADF; - fd = acquire_data_fd(dump, strlen(dump), 0); + fd = acquire_data_fd(dump); if (fd < 0) return fd; @@ -1621,10 +1598,10 @@ static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error * return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ /* Write a log message noting the unit or process who requested the Reload() */ - log_caller(message, m, "Reloading"); + log_caller(message, m, "Reload"); /* Check the rate limit after the authorization succeeds, to avoid denial-of-service issues. */ - if (!ratelimit_below(&m->reload_ratelimit)) { + if (!ratelimit_below(&m->reload_reexec_ratelimit)) { log_warning("Reloading request rejected due to rate limit."); return sd_bus_error_setf(error, SD_BUS_ERROR_LIMITS_EXCEEDED, @@ -1667,7 +1644,15 @@ static int method_reexecute(sd_bus_message *message, void *userdata, sd_bus_erro return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ /* Write a log message noting the unit or process who requested the Reexecute() */ - log_caller(message, m, "Reexecuting"); + log_caller(message, m, "Reexecution"); + + /* Check the rate limit after the authorization succeeds, to avoid denial-of-service issues. */ + if (!ratelimit_below(&m->reload_reexec_ratelimit)) { + log_warning("Reexecution request rejected due to rate limit."); + return sd_bus_error_setf(error, + SD_BUS_ERROR_LIMITS_EXCEEDED, + "Reexecute() request rejected due to rate limit."); + } /* We don't send a reply back here, the client should * just wait for us disconnecting. */ @@ -2329,85 +2314,53 @@ static int send_unit_files_changed(sd_bus *bus, void *userdata) { return sd_bus_send(bus, message, NULL); } -/* Create an error reply, using the error information from changes[] - * if possible, and fall back to generating an error from error code c. - * The error message only describes the first error. - */ +static void manager_unit_files_changed(Manager *m, const InstallChange *changes, size_t n_changes) { + int r; + + assert(m); + assert(changes || n_changes == 0); + + if (!install_changes_have_modification(changes, n_changes)) + return; + + /* See comments for this variable in manager.h */ + m->unit_file_state_outdated = true; + + r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL); + if (r < 0) + log_debug_errno(r, "Failed to send UnitFilesChanged signal, ignoring: %m"); +} + static int install_error( sd_bus_error *error, int c, InstallChange *changes, size_t n_changes) { - CLEANUP_ARRAY(changes, n_changes, install_changes_free); + int r; - for (size_t i = 0; i < n_changes; i++) + /* Create an error reply, using the error information from changes[] if possible, and fall back to + * generating an error from error code c. The error message only describes the first error. */ - /* When making changes here, make sure to also change install_changes_dump() in install.c. */ + assert(changes || n_changes == 0); - switch (changes[i].type) { - case 0 ... _INSTALL_CHANGE_TYPE_MAX: /* not errors */ - break; + CLEANUP_ARRAY(changes, n_changes, install_changes_free); - case -EEXIST: - if (changes[i].source) - return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, - "File %s already exists and is a symlink to %s.", - changes[i].path, changes[i].source); - return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, - "File %s already exists.", - changes[i].path); - - case -ERFKILL: - return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, - "Unit file %s is masked.", changes[i].path); - - case -EADDRNOTAVAIL: - return sd_bus_error_setf(error, BUS_ERROR_UNIT_GENERATED, - "Unit %s is transient or generated.", changes[i].path); - - case -ETXTBSY: - return sd_bus_error_setf(error, BUS_ERROR_UNIT_BAD_PATH, - "File %s is under the systemd unit hierarchy already.", changes[i].path); - - case -EBADSLT: - return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, - "Invalid specifier in %s.", changes[i].path); - - case -EIDRM: - return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, - "Destination unit %s is a non-template unit.", changes[i].path); - - case -EUCLEAN: - return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, - "\"%s\" is not a valid unit name.", - changes[i].path); - - case -ELOOP: - return sd_bus_error_setf(error, BUS_ERROR_UNIT_LINKED, - "Refusing to operate on alias name or linked unit file: %s", - changes[i].path); - - case -EXDEV: - if (changes[i].source) - return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, - "Cannot alias %s as %s.", - changes[i].source, changes[i].path); - return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, - "Invalid unit reference %s.", changes[i].path); - - case -ENOENT: - return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, - "Unit file %s does not exist.", changes[i].path); + FOREACH_ARRAY(i, changes, n_changes) { + _cleanup_free_ char *err_message = NULL; + const char *bus_error; - case -EUNATCH: - return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, - "Cannot resolve specifiers in %s.", changes[i].path); + if (i->type >= 0) + continue; - default: - assert(changes[i].type < 0); /* other errors */ - return sd_bus_error_set_errnof(error, changes[i].type, "File %s: %m", changes[i].path); - } + r = install_change_dump_error(i, &err_message, &bus_error); + if (r == -ENOMEM) + return r; + if (r < 0) + return sd_bus_error_set_errnof(error, r, "File %s: %m", i->path); + + return sd_bus_error_set(error, bus_error, err_message); + } return c < 0 ? c : -EINVAL; } @@ -2426,12 +2379,6 @@ static int reply_install_changes_and_free( CLEANUP_ARRAY(changes, n_changes, install_changes_free); - if (install_changes_have_modification(changes, n_changes)) { - r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL); - if (r < 0) - log_debug_errno(r, "Failed to send UnitFilesChanged signal: %m"); - } - r = sd_bus_message_new_method_return(message, &reply); if (r < 0) return r; @@ -2446,18 +2393,17 @@ static int reply_install_changes_and_free( if (r < 0) return r; - for (size_t i = 0; i < n_changes; i++) { - - if (changes[i].type < 0) { + FOREACH_ARRAY(i, changes, n_changes) { + if (i->type < 0) { bad = true; continue; } r = sd_bus_message_append( reply, "(sss)", - install_change_type_to_string(changes[i].type), - changes[i].path, - changes[i].source); + install_change_type_to_string(i->type), + i->path, + i->source); if (r < 0) return r; @@ -2521,7 +2467,7 @@ static int method_enable_unit_files_generic( return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes); - m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + manager_unit_files_changed(m, changes, n_changes); if (r < 0) return install_error(error, r, changes, n_changes); @@ -2594,7 +2540,7 @@ static int method_preset_unit_files_with_mode(sd_bus_message *message, void *use return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ r = unit_file_preset(m->runtime_scope, flags, NULL, l, preset_mode, &changes, &n_changes); - m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + manager_unit_files_changed(m, changes, n_changes); if (r < 0) return install_error(error, r, changes, n_changes); @@ -2648,7 +2594,7 @@ static int method_disable_unit_files_generic( return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes); - m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + manager_unit_files_changed(m, changes, n_changes); if (r < 0) return install_error(error, r, changes, n_changes); @@ -2691,7 +2637,7 @@ static int method_revert_unit_files(sd_bus_message *message, void *userdata, sd_ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ r = unit_file_revert(m->runtime_scope, NULL, l, &changes, &n_changes); - m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + manager_unit_files_changed(m, changes, n_changes); if (r < 0) return install_error(error, r, changes, n_changes); @@ -2722,6 +2668,7 @@ static int method_set_default_target(sd_bus_message *message, void *userdata, sd return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ r = unit_file_set_default(m->runtime_scope, force ? UNIT_FILE_FORCE : 0, NULL, name, &changes, &n_changes); + manager_unit_files_changed(m, changes, n_changes); if (r < 0) return install_error(error, r, changes, n_changes); @@ -2764,7 +2711,7 @@ static int method_preset_all_unit_files(sd_bus_message *message, void *userdata, return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ r = unit_file_preset_all(m->runtime_scope, flags, NULL, preset_mode, &changes, &n_changes); - m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + manager_unit_files_changed(m, changes, n_changes); if (r < 0) return install_error(error, r, changes, n_changes); @@ -2804,7 +2751,7 @@ static int method_add_dependency_unit_files(sd_bus_message *message, void *userd return -EINVAL; r = unit_file_add_dependency(m->runtime_scope, flags, NULL, l, target, dep, &changes, &n_changes); - m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */ + manager_unit_files_changed(m, changes, n_changes); if (r < 0) return install_error(error, r, changes, n_changes); @@ -2933,6 +2880,175 @@ static int method_dump_unit_descriptor_store(sd_bus_message *message, void *user return method_generic_unit_operation(message, userdata, error, bus_service_method_dump_file_descriptor_store, 0); } +static int aux_scope_from_message(Manager *m, sd_bus_message *message, Unit **ret_scope, sd_bus_error *error) { + _cleanup_(pidref_done) PidRef sender_pidref = PIDREF_NULL; + _cleanup_free_ PidRef *pidrefs = NULL; + const char *name; + Unit *from, *scope; + PidRef *main_pid; + CGroupContext *cc; + size_t n_pids = 0; + uint64_t flags; + int r; + + assert(ret_scope); + + r = bus_query_sender_pidref(message, &sender_pidref); + if (r < 0) + return r; + + from = manager_get_unit_by_pidref(m, &sender_pidref); + if (!from) + return sd_bus_error_set(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit."); + + if (!IN_SET(from->type, UNIT_SERVICE, UNIT_SCOPE)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Starting auxiliary scope is supported only for service and scope units, refusing."); + + if (!unit_name_is_valid(from->id, UNIT_NAME_PLAIN)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Auxiliary scope can be started only for non-template service units and scope units, refusing."); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + if (!unit_name_is_valid(name, UNIT_NAME_PLAIN)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Invalid name \"%s\" for auxiliary scope.", name); + + if (unit_name_to_type(name) != UNIT_SCOPE) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, + "Name \"%s\" of auxiliary scope doesn't have .scope suffix.", name); + + main_pid = unit_main_pid(from); + + r = sd_bus_message_enter_container(message, 'a', "h"); + if (r < 0) + return r; + + for (;;) { + _cleanup_(pidref_done) PidRef p = PIDREF_NULL; + Unit *unit; + int fd; + + r = sd_bus_message_read(message, "h", &fd); + if (r < 0) + return r; + if (r == 0) + break; + + r = pidref_set_pidfd(&p, fd); + if (r < 0) { + log_unit_warning_errno(from, r, "Failed to create process reference from PIDFD, ignoring: %m"); + continue; + } + + unit = manager_get_unit_by_pidref(m, &p); + if (!unit) { + log_unit_warning(from, "Failed to get unit from PIDFD, ignoring."); + continue; + } + + if (!streq(unit->id, from->id)) { + log_unit_warning(from, "PID " PID_FMT " is not running in the same service as the calling process, ignoring.", p.pid); + continue; + } + + if (pidref_equal(main_pid, &p)) { + log_unit_warning(from, "Main PID cannot be migrated into auxiliary scope, ignoring."); + continue; + } + + if (!GREEDY_REALLOC(pidrefs, n_pids+1)) + return -ENOMEM; + + pidrefs[n_pids++] = TAKE_PIDREF(p); + } + + if (n_pids == 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "No processes can be migrated to auxiliary scope."); + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "t", &flags); + if (r < 0) + return r; + + if (flags != 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Flags must be zero."); + + r = manager_load_unit(m, name, NULL, error, &scope); + if (r < 0) + return r; + + if (!unit_is_pristine(scope)) + return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS, + "Unit %s was already loaded or has a fragment file.", name); + + r = unit_set_slice(scope, UNIT_GET_SLICE(from)); + if (r < 0) + return r; + + cc = unit_get_cgroup_context(scope); + + r = cgroup_context_copy(cc, unit_get_cgroup_context(from)); + if (r < 0) + return r; + + r = unit_make_transient(scope); + if (r < 0) + return r; + + r = bus_unit_set_properties(scope, message, UNIT_RUNTIME, true, error); + if (r < 0) + return r; + + FOREACH_ARRAY(p, pidrefs, n_pids) { + r = unit_pid_attachable(scope, p, error); + if (r < 0) + return r; + + r = unit_watch_pidref(scope, p, /* exclusive= */ false); + if (r < 0 && r != -EEXIST) + return r; + } + + /* Now load the missing bits of the unit we just created */ + unit_add_to_load_queue(scope); + manager_dispatch_load_queue(m); + + *ret_scope = TAKE_PTR(scope); + + return 1; +} + +static int method_start_aux_scope(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = ASSERT_PTR(userdata); + Unit *u = NULL; /* avoid false maybe-uninitialized warning */ + int r; + + assert(message); + + r = mac_selinux_access_check(message, "start", error); + if (r < 0) + return r; + + r = bus_verify_manage_units_async(m, message, error); + if (r < 0) + return r; + if (r == 0) + return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ + + r = aux_scope_from_message(m, message, &u, error); + if (r < 0) + return r; + + return bus_unit_queue_job(message, u, JOB_START, JOB_REPLACE, 0, error); +} + const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_VTABLE_START(0), @@ -2948,6 +3064,7 @@ const sd_bus_vtable bus_manager_vtable[] = { BUS_PROPERTY_DUAL_TIMESTAMP("InitRDTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD]), SD_BUS_VTABLE_PROPERTY_CONST), BUS_PROPERTY_DUAL_TIMESTAMP("UserspaceTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_USERSPACE]), SD_BUS_VTABLE_PROPERTY_CONST), BUS_PROPERTY_DUAL_TIMESTAMP("FinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), + BUS_PROPERTY_DUAL_TIMESTAMP("ShutdownStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SHUTDOWN_START]), SD_BUS_VTABLE_PROPERTY_CONST), BUS_PROPERTY_DUAL_TIMESTAMP("SecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST), BUS_PROPERTY_DUAL_TIMESTAMP("SecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST), BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST), @@ -3045,6 +3162,7 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, defaults.oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("CtrlAltDelBurstAction", "s", bus_property_get_emergency_action, offsetof(Manager, cad_burst_action), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("SoftRebootsCount", "u", bus_property_get_unsigned, offsetof(Manager, soft_reboots_count), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_METHOD_WITH_ARGS("GetUnit", SD_BUS_ARGS("s", name), @@ -3491,6 +3609,11 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_RESULT("a(suuutuusu)", entries), method_dump_unit_descriptor_store, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("StartAuxiliaryScope", + SD_BUS_ARGS("s", name, "ah", pidfds, "t", flags, "a(sv)", properties), + SD_BUS_RESULT("o", job), + method_start_aux_scope, + SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_SIGNAL_WITH_ARGS("UnitNew", SD_BUS_ARGS("s", id, "o", unit), diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c index 7dbbdd0..f6a9ea9 100644 --- a/src/core/dbus-mount.c +++ b/src/core/dbus-mount.c @@ -6,6 +6,7 @@ #include "dbus-kill.h" #include "dbus-mount.h" #include "dbus-util.h" +#include "fstab-util.h" #include "mount.h" #include "string-util.h" #include "unit.h" @@ -62,7 +63,7 @@ const sd_bus_vtable bus_mount_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Mount, where), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("What", "s", property_get_what, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), - SD_BUS_PROPERTY("Options","s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("Options", "s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), SD_BUS_PROPERTY("Type", "s", property_get_type, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Mount, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Mount, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), @@ -88,6 +89,7 @@ static int bus_mount_set_transient_property( sd_bus_error *error) { Unit *u = UNIT(m); + int r; assert(m); assert(name); @@ -98,8 +100,31 @@ static int bus_mount_set_transient_property( if (streq(name, "Where")) return bus_set_transient_path(u, name, &m->where, message, flags, error); - if (streq(name, "What")) - return bus_set_transient_string(u, name, &m->parameters_fragment.what, message, flags, error); + if (streq(name, "What")) { + _cleanup_free_ char *path = NULL; + const char *v; + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + if (!isempty(v)) { + path = fstab_node_to_udev_node(v); + if (!path) + return -ENOMEM; + + /* path_is_valid is not used - see the comment for config_parse_mount_node */ + if (strlen(path) >= PATH_MAX) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Resolved What=%s too long", path); + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + free_and_replace(m->parameters_fragment.what, path); + unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "What=%s", strempty(m->parameters_fragment.what)); + } + + return 1; + } if (streq(name, "Options")) return bus_set_transient_string(u, name, &m->parameters_fragment.options, message, flags, error); diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c index 78196a1..165aa65 100644 --- a/src/core/dbus-scope.c +++ b/src/core/dbus-scope.c @@ -3,6 +3,7 @@ #include "alloc-util.h" #include "bus-common-errors.h" #include "bus-get-properties.h" +#include "bus-util.h" #include "dbus-cgroup.h" #include "dbus-kill.h" #include "dbus-manager.h" @@ -84,7 +85,7 @@ static int bus_scope_set_transient_property( return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error); if (streq(name, "PIDs")) { - _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + _cleanup_(pidref_done) PidRef sender_pidref = PIDREF_NULL; unsigned n = 0; r = sd_bus_message_enter_container(message, 'a', "u"); @@ -94,7 +95,7 @@ static int bus_scope_set_transient_property( for (;;) { _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; uint32_t upid; - pid_t pid; + PidRef *p; r = sd_bus_message_read(message, "u", &upid); if (r < 0) @@ -103,28 +104,27 @@ static int bus_scope_set_transient_property( break; if (upid == 0) { - if (!creds) { - r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (!pidref_is_set(&sender_pidref)) { + r = bus_query_sender_pidref(message, &sender_pidref); if (r < 0) return r; } - r = sd_bus_creds_get_pid(creds, &pid); + p = &sender_pidref; + } else { + r = pidref_set_pid(&pidref, upid); if (r < 0) return r; - } else - pid = (uid_t) upid; - r = pidref_set_pid(&pidref, pid); - if (r < 0) - return r; + p = &pidref; + } - r = unit_pid_attachable(u, &pidref, error); + r = unit_pid_attachable(u, p, error); if (r < 0) return r; if (!UNIT_WRITE_FLAGS_NOOP(flags)) { - r = unit_watch_pidref(u, &pidref, /* exclusive= */ false); + r = unit_watch_pidref(u, p, /* exclusive= */ false); if (r < 0 && r != -EEXIST) return r; } diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c index cc478f4..ff970df 100644 --- a/src/core/dbus-service.c +++ b/src/core/dbus-service.c @@ -166,9 +166,7 @@ static int bus_service_method_mount(sd_bus_message *message, void *userdata, sd_ r = bus_verify_manage_units_async_full( u, is_image ? "mount-image" : "bind-mount", - CAP_SYS_ADMIN, N_("Authentication is required to mount on '$(unit)'."), - true, message, error); if (r < 0) diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c index e77e9e5..03c5b4a 100644 --- a/src/core/dbus-socket.c +++ b/src/core/dbus-socket.c @@ -86,6 +86,7 @@ const sd_bus_vtable bus_socket_vtable[] = { SD_BUS_PROPERTY("Transparent", "b", bus_property_get_bool, offsetof(Socket, transparent), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Broadcast", "b", bus_property_get_bool, offsetof(Socket, broadcast), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PassCredentials", "b", bus_property_get_bool, offsetof(Socket, pass_cred), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PassFileDescriptorsToExec", "b", bus_property_get_bool, offsetof(Socket, pass_fds_to_exec), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PassSecurity", "b", bus_property_get_bool, offsetof(Socket, pass_sec), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PassPacketInfo", "b", bus_property_get_bool, offsetof(Socket, pass_pktinfo), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Timestamping", "s", property_get_timestamping, offsetof(Socket, timestamping), SD_BUS_VTABLE_PROPERTY_CONST), @@ -190,6 +191,9 @@ static int bus_socket_set_transient_property( if (streq(name, "PassCredentials")) return bus_set_transient_bool(u, name, &s->pass_cred, message, flags, error); + if (streq(name, "PassFileDescriptorsToExec")) + return bus_set_transient_bool(u, name, &s->pass_fds_to_exec, message, flags, error); + if (streq(name, "PassSecurity")) return bus_set_transient_bool(u, name, &s->pass_sec, message, flags, error); diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index 1a037b7..953cd51 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -7,6 +7,7 @@ #include "bus-common-errors.h" #include "bus-get-properties.h" #include "bus-polkit.h" +#include "bus-util.h" #include "cgroup-util.h" #include "condition.h" #include "dbus-job.h" @@ -177,7 +178,7 @@ static int property_get_dependencies( return sd_bus_message_close_container(reply); } -static int property_get_requires_mounts_for( +static int property_get_mounts_for( sd_bus *bus, const char *path, const char *interface, @@ -408,9 +409,7 @@ int bus_unit_method_start_generic( r = bus_verify_manage_units_async_full( u, verb, - CAP_SYS_ADMIN, polkit_message_for_job[job_type], - true, message, error); if (r < 0) @@ -491,9 +490,7 @@ int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_ r = bus_verify_manage_units_async_full( u, jtype, - CAP_SYS_ADMIN, polkit_message_for_job[type], - true, message, error); if (r < 0) @@ -549,9 +546,7 @@ int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error * r = bus_verify_manage_units_async_full( u, "kill", - CAP_KILL, N_("Authentication is required to send a UNIX signal to the processes of '$(unit)'."), - true, message, error); if (r < 0) @@ -579,9 +574,7 @@ int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus r = bus_verify_manage_units_async_full( u, "reset-failed", - CAP_SYS_ADMIN, N_("Authentication is required to reset the \"failed\" state of '$(unit)'."), - true, message, error); if (r < 0) @@ -611,9 +604,7 @@ int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_b r = bus_verify_manage_units_async_full( u, "set-property", - CAP_SYS_ADMIN, N_("Authentication is required to set properties on '$(unit)'."), - true, message, error); if (r < 0) @@ -641,9 +632,7 @@ int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *e r = bus_verify_manage_units_async_full( u, "ref", - CAP_SYS_ADMIN, - NULL, - false, + /* polkit_message= */ NULL, message, error); if (r < 0) @@ -712,9 +701,7 @@ int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error r = bus_verify_manage_units_async_full( u, "clean", - CAP_DAC_OVERRIDE, N_("Authentication is required to delete files and directories associated with '$(unit)'."), - true, message, error); if (r < 0) @@ -736,22 +723,13 @@ int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error } static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userdata, sd_bus_error *error, FreezerAction action) { - const char* perm; - int (*method)(Unit*); Unit *u = ASSERT_PTR(userdata); - bool reply_no_delay = false; int r; assert(message); assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); - if (action == FREEZER_FREEZE) { - perm = "stop"; - method = unit_freeze; - } else { - perm = "start"; - method = unit_thaw; - } + const char *perm = action == FREEZER_FREEZE ? "stop" : "start"; r = mac_selinux_unit_access_check(u, message, perm, error); if (r < 0) @@ -760,9 +738,7 @@ static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userda r = bus_verify_manage_units_async_full( u, perm, - CAP_SYS_ADMIN, N_("Authentication is required to freeze or thaw the processes of '$(unit)' unit."), - true, message, error); if (r < 0) @@ -770,19 +746,21 @@ static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userda if (r == 0) return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */ - r = method(u); + r = unit_freezer_action(u, action); if (r == -EOPNOTSUPP) - return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support freezing.", u->id); + return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit does not support freeze/thaw"); if (r == -EBUSY) - return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit has a pending job."); + return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit has a pending job"); if (r == -EHOSTDOWN) - return sd_bus_error_set(error, BUS_ERROR_UNIT_INACTIVE, "Unit is inactive."); + return sd_bus_error_set(error, BUS_ERROR_UNIT_INACTIVE, "Unit is not active"); if (r == -EALREADY) - return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Previously requested freezer operation for unit '%s' is still in progress.", u->id); + return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Previously requested freezer operation for unit is still in progress"); + if (r == -ECHILD) + return sd_bus_error_set(error, SD_BUS_ERROR_FAILED, "Unit is frozen by a parent slice"); if (r < 0) return r; - if (r == 0) - reply_no_delay = true; + + bool reply_now = r == 0; if (u->pending_freezer_invocation) { bus_unit_send_pending_freezer_message(u, true); @@ -791,7 +769,7 @@ static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userda u->pending_freezer_invocation = sd_bus_message_ref(message); - if (reply_no_delay) { + if (reply_now) { r = bus_unit_send_pending_freezer_message(u, false); if (r < 0) return r; @@ -879,7 +857,8 @@ const sd_bus_vtable bus_unit_vtable[] = { SD_BUS_PROPERTY("StopPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("JoinsNamespaceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SliceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST), - SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_requires_mounts_for, offsetof(Unit, requires_mounts_for), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_mounts_for, offsetof(Unit, mounts_for[UNIT_MOUNT_REQUIRES]), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("WantsMountsFor", "as", property_get_mounts_for, offsetof(Unit, mounts_for[UNIT_MOUNT_WANTS]), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Documentation", "as", NULL, offsetof(Unit, documentation), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Description", "s", property_get_description, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("AccessSELinuxContext", "s", NULL, offsetof(Unit, access_selinux_context), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1235,12 +1214,32 @@ static int property_get_cgroup( * indicates the root cgroup, which we report as "/". c) all * other cases we report as-is. */ - if (u->cgroup_path) - t = empty_to_root(u->cgroup_path); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + + if (crt && crt->cgroup_path) + t = empty_to_root(crt->cgroup_path); return sd_bus_message_append(reply, "s", t); } +static int property_get_cgroup_id( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = ASSERT_PTR(userdata); + + assert(bus); + assert(reply); + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + return sd_bus_message_append(reply, "t", crt ? crt->cgroup_id : UINT64_C(0)); +} + static int append_process(sd_bus_message *reply, const char *p, PidRef *pid, Set *pids) { _cleanup_free_ char *buf = NULL, *cmdline = NULL; int r; @@ -1299,7 +1298,7 @@ static int append_cgroup(sd_bus_message *reply, const char *p, Set *pids) { * threaded domain cgroup contains the PIDs of all processes in the subtree and is not * readable in the subtree proper. */ - r = cg_read_pidref(f, &pidref); + r = cg_read_pidref(f, &pidref, /* flags = */ 0); if (IN_SET(r, 0, -EOPNOTSUPP)) break; if (r < 0) @@ -1369,8 +1368,10 @@ int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bu if (r < 0) return r; - if (u->cgroup_path) { - r = append_cgroup(reply, u->cgroup_path, pids); + CGroupRuntime *crt; + crt = unit_get_cgroup_runtime(u); + if (crt && crt->cgroup_path) { + r = append_cgroup(reply, crt->cgroup_path, pids); if (r < 0) return r; } @@ -1441,6 +1442,28 @@ static int property_get_io_counter( return sd_bus_message_append(reply, "t", value); } +static int property_get_effective_limit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t value = CGROUP_LIMIT_MAX; + Unit *u = ASSERT_PTR(userdata); + ssize_t type; + + assert(bus); + assert(reply); + assert(property); + + assert_se((type = cgroup_effective_limit_type_from_string(property)) >= 0); + (void) unit_get_effective_limit(u, type, &value); + return sd_bus_message_append(reply, "t", value); +} + int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; _cleanup_set_free_ Set *pids = NULL; @@ -1478,7 +1501,7 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing."); - r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds); + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID|SD_BUS_CREDS_PIDFD, &creds); if (r < 0) return r; @@ -1489,7 +1512,6 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd _cleanup_(pidref_freep) PidRef *pidref = NULL; uid_t process_uid, sender_uid; uint32_t upid; - pid_t pid; r = sd_bus_message_read(message, "u", &upid); if (r < 0) @@ -1498,13 +1520,14 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd break; if (upid == 0) { - r = sd_bus_creds_get_pid(creds, &pid); + _cleanup_(pidref_done) PidRef p = PIDREF_NULL; + r = bus_creds_get_pidref(creds, &p); if (r < 0) return r; - } else - pid = (uid_t) upid; - r = pidref_new_from_pid(pid, &pidref); + r = pidref_dup(&p, &pidref); + } else + r = pidref_new_from_pid(upid, &pidref); if (r < 0) return r; @@ -1530,9 +1553,9 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m"); if (process_uid != sender_uid) - return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid); + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pidref->pid); if (process_uid != u->ref_uid) - return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid); + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pidref->pid); } r = set_ensure_consume(&pids, &pidref_hash_ops_free, TAKE_PTR(pidref)); @@ -1555,17 +1578,20 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0), SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0), - SD_BUS_PROPERTY("ControlGroupId", "t", NULL, offsetof(Unit, cgroup_id), 0), + SD_BUS_PROPERTY("ControlGroupId", "t", property_get_cgroup_id, 0, 0), SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0), SD_BUS_PROPERTY("MemoryPeak", "t", property_get_memory_accounting, 0, 0), SD_BUS_PROPERTY("MemorySwapCurrent", "t", property_get_memory_accounting, 0, 0), SD_BUS_PROPERTY("MemorySwapPeak", "t", property_get_memory_accounting, 0, 0), SD_BUS_PROPERTY("MemoryZSwapCurrent", "t", property_get_memory_accounting, 0, 0), SD_BUS_PROPERTY("MemoryAvailable", "t", property_get_available_memory, 0, 0), + SD_BUS_PROPERTY("EffectiveMemoryMax", "t", property_get_effective_limit, 0, 0), + SD_BUS_PROPERTY("EffectiveMemoryHigh", "t", property_get_effective_limit, 0, 0), SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0), SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0), SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0), SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0), + SD_BUS_PROPERTY("EffectiveTasksMax", "t", property_get_effective_limit, 0, 0), SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0), SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0), SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0), @@ -1576,16 +1602,16 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = { SD_BUS_PROPERTY("IOWriteOperations", "t", property_get_io_counter, 0, 0), SD_BUS_METHOD_WITH_ARGS("GetProcesses", - SD_BUS_NO_ARGS, - SD_BUS_ARGS("a(sus)", processes), - bus_unit_method_get_processes, - SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_NO_ARGS, + SD_BUS_ARGS("a(sus)", processes), + bus_unit_method_get_processes, + SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD_WITH_ARGS("AttachProcesses", - SD_BUS_ARGS("s", subcgroup, "au", pids), - SD_BUS_NO_RESULT, - bus_unit_method_attach_processes, - SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_ARGS("s", subcgroup, "au", pids), + SD_BUS_NO_RESULT, + bus_unit_method_attach_processes, + SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_VTABLE_END }; @@ -2210,7 +2236,7 @@ static int bus_unit_set_transient_property( return bus_set_transient_emergency_action(u, name, &u->job_timeout_action, message, flags, error); if (streq(name, "JobTimeoutRebootArgument")) - return bus_set_transient_string(u, name, &u->job_timeout_reboot_arg, message, flags, error); + return bus_set_transient_reboot_parameter(u, name, &u->job_timeout_reboot_arg, message, flags, error); if (streq(name, "StartLimitIntervalUSec")) return bus_set_transient_usec(u, name, &u->start_ratelimit.interval, message, flags, error); @@ -2234,7 +2260,7 @@ static int bus_unit_set_transient_property( return bus_set_transient_exit_status(u, name, &u->success_action_exit_status, message, flags, error); if (streq(name, "RebootArgument")) - return bus_set_transient_string(u, name, &u->reboot_arg, message, flags, error); + return bus_set_transient_reboot_parameter(u, name, &u->reboot_arg, message, flags, error); if (streq(name, "CollectMode")) return bus_set_transient_collect_mode(u, name, &u->collect_mode, message, flags, error); @@ -2261,7 +2287,9 @@ static int bus_unit_set_transient_property( u->documentation = strv_free(u->documentation); unit_write_settingf(u, flags, name, "%s=", name); } else { - strv_extend_strv(&u->documentation, l, false); + r = strv_extend_strv(&u->documentation, l, /* filter_duplicates= */ false); + if (r < 0) + return r; STRV_FOREACH(p, l) unit_write_settingf(u, flags, name, "%s=%s", name, *p); @@ -2308,7 +2336,7 @@ static int bus_unit_set_transient_property( return 1; - } else if (streq(name, "RequiresMountsFor")) { + } else if (STR_IN_SET(name, "RequiresMountsFor", "WantsMountsFor")) { _cleanup_strv_free_ char **l = NULL; r = sd_bus_message_read_strv(message, &l); @@ -2328,9 +2356,9 @@ static int bus_unit_set_transient_property( return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not normalized: %s", name, *p); if (!UNIT_WRITE_FLAGS_NOOP(flags)) { - r = unit_require_mounts_for(u, *p, UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(u, *p, UNIT_DEPENDENCY_FILE, unit_mount_dependency_type_from_string(name)); if (r < 0) - return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add required mount \"%s\": %m", *p); + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add requested mount \"%s\": %m", *p); unit_write_settingf(u, flags, name, "%s=%s", name, *p); } diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c index d680a64..b871d89 100644 --- a/src/core/dbus-util.c +++ b/src/core/dbus-util.c @@ -6,6 +6,7 @@ #include "escape.h" #include "parse-util.h" #include "path-util.h" +#include "reboot-util.h" #include "unit-printf.h" #include "user-util.h" #include "unit.h" @@ -39,6 +40,7 @@ static bool valid_user_group_name_or_id_relaxed(const char *u) { BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(user_relaxed, valid_user_group_name_or_id_relaxed); BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(path, path_is_absolute); +BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(reboot_parameter, reboot_parameter_is_valid); int bus_set_transient_string( Unit *u, @@ -151,9 +153,7 @@ int bus_set_transient_usec_internal( int bus_verify_manage_units_async_full( Unit *u, const char *verb, - int capability, const char *polkit_message, - bool interactive, sd_bus_message *call, sd_bus_error *error) { @@ -171,11 +171,8 @@ int bus_verify_manage_units_async_full( return bus_verify_polkit_async( call, - capability, "org.freedesktop.systemd1.manage-units", details, - interactive, - UID_INVALID, &u->manager->polkit_registry, error); } diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h index 9464b25..0fc3a94 100644 --- a/src/core/dbus-util.h +++ b/src/core/dbus-util.h @@ -239,6 +239,7 @@ int bus_set_transient_mode_t(Unit *u, const char *name, mode_t *p, sd_bus_messag int bus_set_transient_unsigned(Unit *u, const char *name, unsigned *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_reboot_parameter(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_tristate(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); @@ -249,7 +250,7 @@ static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, s static inline int bus_set_transient_usec_fix_0(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { return bus_set_transient_usec_internal(u, name, p, true, message, flags, error); } -int bus_verify_manage_units_async_full(Unit *u, const char *verb, int capability, const char *polkit_message, bool interactive, sd_bus_message *call, sd_bus_error *error); +int bus_verify_manage_units_async_full(Unit *u, const char *verb, const char *polkit_message, sd_bus_message *call, sd_bus_error *error); int bus_read_mount_options(sd_bus_message *message, sd_bus_error *error, MountOptions **ret_options, char **ret_format_str, const char *separator); diff --git a/src/core/dbus.c b/src/core/dbus.c index ba2cec4..1c6f6fc 100644 --- a/src/core/dbus.c +++ b/src/core/dbus.c @@ -232,6 +232,8 @@ static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_er return 0; path = sd_bus_message_get_path(message); + if (!path) + return 0; if (object_path_startswith("/org/freedesktop/systemd1", path)) { r = mac_selinux_access_check(message, verb, error); @@ -241,25 +243,20 @@ static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_er return 0; } - if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) { - _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; - pid_t pid; - - r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); - if (r < 0) - return 0; + if (streq(path, "/org/freedesktop/systemd1/unit/self")) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - r = sd_bus_creds_get_pid(creds, &pid); + r = bus_query_sender_pidref(message, &pidref); if (r < 0) return 0; - u = manager_get_unit_by_pid(m, pid); + u = manager_get_unit_by_pidref(m, &pidref); } else { r = manager_get_job_from_dbus_path(m, path, &j); if (r >= 0) u = j->unit; else - manager_load_unit_from_dbus_path(m, path, NULL, &u); + (void) manager_load_unit_from_dbus_path(m, path, NULL, &u); } if (!u) return 0; @@ -280,24 +277,19 @@ static int find_unit(Manager *m, sd_bus *bus, const char *path, Unit **unit, sd_ assert(bus); assert(path); - if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) { - _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + if (streq(path, "/org/freedesktop/systemd1/unit/self")) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; sd_bus_message *message; - pid_t pid; message = sd_bus_get_current_message(bus); if (!message) return 0; - r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); - if (r < 0) - return r; - - r = sd_bus_creds_get_pid(creds, &pid); + r = bus_query_sender_pidref(message, &pidref); if (r < 0) return r; - u = manager_get_unit_by_pid(m, pid); + u = manager_get_unit_by_pidref(m, &pidref); if (!u) return 0; } else { @@ -739,7 +731,7 @@ static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void log_debug("Accepting direct incoming connection from " PID_FMT " (%s) [%s]", pid, strna(comm), strna(description)); } - r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + r = sd_bus_attach_event(bus, m->event, EVENT_PRIORITY_IPC); if (r < 0) { log_warning_errno(r, "Failed to attach new connection bus to event loop: %m"); return 0; @@ -847,7 +839,7 @@ int bus_init_api(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to connect to API bus: %m"); - r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + r = sd_bus_attach_event(bus, m->event, EVENT_PRIORITY_IPC); if (r < 0) return log_error_errno(r, "Failed to attach API bus to event loop: %m"); @@ -904,7 +896,7 @@ int bus_init_system(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to connect to system bus: %m"); - r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL); + r = sd_bus_attach_event(bus, m->event, EVENT_PRIORITY_IPC); if (r < 0) return log_error_errno(r, "Failed to attach system bus to event loop: %m"); @@ -1073,7 +1065,7 @@ void bus_done(Manager *m) { assert(!m->subscribed); m->deserialized_subscribed = strv_free(m->deserialized_subscribed); - bus_verify_polkit_async_registry_free(m->polkit_registry); + m->polkit_registry = hashmap_free(m->polkit_registry); } int bus_fdset_add_all(Manager *m, FDSet *fds) { @@ -1121,31 +1113,29 @@ int bus_foreach_bus( int (*send_message)(sd_bus *bus, void *userdata), void *userdata) { - sd_bus *b; - int r, ret = 0; + int r = 0; + + assert(m); + assert(send_message); /* Send to all direct buses, unconditionally */ + sd_bus *b; SET_FOREACH(b, m->private_buses) { /* Don't bother with enqueuing these messages to clients that haven't started yet */ if (sd_bus_is_ready(b) <= 0) continue; - r = send_message(b, userdata); - if (r < 0) - ret = r; + RET_GATHER(r, send_message(b, userdata)); } /* Send to API bus, but only if somebody is subscribed */ if (m->api_bus && (sd_bus_track_count(m->subscribed) > 0 || - sd_bus_track_count(subscribed2) > 0)) { - r = send_message(m->api_bus, userdata); - if (r < 0) - ret = r; - } + sd_bus_track_count(subscribed2) > 0)) + RET_GATHER(r, send_message(m->api_bus, userdata)); - return ret; + return r; } void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix) { @@ -1189,22 +1179,46 @@ int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l) { } int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { - return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-units", NULL, false, UID_INVALID, &m->polkit_registry, error); + return bus_verify_polkit_async( + call, + "org.freedesktop.systemd1.manage-units", + /* details= */ NULL, + &m->polkit_registry, + error); } int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { - return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-unit-files", NULL, false, UID_INVALID, &m->polkit_registry, error); + return bus_verify_polkit_async( + call, + "org.freedesktop.systemd1.manage-unit-files", + /* details= */ NULL, + &m->polkit_registry, + error); } int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { - return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.reload-daemon", NULL, false, UID_INVALID, &m->polkit_registry, error); + return bus_verify_polkit_async( + call, + "org.freedesktop.systemd1.reload-daemon", + /* details= */ NULL, + &m->polkit_registry, error); } int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { - return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.set-environment", NULL, false, UID_INVALID, &m->polkit_registry, error); + return bus_verify_polkit_async( + call, + "org.freedesktop.systemd1.set-environment", + /* details= */ NULL, + &m->polkit_registry, + error); } int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error) { - return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.bypass-dump-ratelimit", NULL, false, UID_INVALID, &m->polkit_registry, error); + return bus_verify_polkit_async( + call, + "org.freedesktop.systemd1.bypass-dump-ratelimit", + /* details= */ NULL, + &m->polkit_registry, + error); } uint64_t manager_bus_n_queued_write(Manager *m) { diff --git a/src/core/device.c b/src/core/device.c index 6b2d7c3..d856767 100644 --- a/src/core/device.c +++ b/src/core/device.c @@ -119,10 +119,9 @@ static int device_set_sysfs(Device *d, const char *sysfs) { } static void device_init(Unit *u) { - Device *d = DEVICE(u); + Device *d = ASSERT_PTR(DEVICE(u)); - assert(d); - assert(UNIT(d)->load_state == UNIT_STUB); + assert(u->load_state == UNIT_STUB); /* In contrast to all other unit types we timeout jobs waiting * for devices by default. This is because they otherwise wait @@ -137,9 +136,7 @@ static void device_init(Unit *u) { } static void device_done(Unit *u) { - Device *d = DEVICE(u); - - assert(d); + Device *d = ASSERT_PTR(DEVICE(u)); device_unset_sysfs(d); d->deserialized_sysfs = mfree(d->deserialized_sysfs); @@ -258,9 +255,8 @@ static void device_update_found_by_name(Manager *m, const char *path, DeviceFoun } static int device_coldplug(Unit *u) { - Device *d = DEVICE(u); + Device *d = ASSERT_PTR(DEVICE(u)); - assert(d); assert(d->state == DEVICE_DEAD); /* First, let's put the deserialized state and found mask into effect, if we have it. */ @@ -336,9 +332,7 @@ static int device_coldplug(Unit *u) { } static void device_catchup(Unit *u) { - Device *d = DEVICE(u); - - assert(d); + Device *d = ASSERT_PTR(DEVICE(u)); /* Second, let's update the state with the enumerated state */ device_update_found_one(d, d->enumerated_found, DEVICE_FOUND_MASK); @@ -405,11 +399,9 @@ static int device_found_from_string_many(const char *name, DeviceFound *ret) { } static int device_serialize(Unit *u, FILE *f, FDSet *fds) { + Device *d = ASSERT_PTR(DEVICE(u)); _cleanup_free_ char *s = NULL; - Device *d = DEVICE(u); - assert(d); - assert(u); assert(f); assert(fds); @@ -428,11 +420,9 @@ static int device_serialize(Unit *u, FILE *f, FDSet *fds) { } static int device_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Device *d = DEVICE(u); + Device *d = ASSERT_PTR(DEVICE(u)); int r; - assert(d); - assert(u); assert(key); assert(value); assert(fds); @@ -472,10 +462,11 @@ static int device_deserialize_item(Unit *u, const char *key, const char *value, } static void device_dump(Unit *u, FILE *f, const char *prefix) { - Device *d = DEVICE(u); + Device *d = ASSERT_PTR(DEVICE(u)); _cleanup_free_ char *s = NULL; - assert(d); + assert(f); + assert(prefix); (void) device_found_to_string_many(d->found, &s); @@ -495,15 +486,15 @@ static void device_dump(Unit *u, FILE *f, const char *prefix) { } static UnitActiveState device_active_state(Unit *u) { - assert(u); + Device *d = ASSERT_PTR(DEVICE(u)); - return state_translation_table[DEVICE(u)->state]; + return state_translation_table[d->state]; } static const char *device_sub_state_to_string(Unit *u) { - assert(u); + Device *d = ASSERT_PTR(DEVICE(u)); - return device_state_to_string(DEVICE(u)->state); + return device_state_to_string(d->state); } static int device_update_description(Unit *u, sd_device *dev, const char *path) { @@ -538,12 +529,11 @@ static int device_update_description(Unit *u, sd_device *dev, const char *path) } static int device_add_udev_wants(Unit *u, sd_device *dev) { + Device *d = ASSERT_PTR(DEVICE(u)); _cleanup_strv_free_ char **added = NULL; const char *wants, *property; - Device *d = DEVICE(u); int r; - assert(d); assert(dev); property = MANAGER_IS_USER(u->manager) ? "SYSTEMD_USER_WANTS" : "SYSTEMD_WANTS"; @@ -646,6 +636,8 @@ static void device_upgrade_mount_deps(Unit *u) { /* Let's upgrade Requires= to BindsTo= on us. (Used when SYSTEMD_MOUNT_DEVICE_BOUND is set) */ + assert(u); + HASHMAP_FOREACH_KEY(v, other, unit_get_dependencies(u, UNIT_REQUIRED_BY)) { if (other->type != UNIT_MOUNT) continue; @@ -706,16 +698,18 @@ static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool unit_add_to_load_queue(u); } - if (!DEVICE(u)->path) { - DEVICE(u)->path = strdup(path); - if (!DEVICE(u)->path) + Device *d = ASSERT_PTR(DEVICE(u)); + + if (!d->path) { + d->path = strdup(path); + if (!d->path) return log_oom(); } /* If this was created via some dependency and has not actually been seen yet ->sysfs will not be * initialized. Hence initialize it if necessary. */ if (sysfs) { - r = device_set_sysfs(DEVICE(u), sysfs); + r = device_set_sysfs(d, sysfs); if (r < 0) return log_unit_error_errno(u, r, "Failed to set sysfs path %s: %m", sysfs); @@ -730,11 +724,11 @@ static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool * by systemd before the device appears on its radar. In this case the device unit is partially * initialized and includes the deps on the mount unit but at that time the "bind mounts" flag wasn't * present. Fix this up now. */ - if (dev && device_is_bound_by_mounts(DEVICE(u), dev)) + if (dev && device_is_bound_by_mounts(d, dev)) device_upgrade_mount_deps(u); if (units) { - r = set_ensure_put(units, NULL, DEVICE(u)); + r = set_ensure_put(units, NULL, d); if (r < 0) return log_unit_error_errno(u, r, "Failed to store unit: %m"); } @@ -950,10 +944,7 @@ static int device_setup_units(Manager *m, sd_device *dev, Set **ready_units, Set } static Unit *device_following(Unit *u) { - Device *d = DEVICE(u); - Device *first = NULL; - - assert(d); + Device *d = ASSERT_PTR(DEVICE(u)), *first = NULL; if (startswith(u->id, "sys-")) return NULL; @@ -973,16 +964,15 @@ static Unit *device_following(Unit *u) { return UNIT(first); } -static int device_following_set(Unit *u, Set **_set) { - Device *d = DEVICE(u); +static int device_following_set(Unit *u, Set **ret) { + Device *d = ASSERT_PTR(DEVICE(u)); _cleanup_set_free_ Set *set = NULL; int r; - assert(d); - assert(_set); + assert(ret); if (LIST_JUST_US(same_sysfs, d)) { - *_set = NULL; + *ret = NULL; return 0; } @@ -1002,7 +992,7 @@ static int device_following_set(Unit *u, Set **_set) { return r; } - *_set = TAKE_PTR(set); + *ret = TAKE_PTR(set); return 1; } @@ -1061,6 +1051,9 @@ static void device_enumerate(Manager *m) { _cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL; Device *d; + if (device_is_processed(dev) <= 0) + continue; + if (device_setup_units(m, dev, &ready_units, ¬_ready_units) < 0) continue; diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c index 2bf9094..11de2ba 100644 --- a/src/core/dynamic-user.c +++ b/src/core/dynamic-user.c @@ -20,7 +20,7 @@ #include "stdio-util.h" #include "string-util.h" #include "strv.h" -#include "uid-alloc-range.h" +#include "uid-classification.h" #include "user-util.h" /* Takes a value generated randomly or by hashing and turns it into a UID in the right range */ @@ -143,7 +143,6 @@ static int dynamic_user_acquire(Manager *m, const char *name, DynamicUser** ret) } static int make_uid_symlinks(uid_t uid, const char *name, bool b) { - char path1[STRLEN("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1]; const char *path2; int r = 0, k; @@ -293,8 +292,8 @@ static int pick_uid(char **suggested_paths, const char *name, uid_t *ret_uid) { } /* Some superficial check whether this UID/GID might already be taken by some static user */ - if (getpwuid(candidate) || - getgrgid((gid_t) candidate) || + if (getpwuid_malloc(candidate, /* ret= */ NULL) >= 0 || + getgrgid_malloc((gid_t) candidate, /* ret= */ NULL) >= 0 || search_ipc(candidate, (gid_t) candidate) != 0) { (void) unlink(lock_path); continue; @@ -419,30 +418,26 @@ static int dynamic_user_realize( /* First, let's parse this as numeric UID */ r = parse_uid(d->name, &num); if (r < 0) { - struct passwd *p; - struct group *g; + _cleanup_free_ struct passwd *p = NULL; + _cleanup_free_ struct group *g = NULL; if (is_user) { /* OK, this is not a numeric UID. Let's see if there's a user by this name */ - p = getpwnam(d->name); - if (p) { + if (getpwnam_malloc(d->name, &p) >= 0) { num = p->pw_uid; gid = p->pw_gid; } else { /* if the user does not exist but the group with the same name exists, refuse operation */ - g = getgrnam(d->name); - if (g) + if (getgrnam_malloc(d->name, /* ret= */ NULL) >= 0) return -EILSEQ; } } else { /* Let's see if there's a group by this name */ - g = getgrnam(d->name); - if (g) + if (getgrnam_malloc(d->name, &g) >= 0) num = (uid_t) g->gr_gid; else { /* if the group does not exist but the user with the same name exists, refuse operation */ - p = getpwnam(d->name); - if (p) + if (getpwnam_malloc(d->name, /* ret= */ NULL) >= 0) return -EILSEQ; } } @@ -484,13 +479,12 @@ static int dynamic_user_realize( uid_lock_fd = new_uid_lock_fd; } } else if (is_user && !uid_is_dynamic(num)) { - struct passwd *p; + _cleanup_free_ struct passwd *p = NULL; /* Statically allocated user may have different uid and gid. So, let's obtain the gid. */ - errno = 0; - p = getpwuid(num); - if (!p) - return errno_or_else(ESRCH); + r = getpwuid_malloc(num, &p); + if (r < 0) + return r; gid = p->pw_gid; } @@ -658,7 +652,7 @@ void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, Dyn /* Parse the serialization again, after a daemon reload */ - r = extract_many_words(&value, NULL, 0, &name, &s0, &s1, NULL); + r = extract_many_words(&value, NULL, 0, &name, &s0, &s1); if (r != 3 || !isempty(value)) { log_debug("Unable to parse dynamic user line."); return; @@ -761,7 +755,6 @@ int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret) { int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicCreds **ret) { _cleanup_(dynamic_creds_unrefp) DynamicCreds *creds = NULL; - bool acquired = false; int r; assert(m); @@ -784,20 +777,14 @@ int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicC r = dynamic_user_acquire(m, user, &creds->user); if (r < 0) return r; - - acquired = true; } - if (creds->user && (!group || streq_ptr(user, group))) - creds->group = dynamic_user_ref(creds->user); - else if (group) { + if (group && !streq_ptr(user, group)) { r = dynamic_user_acquire(m, group, &creds->group); - if (r < 0) { - if (acquired) - creds->user = dynamic_user_unref(creds->user); + if (r < 0) return r; - } - } + } else + creds->group = ASSERT_PTR(dynamic_user_ref(creds->user)); *ret = TAKE_PTR(creds); diff --git a/src/core/emergency-action.c b/src/core/emergency-action.c index e2cd931..dbda6e5 100644 --- a/src/core/emergency-action.c +++ b/src/core/emergency-action.c @@ -13,22 +13,22 @@ #include "virt.h" static const char* const emergency_action_table[_EMERGENCY_ACTION_MAX] = { - [EMERGENCY_ACTION_NONE] = "none", - [EMERGENCY_ACTION_REBOOT] = "reboot", - [EMERGENCY_ACTION_REBOOT_FORCE] = "reboot-force", - [EMERGENCY_ACTION_REBOOT_IMMEDIATE] = "reboot-immediate", - [EMERGENCY_ACTION_POWEROFF] = "poweroff", - [EMERGENCY_ACTION_POWEROFF_FORCE] = "poweroff-force", + [EMERGENCY_ACTION_NONE] = "none", + [EMERGENCY_ACTION_EXIT] = "exit", + [EMERGENCY_ACTION_EXIT_FORCE] = "exit-force", + [EMERGENCY_ACTION_REBOOT] = "reboot", + [EMERGENCY_ACTION_REBOOT_FORCE] = "reboot-force", + [EMERGENCY_ACTION_REBOOT_IMMEDIATE] = "reboot-immediate", + [EMERGENCY_ACTION_POWEROFF] = "poweroff", + [EMERGENCY_ACTION_POWEROFF_FORCE] = "poweroff-force", [EMERGENCY_ACTION_POWEROFF_IMMEDIATE] = "poweroff-immediate", - [EMERGENCY_ACTION_EXIT] = "exit", - [EMERGENCY_ACTION_EXIT_FORCE] = "exit-force", - [EMERGENCY_ACTION_SOFT_REBOOT] = "soft-reboot", - [EMERGENCY_ACTION_SOFT_REBOOT_FORCE] = "soft-reboot-force", - [EMERGENCY_ACTION_KEXEC] = "kexec", - [EMERGENCY_ACTION_KEXEC_FORCE] = "kexec-force", - [EMERGENCY_ACTION_HALT] = "halt", - [EMERGENCY_ACTION_HALT_FORCE] = "halt-force", - [EMERGENCY_ACTION_HALT_IMMEDIATE] = "halt-immediate", + [EMERGENCY_ACTION_SOFT_REBOOT] = "soft-reboot", + [EMERGENCY_ACTION_SOFT_REBOOT_FORCE] = "soft-reboot-force", + [EMERGENCY_ACTION_KEXEC] = "kexec", + [EMERGENCY_ACTION_KEXEC_FORCE] = "kexec-force", + [EMERGENCY_ACTION_HALT] = "halt", + [EMERGENCY_ACTION_HALT_FORCE] = "halt-force", + [EMERGENCY_ACTION_HALT_IMMEDIATE] = "halt-immediate", }; static void log_and_status(Manager *m, bool warn, const char *message, const char *reason) { @@ -216,7 +216,7 @@ int parse_emergency_action( if (x < 0) return -EINVAL; - if (runtime_scope != RUNTIME_SCOPE_SYSTEM && x != EMERGENCY_ACTION_NONE && x < _EMERGENCY_ACTION_FIRST_USER_ACTION) + if (runtime_scope != RUNTIME_SCOPE_SYSTEM && x > _EMERGENCY_ACTION_LAST_USER_ACTION) return -EOPNOTSUPP; *ret = x; diff --git a/src/core/emergency-action.h b/src/core/emergency-action.h index 33e0ec6..6bec475 100644 --- a/src/core/emergency-action.h +++ b/src/core/emergency-action.h @@ -7,15 +7,15 @@ typedef enum EmergencyAction { EMERGENCY_ACTION_NONE, + EMERGENCY_ACTION_EXIT, + EMERGENCY_ACTION_EXIT_FORCE, + _EMERGENCY_ACTION_LAST_USER_ACTION = EMERGENCY_ACTION_EXIT_FORCE, EMERGENCY_ACTION_REBOOT, EMERGENCY_ACTION_REBOOT_FORCE, EMERGENCY_ACTION_REBOOT_IMMEDIATE, EMERGENCY_ACTION_POWEROFF, EMERGENCY_ACTION_POWEROFF_FORCE, EMERGENCY_ACTION_POWEROFF_IMMEDIATE, - EMERGENCY_ACTION_EXIT, - _EMERGENCY_ACTION_FIRST_USER_ACTION = EMERGENCY_ACTION_EXIT, - EMERGENCY_ACTION_EXIT_FORCE, EMERGENCY_ACTION_SOFT_REBOOT, EMERGENCY_ACTION_SOFT_REBOOT_FORCE, EMERGENCY_ACTION_KEXEC, diff --git a/src/core/exec-credential.c b/src/core/exec-credential.c index 6bcfb68..f4cff57 100644 --- a/src/core/exec-credential.c +++ b/src/core/exec-credential.c @@ -9,6 +9,7 @@ #include "fileio.h" #include "glob-util.h" #include "io-util.h" +#include "iovec-util.h" #include "label-util.h" #include "mkdir-label.h" #include "mount-util.h" @@ -48,6 +49,12 @@ DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free); +bool exec_params_need_credentials(const ExecParameters *p) { + assert(p); + + return p->flags & (EXEC_SETUP_CREDENTIALS|EXEC_SETUP_CREDENTIALS_FRESH); +} + bool exec_context_has_credentials(const ExecContext *c) { assert(c); @@ -56,16 +63,15 @@ bool exec_context_has_credentials(const ExecContext *c) { !set_isempty(c->import_credentials); } -bool exec_context_has_encrypted_credentials(ExecContext *c) { - ExecLoadCredential *load_cred; - ExecSetCredential *set_cred; - +bool exec_context_has_encrypted_credentials(const ExecContext *c) { assert(c); + const ExecLoadCredential *load_cred; HASHMAP_FOREACH(load_cred, c->load_credentials) if (load_cred->encrypted) return true; + const ExecSetCredential *set_cred; HASHMAP_FOREACH(set_cred, c->set_credentials) if (set_cred->encrypted) return true; @@ -106,7 +112,7 @@ int exec_context_get_credential_directory( assert(unit); assert(ret); - if (!exec_context_has_credentials(context)) { + if (!exec_params_need_credentials(params) || !exec_context_has_credentials(context)) { *ret = NULL; return 0; } @@ -172,6 +178,10 @@ static int write_credential( _cleanup_close_ int fd = -EBADF; int r; + assert(dfd >= 0); + assert(id); + assert(data || size == 0); + r = tempfn_random_child("", "cred", &tmp); if (r < 0) return r; @@ -224,7 +234,6 @@ typedef enum CredentialSearchPath { } CredentialSearchPath; static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) { - _cleanup_strv_free_ char **l = NULL; assert(params); @@ -243,9 +252,8 @@ static char **credential_search_path(const ExecParameters *params, CredentialSea } if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) { - if (params->received_credentials_directory) - if (strv_extend(&l, params->received_credentials_directory) < 0) - return NULL; + if (strv_extend(&l, params->received_credentials_directory) < 0) + return NULL; if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0) return NULL; @@ -271,20 +279,29 @@ static int maybe_decrypt_and_write_credential( size_t size, uint64_t *left) { - _cleanup_free_ void *plaintext = NULL; + _cleanup_(iovec_done_erase) struct iovec plaintext = {}; size_t add; int r; - if (encrypted) { - size_t plaintext_size = 0; + assert(dir_fd >= 0); + assert(id); + assert(left); - r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, - &plaintext, &plaintext_size); + if (encrypted) { + r = decrypt_credential_and_warn( + id, + now(CLOCK_REALTIME), + /* tpm2_device= */ NULL, + /* tpm2_signature_path= */ NULL, + getuid(), + &IOVEC_MAKE(data, size), + CREDENTIAL_ANY_SCOPE, + &plaintext); if (r < 0) return r; - data = plaintext; - size = plaintext_size; + data = plaintext.iov_base; + size = plaintext.iov_len; } add = strlen(id) + size; @@ -302,7 +319,7 @@ static int maybe_decrypt_and_write_credential( static int load_credential_glob( const char *path, bool encrypted, - char **search_path, + char * const *search_path, ReadFullFileFlags flags, int write_dfd, uid_t uid, @@ -312,6 +329,11 @@ static int load_credential_glob( int r; + assert(path); + assert(search_path); + assert(write_dfd >= 0); + assert(left); + STRV_FOREACH(d, search_path) { _cleanup_globfree_ glob_t pglob = {}; _cleanup_free_ char *j = NULL; @@ -326,38 +348,36 @@ static int load_credential_glob( if (r < 0) return r; - for (size_t n = 0; n < pglob.gl_pathc; n++) { + FOREACH_ARRAY(p, pglob.gl_pathv, pglob.gl_pathc) { _cleanup_free_ char *fn = NULL; _cleanup_(erase_and_freep) char *data = NULL; size_t size; /* path is absolute, hence pass AT_FDCWD as nop dir fd here */ r = read_full_file_full( - AT_FDCWD, - pglob.gl_pathv[n], - UINT64_MAX, - encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX, - flags, - NULL, - &data, &size); + AT_FDCWD, + *p, + UINT64_MAX, + encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX, + flags, + NULL, + &data, &size); if (r < 0) - return log_debug_errno(r, "Failed to read credential '%s': %m", - pglob.gl_pathv[n]); + return log_debug_errno(r, "Failed to read credential '%s': %m", *p); - r = path_extract_filename(pglob.gl_pathv[n], &fn); + r = path_extract_filename(*p, &fn); if (r < 0) - return log_debug_errno(r, "Failed to extract filename from '%s': %m", - pglob.gl_pathv[n]); + return log_debug_errno(r, "Failed to extract filename from '%s': %m", *p); r = maybe_decrypt_and_write_credential( - write_dfd, - fn, - encrypted, - uid, - gid, - ownership_ok, - data, size, - left); + write_dfd, + fn, + encrypted, + uid, + gid, + ownership_ok, + data, size, + left); if (r == -EEXIST) continue; if (r < 0) @@ -423,7 +443,7 @@ static int load_credential( /* Pass some minimal info about the unit and the credential name we are looking to acquire * via the source socket address in case we read off an AF_UNIX socket. */ - if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0) + if (asprintf(&bindname, "@%" PRIx64 "/unit/%s/%s", random_u64(), unit, id) < 0) return -ENOMEM; missing_ok = false; @@ -447,7 +467,7 @@ static int load_credential( maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX; - if (search_path) { + if (search_path) STRV_FOREACH(d, search_path) { _cleanup_free_ char *j = NULL; @@ -465,7 +485,7 @@ static int load_credential( if (r != -ENOENT) break; } - } else if (source) + else if (source) r = read_full_file_full( read_dfd, source, UINT64_MAX, @@ -484,7 +504,8 @@ static int load_credential( * * Also, if the source file doesn't exist, but a fallback is set via SetCredentials= * we are fine, too. */ - log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path); + log_full_errno(hashmap_contains(context->set_credentials, id) ? LOG_DEBUG : LOG_INFO, + r, "Couldn't read inherited credential '%s', skipping: %m", path); return 0; } if (r < 0) @@ -518,6 +539,9 @@ static int load_cred_recurse_dir_cb( _cleanup_free_ char *sub_id = NULL; int r; + assert(path); + assert(de); + if (event != RECURSE_DIR_ENTRY) return RECURSE_DIR_CONTINUE; @@ -574,6 +598,8 @@ static int acquire_credentials( int r; assert(context); + assert(params); + assert(unit); assert(p); dfd = open(p, O_DIRECTORY|O_CLOEXEC); @@ -618,8 +644,7 @@ static int acquire_credentials( &left); else /* Directory */ - r = recurse_dir( - sub_fd, + r = recurse_dir(sub_fd, /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */ /* statx_mask= */ 0, /* n_depth_max= */ UINT_MAX, @@ -684,7 +709,7 @@ static int acquire_credentials( /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not * add them, so that they can act as a "default" if the same credential is specified multiple times. */ HASHMAP_FOREACH(sc, context->set_credentials) { - _cleanup_(erase_and_freep) void *plaintext = NULL; + _cleanup_(iovec_done_erase) struct iovec plaintext = {}; const char *data; size_t size, add; @@ -698,11 +723,20 @@ static int acquire_credentials( return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id); if (sc->encrypted) { - r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size); + r = decrypt_credential_and_warn( + sc->id, + now(CLOCK_REALTIME), + /* tpm2_device= */ NULL, + /* tpm2_signature_path= */ NULL, + getuid(), + &IOVEC_MAKE(sc->data, sc->size), + CREDENTIAL_ANY_SCOPE, + &plaintext); if (r < 0) return r; - data = plaintext; + data = plaintext.iov_base; + size = plaintext.iov_len; } else { data = sc->data; size = sc->size; @@ -754,17 +788,42 @@ static int setup_credentials_internal( uid_t uid, gid_t gid) { + bool final_mounted; int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true * if we mounted something; false if we definitely can't mount anything */ - bool final_mounted; - const char *where; assert(context); + assert(params); + assert(unit); assert(final); assert(workspace); + r = path_is_mount_point(final); + if (r < 0) + return r; + final_mounted = r > 0; + + if (final_mounted) { + if (FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) { + r = umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW); + if (r < 0) + return r; + + final_mounted = false; + } else { + /* We can reuse the previous credential dir */ + r = dir_is_empty(final, /* ignore_hidden_or_backup = */ false); + if (r < 0) + return r; + if (r == 0) { + log_debug("Credential dir for unit '%s' already set up, skipping.", unit); + return 0; + } + } + } + if (reuse_workspace) { - r = path_is_mount_point(workspace, NULL, 0); + r = path_is_mount_point(workspace); if (r < 0) return r; if (r > 0) @@ -775,40 +834,19 @@ static int setup_credentials_internal( } else workspace_mounted = -1; /* ditto */ - r = path_is_mount_point(final, NULL, 0); - if (r < 0) - return r; - if (r > 0) { - /* If the final place already has something mounted, we use that. If the workspace also has - * something mounted we assume it's actually the same mount (but with MS_RDONLY - * different). */ - final_mounted = true; - - if (workspace_mounted < 0) { - /* If the final place is mounted, but the workspace isn't, then let's bind mount - * the final version to the workspace, and make it writable, so that we can make - * changes */ - - r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL); - if (r < 0) - return r; - - r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL); - if (r < 0) - return r; - - workspace_mounted = true; - } - } else - final_mounted = false; + /* If both the final place and the workspace are mounted, we have no mounts to set up, based on + * the assumption that they're actually the same tmpfs (but the latter with MS_RDONLY different). + * If the workspace is not mounted, we just bind the final place over and make it writable. */ + must_mount = must_mount || final_mounted; if (workspace_mounted < 0) { - /* Nothing is mounted on the workspace yet, let's try to mount something now */ - - r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false); - if (r < 0) { - /* If that didn't work, try to make a bind mount from the final to the workspace, so - * that we can make it writable there. */ + if (!final_mounted) + /* Nothing is mounted on the workspace yet, let's try to mount a new tmpfs if + * not using the final place. */ + r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false); + if (final_mounted || r < 0) { + /* If using final place or failed to mount new tmpfs, make a bind mount from + * the final to the workspace, so that we can make it writable there. */ r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL); if (r < 0) { if (!ERRNO_IS_PRIVILEGE(r)) @@ -821,12 +859,19 @@ static int setup_credentials_internal( return r; /* If we lack privileges to bind mount stuff, then let's gracefully proceed - * for compat with container envs, and just use the final dir as is. */ + * for compat with container envs, and just use the final dir as is. + * Final place must not be mounted in this case (refused by must_mount + * above) */ workspace_mounted = false; } else { /* Make the new bind mount writable (i.e. drop MS_RDONLY) */ - r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL); + r = mount_nofollow_verbose(LOG_DEBUG, + NULL, + workspace, + NULL, + MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), + NULL); if (r < 0) return r; @@ -836,34 +881,26 @@ static int setup_credentials_internal( workspace_mounted = true; } - assert(!must_mount || workspace_mounted > 0); - where = workspace_mounted ? workspace : final; + assert(workspace_mounted >= 0); + assert(!must_mount || workspace_mounted); + + const char *where = workspace_mounted ? workspace : final; (void) label_fix_full(AT_FDCWD, where, final, 0); r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted); - if (r < 0) - return r; - - if (workspace_mounted) { - bool install; - - /* Determine if we should actually install the prepared mount in the final location by bind - * mounting it there. We do so only if the mount is not established there already, and if the - * mount is actually non-empty (i.e. carries at least one credential). Not that in the best - * case we are doing all this in a mount namespace, thus no one else will see that we - * allocated a file system we are getting rid of again here. */ + if (r < 0) { + /* If we're using final place as workspace, and failed to acquire credentials, we might + * have left half-written creds there. Let's get rid of the whole mount, so future + * calls won't reuse it. */ if (final_mounted) - install = false; /* already installed */ - else { - r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false); - if (r < 0) - return r; + (void) umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW); - install = r == 0; /* install only if non-empty */ - } + return r; + } - if (install) { + if (workspace_mounted) { + if (!final_mounted) { /* Make workspace read-only now, so that any bind mount we make from it defaults to * read-only too */ r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL); @@ -873,7 +910,7 @@ static int setup_credentials_internal( /* And mount it to the final place, read-only */ r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL); } else - /* Otherwise get rid of it */ + /* Otherwise we just get rid of the bind mount of final place */ r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW); if (r < 0) return r; @@ -905,15 +942,16 @@ int exec_setup_credentials( assert(context); assert(params); + assert(unit); - if (!exec_context_has_credentials(context)) + if (!exec_params_need_credentials(params) || !exec_context_has_credentials(context)) return 0; if (!params->prefix[EXEC_DIRECTORY_RUNTIME]) return -EINVAL; - /* This where we'll place stuff when we are done; this main credentials directory is world-readable, - * and the subdir we mount over with a read-only file system readable by the service's user */ + /* This is where we'll place stuff when we are done; the main credentials directory is world-readable, + * and the subdir we mount over with a read-only file system readable by the service's user. */ q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials"); if (!q) return -ENOMEM; diff --git a/src/core/exec-credential.h b/src/core/exec-credential.h index 6f836fb..70bb46b 100644 --- a/src/core/exec-credential.h +++ b/src/core/exec-credential.h @@ -34,8 +34,10 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecLoadCredential*, exec_load_credential_free); extern const struct hash_ops exec_set_credential_hash_ops; extern const struct hash_ops exec_load_credential_hash_ops; -bool exec_context_has_encrypted_credentials(ExecContext *c); +bool exec_params_need_credentials(const ExecParameters *p); + bool exec_context_has_credentials(const ExecContext *c); +bool exec_context_has_encrypted_credentials(const ExecContext *c); int exec_context_get_credential_directory( const ExecContext *context, diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 8e6de15..ee8db04 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -22,7 +22,7 @@ #include "argv-util.h" #include "barrier.h" #include "bpf-dlopen.h" -#include "bpf-lsm.h" +#include "bpf-restrict-fs.h" #include "btrfs-util.h" #include "capability-util.h" #include "cgroup-setup.h" @@ -41,6 +41,7 @@ #include "hexdecoct.h" #include "io-util.h" #include "iovec-util.h" +#include "journal-send.h" #include "missing_ioprio.h" #include "missing_prctl.h" #include "missing_securebits.h" @@ -59,52 +60,13 @@ #include "strv.h" #include "terminal-util.h" #include "utmp-wtmp.h" +#include "vpick.h" #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC) #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC) #define SNDBUF_SIZE (8*1024*1024) -static int shift_fds(int fds[], size_t n_fds) { - if (n_fds <= 0) - return 0; - - /* Modifies the fds array! (sorts it) */ - - assert(fds); - - for (int start = 0;;) { - int restart_from = -1; - - for (int i = start; i < (int) n_fds; i++) { - int nfd; - - /* Already at right index? */ - if (fds[i] == i+3) - continue; - - nfd = fcntl(fds[i], F_DUPFD, i + 3); - if (nfd < 0) - return -errno; - - safe_close(fds[i]); - fds[i] = nfd; - - /* Hmm, the fd we wanted isn't free? Then - * let's remember that and try again from here */ - if (nfd != i+3 && restart_from < 0) - restart_from = i; - } - - if (restart_from < 0) - break; - - start = restart_from; - } - - return 0; -} - static int flag_fds( const int fds[], size_t n_socket_fds, @@ -198,9 +160,11 @@ static int connect_journal_socket( const char *j; int r; - j = log_namespace ? - strjoina("/run/systemd/journal.", log_namespace, "/stdout") : - "/run/systemd/journal/stdout"; + assert(fd >= 0); + + j = journal_stream_path(log_namespace); + if (!j) + return -EINVAL; if (gid_is_valid(gid)) { oldgid = getgid(); @@ -449,7 +413,7 @@ static int setup_input( case EXEC_INPUT_DATA: { int fd; - fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0); + fd = acquire_data_fd_full(context->stdin_data, context->stdin_data_size, /* flags = */ 0); if (fd < 0) return fd; @@ -670,12 +634,8 @@ static int chown_terminal(int fd, uid_t uid) { assert(fd >= 0); /* Before we chown/chmod the TTY, let's ensure this is actually a tty */ - if (isatty(fd) < 1) { - if (IN_SET(errno, EINVAL, ENOTTY)) - return 0; /* not a tty */ - - return -errno; - } + if (!isatty_safe(fd)) + return 0; /* This might fail. What matters are the results. */ r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID); @@ -1126,7 +1086,8 @@ static int setup_pam( gid_t gid, const char *tty, char ***env, /* updated on success */ - const int fds[], size_t n_fds) { + const int fds[], size_t n_fds, + int exec_fd) { #if HAVE_PAM @@ -1141,7 +1102,7 @@ static int setup_pam( sigset_t old_ss; int pam_code = PAM_SUCCESS, r; bool close_session = false; - pid_t pam_pid = 0, parent_pid; + pid_t parent_pid; int flags = 0; assert(name); @@ -1196,7 +1157,7 @@ static int setup_pam( pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags); if (pam_code != PAM_SUCCESS) - log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code)); + log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code)); pam_code = pam_open_session(handle, flags); if (pam_code != PAM_SUCCESS) @@ -1212,15 +1173,15 @@ static int setup_pam( /* Block SIGTERM, so that we know that it won't get lost in the child */ - assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0); + assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0); parent_pid = getpid_cached(); - r = safe_fork("(sd-pam)", 0, &pam_pid); + r = safe_fork("(sd-pam)", 0, NULL); if (r < 0) goto fail; if (r == 0) { - int sig, ret = EXIT_PAM; + int ret = EXIT_PAM; /* The child's job is to reset the PAM session on termination */ barrier_set_role(&barrier, BARRIER_CHILD); @@ -1229,17 +1190,18 @@ static int setup_pam( * those fds are open here that have been opened by PAM. */ (void) close_many(fds, n_fds); + /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced + * by the execve() to wait for completion, and if we'd keep the fd open here in the child + * we'd never signal completion. */ + exec_fd = safe_close(exec_fd); + /* Drop privileges - we don't need any to pam_close_session and this will make * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam * threads to fail to exit normally */ - r = maybe_setgroups(0, NULL); + r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0); if (r < 0) - log_warning_errno(r, "Failed to setgroups() in sd-pam: %m"); - if (setresgid(gid, gid, gid) < 0) - log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m"); - if (setresuid(uid, uid, uid) < 0) - log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m"); + log_warning_errno(r, "Failed to drop privileges in sd-pam: %m"); (void) ignore_signals(SIGPIPE); @@ -1258,21 +1220,13 @@ static int setup_pam( /* Check if our parent process might already have died? */ if (getppid() == parent_pid) { sigset_t ss; + int sig; assert_se(sigemptyset(&ss) >= 0); assert_se(sigaddset(&ss, SIGTERM) >= 0); - for (;;) { - if (sigwait(&ss, &sig) < 0) { - if (errno == EINTR) - continue; - - goto child_finish; - } - - assert(sig == SIGTERM); - break; - } + assert_se(sigwait(&ss, &sig) == 0); + assert(sig == SIGTERM); } /* If our parent died we'll end the session */ @@ -1361,7 +1315,7 @@ static void rename_process_from_path(const char *path) { process_name[1+l] = ')'; process_name[1+l+1] = 0; - rename_process(process_name); + (void) rename_process(process_name); } static bool context_has_address_families(const ExecContext *c) { @@ -1725,7 +1679,7 @@ static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters if (!exec_context_restrict_filesystems_set(c)) return 0; - if (p->bpf_outer_map_fd < 0) { + if (p->bpf_restrict_fs_map_fd < 0) { /* LSM BPF is unsupported or lsm_bpf_setup failed */ log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems="); return 0; @@ -1736,7 +1690,7 @@ static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters if (r < 0) return r; - return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list); + return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list); } #endif @@ -1817,10 +1771,10 @@ static const char *exec_directory_env_name_to_string(ExecDirectoryType t); /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to * the service payload in. */ static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = { - [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY", - [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY", - [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY", - [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY", + [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY", + [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY", + [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY", + [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY", [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY", }; @@ -1907,7 +1861,7 @@ static int build_environment( "Failed to determine user credentials for root: %m"); } - bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user); + bool set_user_login_env = exec_context_get_set_login_environment(c); if (username) { x = strjoin("USER=", username); @@ -1961,7 +1915,7 @@ static int build_environment( * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the * container manager passes to PID 1 ends up all the way in the console login shown. */ - if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1) + if (path_equal(tty_path, "/dev/console") && getppid() == 1) term = getenv("TERM"); else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) { _cleanup_free_ char *key = NULL; @@ -2315,10 +2269,10 @@ static int setup_exec_directory( int *exit_status) { static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = { - [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY, - [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY, - [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY, - [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY, + [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY, + [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY, + [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY, + [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY, [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY, }; int r; @@ -2338,10 +2292,10 @@ static int setup_exec_directory( gid = 0; } - for (size_t i = 0; i < context->directories[type].n_items; i++) { + FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) { _cleanup_free_ char *p = NULL, *pp = NULL; - p = path_join(params->prefix[type], context->directories[type].items[i].path); + p = path_join(params->prefix[type], i->path); if (!p) { r = -ENOMEM; goto fail; @@ -2357,7 +2311,7 @@ static int setup_exec_directory( * doesn't exist, then we likely are upgrading from an older systemd version that * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to - * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now + * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME it is now * separated. If a service has both dirs configured but only the configuration dir * exists and the state dir does not, we assume we are looking at an update * situation. Hence, create a compatibility symlink, so that all expectations are @@ -2378,9 +2332,9 @@ static int setup_exec_directory( * under the configuration hierarchy. */ if (type == EXEC_DIRECTORY_STATE) - q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path); + q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], i->path); else if (type == EXEC_DIRECTORY_LOGS) - q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path); + q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", i->path); else assert_not_reached(); if (!q) { @@ -2443,7 +2397,7 @@ static int setup_exec_directory( if (r < 0) goto fail; - if (!path_extend(&pp, context->directories[type].items[i].path)) { + if (!path_extend(&pp, i->path)) { r = -ENOMEM; goto fail; } @@ -2477,7 +2431,7 @@ static int setup_exec_directory( goto fail; } - if (!context->directories[type].items[i].only_create) { + if (!i->only_create) { /* And link it up from the original place. * Notes * 1) If a mount namespace is going to be used, then this symlink remains on @@ -2514,7 +2468,7 @@ static int setup_exec_directory( if (r < 0) goto fail; - q = path_join(params->prefix[type], "private", context->directories[type].items[i].path); + q = path_join(params->prefix[type], "private", i->path); if (!q) { r = -ENOMEM; goto fail; @@ -2568,7 +2522,7 @@ static int setup_exec_directory( params, "%s \'%s\' already exists but the mode is different. " "(File system: %o %sMode: %o)", - exec_directory_type_to_string(type), context->directories[type].items[i].path, + exec_directory_type_to_string(type), i->path, st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777); continue; @@ -2599,10 +2553,8 @@ static int setup_exec_directory( /* If we are not going to run in a namespace, set up the symlinks - otherwise * they are set up later, to allow configuring empty var/run/etc. */ if (!needs_mount_namespace) - for (size_t i = 0; i < context->directories[type].n_items; i++) { - r = create_many_symlinks(params->prefix[type], - context->directories[type].items[i].path, - context->directories[type].items[i].symlinks); + FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) { + r = create_many_symlinks(params->prefix[type], i->path, i->symlinks); if (r < 0) goto fail; } @@ -2669,8 +2621,8 @@ static int compile_bind_mounts( if (!params->prefix[t]) continue; - for (size_t i = 0; i < context->directories[t].n_items; i++) - n += !context->directories[t].items[i].only_create; + FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items) + n += !i->only_create; } if (n <= 0) { @@ -2684,8 +2636,7 @@ static int compile_bind_mounts( if (!bind_mounts) return -ENOMEM; - for (size_t i = 0; i < context->n_bind_mounts; i++) { - BindMount *item = context->bind_mounts + i; + FOREACH_ARRAY(item, context->bind_mounts, context->n_bind_mounts) { _cleanup_free_ char *s = NULL, *d = NULL; s = strdup(item->source); @@ -2729,18 +2680,18 @@ static int compile_bind_mounts( return r; } - for (size_t i = 0; i < context->directories[t].n_items; i++) { + FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items) { _cleanup_free_ char *s = NULL, *d = NULL; /* When one of the parent directories is in the list, we cannot create the symlink * for the child directory. See also the comments in setup_exec_directory(). */ - if (context->directories[t].items[i].only_create) + if (i->only_create) continue; if (exec_directory_is_private(context, t)) - s = path_join(params->prefix[t], "private", context->directories[t].items[i].path); + s = path_join(params->prefix[t], "private", i->path); else - s = path_join(params->prefix[t], context->directories[t].items[i].path); + s = path_join(params->prefix[t], i->path); if (!s) return -ENOMEM; @@ -2749,7 +2700,7 @@ static int compile_bind_mounts( /* When RootDirectory= or RootImage= are set, then the symbolic link to the private * directory is not created on the root directory. So, let's bind-mount the directory * on the 'non-private' place. */ - d = path_join(params->prefix[t], context->directories[t].items[i].path); + d = path_join(params->prefix[t], i->path); else d = strdup(s); if (!d) @@ -2758,10 +2709,8 @@ static int compile_bind_mounts( bind_mounts[h++] = (BindMount) { .source = TAKE_PTR(s), .destination = TAKE_PTR(d), - .read_only = false, .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */ .recursive = true, - .ignore_enoent = false, }; } } @@ -2791,14 +2740,14 @@ static int compile_symlinks( assert(params); assert(ret_symlinks); - for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) { - for (size_t i = 0; i < context->directories[dt].n_items; i++) { + for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) + FOREACH_ARRAY(i, context->directories[dt].items, context->directories[dt].n_items) { _cleanup_free_ char *private_path = NULL, *path = NULL; - STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) { + STRV_FOREACH(symlink, i->symlinks) { _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL; - src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path); + src_abs = path_join(params->prefix[dt], i->path); dst_abs = path_join(params->prefix[dt], *symlink); if (!src_abs || !dst_abs) return -ENOMEM; @@ -2810,14 +2759,14 @@ static int compile_symlinks( if (!exec_directory_is_private(context, dt) || exec_context_with_rootfs(context) || - context->directories[dt].items[i].only_create) + i->only_create) continue; - private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path); + private_path = path_join(params->prefix[dt], "private", i->path); if (!private_path) return -ENOMEM; - path = path_join(params->prefix[dt], context->directories[dt].items[i].path); + path = path_join(params->prefix[dt], i->path); if (!path) return -ENOMEM; @@ -2825,18 +2774,16 @@ static int compile_symlinks( if (r < 0) return r; } - } /* We make the host's os-release available via a symlink, so that we can copy it atomically * and readers will never get a half-written version. Note that, while the paths specified here are * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.: * 'os-release -> .os-release-stage/os-release' is what will be created. */ if (setup_os_release_symlink) { - r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release"); - if (r < 0) - return r; - - r = strv_extend(&symlinks, "/run/host/os-release"); + r = strv_extend_many( + &symlinks, + "/run/host/.os-release-stage/os-release", + "/run/host/os-release"); if (r < 0) return r; } @@ -2877,8 +2824,8 @@ static bool insist_on_sandboxing( /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes * essential. */ - for (size_t i = 0; i < n_bind_mounts; i++) - if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination)) + FOREACH_ARRAY(i, bind_mounts, n_bind_mounts) + if (!path_equal(i->source, i->destination)) return true; if (context->log_namespace) @@ -2887,13 +2834,33 @@ static bool insist_on_sandboxing( return false; } -static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) { +static int setup_ephemeral( + const ExecContext *context, + ExecRuntime *runtime, + char **root_image, /* both input and output! modified if ephemeral logic enabled */ + char **root_directory) { /* ditto */ + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *new_root = NULL; int r; + assert(context); + assert(root_image); + assert(root_directory); + + if (!*root_image && !*root_directory) + return 0; + if (!runtime || !runtime->ephemeral_copy) return 0; + assert(runtime->ephemeral_storage_socket[0] >= 0); + assert(runtime->ephemeral_storage_socket[1] >= 0); + + new_root = strdup(runtime->ephemeral_copy); + if (!new_root) + return log_oom_debug(); + r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX); if (r < 0) return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m"); @@ -2904,28 +2871,23 @@ static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) { if (fd >= 0) /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */ return 0; - if (fd != -EAGAIN) return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m"); - log_debug("Making ephemeral snapshot of %s to %s", - context->root_image ?: context->root_directory, runtime->ephemeral_copy); + if (*root_image) { + log_debug("Making ephemeral copy of %s to %s", *root_image, new_root); - if (context->root_image) - fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600, - COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME); - else - fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory, - AT_FDCWD, runtime->ephemeral_copy, - BTRFS_SNAPSHOT_FALLBACK_COPY | - BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | - BTRFS_SNAPSHOT_RECURSIVE | - BTRFS_SNAPSHOT_LOCK_BSD); - if (fd < 0) - return log_debug_errno(fd, "Failed to snapshot %s to %s: %m", - context->root_image ?: context->root_directory, runtime->ephemeral_copy); + fd = copy_file(*root_image, + new_root, + O_EXCL, + 0600, + COPY_LOCK_BSD| + COPY_REFLINK| + COPY_CRTIME); + if (fd < 0) + return log_debug_errno(fd, "Failed to copy image %s to %s: %m", + *root_image, new_root); - if (context->root_image) { /* A root image might be subject to lots of random writes so let's try to disable COW on it * which tends to not perform well in combination with lots of random writes. * @@ -2934,13 +2896,35 @@ static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) { */ r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL); if (r < 0) - log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy); + log_debug_errno(r, "Failed to disable copy-on-write for %s, ignoring: %m", new_root); + } else { + assert(*root_directory); + + log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root); + + fd = btrfs_subvol_snapshot_at( + AT_FDCWD, *root_directory, + AT_FDCWD, new_root, + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_LOCK_BSD); + if (fd < 0) + return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m", + *root_directory, new_root); } r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT); if (r < 0) return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m"); + if (*root_image) + free_and_replace(*root_image, new_root); + else { + assert(*root_directory); + free_and_replace(*root_directory, new_root); + } + return 1; } @@ -3000,22 +2984,80 @@ static int verity_settings_prepare( return 0; } +static int pick_versions( + const ExecContext *context, + const ExecParameters *params, + char **ret_root_image, + char **ret_root_directory) { + + int r; + + assert(context); + assert(params); + assert(ret_root_image); + assert(ret_root_directory); + + if (context->root_image) { + _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL; + + r = path_pick(/* toplevel_path= */ NULL, + /* toplevel_fd= */ AT_FDCWD, + context->root_image, + &pick_filter_image_raw, + PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE, + &result); + if (r < 0) + return r; + + if (!result.path) + return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image); + + *ret_root_image = TAKE_PTR(result.path); + *ret_root_directory = NULL; + return r; + } + + if (context->root_directory) { + _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL; + + r = path_pick(/* toplevel_path= */ NULL, + /* toplevel_fd= */ AT_FDCWD, + context->root_directory, + &pick_filter_image_dir, + PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE, + &result); + if (r < 0) + return r; + + if (!result.path) + return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory); + + *ret_root_image = NULL; + *ret_root_directory = TAKE_PTR(result.path); + return r; + } + + *ret_root_image = *ret_root_directory = NULL; + return 0; +} + static int apply_mount_namespace( ExecCommandFlags command_flags, const ExecContext *context, const ExecParameters *params, ExecRuntime *runtime, const char *memory_pressure_path, + bool needs_sandboxing, char **error_path) { _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL, **read_write_paths_cleanup = NULL; _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL, - *extension_dir = NULL, *host_os_release_stage = NULL; - const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL; + *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL; + const char *tmp_dir = NULL, *var_tmp_dir = NULL; char **read_write_paths; - bool needs_sandboxing, setup_os_release_symlink; + bool setup_os_release_symlink; BindMount *bind_mounts = NULL; size_t n_bind_mounts = 0; int r; @@ -3025,14 +3067,21 @@ static int apply_mount_namespace( CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many); if (params->flags & EXEC_APPLY_CHROOT) { - r = setup_ephemeral(context, runtime); + r = pick_versions( + context, + params, + &root_image, + &root_dir); if (r < 0) return r; - if (context->root_image) - root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image; - else - root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory; + r = setup_ephemeral( + context, + runtime, + &root_image, + &root_dir); + if (r < 0) + return r; } r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories); @@ -3054,7 +3103,6 @@ static int apply_mount_namespace( } else read_write_paths = context->read_write_paths; - needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED); if (needs_sandboxing) { /* The runtime struct only contains the parent of the private /tmp, which is non-accessible * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to @@ -3084,11 +3132,9 @@ static int apply_mount_namespace( params, "shared mount propagation hidden by other fs namespacing unit settings: ignoring"); - if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) { - r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path); - if (r < 0) - return r; - } + r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path); + if (r < 0) + return r; if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) { propagate_dir = path_join("/run/systemd/propagate/", params->unit_id); @@ -3246,31 +3292,39 @@ static int apply_working_directory( const char *home, int *exit_status) { - const char *d, *wd; + const char *wd; + int r; assert(context); assert(exit_status); if (context->working_directory_home) { - if (!home) { *exit_status = EXIT_CHDIR; return -ENXIO; } wd = home; - } else wd = empty_to_root(context->working_directory); if (params->flags & EXEC_APPLY_CHROOT) - d = wd; - else - d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd); + r = RET_NERRNO(chdir(wd)); + else { + _cleanup_close_ int dfd = -EBADF; + + r = chase(wd, + (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, + CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT, + /* ret_path= */ NULL, + &dfd); + if (r >= 0) + r = RET_NERRNO(fchdir(dfd)); + } - if (chdir(d) < 0 && !context->working_directory_missing_ok) { + if (r < 0 && !context->working_directory_missing_ok) { *exit_status = EXIT_CHDIR; - return -errno; + return r; } return 0; @@ -3459,7 +3513,7 @@ static int close_remaining_fds( const int *fds, size_t n_fds) { size_t n_dont_close = 0; - int dont_close[n_fds + 15]; + int dont_close[n_fds + 16]; assert(params); @@ -3495,6 +3549,9 @@ static int close_remaining_fds( if (params->user_lookup_fd >= 0) dont_close[n_dont_close++] = params->user_lookup_fd; + if (params->handoff_timestamp_fd >= 0) + dont_close[n_dont_close++] = params->handoff_timestamp_fd; + assert(n_dont_close <= ELEMENTSOF(dont_close)); return close_all_fds(dont_close, n_dont_close); @@ -3528,26 +3585,29 @@ static int send_user_lookup( return 0; } -static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) { +static int acquire_home(const ExecContext *c, const char **home, char **ret_buf) { int r; assert(c); assert(home); - assert(buf); + assert(ret_buf); /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */ - if (*home) + if (*home) /* Already acquired from get_fixed_user()? */ return 0; if (!c->working_directory_home) return 0; - r = get_home_dir(buf); + if (c->dynamic_user) + return -EADDRNOTAVAIL; + + r = get_home_dir(ret_buf); if (r < 0) return r; - *home = *buf; + *home = *ret_buf; return 1; } @@ -3641,11 +3701,12 @@ static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) { } static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) { + static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET }; + union sockaddr_union addr = { .un.sun_family = AF_UNIX, }; socklen_t sa_len; - static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET }; int r; assert(c); @@ -3655,43 +3716,35 @@ static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, co r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd)); if (r < 0) - return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path); - + return log_exec_error_errno(c, p, r, "Failed to set sockaddr for '%s': %m", of->path); sa_len = r; - for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) { + FOREACH_ELEMENT(i, socket_types) { _cleanup_close_ int fd = -EBADF; - fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0); + fd = socket(AF_UNIX, *i|SOCK_CLOEXEC, 0); if (fd < 0) - return log_exec_error_errno(c, - p, - errno, - "Failed to create socket for %s: %m", + return log_exec_error_errno(c, p, + errno, "Failed to create socket for '%s': %m", of->path); r = RET_NERRNO(connect(fd, &addr.sa, sa_len)); - if (r == -EPROTOTYPE) - continue; - if (r < 0) - return log_exec_error_errno(c, - p, - r, - "Failed to connect socket for %s: %m", + if (r >= 0) + return TAKE_FD(fd); + if (r != -EPROTOTYPE) + return log_exec_error_errno(c, p, + r, "Failed to connect to socket for '%s': %m", of->path); - - return TAKE_FD(fd); } - return log_exec_error_errno(c, - p, - SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", + return log_exec_error_errno(c, p, + SYNTHETIC_ERRNO(EPROTOTYPE), "No suitable socket type to connect to socket '%s'.", of->path); } static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) { - struct stat st; _cleanup_close_ int fd = -EBADF, ofd = -EBADF; + struct stat st; assert(c); assert(p); @@ -3699,10 +3752,10 @@ static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const ofd = open(of->path, O_PATH | O_CLOEXEC); if (ofd < 0) - return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path); + return log_exec_error_errno(c, p, errno, "Failed to open '%s' as O_PATH: %m", of->path); if (fstat(ofd, &st) < 0) - return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path); + return log_exec_error_errno(c, p, errno, "Failed to stat '%s': %m", of->path); if (S_ISSOCK(st.st_mode)) { fd = connect_unix_harder(c, p, of, ofd); @@ -3710,10 +3763,11 @@ static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const return fd; if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0) - return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m", + return log_exec_error_errno(c, p, + errno, "Failed to shutdown send for socket '%s': %m", of->path); - log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd); + log_exec_debug(c, p, "Opened socket '%s' as fd %d.", of->path, fd); } else { int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR; if (FLAGS_SET(of->flags, OPENFILE_APPEND)) @@ -3723,9 +3777,9 @@ static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const fd = fd_reopen(ofd, flags | O_CLOEXEC); if (fd < 0) - return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path); + return log_exec_error_errno(c, p, fd, "Failed to reopen file '%s': %m", of->path); - log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd); + log_exec_debug(c, p, "Opened file '%s' as fd %d.", of->path, fd); } return TAKE_FD(fd); @@ -3744,7 +3798,9 @@ static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t fd = get_open_file_fd(c, p, of); if (fd < 0) { if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) { - log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path); + log_exec_warning_errno(c, p, fd, + "Failed to get OpenFile= file descriptor for '%s', ignoring: %m", + of->path); continue; } @@ -3758,9 +3814,7 @@ static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t if (r < 0) return r; - p->fds[*n_fds] = TAKE_FD(fd); - - (*n_fds)++; + p->fds[(*n_fds)++] = TAKE_FD(fd); } return 0; @@ -3810,7 +3864,7 @@ static bool exec_context_need_unprivileged_private_users( context->private_ipc || context->ipc_namespace_path || context->private_mounts > 0 || - context->mount_apivfs || + context->mount_apivfs > 0 || context->n_bind_mounts > 0 || context->n_temporary_filesystems > 0 || context->root_directory || @@ -3920,6 +3974,52 @@ static void exec_params_close(ExecParameters *p) { p->stderr_fd = safe_close(p->stderr_fd); } +static int exec_fd_mark_hot( + const ExecContext *c, + ExecParameters *p, + bool hot, + int *reterr_exit_status) { + + assert(c); + assert(p); + + if (p->exec_fd < 0) + return 0; + + uint8_t x = hot; + + if (write(p->exec_fd, &x, sizeof(x)) < 0) { + if (reterr_exit_status) + *reterr_exit_status = EXIT_EXEC; + return log_exec_error_errno(c, p, errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold"); + } + + return 1; +} + +static int send_handoff_timestamp( + const ExecContext *c, + ExecParameters *p, + int *reterr_exit_status) { + + assert(c); + assert(p); + + if (p->handoff_timestamp_fd < 0) + return 0; + + dual_timestamp dt; + dual_timestamp_now(&dt); + + if (send(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2, 0) < 0) { + if (reterr_exit_status) + *reterr_exit_status = EXIT_EXEC; + return log_exec_error_errno(c, p, errno, "Failed to send handoff timestamp: %m"); + } + + return 1; +} + int exec_invoke( const ExecCommand *command, const ExecContext *context, @@ -3974,6 +4074,8 @@ int exec_invoke( assert(params); assert(exit_status); + /* This should be mostly redundant, as the log level is also passed as an argument of the executor, + * and is already applied earlier. Just for safety. */ if (context->log_level_max >= 0) log_set_max_level(context->log_level_max); @@ -4049,7 +4151,7 @@ int exec_invoke( return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m"); } - int keep_fds[n_fds + 3]; + int keep_fds[n_fds + 4]; memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int)); n_keep_fds = n_fds; @@ -4059,8 +4161,14 @@ int exec_invoke( return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); } + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->handoff_timestamp_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); + } + #if HAVE_LIBBPF - r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->bpf_outer_map_fd); + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->bpf_restrict_fs_map_fd); if (r < 0) { *exit_status = EXIT_FDS; return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); @@ -4099,7 +4207,7 @@ int exec_invoke( *exit_status = EXIT_CONFIRM; return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED), - "Execution cancelled by the user"); + "Execution cancelled by the user."); } } @@ -4141,12 +4249,12 @@ int exec_invoke( if (!uid_is_valid(uid)) { *exit_status = EXIT_USER; - return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid); + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\".", uid); } if (!gid_is_valid(gid)) { *exit_status = EXIT_USER; - return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid); + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\".", gid); } if (runtime->dynamic_creds->user) @@ -4186,7 +4294,7 @@ int exec_invoke( params->user_lookup_fd = safe_close(params->user_lookup_fd); - r = acquire_home(context, uid, &home, &home_buffer); + r = acquire_home(context, &home, &home_buffer); if (r < 0) { *exit_status = EXIT_CHDIR; return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m"); @@ -4210,9 +4318,10 @@ int exec_invoke( r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL); if (r == -EUCLEAN) { *exit_status = EXIT_CGROUP; - return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s " + return log_exec_error_errno(context, params, r, + "Failed to attach process to cgroup '%s', " "because the cgroup or one of its parents or " - "siblings is in the threaded mode: %m", p); + "siblings is in the threaded mode.", p); } if (r < 0) { *exit_status = EXIT_CGROUP; @@ -4242,13 +4351,20 @@ int exec_invoke( return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m"); } - r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + _cleanup_free_ char *fname = NULL; + r = path_extract_filename(command->path, &fname); + if (r < 0) { + *exit_status = EXIT_STDOUT; + return log_exec_error_errno(context, params, r, "Failed to extract filename from path %s: %m", command->path); + } + + r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino); if (r < 0) { *exit_status = EXIT_STDOUT; return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m"); } - r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino); if (r < 0) { *exit_status = EXIT_STDERR; return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m"); @@ -4445,12 +4561,10 @@ int exec_invoke( return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]); } - if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) { - r = exec_setup_credentials(context, params, params->unit_id, uid, gid); - if (r < 0) { - *exit_status = EXIT_CREDENTIALS; - return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m"); - } + r = exec_setup_credentials(context, params, params->unit_id, uid, gid); + if (r < 0) { + *exit_status = EXIT_CREDENTIALS; + return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m"); } r = build_environment( @@ -4567,7 +4681,7 @@ int exec_invoke( * wins here. (See above.) */ /* All fds passed in the fds array will be closed in the pam child process. */ - r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds); + r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds, params->exec_fd); if (r < 0) { *exit_status = EXIT_PAM; return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m"); @@ -4639,7 +4753,7 @@ int exec_invoke( if (ns_type_supported(NAMESPACE_IPC)) { r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC); - if (r == -EPERM) + if (ERRNO_IS_NEG_PRIVILEGE(r)) log_exec_warning_errno(context, params, r, "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m"); else if (r < 0) { @@ -4657,7 +4771,13 @@ int exec_invoke( if (needs_mount_namespace) { _cleanup_free_ char *error_path = NULL; - r = apply_mount_namespace(command->flags, context, params, runtime, memory_pressure_path, &error_path); + r = apply_mount_namespace(command->flags, + context, + params, + runtime, + memory_pressure_path, + needs_sandboxing, + &error_path); if (r < 0) { *exit_status = EXIT_NAMESPACE; return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m", @@ -4672,7 +4792,7 @@ int exec_invoke( } if (context->memory_ksm >= 0) - if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) { + if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm, 0, 0, 0) < 0) { if (ERRNO_IS_NOT_SUPPORTED(errno)) log_exec_debug_errno(context, params, @@ -4731,26 +4851,16 @@ int exec_invoke( _cleanup_close_ int executable_fd = -EBADF; r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd); if (r < 0) { - if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { - log_exec_struct_errno(context, params, LOG_INFO, r, - "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, - LOG_EXEC_INVOCATION_ID(params), - LOG_EXEC_MESSAGE(params, - "Executable %s missing, skipping: %m", - command->path), - "EXECUTABLE=%s", command->path); - *exit_status = EXIT_SUCCESS; - return 0; - } - *exit_status = EXIT_EXEC; - return log_exec_struct_errno(context, params, LOG_INFO, r, - "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, - LOG_EXEC_INVOCATION_ID(params), - LOG_EXEC_MESSAGE(params, - "Failed to locate executable %s: %m", - command->path), - "EXECUTABLE=%s", command->path); + log_exec_struct_errno(context, params, LOG_NOTICE, r, + "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, + LOG_EXEC_MESSAGE(params, + "Unable to locate executable '%s': %m", + command->path), + "EXECUTABLE=%s", command->path); + /* If the error will be ignored by manager, tune down the log level here. Missing executable + * is very much expected in this case. */ + return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r; } r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd); @@ -4791,15 +4901,16 @@ int exec_invoke( /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any - * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final - * execve(). But first, close the remaining sockets in the context objects. */ + * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep + * them open until the final execve(). But first, close the remaining sockets in the context + * objects. */ exec_runtime_close(runtime); exec_params_close(params); r = close_all_fds(keep_fds, n_keep_fds); if (r >= 0) - r = shift_fds(params->fds, n_fds); + r = pack_fds(params->fds, n_fds); if (r >= 0) r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking); if (r < 0) { @@ -4945,8 +5056,10 @@ int exec_invoke( } } - /* Apply working directory here, because the working directory might be on NFS and only the user running - * this service might have the correct privilege to change to the working directory */ + /* Apply working directory here, because the working directory might be on NFS and only the user + * running this service might have the correct privilege to change to the working directory. Also, it + * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that + * the cwd cannot be used to pin directories outside of the sandbox. */ r = apply_working_directory(context, params, runtime, home, exit_status); if (r < 0) return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m"); @@ -5206,31 +5319,29 @@ int exec_invoke( log_command_line(context, params, "Executing", executable, final_argv); - if (params->exec_fd >= 0) { - uint8_t hot = 1; + /* We have finished with all our initializations. Let's now let the manager know that. From this + * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ - /* We have finished with all our initializations. Let's now let the manager know that. From this point - * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ + r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status); + if (r < 0) + return r; - if (write(params->exec_fd, &hot, sizeof(hot)) < 0) { - *exit_status = EXIT_EXEC; - return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m"); - } + /* As last thing before the execve(), let's send the handoff timestamp */ + r = send_handoff_timestamp(context, params, exit_status); + if (r < 0) { + /* If this handoff timestamp failed, let's undo the marking as hot */ + (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL); + return r; } - r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env); - - if (params->exec_fd >= 0) { - uint8_t hot = 0; + /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they + * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for + * exec_fd this is pretty much the whole raison d'etre. */ - /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager - * that POLLHUP on it no longer means execve() succeeded. */ + r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env); - if (write(params->exec_fd, &hot, sizeof(hot)) < 0) { - *exit_status = EXIT_EXEC; - return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m"); - } - } + /* The execve() failed, let's undo the marking as hot */ + (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL); *exit_status = EXIT_EXEC; return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable); diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index b1e716e..ecd1e70 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -230,6 +230,10 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { return r; } + r = serialize_bool(f, "exec-cgroup-context-memory-zswap-writeback", c->memory_zswap_writeback); + if (r < 0) + return r; + if (c->memory_limit != CGROUP_LIMIT_MAX) { r = serialize_item_format(f, "exec-cgroup-context-memory-limit", "%" PRIu64, c->memory_limit); if (r < 0) @@ -373,8 +377,7 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { if (il->limits[type] == cgroup_io_limit_defaults[type]) continue; - key = strjoin("exec-cgroup-context-io-device-limit-", - cgroup_io_limit_type_to_string(type)); + key = strjoin("exec-cgroup-context-io-device-limit-", cgroup_io_limit_type_to_string(type)); if (!key) return -ENOMEM; @@ -678,6 +681,11 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { r = safe_atou64(val, &c->startup_memory_zswap_max); if (r < 0) return r; + } else if ((val = startswith(l, "exec-cgroup-context-memory-zswap-writeback="))) { + r = parse_boolean(val); + if (r < 0) + return r; + c->memory_zswap_writeback = r; } else if ((val = startswith(l, "exec-cgroup-context-memory-limit="))) { r = safe_atou64(val, &c->memory_limit); if (r < 0) @@ -789,7 +797,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { _cleanup_free_ char *path = NULL, *rwm = NULL; CGroupDevicePermissions p; - r = extract_many_words(&val, " ", 0, &path, &rwm, NULL); + r = extract_many_words(&val, " ", 0, &path, &rwm); if (r < 0) return r; if (r == 0) @@ -806,7 +814,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { _cleanup_free_ char *path = NULL, *weight = NULL; CGroupIODeviceWeight *a = NULL; - r = extract_many_words(&val, " ", 0, &path, &weight, NULL); + r = extract_many_words(&val, " ", 0, &path, &weight); if (r < 0) return r; if (r != 2) @@ -835,7 +843,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { _cleanup_free_ char *path = NULL, *target = NULL; CGroupIODeviceLatency *a = NULL; - r = extract_many_words(&val, " ", 0, &path, &target, NULL); + r = extract_many_words(&val, " ", 0, &path, &target); if (r < 0) return r; if (r != 2) @@ -865,7 +873,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { CGroupIODeviceLimit *limit = NULL; CGroupIOLimitType t; - r = extract_many_words(&val, "= ", 0, &type, &path, &limits, NULL); + r = extract_many_words(&val, "= ", 0, &type, &path, &limits); if (r < 0) return r; if (r != 3) @@ -900,7 +908,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { _cleanup_free_ char *path = NULL, *weight = NULL; CGroupBlockIODeviceWeight *a = NULL; - r = extract_many_words(&val, " ", 0, &path, &weight, NULL); + r = extract_many_words(&val, " ", 0, &path, &weight); if (r < 0) return r; if (r != 2) @@ -921,7 +929,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { _cleanup_free_ char *path = NULL, *bw = NULL; CGroupBlockIODeviceBandwidth *a = NULL; - r = extract_many_words(&val, " ", 0, &path, &bw, NULL); + r = extract_many_words(&val, " ", 0, &path, &bw); if (r < 0) return r; if (r != 2) @@ -951,7 +959,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { _cleanup_free_ char *path = NULL, *bw = NULL; CGroupBlockIODeviceBandwidth *a = NULL; - r = extract_many_words(&val, " ", 0, &path, &bw, NULL); + r = extract_many_words(&val, " ", 0, &path, &bw); if (r < 0) return r; if (r != 2) @@ -1019,7 +1027,7 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { _cleanup_free_ char *type = NULL, *path = NULL; uint32_t t; - r = extract_many_words(&val, " ", 0, &type, &path, NULL); + r = extract_many_words(&val, " ", 0, &type, &path); if (r < 0) return r; if (r != 2) @@ -1365,8 +1373,12 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext if (r < 0) return r; + r = serialize_fd(f, fds, "exec-parameters-handoff-timestamp-fd", p->handoff_timestamp_fd); + if (r < 0) + return r; + if (c && exec_context_restrict_filesystems_set(c)) { - r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_outer_map_fd); + r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd); if (r < 0) return r; } @@ -1479,8 +1491,8 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { return log_oom_debug(); /* Ensure we don't leave any FD uninitialized on error, it makes the fuzzer sad */ - for (size_t i = 0; i < p->n_socket_fds + p->n_storage_fds; ++i) - p->fds[i] = -EBADF; + FOREACH_ARRAY(i, p->fds, p->n_socket_fds + p->n_storage_fds) + *i = -EBADF; r = deserialize_fd_many(fds, val, p->n_socket_fds + p->n_storage_fds, p->fds); if (r < 0) @@ -1522,7 +1534,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { _cleanup_free_ char *type = NULL, *prefix = NULL; ExecDirectoryType dt; - r = extract_many_words(&val, "= ", 0, &type, &prefix, NULL); + r = extract_many_words(&val, "= ", 0, &type, &prefix); if (r < 0) return r; if (r == 0) @@ -1585,7 +1597,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { if (fd < 0) continue; - p->stdin_fd = fd; + close_and_replace(p->stdin_fd, fd); } else if ((val = startswith(l, "exec-parameters-stdout-fd="))) { int fd; @@ -1594,7 +1606,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { if (fd < 0) continue; - p->stdout_fd = fd; + close_and_replace(p->stdout_fd, fd); } else if ((val = startswith(l, "exec-parameters-stderr-fd="))) { int fd; @@ -1603,7 +1615,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { if (fd < 0) continue; - p->stderr_fd = fd; + close_and_replace(p->stderr_fd, fd); } else if ((val = startswith(l, "exec-parameters-exec-fd="))) { int fd; @@ -1611,7 +1623,15 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { if (fd < 0) continue; - p->exec_fd = fd; + close_and_replace(p->exec_fd, fd); + } else if ((val = startswith(l, "exec-parameters-handoff-timestamp-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + close_and_replace(p->handoff_timestamp_fd, fd); } else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) { int fd; @@ -1619,13 +1639,13 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { if (fd < 0) continue; - p->bpf_outer_map_fd = fd; + close_and_replace(p->bpf_restrict_fs_map_fd, fd); } else if ((val = startswith(l, "exec-parameters-notify-socket="))) { r = free_and_strdup(&p->notify_socket, val); if (r < 0) return r; } else if ((val = startswith(l, "exec-parameters-open-file="))) { - OpenFile *of = NULL; + OpenFile *of; r = open_file_parse(val, &of); if (r < 0) @@ -1643,7 +1663,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { if (fd < 0) continue; - p->user_lookup_fd = fd; + close_and_replace(p->user_lookup_fd, fd); } else if ((val = startswith(l, "exec-parameters-files-env="))) { r = deserialize_strv(val, &p->files_env); if (r < 0) @@ -1812,6 +1832,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (r < 0) return r; + r = serialize_item_tristate(f, "exec-context-mount-api-vfs", c->mount_apivfs); + if (r < 0) + return r; + r = serialize_item_tristate(f, "exec-context-memory-ksm", c->memory_ksm); if (r < 0) return r; @@ -1868,20 +1892,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (r < 0) return r; - if (c->mount_apivfs_set) { - r = serialize_bool(f, "exec-context-mount-api-vfs", c->mount_apivfs); - if (r < 0) - return r; - } - r = serialize_bool_elide(f, "exec-context-same-pgrp", c->same_pgrp); if (r < 0) return r; - r = serialize_bool_elide(f, "exec-context-cpu-sched-reset-on-fork", c->cpu_sched_reset_on_fork); - if (r < 0) - return r; - r = serialize_bool(f, "exec-context-ignore-sigpipe", c->ignore_sigpipe); if (r < 0) return r; @@ -2154,6 +2168,8 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (r < 0) return r; + /* This is also passed to executor as an argument. So, the information should be redundant in general. + * But, let's keep this as is for consistency with other elements of ExecContext. See exec_spawn(). */ r = serialize_item_format(f, "exec-context-log-level-max", "%d", c->log_level_max); if (r < 0) return r; @@ -2538,14 +2554,14 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (base64mem(sc->data, sc->size, &data) < 0) return log_oom_debug(); - r = serialize_item_format(f, "exec-context-set-credentials", "%s %s %s", sc->id, yes_no(sc->encrypted), data); + r = serialize_item_format(f, "exec-context-set-credentials", "%s %s %s", sc->id, data, yes_no(sc->encrypted)); if (r < 0) return r; } ExecLoadCredential *lc; HASHMAP_FOREACH(lc, c->load_credentials) { - r = serialize_item_format(f, "exec-context-load-credentials", "%s %s %s", lc->id, yes_no(lc->encrypted), lc->path); + r = serialize_item_format(f, "exec-context-load-credentials", "%s %s %s", lc->id, lc->path, yes_no(lc->encrypted)); if (r < 0) return r; } @@ -2636,7 +2652,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { break; p = word; - r = extract_many_words(&p, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + r = extract_many_words(&p, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options); if (r < 0) return r; if (r == 0) @@ -2669,12 +2685,12 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { return r; } else if ((val = startswith(l, "exec-context-root-hash="))) { c->root_hash = mfree(c->root_hash); - r = unhexmem(val, strlen(val), &c->root_hash, &c->root_hash_size); + r = unhexmem(val, &c->root_hash, &c->root_hash_size); if (r < 0) return r; } else if ((val = startswith(l, "exec-context-root-hash-sig="))) { c->root_hash_sig = mfree(c->root_hash_sig); - r= unbase64mem(val, strlen(val), &c->root_hash_sig, &c->root_hash_sig_size); + r= unbase64mem(val, &c->root_hash_sig, &c->root_hash_sig_size); if (r < 0) return r; } else if ((val = startswith(l, "exec-context-root-ephemeral="))) { @@ -2695,6 +2711,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { r = safe_atoi(val, &c->private_mounts); if (r < 0) return r; + } else if ((val = startswith(l, "exec-context-mount-api-vfs="))) { + r = safe_atoi(val, &c->mount_apivfs); + if (r < 0) + return r; } else if ((val = startswith(l, "exec-context-memory-ksm="))) { r = safe_atoi(val, &c->memory_ksm); if (r < 0) @@ -2762,22 +2782,11 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { c->protect_system = protect_system_from_string(val); if (c->protect_system < 0) return -EINVAL; - } else if ((val = startswith(l, "exec-context-mount-api-vfs="))) { - r = parse_boolean(val); - if (r < 0) - return r; - c->mount_apivfs = r; - c->mount_apivfs_set = true; } else if ((val = startswith(l, "exec-context-same-pgrp="))) { r = parse_boolean(val); if (r < 0) return r; c->same_pgrp = r; - } else if ((val = startswith(l, "exec-context-cpu-sched-reset-on-fork="))) { - r = parse_boolean(val); - if (r < 0) - return r; - c->cpu_sched_reset_on_fork = r; } else if ((val = startswith(l, "exec-context-non-blocking="))) { r = parse_boolean(val); if (r < 0) @@ -2828,7 +2837,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { _cleanup_free_ char *type = NULL, *mode = NULL; ExecDirectoryType dt; - r = extract_many_words(&val, "= ", 0, &type, &mode, NULL); + r = extract_many_words(&val, "= ", 0, &type, &mode); if (r < 0) return r; if (r == 0 || !mode) @@ -2854,7 +2863,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { break; p = tuple; - r = extract_many_words(&p, ":", EXTRACT_UNESCAPE_SEPARATORS, &path, &only_create, NULL); + r = extract_many_words(&p, ":", EXTRACT_UNESCAPE_SEPARATORS, &path, &only_create); if (r < 0) return r; if (r < 2) @@ -3054,7 +3063,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { if (c->stdin_data) return -EINVAL; /* duplicated */ - r = unbase64mem(val, strlen(val), &c->stdin_data, &c->stdin_data_size); + r = unbase64mem(val, &c->stdin_data, &c->stdin_data_size); if (r < 0) return r; } else if ((val = startswith(l, "exec-context-tty-path="))) { @@ -3098,6 +3107,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { if (r < 0) return r; } else if ((val = startswith(l, "exec-context-log-level-max="))) { + /* See comment in serialization. */ r = safe_atoi(val, &c->log_level_max); if (r < 0) return r; @@ -3314,7 +3324,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { } else if ((val = startswith(l, "exec-context-temporary-filesystems="))) { _cleanup_free_ char *path = NULL, *options = NULL; - r = extract_many_words(&val, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &path, &options, NULL); + r = extract_many_words(&val, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &path, &options); if (r < 0) return r; if (r < 1) @@ -3392,7 +3402,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { _cleanup_free_ char *s_id = NULL, *s_errno_num = NULL; int id, errno_num; - r = extract_many_words(&val, NULL, 0, &s_id, &s_errno_num, NULL); + r = extract_many_words(&val, NULL, 0, &s_id, &s_errno_num); if (r < 0) return r; if (r != 2) @@ -3432,7 +3442,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { _cleanup_free_ char *s_id = NULL, *s_errno_num = NULL; int id, errno_num; - r = extract_many_words(&val, " ", 0, &s_id, &s_errno_num, NULL); + r = extract_many_words(&val, " ", 0, &s_id, &s_errno_num); if (r < 0) return r; if (r != 2) @@ -3505,8 +3515,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &source, - &destination, - NULL); + &destination); if (r < 0) return r; if (r == 0) @@ -3538,8 +3547,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, - &opts, - NULL); + &opts); if (r < 0) return r; if (r == 0) @@ -3619,8 +3627,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, - &opts, - NULL); + &opts); if (r < 0) return r; if (r == 0) @@ -3669,7 +3676,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL; _cleanup_free_ char *id = NULL, *encrypted = NULL, *data = NULL; - r = extract_many_words(&val, " ", 0, &id, &encrypted, &data, NULL); + r = extract_many_words(&val, " ", EXTRACT_DONT_COALESCE_SEPARATORS, &id, &data, &encrypted); if (r < 0) return r; if (r != 3) @@ -3688,7 +3695,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { .encrypted = r, }; - r = unbase64mem(data, strlen(data), &sc->data, &sc->size); + r = unbase64mem(data, &sc->data, &sc->size); if (r < 0) return r; @@ -3701,7 +3708,7 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL; _cleanup_free_ char *id = NULL, *encrypted = NULL, *path = NULL; - r = extract_many_words(&val, " ", 0, &id, &encrypted, &path, NULL); + r = extract_many_words(&val, " ", EXTRACT_DONT_COALESCE_SEPARATORS, &id, &path, &encrypted); if (r < 0) return r; if (r != 3) diff --git a/src/core/execute.c b/src/core/execute.c index 8dbdfcf..513e95e 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -147,7 +147,7 @@ void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) const char *path = exec_context_tty_path(context); - if (p && p->stdin_fd >= 0 && isatty(p->stdin_fd)) + if (p && p->stdin_fd >= 0 && isatty_safe(p->stdin_fd)) fd = p->stdin_fd; else if (path && (context->tty_path || is_terminal_input(context->std_input) || is_terminal_output(context->std_output) || is_terminal_output(context->std_error))) { @@ -162,9 +162,11 @@ void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) * that will be closed automatically, and operate on it for convenience. */ lock_fd = lock_dev_console(); if (ERRNO_IS_NEG_PRIVILEGE(lock_fd)) - log_debug_errno(lock_fd, "No privileges to lock /dev/console, proceeding without: %m"); + log_debug_errno(lock_fd, "No privileges to lock /dev/console, proceeding without lock: %m"); + else if (ERRNO_IS_NEG_DEVICE_ABSENT(lock_fd)) + log_debug_errno(lock_fd, "Device /dev/console does not exist, proceeding without lock: %m"); else if (lock_fd < 0) - return (void) log_debug_errno(lock_fd, "Failed to lock /dev/console: %m"); + log_warning_errno(lock_fd, "Failed to lock /dev/console, proceeding without lock: %m"); if (context->tty_vhangup) (void) terminal_vhangup_fd(fd); @@ -351,19 +353,18 @@ static void log_command_line(Unit *unit, const char *msg, const char *executable static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l); -int exec_spawn(Unit *unit, - ExecCommand *command, - const ExecContext *context, - ExecParameters *params, - ExecRuntime *runtime, - const CGroupContext *cgroup_context, - pid_t *ret) { +int exec_spawn( + Unit *unit, + ExecCommand *command, + const ExecContext *context, + ExecParameters *params, + ExecRuntime *runtime, + const CGroupContext *cgroup_context, + PidRef *ret) { - char serialization_fd_number[DECIMAL_STR_MAX(int) + 1]; - _cleanup_free_ char *subcgroup_path = NULL, *log_level = NULL, *executor_path = NULL; + _cleanup_free_ char *subcgroup_path = NULL, *max_log_levels = NULL, *executor_path = NULL; _cleanup_fdset_free_ FDSet *fdset = NULL; _cleanup_fclose_ FILE *f = NULL; - pid_t pid; int r; assert(unit); @@ -371,10 +372,11 @@ int exec_spawn(Unit *unit, assert(unit->manager->executor_fd >= 0); assert(command); assert(context); - assert(ret); assert(params); - assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0)); + assert(!params->fds || FLAGS_SET(params->flags, EXEC_PASS_FDS)); + assert(params->fds || (params->n_socket_fds + params->n_storage_fds == 0)); assert(!params->files_env); /* We fill this field, ensure it comes NULL-initialized to us */ + assert(ret); LOG_CONTEXT_PUSH_UNIT(unit); @@ -404,8 +406,8 @@ int exec_spawn(Unit *unit, * child's memory.max, serialize all the state needed to start the unit, and pass it to the * systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec * and ensure all memory is shared. The child immediately execs the new binary so the delay should - * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the - * target cgroup. */ + * be minimal. If glibc 2.39 is available pidfd_spawn() is used in order to get a race-free pid fd + * and to clone directly into the target cgroup (if we booted with cgroupv2). */ r = open_serialization_file("sd-executor-state", &f); if (r < 0) @@ -430,39 +432,57 @@ int exec_spawn(Unit *unit, if (r < 0) return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialized fds: %m"); - r = log_level_to_string_alloc(log_get_max_level(), &log_level); + /* If LogLevelMax= is specified, then let's use the specified log level at the beginning of the + * executor process. To achieve that the specified log level is passed as an argument, rather than + * the one for the manager process. */ + r = log_max_levels_to_string(context->log_level_max >= 0 ? context->log_level_max : log_get_max_level(), &max_log_levels); if (r < 0) - return log_unit_error_errno(unit, r, "Failed to convert log level to string: %m"); + return log_unit_error_errno(unit, r, "Failed to convert max log levels to string: %m"); r = fd_get_path(unit->manager->executor_fd, &executor_path); if (r < 0) return log_unit_error_errno(unit, r, "Failed to get executor path from fd: %m"); + char serialization_fd_number[DECIMAL_STR_MAX(int)]; xsprintf(serialization_fd_number, "%i", fileno(f)); + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + dual_timestamp start_timestamp; + + /* Record the start timestamp before we fork so that it is guaranteed to be earlier than the + * handoff timestamp. */ + dual_timestamp_now(&start_timestamp); + /* The executor binary is pinned, to avoid compatibility problems during upgrades. */ r = posix_spawn_wrapper( FORMAT_PROC_FD_PATH(unit->manager->executor_fd), STRV_MAKE(executor_path, "--deserialize", serialization_fd_number, - "--log-level", log_level, + "--log-level", max_log_levels, "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))), environ, - &pid); + cg_unified() > 0 ? subcgroup_path : NULL, + &pidref); + if (r == -EUCLEAN && subcgroup_path) + return log_unit_error_errno(unit, r, + "Failed to spawn process into cgroup '%s', because the cgroup " + "or one of its parents or siblings is in the threaded mode.", + subcgroup_path); if (r < 0) return log_unit_error_errno(unit, r, "Failed to spawn executor: %m"); - - log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid); - /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the * process will be killed too). */ - if (subcgroup_path) - (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid); + if (r == 0 && subcgroup_path) + (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pidref.pid); + /* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */ + + log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)", + command->path, pidref.pid, r > 0 ? "via" : "without"); - exec_status_start(&command->exec_status, pid); + exec_status_start(&command->exec_status, pidref.pid, &start_timestamp); - *ret = pid; + *ret = TAKE_PIDREF(pidref); return 0; } @@ -491,6 +511,7 @@ void exec_context_init(ExecContext *c) { .tty_rows = UINT_MAX, .tty_cols = UINT_MAX, .private_mounts = -1, + .mount_apivfs = -1, .memory_ksm = -1, .set_login_environment = -1, }; @@ -664,13 +685,19 @@ void exec_command_done_array(ExecCommand *c, size_t n) { exec_command_done(i); } +ExecCommand* exec_command_free(ExecCommand *c) { + if (!c) + return NULL; + + exec_command_done(c); + return mfree(c); +} + ExecCommand* exec_command_free_list(ExecCommand *c) { ExecCommand *i; - while ((i = LIST_POP(command, c))) { - exec_command_done(i); - free(i); - } + while ((i = LIST_POP(command, c))) + exec_command_free(i); return NULL; } @@ -1396,7 +1423,7 @@ bool exec_context_maintains_privileges(const ExecContext *c) { if (!c->user) return true; - if (streq(c->user, "root") || streq(c->user, "0")) + if (STR_IN_SET(c->user, "root", "0")) return true; return false; @@ -1421,8 +1448,8 @@ bool exec_context_get_effective_mount_apivfs(const ExecContext *c) { assert(c); /* Explicit setting wins */ - if (c->mount_apivfs_set) - return c->mount_apivfs; + if (c->mount_apivfs >= 0) + return c->mount_apivfs > 0; /* Default to "yes" if root directory or image are specified */ if (exec_context_with_rootfs(c)) @@ -1657,6 +1684,15 @@ uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c) { return (uint64_t) MAX(r, 0); } +bool exec_context_get_set_login_environment(const ExecContext *c) { + assert(c); + + if (c->set_login_environment >= 0) + return c->set_login_environment; + + return c->user || c->dynamic_user || c->pam_name; +} + char** exec_context_get_syscall_filter(const ExecContext *c) { _cleanup_strv_free_ char **l = NULL; @@ -1787,14 +1823,17 @@ char** exec_context_get_restrict_filesystems(const ExecContext *c) { return l ? TAKE_PTR(l) : strv_new(NULL); } -void exec_status_start(ExecStatus *s, pid_t pid) { +void exec_status_start(ExecStatus *s, pid_t pid, const dual_timestamp *ts) { assert(s); *s = (ExecStatus) { .pid = pid, }; - dual_timestamp_now(&s->start_timestamp); + if (ts) + s->start_timestamp = *ts; + else + dual_timestamp_now(&s->start_timestamp); } void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) { @@ -1814,6 +1853,19 @@ void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int (void) utmp_put_dead_process(context->utmp_id, pid, code, status); } +void exec_status_handoff(ExecStatus *s, const struct ucred *ucred, const dual_timestamp *ts) { + assert(s); + assert(ucred); + assert(ts); + + if (ucred->pid != s->pid) + *s = (ExecStatus) { + .pid = ucred->pid, + }; + + s->handoff_timestamp = *ts; +} + void exec_status_reset(ExecStatus *s) { assert(s); @@ -1836,19 +1888,45 @@ void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) { if (dual_timestamp_is_set(&s->start_timestamp)) fprintf(f, "%sStart Timestamp: %s\n", - prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime)); + prefix, FORMAT_TIMESTAMP_STYLE(s->start_timestamp.realtime, TIMESTAMP_US)); + + if (dual_timestamp_is_set(&s->handoff_timestamp) && dual_timestamp_is_set(&s->start_timestamp) && + s->handoff_timestamp.monotonic > s->start_timestamp.monotonic) + fprintf(f, + "%sHandoff Timestamp: %s since start\n", + prefix, + FORMAT_TIMESPAN(usec_sub_unsigned(s->handoff_timestamp.monotonic, s->start_timestamp.monotonic), 1)); + else + fprintf(f, + "%sHandoff Timestamp: %s\n", + prefix, FORMAT_TIMESTAMP_STYLE(s->handoff_timestamp.realtime, TIMESTAMP_US)); + + if (dual_timestamp_is_set(&s->exit_timestamp)) { + + if (dual_timestamp_is_set(&s->handoff_timestamp) && s->exit_timestamp.monotonic > s->handoff_timestamp.monotonic) + fprintf(f, + "%sExit Timestamp: %s since handoff\n", + prefix, + FORMAT_TIMESPAN(usec_sub_unsigned(s->exit_timestamp.monotonic, s->handoff_timestamp.monotonic), 1)); + else if (dual_timestamp_is_set(&s->start_timestamp) && s->exit_timestamp.monotonic > s->start_timestamp.monotonic) + fprintf(f, + "%sExit Timestamp: %s since start\n", + prefix, + FORMAT_TIMESPAN(usec_sub_unsigned(s->exit_timestamp.monotonic, s->start_timestamp.monotonic), 1)); + else + fprintf(f, + "%sExit Timestamp: %s\n", + prefix, FORMAT_TIMESTAMP_STYLE(s->exit_timestamp.realtime, TIMESTAMP_US)); - if (dual_timestamp_is_set(&s->exit_timestamp)) fprintf(f, - "%sExit Timestamp: %s\n" "%sExit Code: %s\n" "%sExit Status: %i\n", - prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime), prefix, sigchld_code_to_string(s->code), prefix, s->status); + } } -static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) { +void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) { _cleanup_free_ char *cmd = NULL; const char *prefix2; @@ -1951,8 +2029,7 @@ static char *destroy_tree(char *path) { } void exec_shared_runtime_done(ExecSharedRuntime *rt) { - if (!rt) - return; + assert(rt); if (rt->manager) (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id); @@ -1965,8 +2042,10 @@ void exec_shared_runtime_done(ExecSharedRuntime *rt) { } static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) { - exec_shared_runtime_done(rt); + if (!rt) + return NULL; + exec_shared_runtime_done(rt); return mfree(rt); } @@ -2090,15 +2169,13 @@ static int exec_shared_runtime_make( return r; } - if (exec_needs_network_namespace(c)) { + if (exec_needs_network_namespace(c)) if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0) return -errno; - } - if (exec_needs_ipc_namespace(c)) { + if (exec_needs_ipc_namespace(c)) if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0) return -errno; - } r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret); if (r < 0) @@ -2488,7 +2565,7 @@ void exec_params_shallow_clear(ExecParameters *p) { p->fds = mfree(p->fds); p->exec_fd = safe_close(p->exec_fd); p->user_lookup_fd = -EBADF; - p->bpf_outer_map_fd = -EBADF; + p->bpf_restrict_fs_map_fd = -EBADF; p->unit_id = mfree(p->unit_id); p->invocation_id = SD_ID128_NULL; p->invocation_id_string[0] = '\0'; @@ -2643,46 +2720,46 @@ ExecCleanMask exec_clean_mask_from_string(const char *s) { } static const char* const exec_input_table[_EXEC_INPUT_MAX] = { - [EXEC_INPUT_NULL] = "null", - [EXEC_INPUT_TTY] = "tty", + [EXEC_INPUT_NULL] = "null", + [EXEC_INPUT_TTY] = "tty", [EXEC_INPUT_TTY_FORCE] = "tty-force", - [EXEC_INPUT_TTY_FAIL] = "tty-fail", - [EXEC_INPUT_SOCKET] = "socket", - [EXEC_INPUT_NAMED_FD] = "fd", - [EXEC_INPUT_DATA] = "data", - [EXEC_INPUT_FILE] = "file", + [EXEC_INPUT_TTY_FAIL] = "tty-fail", + [EXEC_INPUT_SOCKET] = "socket", + [EXEC_INPUT_NAMED_FD] = "fd", + [EXEC_INPUT_DATA] = "data", + [EXEC_INPUT_FILE] = "file", }; DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput); static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = { - [EXEC_OUTPUT_INHERIT] = "inherit", - [EXEC_OUTPUT_NULL] = "null", - [EXEC_OUTPUT_TTY] = "tty", - [EXEC_OUTPUT_KMSG] = "kmsg", - [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console", - [EXEC_OUTPUT_JOURNAL] = "journal", + [EXEC_OUTPUT_INHERIT] = "inherit", + [EXEC_OUTPUT_NULL] = "null", + [EXEC_OUTPUT_TTY] = "tty", + [EXEC_OUTPUT_KMSG] = "kmsg", + [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console", + [EXEC_OUTPUT_JOURNAL] = "journal", [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console", - [EXEC_OUTPUT_SOCKET] = "socket", - [EXEC_OUTPUT_NAMED_FD] = "fd", - [EXEC_OUTPUT_FILE] = "file", - [EXEC_OUTPUT_FILE_APPEND] = "append", - [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate", + [EXEC_OUTPUT_SOCKET] = "socket", + [EXEC_OUTPUT_NAMED_FD] = "fd", + [EXEC_OUTPUT_FILE] = "file", + [EXEC_OUTPUT_FILE_APPEND] = "append", + [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate", }; DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput); static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = { - [EXEC_UTMP_INIT] = "init", + [EXEC_UTMP_INIT] = "init", [EXEC_UTMP_LOGIN] = "login", - [EXEC_UTMP_USER] = "user", + [EXEC_UTMP_USER] = "user", }; DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode); static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = { - [EXEC_PRESERVE_NO] = "no", - [EXEC_PRESERVE_YES] = "yes", + [EXEC_PRESERVE_NO] = "no", + [EXEC_PRESERVE_YES] = "yes", [EXEC_PRESERVE_RESTART] = "restart", }; @@ -2690,10 +2767,10 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EX /* This table maps ExecDirectoryType to the setting it is configured with in the unit */ static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = { - [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory", - [EXEC_DIRECTORY_STATE] = "StateDirectory", - [EXEC_DIRECTORY_CACHE] = "CacheDirectory", - [EXEC_DIRECTORY_LOGS] = "LogsDirectory", + [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory", + [EXEC_DIRECTORY_STATE] = "StateDirectory", + [EXEC_DIRECTORY_CACHE] = "CacheDirectory", + [EXEC_DIRECTORY_LOGS] = "LogsDirectory", [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory", }; @@ -2724,10 +2801,10 @@ DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_mode, ExecDirectoryType); * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit * directories, specifically .timer units with their timestamp touch file. */ static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = { - [EXEC_DIRECTORY_RUNTIME] = "runtime", - [EXEC_DIRECTORY_STATE] = "state", - [EXEC_DIRECTORY_CACHE] = "cache", - [EXEC_DIRECTORY_LOGS] = "logs", + [EXEC_DIRECTORY_RUNTIME] = "runtime", + [EXEC_DIRECTORY_STATE] = "state", + [EXEC_DIRECTORY_CACHE] = "cache", + [EXEC_DIRECTORY_LOGS] = "logs", [EXEC_DIRECTORY_CONFIGURATION] = "configuration", }; @@ -2736,7 +2813,7 @@ DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType); static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = { [EXEC_KEYRING_INHERIT] = "inherit", [EXEC_KEYRING_PRIVATE] = "private", - [EXEC_KEYRING_SHARED] = "shared", + [EXEC_KEYRING_SHARED] = "shared", }; DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode); diff --git a/src/core/execute.h b/src/core/execute.h index 5a6927a..107ae25 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -91,6 +91,7 @@ typedef enum ExecKeyringMode { struct ExecStatus { dual_timestamp start_timestamp; dual_timestamp exit_timestamp; + dual_timestamp handoff_timestamp; pid_t pid; int code; /* as in siginfo_t::si_code */ int status; /* as in siginfo_t::si_status */ @@ -199,7 +200,6 @@ struct ExecContext { bool nice_set:1; bool ioprio_set:1; bool cpu_sched_set:1; - bool mount_apivfs_set:1; /* This is not exposed to the user but available internally. We need it to make sure that whenever we * spawn /usr/bin/mount it is run in the same process group as us so that the autofs logic detects @@ -312,6 +312,7 @@ struct ExecContext { ProcSubset proc_subset; /* subset= */ int private_mounts; + int mount_apivfs; int memory_ksm; bool private_tmp; bool private_network; @@ -326,7 +327,6 @@ struct ExecContext { ProtectSystem protect_system; ProtectHome protect_home; bool protect_hostname; - bool mount_apivfs; bool dynamic_user; bool remove_ipc; @@ -390,22 +390,23 @@ static inline bool exec_context_with_rootfs(const ExecContext *c) { } typedef enum ExecFlags { - EXEC_APPLY_SANDBOXING = 1 << 0, - EXEC_APPLY_CHROOT = 1 << 1, - EXEC_APPLY_TTY_STDIN = 1 << 2, - EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */ - EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */ - EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */ - EXEC_CGROUP_DELEGATE = 1 << 6, - EXEC_IS_CONTROL = 1 << 7, - EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */ - EXEC_WRITE_CREDENTIALS = 1 << 9, /* Set up the credential store logic */ + EXEC_APPLY_SANDBOXING = 1 << 0, + EXEC_APPLY_CHROOT = 1 << 1, + EXEC_APPLY_TTY_STDIN = 1 << 2, + EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */ + EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */ + EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */ + EXEC_CGROUP_DELEGATE = 1 << 6, + EXEC_IS_CONTROL = 1 << 7, + EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */ + EXEC_SETUP_CREDENTIALS = 1 << 9, /* Set up the credential store logic */ + EXEC_SETUP_CREDENTIALS_FRESH = 1 << 10, /* Set up a new credential store (disable reuse) */ /* The following are not used by execute.c, but by consumers internally */ - EXEC_PASS_FDS = 1 << 10, - EXEC_SETENV_RESULT = 1 << 11, - EXEC_SET_WATCHDOG = 1 << 12, - EXEC_SETENV_MONITOR_RESULT = 1 << 13, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */ + EXEC_PASS_FDS = 1 << 11, + EXEC_SETENV_RESULT = 1 << 12, + EXEC_SET_WATCHDOG = 1 << 13, + EXEC_SETENV_MONITOR_RESULT = 1 << 14, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */ } ExecFlags; /* Parameters for a specific invocation of a command. This structure is put together right before a command is @@ -442,7 +443,7 @@ struct ExecParameters { int stdout_fd; int stderr_fd; - /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */ + /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. */ int exec_fd; char *notify_socket; @@ -453,7 +454,9 @@ struct ExecParameters { char **files_env; int user_lookup_fd; - int bpf_outer_map_fd; + int handoff_timestamp_fd; + + int bpf_restrict_fs_map_fd; /* Used for logging in the executor functions */ char *unit_id; @@ -461,34 +464,40 @@ struct ExecParameters { char invocation_id_string[SD_ID128_STRING_MAX]; }; -#define EXEC_PARAMETERS_INIT(_flags) \ - (ExecParameters) { \ - .flags = (_flags), \ - .stdin_fd = -EBADF, \ - .stdout_fd = -EBADF, \ - .stderr_fd = -EBADF, \ - .exec_fd = -EBADF, \ - .bpf_outer_map_fd = -EBADF, \ - .user_lookup_fd = -EBADF, \ - }; +#define EXEC_PARAMETERS_INIT(_flags) \ + (ExecParameters) { \ + .flags = (_flags), \ + .stdin_fd = -EBADF, \ + .stdout_fd = -EBADF, \ + .stderr_fd = -EBADF, \ + .exec_fd = -EBADF, \ + .bpf_restrict_fs_map_fd = -EBADF, \ + .user_lookup_fd = -EBADF, \ + .handoff_timestamp_fd = -EBADF, \ + } #include "unit.h" #include "dynamic-user.h" -int exec_spawn(Unit *unit, - ExecCommand *command, - const ExecContext *context, - ExecParameters *exec_params, - ExecRuntime *runtime, - const CGroupContext *cgroup_context, - pid_t *ret); +int exec_spawn( + Unit *unit, + ExecCommand *command, + const ExecContext *context, + ExecParameters *exec_params, + ExecRuntime *runtime, + const CGroupContext *cgroup_context, + PidRef *ret); void exec_command_done(ExecCommand *c); void exec_command_done_array(ExecCommand *c, size_t n); +ExecCommand* exec_command_free(ExecCommand *c); +DEFINE_TRIVIAL_CLEANUP_FUNC(ExecCommand*, exec_command_free); ExecCommand* exec_command_free_list(ExecCommand *c); void exec_command_free_array(ExecCommand **c, size_t n); void exec_command_reset_status_array(ExecCommand *c, size_t n); void exec_command_reset_status_list_array(ExecCommand **c, size_t n); + +void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix); void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix); void exec_command_append_list(ExecCommand **l, ExecCommand *e); int exec_command_set(ExecCommand *c, const char *path, ...) _sentinel_; @@ -527,14 +536,16 @@ int exec_context_get_nice(const ExecContext *c); int exec_context_get_cpu_sched_policy(const ExecContext *c); int exec_context_get_cpu_sched_priority(const ExecContext *c); uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c); +bool exec_context_get_set_login_environment(const ExecContext *c); char** exec_context_get_syscall_filter(const ExecContext *c); char** exec_context_get_syscall_archs(const ExecContext *c); char** exec_context_get_syscall_log(const ExecContext *c); char** exec_context_get_address_families(const ExecContext *c); char** exec_context_get_restrict_filesystems(const ExecContext *c); -void exec_status_start(ExecStatus *s, pid_t pid); +void exec_status_start(ExecStatus *s, pid_t pid, const dual_timestamp *ts); void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status); +void exec_status_handoff(ExecStatus *s, const struct ucred *ucred, const dual_timestamp *ts); void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix); void exec_status_reset(ExecStatus *s); @@ -613,23 +624,23 @@ bool exec_needs_ipc_namespace(const ExecContext *context); #define LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep) \ ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_INVOCATION_ID=%s" : "INVOCATION_ID=%s") -#define log_exec_full_errno_zerook(ec, ep, level, error, ...) \ - ({ \ - const ExecContext *_c = (ec); \ - const ExecParameters *_p = (ep); \ - const int _l = (level); \ - bool _do_log = !(log_get_max_level() < LOG_PRI(_l) || \ - !(_c->log_level_max < 0 || \ - _c->log_level_max >= LOG_PRI(_l))); \ - LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \ - _c->n_log_extra_fields); \ - !_do_log ? -ERRNO_VALUE(error) : \ - log_object_internal(_l, error, PROJECT_FILE, \ - __LINE__, __func__, \ - LOG_EXEC_ID_FIELD(_p), \ - _p->unit_id, \ - LOG_EXEC_INVOCATION_ID_FIELD(_p), \ - _p->invocation_id_string, ##__VA_ARGS__); \ +#define log_exec_full_errno_zerook(ec, ep, level, error, ...) \ + ({ \ + const ExecContext *_c = (ec); \ + const ExecParameters *_p = (ep); \ + const int _l = (level); \ + bool _do_log = _c->log_level_max < 0 || \ + _c->log_level_max >= LOG_PRI(_l); \ + LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \ + _c->n_log_extra_fields); \ + !_do_log ? -ERRNO_VALUE(error) : \ + log_object_internal(_l, error, \ + PROJECT_FILE, __LINE__, __func__, \ + LOG_EXEC_ID_FIELD(_p), \ + _p->unit_id, \ + LOG_EXEC_INVOCATION_ID_FIELD(_p), \ + _p->invocation_id_string, \ + ##__VA_ARGS__); \ }) #define log_exec_full_errno(ec, ep, level, error, ...) \ @@ -653,48 +664,34 @@ bool exec_needs_ipc_namespace(const ExecContext *context); #define log_exec_warning_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_WARNING, error, __VA_ARGS__) #define log_exec_error_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_ERR, error, __VA_ARGS__) -#define log_exec_struct_errno(ec, ep, level, error, ...) \ - ({ \ - const ExecContext *_c = (ec); \ - const ExecParameters *_p = (ep); \ - const int _l = (level); \ - bool _do_log = !(_c->log_level_max < 0 || \ - _c->log_level_max >= LOG_PRI(_l)); \ - LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \ - _c->n_log_extra_fields); \ - _do_log ? \ - log_struct_errno(_l, error, __VA_ARGS__, LOG_EXEC_ID_FIELD_FORMAT(_p), _p->unit_id) : \ - -ERRNO_VALUE(error); \ - }) - -#define log_exec_struct(ec, ep, level, ...) log_exec_struct_errno(ec, ep, level, 0, __VA_ARGS__) - -#define log_exec_struct_iovec_errno(ec, ep, level, error, iovec, n_iovec) \ - ({ \ - const ExecContext *_c = (ec); \ - const ExecParameters *_p = (ep); \ - const int _l = (level); \ - bool _do_log = !(_c->log_level_max < 0 || \ - _c->log_level_max >= LOG_PRI(_l)); \ - LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \ - _c->n_log_extra_fields); \ - _do_log ? \ - log_struct_iovec_errno(_l, error, iovec, n_iovec) : \ - -ERRNO_VALUE(error); \ - }) - -#define log_exec_struct_iovec(ec, ep, level, iovec, n_iovec) log_exec_struct_iovec_errno(ec, ep, level, 0, iovec, n_iovec) - /* Like LOG_MESSAGE(), but with the unit name prefixed. */ #define LOG_EXEC_MESSAGE(ep, fmt, ...) LOG_MESSAGE("%s: " fmt, (ep)->unit_id, ##__VA_ARGS__) #define LOG_EXEC_ID(ep) LOG_EXEC_ID_FIELD_FORMAT(ep), (ep)->unit_id #define LOG_EXEC_INVOCATION_ID(ep) LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep), (ep)->invocation_id_string -#define _LOG_CONTEXT_PUSH_EXEC(ec, ep, p, c) \ - const ExecContext *c = (ec); \ - const ExecParameters *p = (ep); \ +#define log_exec_struct_errno(ec, ep, level, error, ...) \ + ({ \ + const ExecContext *_c = (ec); \ + const ExecParameters *_p = (ep); \ + const int _l = (level); \ + bool _do_log = _c->log_level_max < 0 || \ + _c->log_level_max >= LOG_PRI(_l); \ + LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \ + _c->n_log_extra_fields); \ + !_do_log ? -ERRNO_VALUE(error) : \ + log_struct_errno(_l, error, \ + LOG_EXEC_ID(_p), \ + LOG_EXEC_INVOCATION_ID(_p), \ + __VA_ARGS__); \ + }) + +#define log_exec_struct(ec, ep, level, ...) log_exec_struct_errno(ec, ep, level, 0, __VA_ARGS__) + +#define _LOG_CONTEXT_PUSH_EXEC(ec, ep, p, c) \ + const ExecContext *c = (ec); \ + const ExecParameters *p = (ep); \ LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_ID_FIELD(p), p->unit_id); \ - LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_INVOCATION_ID_FIELD(p), p->invocation_id_string); \ + LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_INVOCATION_ID_FIELD(p), p->invocation_id_string); \ LOG_CONTEXT_PUSH_IOV(c->log_extra_fields, c->n_log_extra_fields) #define LOG_CONTEXT_PUSH_EXEC(ec, ep) \ diff --git a/src/core/executor.c b/src/core/executor.c index b2716ef..bd0c742 100644 --- a/src/core/executor.c +++ b/src/core/executor.c @@ -245,12 +245,13 @@ static int run(int argc, char *argv[]) { log_exec_struct_errno(&context, ¶ms, LOG_ERR, r, "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, - LOG_EXEC_INVOCATION_ID(¶ms), LOG_EXEC_MESSAGE(¶ms, "Failed at step %s spawning %s: %m", status, command.path), "EXECUTABLE=%s", command.path); } else - assert(exit_status == EXIT_SUCCESS); /* When 'skip' is chosen in the confirm spawn prompt */ + /* r == 0: 'skip' is chosen in the confirm spawn prompt + * r > 0: expected/ignored failure, do not log at error level */ + assert((r == 0) == (exit_status == EXIT_SUCCESS)); return exit_status; } diff --git a/src/core/fuzz-execute-serialize.c b/src/core/fuzz-execute-serialize.c index 6069efd..5b2dc95 100644 --- a/src/core/fuzz-execute-serialize.c +++ b/src/core/fuzz-execute-serialize.c @@ -56,7 +56,7 @@ static void exec_fuzz_one(FILE *f, FDSet *fdset) { params.stderr_fd = -EBADF; params.exec_fd = -EBADF; params.user_lookup_fd = -EBADF; - params.bpf_outer_map_fd = -EBADF; + params.bpf_restrict_fs_map_fd = -EBADF; if (!params.fds) params.n_socket_fds = params.n_storage_fds = 0; for (size_t i = 0; params.fds && i < params.n_socket_fds + params.n_storage_fds; i++) diff --git a/src/core/generator-setup.c b/src/core/generator-setup.c index 00d6ad6..b16211e 100644 --- a/src/core/generator-setup.c +++ b/src/core/generator-setup.c @@ -8,7 +8,7 @@ #include "rm-rf.h" int lookup_paths_mkdir_generator(LookupPaths *p) { - int r, q; + int r; assert(p); @@ -16,14 +16,8 @@ int lookup_paths_mkdir_generator(LookupPaths *p) { return -EINVAL; r = mkdir_p_label(p->generator, 0755); - - q = mkdir_p_label(p->generator_early, 0755); - if (q < 0 && r >= 0) - r = q; - - q = mkdir_p_label(p->generator_late, 0755); - if (q < 0 && r >= 0) - r = q; + RET_GATHER(r, mkdir_p_label(p->generator_early, 0755)); + RET_GATHER(r, mkdir_p_label(p->generator_late, 0755)); return r; } diff --git a/src/core/import-creds.c b/src/core/import-creds.c index 48f3160..f27ffed 100644 --- a/src/core/import-creds.c +++ b/src/core/import-creds.c @@ -80,7 +80,7 @@ static int acquire_credential_directory(ImportCredentialContext *c, const char * if (c->target_dir_fd >= 0) return c->target_dir_fd; - r = path_is_mount_point(path, NULL, 0); + r = path_is_mount_point(path); if (r < 0) { if (r != -ENOENT) return log_error_errno(r, "Failed to determine if %s is a mount point: %m", path); @@ -314,7 +314,7 @@ static int proc_cmdline_callback(const char *key, const char *value, void *data) colon++; if (base64) { - r = unbase64mem(colon, SIZE_MAX, &binary, &l); + r = unbase64mem(colon, &binary, &l); if (r < 0) { log_warning_errno(r, "Failed to decode binary credential '%s' data, ignoring: %m", n); return 0; @@ -519,13 +519,13 @@ static int parse_smbios_strings(ImportCredentialContext *c, const char *data, si return log_oom(); if (!credential_name_valid(cn)) { - log_warning("SMBIOS credential name '%s' is not valid, ignoring: %m", cn); + log_warning("SMBIOS credential name '%s' is not valid, ignoring.", cn); continue; } /* Optionally base64 decode the data, if requested, to allow binary credentials */ if (unbase64) { - r = unbase64mem(eq + 1, nul - (eq + 1), &buf, &buflen); + r = unbase64mem_full(eq + 1, nul - (eq + 1), /* secure = */ false, &buf, &buflen); if (r < 0) { log_warning_errno(r, "Failed to base64 decode credential '%s', ignoring: %m", cn); continue; @@ -753,7 +753,7 @@ static int merge_credentials_trusted(const char *creds_dir) { return 0; /* Do not try to merge initrd credentials into foreign credentials directories */ - if (!path_equal_ptr(creds_dir, SYSTEM_CREDENTIALS_DIRECTORY)) { + if (!path_equal(creds_dir, SYSTEM_CREDENTIALS_DIRECTORY)) { log_debug("Not importing initrd credentials, as foreign $CREDENTIALS_DIRECTORY has been set."); return 0; } @@ -815,7 +815,6 @@ static int setenv_notify_socket(void) { static int report_credentials_per_func(const char *title, int (*get_directory_func)(const char **ret)) { _cleanup_free_ DirectoryEntries *de = NULL; - _cleanup_close_ int dir_fd = -EBADF; _cleanup_free_ char *ll = NULL; const char *d = NULL; int r, c = 0; @@ -831,11 +830,7 @@ static int report_credentials_per_func(const char *title, int (*get_directory_fu return log_warning_errno(r, "Failed to determine %s directory: %m", title); } - dir_fd = open(d, O_RDONLY|O_DIRECTORY|O_CLOEXEC); - if (dir_fd < 0) - return log_warning_errno(errno, "Failed to open credentials directory %s: %m", d); - - r = readdir_all(dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + r = readdir_all_at(AT_FDCWD, d, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); if (r < 0) return log_warning_errno(r, "Failed to enumerate credentials directory %s: %m", d); diff --git a/src/core/job.c b/src/core/job.c index e78c2a7..2f19468 100644 --- a/src/core/job.c +++ b/src/core/job.c @@ -133,6 +133,7 @@ Job* job_free(Job *j) { static void job_set_state(Job *j, JobState state) { assert(j); + assert(j->manager); assert(state >= 0); assert(state < _JOB_STATE_MAX); @@ -145,15 +146,15 @@ static void job_set_state(Job *j, JobState state) { return; if (j->state == JOB_RUNNING) - j->unit->manager->n_running_jobs++; + j->manager->n_running_jobs++; else { assert(j->state == JOB_WAITING); - assert(j->unit->manager->n_running_jobs > 0); + assert(j->manager->n_running_jobs > 0); - j->unit->manager->n_running_jobs--; + j->manager->n_running_jobs--; - if (j->unit->manager->n_running_jobs <= 0) - j->unit->manager->jobs_in_progress_event_source = sd_event_source_disable_unref(j->unit->manager->jobs_in_progress_event_source); + if (j->manager->n_running_jobs <= 0) + j->manager->jobs_in_progress_event_source = sd_event_source_disable_unref(j->manager->jobs_in_progress_event_source); } } @@ -281,6 +282,8 @@ int job_install_deserialized(Job *j) { Job **pj; int r; + assert(j); + assert(j->manager); assert(!j->installed); if (j->type < 0 || j->type >= _JOB_TYPE_MAX_IN_TRANSACTION) @@ -307,7 +310,7 @@ int job_install_deserialized(Job *j) { j->installed = true; if (j->state == JOB_RUNNING) - j->unit->manager->n_running_jobs++; + j->manager->n_running_jobs++; log_unit_debug(j->unit, "Reinstalled deserialized job %s/%s as %u", @@ -633,16 +636,19 @@ static const char* job_done_message_format(Unit *u, JobType t, JobResult result) [JOB_UNSUPPORTED] = "Starting of %s unsupported.", [JOB_COLLECTED] = "Unnecessary job was removed for %s.", [JOB_ONCE] = "Unit %s has been started before and cannot be started again.", + [JOB_FROZEN] = "Cannot start frozen unit %s.", }; static const char* const generic_finished_stop_job[_JOB_RESULT_MAX] = { [JOB_DONE] = "Stopped %s.", [JOB_FAILED] = "Stopped %s with error.", [JOB_TIMEOUT] = "Timed out stopping %s.", + [JOB_FROZEN] = "Cannot stop frozen unit %s.", }; static const char* const generic_finished_reload_job[_JOB_RESULT_MAX] = { [JOB_DONE] = "Reloaded %s.", [JOB_FAILED] = "Reload failed for %s.", [JOB_TIMEOUT] = "Timed out reloading %s.", + [JOB_FROZEN] = "Cannot reload frozen unit %s.", }; /* When verify-active detects the unit is inactive, report it. * Most likely a DEPEND warning from a requisiting unit will @@ -704,6 +710,7 @@ static const struct { [JOB_UNSUPPORTED] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "UNSUPP" }, [JOB_COLLECTED] = { LOG_INFO, }, [JOB_ONCE] = { LOG_ERR, ANSI_HIGHLIGHT_RED, " ONCE " }, + [JOB_FROZEN] = { LOG_ERR, ANSI_HIGHLIGHT_RED, "FROZEN" }, }; static const char* job_done_mid(JobType type, JobResult result) { @@ -954,6 +961,8 @@ int job_run_and_invalidate(Job *j) { r = job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false); else if (r == -ESTALE) r = job_finish_and_invalidate(j, JOB_ONCE, true, false); + else if (r == -EDEADLK) + r = job_finish_and_invalidate(j, JOB_FROZEN, true, false); else if (r < 0) r = job_finish_and_invalidate(j, JOB_FAILED, true, false); } @@ -1011,7 +1020,7 @@ int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool alr goto finish; } - if (IN_SET(result, JOB_FAILED, JOB_INVALID)) + if (IN_SET(result, JOB_FAILED, JOB_INVALID, JOB_FROZEN)) j->manager->n_failed_jobs++; job_uninstall(j); @@ -1369,6 +1378,7 @@ int job_coldplug(Job *j) { void job_shutdown_magic(Job *j) { assert(j); + assert(j->manager); /* The shutdown target gets some special treatment here: we * tell the kernel to begin with flushing its disk caches, to @@ -1381,16 +1391,19 @@ void job_shutdown_magic(Job *j) { if (j->type != JOB_START) return; - if (!MANAGER_IS_SYSTEM(j->unit->manager)) + if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET)) return; - if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET)) + /* This is the very beginning of the shutdown phase, so take the timestamp here */ + dual_timestamp_now(j->manager->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START); + + if (!MANAGER_IS_SYSTEM(j->manager)) return; /* In case messages on console has been disabled on boot */ - j->unit->manager->no_console_output = false; + j->manager->no_console_output = false; - manager_invalidate_startup_units(j->unit->manager); + manager_invalidate_startup_units(j->manager); if (detect_container() > 0) return; @@ -1430,6 +1443,7 @@ bool job_may_gc(Job *j) { Unit *other; assert(j); + assert(j->manager); /* Checks whether this job should be GC'ed away. We only do this for jobs of units that have no effect on their * own and just track external state. For now the only unit type that qualifies for this are .device units. @@ -1450,7 +1464,7 @@ bool job_may_gc(Job *j) { * referenced by one, and reset this whenever we notice that no private bus connections are around. This means * the GC is a bit too conservative when it comes to jobs created by private bus connections. */ if (j->ref_by_private_bus) { - if (set_isempty(j->unit->manager->private_buses)) + if (set_isempty(j->manager->private_buses)) j->ref_by_private_bus = false; else return false; @@ -1473,6 +1487,7 @@ bool job_may_gc(Job *j) { void job_add_to_gc_queue(Job *j) { assert(j); + assert(j->manager); if (j->in_gc_queue) return; @@ -1480,7 +1495,7 @@ void job_add_to_gc_queue(Job *j) { if (!job_may_gc(j)) return; - LIST_PREPEND(gc_queue, j->unit->manager->gc_job_queue, j); + LIST_PREPEND(gc_queue, j->manager->gc_job_queue, j); j->in_gc_queue = true; } @@ -1645,6 +1660,7 @@ static const char* const job_result_table[_JOB_RESULT_MAX] = { [JOB_UNSUPPORTED] = "unsupported", [JOB_COLLECTED] = "collected", [JOB_ONCE] = "once", + [JOB_FROZEN] = "frozen", }; DEFINE_STRING_TABLE_LOOKUP(job_result, JobResult); diff --git a/src/core/job.h b/src/core/job.h index 891d87a..8318b52 100644 --- a/src/core/job.h +++ b/src/core/job.h @@ -96,6 +96,7 @@ enum JobResult { JOB_UNSUPPORTED, /* Couldn't start a unit, because the unit type is not supported on the system */ JOB_COLLECTED, /* Job was garbage collected, since nothing needed it anymore */ JOB_ONCE, /* Unit was started before, and hence can't be started again */ + JOB_FROZEN, /* Unit is currently frozen, so we can't safely operate on it */ _JOB_RESULT_MAX, _JOB_RESULT_INVALID = -EINVAL, }; diff --git a/src/core/kmod-setup.c b/src/core/kmod-setup.c index b8e3f7a..c39b136 100644 --- a/src/core/kmod-setup.c +++ b/src/core/kmod-setup.c @@ -9,28 +9,13 @@ #include "fileio.h" #include "kmod-setup.h" #include "macro.h" +#include "module-util.h" #include "recurse-dir.h" #include "string-util.h" #include "strv.h" #include "virt.h" #if HAVE_KMOD -#include "module-util.h" - -static void systemd_kmod_log( - void *data, - int priority, - const char *file, int line, - const char *fn, - const char *format, - va_list args) { - - /* library logging is enabled at debug only */ - DISABLE_WARNING_FORMAT_NONLITERAL; - log_internalv(LOG_DEBUG, 0, file, line, fn, format, args); - REENABLE_WARNING; -} - static int match_modalias_recurse_dir_cb( RecurseDirEvent event, const char *path, @@ -113,12 +98,11 @@ static bool in_qemu(void) { int kmod_setup(void) { #if HAVE_KMOD - static const struct { const char *module; const char *path; - bool warn_if_unavailable:1; - bool warn_if_module:1; + bool warn_if_unavailable; + bool warn_if_module; bool (*condition_fn)(void); } kmod_table[] = { /* This one we need to load explicitly, since auto-loading on use doesn't work @@ -166,34 +150,32 @@ int kmod_setup(void) { { "tpm", "/sys/class/tpmrm", false, false, efi_has_tpm2 }, #endif }; - _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL; - unsigned i; + + int r; if (have_effective_cap(CAP_SYS_MODULE) <= 0) return 0; - for (i = 0; i < ELEMENTSOF(kmod_table); i++) { - if (kmod_table[i].path && access(kmod_table[i].path, F_OK) >= 0) + _cleanup_(sym_kmod_unrefp) struct kmod_ctx *ctx = NULL; + FOREACH_ELEMENT(kmod, kmod_table) { + if (kmod->path && access(kmod->path, F_OK) >= 0) continue; - if (kmod_table[i].condition_fn && !kmod_table[i].condition_fn()) + if (kmod->condition_fn && !kmod->condition_fn()) continue; - if (kmod_table[i].warn_if_module) + if (kmod->warn_if_module) log_debug("Your kernel apparently lacks built-in %s support. Might be " "a good idea to compile it in. We'll now try to work around " - "this by loading the module...", kmod_table[i].module); + "this by loading the module...", kmod->module); if (!ctx) { - ctx = kmod_new(NULL, NULL); - if (!ctx) - return log_oom(); - - kmod_set_log_fn(ctx, systemd_kmod_log, NULL); - kmod_load_resources(ctx); + r = module_setup_context(&ctx); + if (r < 0) + return log_error_errno(r, "Failed to initialize kmod context: %m"); } - (void) module_load_and_warn(ctx, kmod_table[i].module, kmod_table[i].warn_if_unavailable); + (void) module_load_and_warn(ctx, kmod->module, kmod->warn_if_unavailable); } #endif diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 45f9ab0..df219d8 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -136,7 +136,7 @@ {{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system) {{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home) {{type}}.MountFlags, config_parse_exec_mount_propagation_flag, 0, offsetof({{type}}, exec_context.mount_propagation_flag) -{{type}}.MountAPIVFS, config_parse_exec_mount_apivfs, 0, offsetof({{type}}, exec_context) +{{type}}.MountAPIVFS, config_parse_tristate, 0, offsetof({{type}}, exec_context.mount_apivfs) {{type}}.Personality, config_parse_personality, 0, offsetof({{type}}, exec_context.personality) {{type}}.RuntimeDirectoryPreserve, config_parse_exec_preserve_mode, 0, offsetof({{type}}, exec_context.runtime_directory_preserve_mode) {{type}}.RuntimeDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME].mode) @@ -220,6 +220,7 @@ {{type}}.StartupMemorySwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) {{type}}.MemoryZSwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) {{type}}.StartupMemoryZSwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) +{{type}}.MemoryZSwapWriteback, config_parse_bool, 0, offsetof({{type}}, cgroup_context.memory_zswap_writeback) {{type}}.MemoryLimit, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context) {{type}}.DeviceAllow, config_parse_device_allow, 0, offsetof({{type}}, cgroup_context) {{type}}.DevicePolicy, config_parse_device_policy, 0, offsetof({{type}}, cgroup_context.device_policy) @@ -309,7 +310,8 @@ Unit.PartOf, config_parse_unit_deps, Unit.JoinsNamespaceOf, config_parse_unit_deps, UNIT_JOINS_NAMESPACE_OF, 0 Unit.RequiresOverridable, config_parse_obsolete_unit_deps, UNIT_REQUIRES, 0 Unit.RequisiteOverridable, config_parse_obsolete_unit_deps, UNIT_REQUISITE, 0 -Unit.RequiresMountsFor, config_parse_unit_requires_mounts_for, 0, 0 +Unit.RequiresMountsFor, config_parse_unit_mounts_for, 0, 0 +Unit.WantsMountsFor, config_parse_unit_mounts_for, 0, 0 Unit.StopWhenUnneeded, config_parse_bool, 0, offsetof(Unit, stop_when_unneeded) Unit.RefuseManualStart, config_parse_bool, 0, offsetof(Unit, refuse_manual_start) Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop) @@ -325,7 +327,7 @@ Unit.IgnoreOnSnapshot, config_parse_warn_compat, Unit.JobTimeoutSec, config_parse_job_timeout_sec, 0, 0 Unit.JobRunningTimeoutSec, config_parse_job_running_timeout_sec, 0, 0 Unit.JobTimeoutAction, config_parse_emergency_action, 0, offsetof(Unit, job_timeout_action) -Unit.JobTimeoutRebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, job_timeout_reboot_arg) +Unit.JobTimeoutRebootArgument, config_parse_reboot_parameter, 0, offsetof(Unit, job_timeout_reboot_arg) Unit.StartLimitIntervalSec, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval) {# The following is a legacy alias name for compatibility #} Unit.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval) @@ -335,7 +337,7 @@ Unit.FailureAction, config_parse_emergency_action, Unit.SuccessAction, config_parse_emergency_action, 0, offsetof(Unit, success_action) Unit.FailureActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, failure_action_exit_status) Unit.SuccessActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, success_action_exit_status) -Unit.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg) +Unit.RebootArgument, config_parse_reboot_parameter, 0, offsetof(Unit, reboot_arg) Unit.ConditionPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, conditions) Unit.ConditionPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, conditions) Unit.ConditionPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, conditions) @@ -498,6 +500,7 @@ Socket.FreeBind, config_parse_bool, Socket.Transparent, config_parse_bool, 0, offsetof(Socket, transparent) Socket.Broadcast, config_parse_bool, 0, offsetof(Socket, broadcast) Socket.PassCredentials, config_parse_bool, 0, offsetof(Socket, pass_cred) +Socket.PassFileDescriptorsToExec, config_parse_bool, 0, offsetof(Socket, pass_fds_to_exec) Socket.PassSecurity, config_parse_bool, 0, offsetof(Socket, pass_sec) Socket.PassPacketInfo, config_parse_bool, 0, offsetof(Socket, pass_pktinfo) Socket.Timestamping, config_parse_socket_timestamping, 0, offsetof(Socket, timestamping) @@ -530,7 +533,7 @@ Socket.SELinuxContextFromNet, config_parse_warn_compat, {{ EXEC_CONTEXT_CONFIG_ITEMS('Socket') }} {{ CGROUP_CONTEXT_CONFIG_ITEMS('Socket') }} {{ KILL_CONTEXT_CONFIG_ITEMS('Socket') }} -Mount.What, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.what) +Mount.What, config_parse_mount_node, 0, offsetof(Mount, parameters_fragment.what) Mount.Where, config_parse_unit_path_printf, 0, offsetof(Mount, where) Mount.Options, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.options) Mount.Type, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.fstype) @@ -547,7 +550,7 @@ Automount.Where, config_parse_unit_path_printf, Automount.ExtraOptions, config_parse_unit_string_printf, 0, offsetof(Automount, extra_options) Automount.DirectoryMode, config_parse_mode, 0, offsetof(Automount, directory_mode) Automount.TimeoutIdleSec, config_parse_sec_fix_0, 0, offsetof(Automount, timeout_idle_usec) -Swap.What, config_parse_unit_path_printf, 0, offsetof(Swap, parameters_fragment.what) +Swap.What, config_parse_mount_node, 0, offsetof(Swap, parameters_fragment.what) Swap.Priority, config_parse_swap_priority, 0, 0 Swap.Options, config_parse_unit_string_printf, 0, offsetof(Swap, parameters_fragment.options) Swap.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Swap, timeout_usec) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 0baf08e..5ae6888 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -16,8 +16,8 @@ #include "all-units.h" #include "alloc-util.h" #include "bpf-firewall.h" -#include "bpf-lsm.h" #include "bpf-program.h" +#include "bpf-restrict-fs.h" #include "bpf-socket-bind.h" #include "bus-error.h" #include "bus-internal.h" @@ -38,6 +38,7 @@ #include "fileio.h" #include "firewall-util.h" #include "fs-util.h" +#include "fstab-util.h" #include "hexdecoct.h" #include "iovec-util.h" #include "ioprio-util.h" @@ -56,6 +57,7 @@ #include "pcre2-util.h" #include "percent-util.h" #include "process-util.h" +#include "reboot-util.h" #include "seccomp-util.h" #include "securebits-util.h" #include "selinux-util.h" @@ -248,7 +250,7 @@ int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, cons /* Fragment paths should also be equal as a custom fragment for a specific template instance * wouldn't necessarily lead to infinite recursion. */ - if (!path_equal_ptr(u->fragment_path, fragment_path)) + if (!path_equal(u->fragment_path, fragment_path)) return false; if (!contains_instance_specifier_superset(format)) @@ -361,6 +363,40 @@ int config_parse_unit_string_printf( return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); } +int config_parse_reboot_parameter( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_free_ char *k = NULL; + const Unit *u = ASSERT_PTR(userdata); + int r; + + assert(filename); + assert(line); + assert(rvalue); + + r = unit_full_printf(u, rvalue, &k); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + if (!reboot_parameter_is_valid(k)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid reboot parameter '%s', ignoring.", k); + return 0; + } + + return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata); +} + int config_parse_unit_strv_printf( const char *unit, const char *filename, @@ -433,8 +469,9 @@ int config_parse_colon_separated_paths( const char *rvalue, void *data, void *userdata) { + char ***sv = ASSERT_PTR(data); - const Unit *u = userdata; + const Unit *u = ASSERT_PTR(userdata); int r; assert(filename); @@ -574,17 +611,13 @@ int config_parse_socket_listen( void *data, void *userdata) { + Socket *s = ASSERT_PTR(SOCKET(data)); _cleanup_free_ SocketPort *p = NULL; - SocketPort *tail; - Socket *s; int r; assert(filename); assert(lvalue); assert(rvalue); - assert(data); - - s = SOCKET(data); if (isempty(rvalue)) { /* An empty assignment removes all ports */ @@ -592,10 +625,15 @@ int config_parse_socket_listen( return 0; } - p = new0(SocketPort, 1); + p = new(SocketPort, 1); if (!p) return log_oom(); + *p = (SocketPort) { + .socket = s, + .fd = -EBADF, + }; + if (ltype != SOCKET_SOCKET) { _cleanup_free_ char *k = NULL; @@ -605,7 +643,11 @@ int config_parse_socket_listen( return 0; } - r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + PathSimplifyWarnFlags flags = PATH_CHECK_ABSOLUTE; + if (ltype != SOCKET_SPECIAL) + flags |= PATH_CHECK_NON_API_VFS; + + r = path_simplify_and_warn(k, flags, unit, filename, line, lvalue); if (r < 0) return 0; @@ -619,7 +661,7 @@ int config_parse_socket_listen( p->type = ltype; } else if (streq(lvalue, "ListenNetlink")) { - _cleanup_free_ char *k = NULL; + _cleanup_free_ char *k = NULL; r = unit_path_printf(UNIT(s), rvalue, &k); if (r < 0) { @@ -644,7 +686,7 @@ int config_parse_socket_listen( return 0; } - if (k[0] == '/') { /* Only for AF_UNIX file system sockets… */ + if (path_is_absolute(k)) { /* Only for AF_UNIX file system sockets… */ r = patch_var_run(unit, filename, line, lvalue, &k); if (r < 0) return r; @@ -674,16 +716,7 @@ int config_parse_socket_listen( p->type = SOCKET_SOCKET; } - p->fd = -EBADF; - p->auxiliary_fds = NULL; - p->n_auxiliary_fds = 0; - p->socket = s; - - tail = LIST_FIND_TAIL(port, s->ports); - LIST_INSERT_AFTER(port, s->ports, tail, p); - - p = NULL; - + LIST_APPEND(port, s->ports, TAKE_PTR(p)); return 0; } @@ -858,9 +891,7 @@ int config_parse_exec( void *userdata) { ExecCommand **e = ASSERT_PTR(data); - const Unit *u = userdata; - const char *p; - bool semicolon; + const Unit *u = ASSERT_PTR(userdata); int r; assert(filename); @@ -875,15 +906,11 @@ int config_parse_exec( return 0; } - p = rvalue; + const char *p = rvalue; + bool semicolon; + do { _cleanup_free_ char *path = NULL, *firstword = NULL; - ExecCommandFlags flags = 0; - bool ignore = false, separate_argv0 = false; - _cleanup_free_ ExecCommand *nce = NULL; - _cleanup_strv_free_ char **n = NULL; - size_t nlen = 0; - const char *f; semicolon = false; @@ -897,25 +924,30 @@ int config_parse_exec( continue; } - f = firstword; - for (;;) { - /* We accept an absolute path as first argument. If it's prefixed with - and the path doesn't - * exist, we ignore it instead of erroring out; if it's prefixed with @, we allow overriding of - * argv[0]; if it's prefixed with :, we will not do environment variable substitution; - * if it's prefixed with +, it will be run with full privileges and no sandboxing; if - * it's prefixed with '!' we apply sandboxing, but do not change user/group credentials; if - * it's prefixed with '!!', then we apply user/group credentials if the kernel supports ambient - * capabilities -- if it doesn't we don't apply the credentials themselves, but do apply most - * other sandboxing, with some special exceptions for changing UID. + const char *f = firstword; + bool ignore, separate_argv0 = false; + ExecCommandFlags flags = 0; + + for (;; f++) { + /* We accept an absolute path as first argument. Valid prefixes and their effect: + * + * "-": Ignore if the path doesn't exist + * "@": Allow overriding argv[0] (supplied as a separate argument) + * ":": Disable environment variable substitution + * "+": Run with full privileges and no sandboxing + * "!": Apply sandboxing except for user/group credentials + * "!!": Apply user/group credentials if the kernel supports ambient capabilities - + * if it doesn't we don't apply the credentials themselves, but do apply + * most other sandboxing, with some special exceptions for changing UID. * - * The idea is that '!!' may be used to write services that can take benefit of systemd's - * UID/GID dropping if the kernel supports ambient creds, but provide an automatic fallback to - * privilege dropping within the daemon if the kernel does not offer that. */ + * The idea is that '!!' may be used to write services that can take benefit of + * systemd's UID/GID dropping if the kernel supports ambient creds, but provide + * an automatic fallback to privilege dropping within the daemon if the kernel + * does not offer that. */ - if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) { + if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) flags |= EXEC_COMMAND_IGNORE_FAILURE; - ignore = true; - } else if (*f == '@' && !separate_argv0) + else if (*f == '@' && !separate_argv0) separate_argv0 = true; else if (*f == ':' && !(flags & EXEC_COMMAND_NO_ENV_EXPAND)) flags |= EXEC_COMMAND_NO_ENV_EXPAND; @@ -928,9 +960,10 @@ int config_parse_exec( flags |= EXEC_COMMAND_AMBIENT_MAGIC; } else break; - f++; } + ignore = FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE); + r = unit_path_printf(u, f, &path); if (r < 0) { log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, @@ -940,19 +973,18 @@ int config_parse_exec( } if (isempty(path)) { - /* First word is either "-" or "@" with no command. */ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, - "Empty path in command line%s: '%s'", + "Empty path in command line%s: %s", ignore ? ", ignoring" : "", rvalue); return ignore ? 0 : -ENOEXEC; } if (!string_is_safe(path)) { log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, - "Executable name contains special characters%s: %s", + "Executable path contains special characters%s: %s", ignore ? ", ignoring" : "", path); return ignore ? 0 : -ENOEXEC; } - if (endswith(path, "/")) { + if (path_implies_directory(path)) { log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, "Executable path specifies a directory%s: %s", ignore ? ", ignoring" : "", path); @@ -966,92 +998,71 @@ int config_parse_exec( return ignore ? 0 : -ENOEXEC; } - if (!separate_argv0) { - char *w = NULL; - - if (!GREEDY_REALLOC0(n, nlen + 2)) - return log_oom(); + _cleanup_strv_free_ char **args = NULL; - w = strdup(path); - if (!w) + if (!separate_argv0) + if (strv_extend(&args, path) < 0) return log_oom(); - n[nlen++] = w; - n[nlen] = NULL; - } - - path_simplify(path); while (!isempty(p)) { _cleanup_free_ char *word = NULL, *resolved = NULL; - /* Check explicitly for an unquoted semicolon as - * command separator token. */ + /* Check explicitly for an unquoted semicolon as command separator token. */ if (p[0] == ';' && (!p[1] || strchr(WHITESPACE, p[1]))) { p++; - p += strspn(p, WHITESPACE); + p = skip_leading_chars(p, /* bad = */ NULL); semicolon = true; break; } /* Check for \; explicitly, to not confuse it with \\; or "\;" or "\\;" etc. - * extract_first_word() would return the same for all of those. */ + * extract_first_word() would return the same for all of those. */ if (p[0] == '\\' && p[1] == ';' && (!p[2] || strchr(WHITESPACE, p[2]))) { - char *w; - p += 2; - p += strspn(p, WHITESPACE); + p = skip_leading_chars(p, /* bad = */ NULL); - if (!GREEDY_REALLOC0(n, nlen + 2)) + if (strv_extend(&args, ";") < 0) return log_oom(); - w = strdup(";"); - if (!w) - return log_oom(); - n[nlen++] = w; - n[nlen] = NULL; continue; } r = extract_first_word_and_warn(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue); - if (r == 0) - break; if (r < 0) return ignore ? 0 : -ENOEXEC; + if (r == 0) + break; r = unit_full_printf(u, word, &resolved); if (r < 0) { log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r, - "Failed to resolve unit specifiers in %s%s: %m", + "Failed to resolve unit specifiers in '%s'%s: %m", word, ignore ? ", ignoring" : ""); return ignore ? 0 : -ENOEXEC; } - if (!GREEDY_REALLOC(n, nlen + 2)) + if (strv_consume(&args, TAKE_PTR(resolved)) < 0) return log_oom(); - - n[nlen++] = TAKE_PTR(resolved); - n[nlen] = NULL; } - if (!n || !n[0]) { + if (strv_isempty(args)) { log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0, "Empty executable name or zeroeth argument%s: %s", ignore ? ", ignoring" : "", rvalue); return ignore ? 0 : -ENOEXEC; } - nce = new0(ExecCommand, 1); - if (!nce) + ExecCommand *nec = new(ExecCommand, 1); + if (!nec) return log_oom(); - nce->argv = TAKE_PTR(n); - nce->path = TAKE_PTR(path); - nce->flags = flags; - - exec_command_append_list(e, nce); + *nec = (ExecCommand) { + .path = path_simplify(TAKE_PTR(path)), + .argv = TAKE_PTR(args), + .flags = flags, + }; - /* Do not _cleanup_free_ these. */ - nce = NULL; + exec_command_append_list(e, nec); rvalue = p; } while (semicolon); @@ -1254,7 +1265,7 @@ int config_parse_exec_input_data( return 0; } - r = unbase64mem(rvalue, SIZE_MAX, &p, &sz); + r = unbase64mem(rvalue, &p, &sz); if (r < 0) { log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode base64 data, ignoring: %s", rvalue); @@ -1520,43 +1531,6 @@ int config_parse_exec_cpu_sched_policy(const char *unit, return 0; } -int config_parse_exec_mount_apivfs(const char *unit, - const char *filename, - unsigned line, - const char *section, - unsigned section_line, - const char *lvalue, - int ltype, - const char *rvalue, - void *data, - void *userdata) { - - ExecContext *c = ASSERT_PTR(data); - int k; - - assert(filename); - assert(lvalue); - assert(rvalue); - - if (isempty(rvalue)) { - c->mount_apivfs_set = false; - c->mount_apivfs = false; - return 0; - } - - k = parse_boolean(rvalue); - if (k < 0) { - log_syntax(unit, LOG_WARNING, filename, line, k, - "Failed to parse boolean value, ignoring: %s", - rvalue); - return 0; - } - - c->mount_apivfs_set = true; - c->mount_apivfs = k; - return 0; -} - int config_parse_numa_mask(const char *unit, const char *filename, unsigned line, @@ -1748,7 +1722,7 @@ int config_parse_exec_root_hash( } /* We have a roothash to decode, eg: RootHash=012345789abcdef */ - r = unhexmem(rvalue, strlen(rvalue), &roothash_decoded, &roothash_decoded_size); + r = unhexmem(rvalue, &roothash_decoded, &roothash_decoded_size); if (r < 0) { log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHash=, ignoring: %s", rvalue); return 0; @@ -1816,7 +1790,7 @@ int config_parse_exec_root_hash_sig( } /* We have a roothash signature to decode, eg: RootHashSignature=base64:012345789abcdef */ - r = unbase64mem(value, strlen(value), &roothash_sig_decoded, &roothash_sig_decoded_size); + r = unbase64mem(value, &roothash_sig_decoded, &roothash_sig_decoded_size); if (r < 0) { log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHashSignature=, ignoring: %s", rvalue); return 0; @@ -2634,6 +2608,7 @@ int config_parse_working_directory( assert(rvalue); if (isempty(rvalue)) { + c->working_directory_missing_ok = false; c->working_directory_home = false; c->working_directory = mfree(c->working_directory); return 0; @@ -2659,7 +2634,7 @@ int config_parse_working_directory( return missing_ok ? 0 : -ENOEXEC; } - r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE | (missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue); + r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS|(missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue); if (r < 0) return missing_ok ? 0 : -ENOEXEC; @@ -2697,7 +2672,7 @@ int config_parse_unit_env_file(const char *unit, return 0; } - r = unit_full_printf_full(u, rvalue, PATH_MAX, &n); + r = unit_path_printf(u, rvalue, &n); if (r < 0) { log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); return 0; @@ -3152,7 +3127,7 @@ int config_parse_unit_condition_string( return 0; } -int config_parse_unit_requires_mounts_for( +int config_parse_unit_mounts_for( const char *unit, const char *filename, unsigned line, @@ -3171,6 +3146,7 @@ int config_parse_unit_requires_mounts_for( assert(lvalue); assert(rvalue); assert(data); + assert(STR_IN_SET(lvalue, "RequiresMountsFor", "WantsMountsFor")); for (const char *p = rvalue;;) { _cleanup_free_ char *word = NULL, *resolved = NULL; @@ -3196,9 +3172,9 @@ int config_parse_unit_requires_mounts_for( if (r < 0) continue; - r = unit_require_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE, unit_mount_dependency_type_from_string(lvalue)); if (r < 0) { - log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add required mount '%s', ignoring: %m", resolved); + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add requested mount '%s', ignoring: %m", resolved); continue; } } @@ -3695,7 +3671,7 @@ int config_parse_restrict_filesystems( break; } - r = lsm_bpf_parse_filesystem( + r = bpf_restrict_fs_parse_filesystem( word, &c->restrict_filesystems, FILESYSTEM_PARSE_LOG| @@ -4693,7 +4669,7 @@ int config_parse_exec_directories( _cleanup_free_ char *src = NULL, *dest = NULL; const char *q = tuple; - r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &src, &dest, NULL); + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &src, &dest); if (r == -ENOMEM) return log_oom(); if (r <= 0) { @@ -4908,11 +4884,8 @@ int config_parse_load_credential( void *data, void *userdata) { - _cleanup_free_ char *word = NULL, *k = NULL, *q = NULL; ExecContext *context = ASSERT_PTR(data); - bool encrypted = ltype; - Unit *u = userdata; - const char *p; + const Unit *u = ASSERT_PTR(userdata); int r; assert(filename); @@ -4925,7 +4898,10 @@ int config_parse_load_credential( return 0; } - p = rvalue; + _cleanup_free_ char *word = NULL, *id = NULL, *path = NULL; + const char *p = rvalue; + bool encrypted = ltype; + r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS); if (r == -ENOMEM) return log_oom(); @@ -4934,35 +4910,35 @@ int config_parse_load_credential( return 0; } - r = unit_cred_printf(u, word, &k); + r = unit_cred_printf(u, word, &id); if (r < 0) { log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word); return 0; } - if (!credential_name_valid(k)) { - log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k); + if (!credential_name_valid(id)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", id); return 0; } if (isempty(p)) { /* If only one field is specified take it as shortcut for inheriting a credential named * the same way from our parent */ - q = strdup(k); - if (!q) + path = strdup(id); + if (!path) return log_oom(); } else { - r = unit_path_printf(u, p, &q); + r = unit_path_printf(u, p, &path); if (r < 0) { log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", p); return 0; } - if (path_is_absolute(q) ? !path_is_normalized(q) : !credential_name_valid(q)) { - log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential source \"%s\" not valid, ignoring.", q); + if (path_is_absolute(path) ? !path_is_normalized(path) : !credential_name_valid(path)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential source \"%s\" not valid, ignoring.", path); return 0; } } - r = hashmap_put_credential(&context->load_credentials, k, q, encrypted); + r = hashmap_put_credential(&context->load_credentials, id, path, encrypted); if (r < 0) return log_error_errno(r, "Failed to store load credential '%s': %m", rvalue); @@ -5236,7 +5212,7 @@ int config_parse_bind_paths( void *userdata) { ExecContext *c = ASSERT_PTR(data); - const Unit *u = userdata; + const Unit *u = ASSERT_PTR(userdata); int r; assert(filename); @@ -5267,7 +5243,7 @@ int config_parse_bind_paths( if (r == 0) break; - r = unit_full_printf_full(u, source, PATH_MAX, &sresolved); + r = unit_path_printf(u, source, &sresolved); if (r < 0) { log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", source); @@ -5396,7 +5372,7 @@ int config_parse_mount_images( return 0; q = tuple; - r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second, NULL); + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second); if (r == -ENOMEM) return log_oom(); if (r < 0) { @@ -5420,7 +5396,7 @@ int config_parse_mount_images( continue; } - r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS, unit, filename, line, lvalue); if (r < 0) continue; @@ -5436,7 +5412,7 @@ int config_parse_mount_images( continue; } - r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS, unit, filename, line, lvalue); if (r < 0) continue; @@ -5445,7 +5421,7 @@ int config_parse_mount_images( MountOptions *o = NULL; PartitionDesignator partition_designator; - r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options); if (r == -ENOMEM) return log_oom(); if (r < 0) { @@ -5578,7 +5554,7 @@ int config_parse_extension_images( continue; } - r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS, unit, filename, line, lvalue); if (r < 0) continue; @@ -5587,7 +5563,7 @@ int config_parse_extension_images( MountOptions *o = NULL; PartitionDesignator partition_designator; - r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL); + r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options); if (r == -ENOMEM) return log_oom(); if (r < 0) { @@ -5799,7 +5775,7 @@ int config_parse_pid_file( return log_oom(); /* Check that the result is a sensible path */ - r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE|PATH_CHECK_NON_API_VFS, unit, filename, line, lvalue); if (r < 0) return r; @@ -6095,7 +6071,7 @@ int config_parse_restrict_network_interfaces( break; } - if (!ifname_valid(word)) { + if (!ifname_valid_full(word, IFNAME_VALID_ALTERNATIVE)) { log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", word); continue; } @@ -6112,6 +6088,47 @@ int config_parse_restrict_network_interfaces( return 0; } +int config_parse_mount_node( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + const Unit *u = ASSERT_PTR(userdata); + _cleanup_free_ char *resolved = NULL, *path = NULL; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = unit_full_printf(u, rvalue, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + path = fstab_node_to_udev_node(resolved); + if (!path) + return log_oom(); + + /* The source passed is not necessarily something we understand, and we pass it as-is to mount/swapon, + * so path_is_valid is not used. But let's check for basic sanity, i.e. if the source is longer than + * PATH_MAX, you're likely doing something wrong. */ + if (strlen(path) >= PATH_MAX) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Resolved mount path '%s' too long, ignoring.", path); + return 0; + } + + return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, path, data, userdata); +} + static int merge_by_names(Unit *u, Set *names, const char *id) { char *k; int r; @@ -6316,8 +6333,7 @@ void unit_dump_config_items(FILE *f) { { config_parse_nsec, "NANOSECONDS" }, { config_parse_namespace_path_strv, "PATH [...]" }, { config_parse_bind_paths, "PATH[:PATH[:OPTIONS]] [...]" }, - { config_parse_unit_requires_mounts_for, - "PATH [...]" }, + { config_parse_unit_mounts_for, "PATH [...]" }, { config_parse_exec_mount_propagation_flag, "MOUNTFLAG" }, { config_parse_unit_string_printf, "STRING" }, @@ -6365,6 +6381,7 @@ void unit_dump_config_items(FILE *f) { { config_parse_job_mode_isolate, "BOOLEAN" }, { config_parse_personality, "PERSONALITY" }, { config_parse_log_filter_patterns, "REGEX" }, + { config_parse_mount_node, "NODE" }, }; const char *prev = NULL; diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 6919805..005b915 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -23,6 +23,7 @@ void unit_dump_config_items(FILE *f); CONFIG_PARSER_PROTOTYPE(config_parse_unit_deps); CONFIG_PARSER_PROTOTYPE(config_parse_obsolete_unit_deps); CONFIG_PARSER_PROTOTYPE(config_parse_unit_string_printf); +CONFIG_PARSER_PROTOTYPE(config_parse_reboot_parameter); CONFIG_PARSER_PROTOTYPE(config_parse_unit_strv_printf); CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_printf); CONFIG_PARSER_PROTOTYPE(config_parse_colon_separated_paths); @@ -71,7 +72,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_string); CONFIG_PARSER_PROTOTYPE(config_parse_kill_mode); CONFIG_PARSER_PROTOTYPE(config_parse_notify_access); CONFIG_PARSER_PROTOTYPE(config_parse_emergency_action); -CONFIG_PARSER_PROTOTYPE(config_parse_unit_requires_mounts_for); +CONFIG_PARSER_PROTOTYPE(config_parse_unit_mounts_for); CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter); CONFIG_PARSER_PROTOTYPE(config_parse_syscall_archs); CONFIG_PARSER_PROTOTYPE(config_parse_syscall_errno); @@ -159,6 +160,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_log_filter_patterns); CONFIG_PARSER_PROTOTYPE(config_parse_open_file); CONFIG_PARSER_PROTOTYPE(config_parse_memory_pressure_watch); CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_nft_set); +CONFIG_PARSER_PROTOTYPE(config_parse_mount_node); /* gperf prototypes */ const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length); diff --git a/src/core/main.c b/src/core/main.c index 1ed968d..4b8a315 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -21,7 +21,7 @@ #include "architecture.h" #include "argv-util.h" #if HAVE_LIBBPF -#include "bpf-lsm.h" +#include "bpf-restrict-fs.h" #endif #include "build.h" #include "bus-error.h" @@ -68,6 +68,7 @@ #include "manager-serialize.h" #include "mkdir-label.h" #include "mount-setup.h" +#include "mount-util.h" #include "os-util.h" #include "pager.h" #include "parse-argument.h" @@ -87,6 +88,7 @@ #include "special.h" #include "stat-util.h" #include "stdio-util.h" +#include "string-table.h" #include "strv.h" #include "switch-root.h" #include "sysctl-util.h" @@ -121,7 +123,7 @@ static RuntimeScope arg_runtime_scope; bool arg_dump_core; int arg_crash_chvt; bool arg_crash_shell; -bool arg_crash_reboot; +CrashAction arg_crash_action; static char *arg_confirm_spawn; static ShowStatus arg_show_status; static StatusUnitFormat arg_status_unit_format; @@ -140,6 +142,7 @@ static char **arg_default_environment; static char **arg_manager_environment; static uint64_t arg_capability_bounding_set; static bool arg_no_new_privs; +static int arg_protect_system; static nsec_t arg_timer_slack_nsec; static Set* arg_syscall_archs; static FILE* arg_serialization; @@ -159,6 +162,16 @@ static char **saved_env = NULL; static int parse_configuration(const struct rlimit *saved_rlimit_nofile, const struct rlimit *saved_rlimit_memlock); +static const char* const crash_action_table[_CRASH_ACTION_MAX] = { + [CRASH_FREEZE] = "freeze", + [CRASH_REBOOT] = "reboot", + [CRASH_POWEROFF] = "poweroff", +}; + +DEFINE_STRING_TABLE_LOOKUP(crash_action, CrashAction); + +static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_crash_action, crash_action, CrashAction, CRASH_FREEZE, "Invalid crash action"); + static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) { _cleanup_free_ char *base = NULL; _cleanup_strv_free_ char **files = NULL, **dirs = NULL; @@ -206,13 +219,17 @@ static int console_setup(void) { r = proc_cmdline_tty_size("/dev/console", &rows, &cols); if (r < 0) - log_warning_errno(r, "Failed to get terminal size, ignoring: %m"); + log_warning_errno(r, "Failed to get /dev/console size, ignoring: %m"); else { r = terminal_set_size_fd(tty_fd, NULL, rows, cols); if (r < 0) - log_warning_errno(r, "Failed to set terminal size, ignoring: %m"); + log_warning_errno(r, "Failed to set /dev/console size, ignoring: %m"); } + r = terminal_reset_ansi_seq(tty_fd); + if (r < 0) + log_warning_errno(r, "Failed to reset /dev/console using ANSI sequences, ignoring: %m"); + return 0; } @@ -273,7 +290,18 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat if (r < 0) log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value); else - arg_crash_reboot = r; + arg_crash_action = r ? CRASH_REBOOT : CRASH_FREEZE; + + } else if (proc_cmdline_key_streq(key, "systemd.crash_action")) { + + if (proc_cmdline_value_missing(key, value)) + return 0; + + r = crash_action_from_string(value); + if (r < 0) + log_warning_errno(r, "Failed to parse crash action switch %s, ignoring: %m", value); + else + arg_crash_action = r; } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) { char *s; @@ -462,7 +490,7 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat if (proc_cmdline_value_missing(key, value)) return 0; - r = unbase64mem(value, SIZE_MAX, &p, &sz); + r = unbase64mem(value, &p, &sz); if (r < 0) log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value); @@ -610,6 +638,73 @@ static int config_parse_oom_score_adjust( return 0; } +static int config_parse_protect_system_pid1( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *v = ASSERT_PTR(data), r; + + /* This is modelled after the per-service ProtectSystem= setting, but a bit more restricted on one + * hand, and more automatic in another. i.e. we currently only support yes/no (not "strict" or + * "full"). And we will enable this automatically for the initrd unless configured otherwise. + * + * We might extend this later to match more closely what the per-service ProtectSystem= can do, but + * this is not trivial, due to ordering constraints: besides /usr/ we don't really have much mounted + * at the moment we enable this logic. */ + + if (isempty(rvalue) || streq(rvalue, "auto")) { + *v = -1; + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse ProtectSystem= argument '%s', ignoring: %m", rvalue); + return 0; + } + + *v = r; + return 0; +} + +static int config_parse_crash_reboot( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CrashAction *v = ASSERT_PTR(data); + int r; + + if (isempty(rvalue)) { + *v = CRASH_REBOOT; + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CrashReboot= argument '%s', ignoring: %m", rvalue); + return 0; + } + + *v = r > 0 ? CRASH_REBOOT : CRASH_FREEZE; + return 0; +} + static int parse_config_file(void) { const ConfigTableItem items[] = { { "Manager", "LogLevel", config_parse_level2, 0, NULL }, @@ -621,7 +716,8 @@ static int parse_config_file(void) { { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt }, { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt }, { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell }, - { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot }, + { "Manager", "CrashReboot", config_parse_crash_reboot, 0, &arg_crash_action }, + { "Manager", "CrashAction", config_parse_crash_action, 0, &arg_crash_action }, { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status }, { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format }, { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity }, @@ -637,6 +733,7 @@ static int parse_config_file(void) { { "Manager", "RuntimeWatchdogPreGovernor", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor }, { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set }, { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs }, + { "Manager", "ProtectSystem", config_parse_protect_system_pid1, 0, &arg_protect_system }, #if HAVE_SECCOMP { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs }, #else @@ -696,11 +793,12 @@ static int parse_config_file(void) { }; if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) - (void) config_parse_config_file("system.conf", - "Manager\0", - config_item_table_lookup, items, - CONFIG_PARSE_WARN, - NULL); + (void) config_parse_standard_file_with_dropins( + "systemd/system.conf", + "Manager\0", + config_item_table_lookup, items, + CONFIG_PARSE_WARN, + /* userdata= */ NULL); else { _cleanup_strv_free_ char **files = NULL, **dirs = NULL; int r; @@ -769,8 +867,8 @@ static void set_manager_settings(Manager *m) { m->cad_burst_action = arg_cad_burst_action; /* Note that we don't do structured initialization here, otherwise it will reset the rate limit * counter on every daemon-reload. */ - m->reload_ratelimit.interval = arg_reload_limit_interval_sec; - m->reload_ratelimit.burst = arg_reload_limit_burst; + m->reload_reexec_ratelimit.interval = arg_reload_limit_interval_sec; + m->reload_reexec_ratelimit.burst = arg_reload_limit_burst; manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog); manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog); @@ -935,9 +1033,17 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_CRASH_REBOOT: - r = parse_boolean_argument("--crash-reboot", optarg, &arg_crash_reboot); + r = parse_boolean_argument("--crash-reboot", optarg, NULL); if (r < 0) return r; + arg_crash_action = r > 0 ? CRASH_REBOOT : CRASH_FREEZE; + break; + + case ARG_CRASH_ACTION: + r = crash_action_from_string(optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse crash action \"%s\": %m", optarg); + arg_crash_action = r; break; case ARG_CONFIRM_SPAWN: @@ -1053,7 +1159,7 @@ static int help(void) { " --unit=UNIT Set default unit\n" " --dump-core[=BOOL] Dump core on crash\n" " --crash-vt=NR Change to specified VT on crash\n" - " --crash-reboot[=BOOL] Reboot on crash\n" + " --crash-action=ACTION Specify what to do on crash\n" " --crash-shell[=BOOL] Run shell on crash\n" " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n" " --show-status[=BOOL] Show status updates on the console during boot\n" @@ -1265,7 +1371,7 @@ static void test_usr(void) { log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. " "Some things will probably break (sometimes even silently) in mysterious ways. " - "Consult https://www.freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information."); + "Consult https://systemd.io/SEPARATE_USR_IS_BROKEN for more information."); } static int enforce_syscall_archs(Set *archs) { @@ -1277,7 +1383,7 @@ static int enforce_syscall_archs(Set *archs) { r = seccomp_restrict_archs(arg_syscall_archs); if (r < 0) - return log_error_errno(r, "Failed to enforce system call architecture restrication: %m"); + return log_error_errno(r, "Failed to enforce system call architecture restriction: %m"); #endif return 0; } @@ -1435,7 +1541,7 @@ static int fixup_environment(void) { return -errno; /* The kernels sets HOME=/ for init. Let's undo this. */ - if (path_equal_ptr(getenv("HOME"), "/")) + if (path_equal(getenv("HOME"), "/")) assert_se(unsetenv("HOME") == 0); return 0; @@ -1467,32 +1573,37 @@ static int become_shutdown(int objective, int retval) { [MANAGER_KEXEC] = "kexec", }; - char log_level[STRLEN("--log-level=") + DECIMAL_STR_MAX(int)], - timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")], + char timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")], exit_code[STRLEN("--exit-code=") + DECIMAL_STR_MAX(uint8_t)]; _cleanup_strv_free_ char **env_block = NULL; + _cleanup_free_ char *max_log_levels = NULL; usec_t watchdog_timer = 0; int r; assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX); assert(table[objective]); - xsprintf(log_level, "--log-level=%d", log_get_max_level()); xsprintf(timeout, "--timeout=%" PRI_USEC "us", arg_defaults.timeout_stop_usec); - const char* command_line[10] = { + const char* command_line[11] = { SYSTEMD_SHUTDOWN_BINARY_PATH, table[objective], - log_level, timeout, /* Note that the last position is a terminator and must contain NULL. */ }; - size_t pos = 4; + size_t pos = 3; assert(command_line[pos-1]); assert(!command_line[pos]); + (void) log_max_levels_to_string(log_get_max_level(), &max_log_levels); + + if (max_log_levels) { + command_line[pos++] = "--log-level"; + command_line[pos++] = max_log_levels; + } + switch (log_get_target()) { case LOG_TARGET_KMSG: @@ -1538,7 +1649,7 @@ static int become_shutdown(int objective, int retval) { (void) watchdog_setup_pretimeout(0); (void) watchdog_setup_pretimeout_governor(NULL); r = watchdog_setup(watchdog_timer); - watchdog_close(r < 0); + watchdog_close(/* disarm= */ r < 0); /* The environment block: */ @@ -1684,6 +1795,35 @@ static void initialize_core_pattern(bool skip_setup) { arg_early_core_pattern); } +static void apply_protect_system(bool skip_setup) { + int r; + + if (skip_setup || getpid_cached() != 1 || arg_protect_system == 0) + return; + + if (arg_protect_system < 0 && !in_initrd()) { + log_debug("ProtectSystem=auto selected, but not running in an initrd, skipping."); + return; + } + + r = make_mount_point("/usr"); + if (r < 0) { + log_warning_errno(r, "Failed to make /usr/ a mount point, ignoring: %m"); + return; + } + + if (mount_nofollow_verbose( + LOG_WARNING, + /* what= */ NULL, + "/usr", + /* fstype= */ NULL, + MS_BIND|MS_REMOUNT|MS_RDONLY, + /* options= */ NULL) < 0) + return; + + log_info("Successfully made /usr/ read-only."); +} + static void update_cpu_affinity(bool skip_setup) { _cleanup_free_ char *mask = NULL; @@ -1966,6 +2106,16 @@ static int invoke_main_loop( "MESSAGE_ID=" SD_MESSAGE_CORE_MAINLOOP_FAILED_STR); } + /* Ensure shutdown timestamp is taken even when bypassing the job engine */ + if (IN_SET(objective, + MANAGER_SOFT_REBOOT, + MANAGER_REBOOT, + MANAGER_KEXEC, + MANAGER_HALT, + MANAGER_POWEROFF) && + !dual_timestamp_is_set(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START)) + dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START); + switch (objective) { case MANAGER_RELOAD: { @@ -2133,9 +2283,9 @@ static void log_execution_mode(bool *ret_first_boot) { /* Let's check whether we are in first boot. First, check if an override was * specified on the kernel command line. If yes, we honour that. */ - r = proc_cmdline_get_bool("systemd.condition-first-boot", /* flags = */ 0, &first_boot); + r = proc_cmdline_get_bool("systemd.condition_first_boot", /* flags = */ 0, &first_boot); if (r < 0) - log_debug_errno(r, "Failed to parse systemd.condition-first-boot= kernel command line argument, ignoring: %m"); + log_debug_errno(r, "Failed to parse systemd.condition_first_boot= kernel command line argument, ignoring: %m"); if (r > 0) log_full(first_boot ? LOG_INFO : LOG_DEBUG, @@ -2221,12 +2371,6 @@ static int initialize_runtime( install_crash_handler(); if (!skip_setup) { - r = mount_cgroup_controllers(); - if (r < 0) { - *ret_error_message = "Failed to mount cgroup hierarchies"; - return r; - } - /* Pull credentials from various sources into a common credential directory (we do * this here, before setting up the machine ID, so that we can use credential info * for setting up the machine ID) */ @@ -2493,7 +2637,7 @@ static void setenv_manager_environment(void) { r = putenv_dup(*p, true); if (r < 0) - log_warning_errno(errno, "Failed to setenv \"%s\", ignoring: %m", *p); + log_warning_errno(r, "Failed to setenv \"%s\", ignoring: %m", *p); } } @@ -2507,7 +2651,7 @@ static void reset_arguments(void) { arg_dump_core = true; arg_crash_chvt = -1; arg_crash_shell = false; - arg_crash_reboot = false; + arg_crash_action = CRASH_FREEZE; arg_confirm_spawn = mfree(arg_confirm_spawn); arg_show_status = _SHOW_STATUS_INVALID; arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT; @@ -2531,6 +2675,7 @@ static void reset_arguments(void) { arg_capability_bounding_set = CAP_MASK_UNSET; arg_no_new_privs = false; + arg_protect_system = -1; arg_timer_slack_nsec = NSEC_INFINITY; arg_syscall_archs = set_free(arg_syscall_archs); @@ -2952,6 +3097,24 @@ int main(int argc, char *argv[]) { goto finish; } + if (!skip_setup) { + /* Before we actually start deleting cgroup v1 code, make it harder to boot + * in cgroupv1 mode first. See also #30852. */ + + r = mount_cgroup_legacy_controllers(loaded_policy); + if (r < 0) { + if (r == -ERFKILL) + error_message = "Refusing to run under cgroup v1, SYSTEMD_CGROUP_ENABLE_LEGACY_FORCE=1 not specified on kernel command line"; + else + error_message = "Failed to mount cgroup v1 hierarchy"; + goto finish; + } + if (r > 0) { + log_full(LOG_CRIT, "Legacy cgroup v1 support selected. This is no longer supported. Will proceed anyway after 30s."); + (void) usleep_safe(30 * USEC_PER_SEC); + } + } + /* The efivarfs is now mounted, let's lock down the system token. */ lock_down_efi_variables(); @@ -3038,9 +3201,12 @@ int main(int argc, char *argv[]) { cmdline_take_random_seed(); } - /* A core pattern might have been specified via the cmdline. */ + /* A core pattern might have been specified via the cmdline. */ initialize_core_pattern(skip_setup); + /* Make /usr/ read-only */ + apply_protect_system(skip_setup); + /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */ log_close(); @@ -3196,7 +3362,8 @@ finish: #endif if (r < 0) - (void) sd_notifyf(0, "ERRNO=%i", -r); + (void) sd_notifyf(/* unset_environment= */ false, + "ERRNO=%i", -r); /* Try to invoke the shutdown binary unless we already failed. * If we failed above, we want to freeze after finishing cleanup. */ @@ -3209,7 +3376,8 @@ finish: /* This is primarily useful when running systemd in a VM, as it provides the user running the VM with * a mechanism to pick up systemd's exit status in the VM. */ - (void) sd_notifyf(0, "EXIT_STATUS=%i", retval); + (void) sd_notifyf(/* unset_environment= */ false, + "EXIT_STATUS=%i", retval); watchdog_free_device(); arg_watchdog_device = mfree(arg_watchdog_device); diff --git a/src/core/main.h b/src/core/main.h index b12a1cc..1949a08 100644 --- a/src/core/main.h +++ b/src/core/main.h @@ -1,9 +1,21 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #pragma once +#include #include +typedef enum CrashAction { + CRASH_FREEZE, + CRASH_REBOOT, + CRASH_POWEROFF, + _CRASH_ACTION_MAX, + _CRASH_ACTION_INVALID = -EINVAL, +} CrashAction; + +const char* crash_action_to_string(CrashAction action); +CrashAction crash_action_from_string(const char *action); + extern bool arg_dump_core; extern int arg_crash_chvt; extern bool arg_crash_shell; -extern bool arg_crash_reboot; +extern CrashAction arg_crash_action; diff --git a/src/core/manager-dump.c b/src/core/manager-dump.c index 6c32d78..a12d50c 100644 --- a/src/core/manager-dump.c +++ b/src/core/manager-dump.c @@ -64,7 +64,7 @@ static void manager_dump_header(Manager *m, FILE *f, const char *prefix) { * stable between versions. We take the liberty to restructure it entirely between versions and * add/remove fields at will. */ - fprintf(f, "%sManager: systemd " STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")\n", strempty(prefix)); + fprintf(f, "%sManager: systemd " PROJECT_VERSION_FULL " (" GIT_VERSION ")\n", strempty(prefix)); fprintf(f, "%sFeatures: %s\n", strempty(prefix), systemd_features); for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c index 1ac2636..b4af82b 100644 --- a/src/core/manager-serialize.c +++ b/src/core/manager-serialize.c @@ -23,11 +23,12 @@ int manager_open_serialization(Manager *m, FILE **ret_f) { return open_serialization_file("systemd-state", ret_f); } -static bool manager_timestamp_shall_serialize(ManagerTimestamp t) { - if (!in_initrd()) +static bool manager_timestamp_shall_serialize(ManagerObjective o, ManagerTimestamp t) { + if (!in_initrd() && o != MANAGER_SOFT_REBOOT) return true; - /* The following timestamps only apply to the host system, hence only serialize them there */ + /* The following timestamps only apply to the host system (or first boot in case of soft-reboot), + * hence only serialize them there. */ return !IN_SET(t, MANAGER_TIMESTAMP_USERSPACE, MANAGER_TIMESTAMP_FINISH, MANAGER_TIMESTAMP_SECURITY_START, MANAGER_TIMESTAMP_SECURITY_FINISH, @@ -108,10 +109,13 @@ int manager_serialize( (void) serialize_usec(f, "pretimeout-watchdog-overridden", m->watchdog_overridden[WATCHDOG_PRETIMEOUT]); (void) serialize_item(f, "pretimeout-watchdog-governor-overridden", m->watchdog_pretimeout_governor_overridden); + (void) serialize_item(f, "previous-objective", manager_objective_to_string(m->objective)); + (void) serialize_item_format(f, "soft-reboots-count", "%u", m->soft_reboots_count); + for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { _cleanup_free_ char *joined = NULL; - if (!manager_timestamp_shall_serialize(q)) + if (!manager_timestamp_shall_serialize(m->objective, q)) continue; joined = strjoin(manager_timestamp_to_string(q), "-timestamp"); @@ -139,21 +143,19 @@ int manager_serialize( } if (m->user_lookup_fds[0] >= 0) { - int copy0, copy1; - - copy0 = fdset_put_dup(fds, m->user_lookup_fds[0]); - if (copy0 < 0) - return log_error_errno(copy0, "Failed to add user lookup fd to serialization: %m"); - - copy1 = fdset_put_dup(fds, m->user_lookup_fds[1]); - if (copy1 < 0) - return log_error_errno(copy1, "Failed to add user lookup fd to serialization: %m"); + r = serialize_fd_many(f, fds, "user-lookup", m->user_lookup_fds, 2); + if (r < 0) + return r; + } - (void) serialize_item_format(f, "user-lookup", "%i %i", copy0, copy1); + if (m->handoff_timestamp_fds[0] >= 0) { + r = serialize_fd_many(f, fds, "handoff-timestamp-fds", m->handoff_timestamp_fds, 2); + if (r < 0) + return r; } (void) serialize_ratelimit(f, "dump-ratelimit", &m->dump_ratelimit); - (void) serialize_ratelimit(f, "reload-ratelimit", &m->reload_ratelimit); + (void) serialize_ratelimit(f, "reload-reexec-ratelimit", &m->reload_reexec_ratelimit); bus_track_serialize(m->subscribed, f, "subscribed"); @@ -443,10 +445,10 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { if (r < 0) return r; - } else if (startswith(l, "env=")) { - r = deserialize_environment(l + 4, &m->client_environment); + } else if ((val = startswith(l, "env="))) { + r = deserialize_environment(val, &m->client_environment); if (r < 0) - log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", l); + log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", val); } else if ((val = startswith(l, "notify-fd="))) { int fd; @@ -454,8 +456,7 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { fd = deserialize_fd(fds, val); if (fd >= 0) { m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source); - safe_close(m->notify_fd); - m->notify_fd = fd; + close_and_replace(m->notify_fd, fd); } } else if ((val = startswith(l, "notify-socket="))) { @@ -469,21 +470,26 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { fd = deserialize_fd(fds, val); if (fd >= 0) { m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source); - safe_close(m->cgroups_agent_fd); - m->cgroups_agent_fd = fd; + close_and_replace(m->cgroups_agent_fd, fd); } } else if ((val = startswith(l, "user-lookup="))) { - int fd0, fd1; - - if (sscanf(val, "%i %i", &fd0, &fd1) != 2 || fd0 < 0 || fd1 < 0 || fd0 == fd1 || !fdset_contains(fds, fd0) || !fdset_contains(fds, fd1)) - log_notice("Failed to parse user lookup fd, ignoring: %s", val); - else { - m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source); - safe_close_pair(m->user_lookup_fds); - m->user_lookup_fds[0] = fdset_remove(fds, fd0); - m->user_lookup_fds[1] = fdset_remove(fds, fd1); - } + + m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source); + safe_close_pair(m->user_lookup_fds); + + r = deserialize_fd_many(fds, val, 2, m->user_lookup_fds); + if (r < 0) + log_warning_errno(r, "Failed to parse user-lookup fds: \"%s\", ignoring: %m", val); + + } else if ((val = startswith(l, "handoff-timestamp-fds="))) { + + m->handoff_timestamp_event_source = sd_event_source_disable_unref(m->handoff_timestamp_event_source); + safe_close_pair(m->handoff_timestamp_fds); + + r = deserialize_fd_many(fds, val, 2, m->handoff_timestamp_fds); + if (r < 0) + log_warning_errno(r, "Failed to parse handoff-timestamp fds: \"%s\", ignoring: %m", val); } else if ((val = startswith(l, "dynamic-user="))) dynamic_user_deserialize_one(m, val, fds, NULL); @@ -495,8 +501,9 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { (void) exec_shared_runtime_deserialize_one(m, val, fds); else if ((val = startswith(l, "subscribed="))) { - if (strv_extend(&m->deserialized_subscribed, val) < 0) - return -ENOMEM; + r = strv_extend(&m->deserialized_subscribed, val); + if (r < 0) + return r; } else if ((val = startswith(l, "varlink-server-socket-address="))) { if (!m->varlink_server && MANAGER_IS_SYSTEM(m)) { r = manager_varlink_init(m); @@ -516,9 +523,25 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { (void) varlink_server_deserialize_one(m->varlink_server, val, fds); } else if ((val = startswith(l, "dump-ratelimit="))) deserialize_ratelimit(&m->dump_ratelimit, "dump-ratelimit", val); - else if ((val = startswith(l, "reload-ratelimit="))) - deserialize_ratelimit(&m->reload_ratelimit, "reload-ratelimit", val); - else { + else if ((val = startswith(l, "reload-reexec-ratelimit="))) + deserialize_ratelimit(&m->reload_reexec_ratelimit, "reload-reexec-ratelimit", val); + else if ((val = startswith(l, "soft-reboots-count="))) { + unsigned n; + + if (safe_atou(val, &n) < 0) + log_notice("Failed to parse soft reboots counter '%s', ignoring.", val); + else + m->soft_reboots_count = n; + } else if ((val = startswith(l, "previous-objective="))) { + ManagerObjective objective; + + objective = manager_objective_from_string(val); + if (objective < 0) + log_notice("Failed to parse previous objective '%s', ignoring.", val); + else + m->previous_objective = objective; + + } else { ManagerTimestamp q; for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) { diff --git a/src/core/manager.c b/src/core/manager.c index 88eebfc..90e72b0 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -25,6 +24,7 @@ #include "alloc-util.h" #include "audit-fd.h" #include "boot-timestamps.h" +#include "build-path.h" #include "bus-common-errors.h" #include "bus-error.h" #include "bus-kernel.h" @@ -36,6 +36,7 @@ #include "constants.h" #include "core-varlink.h" #include "creds-util.h" +#include "daemon-util.h" #include "dbus-job.h" #include "dbus-manager.h" #include "dbus-unit.h" @@ -55,6 +56,7 @@ #include "inotify-util.h" #include "install.h" #include "io-util.h" +#include "iovec-util.h" #include "label-util.h" #include "load-fragment.h" #include "locale-setup.h" @@ -88,6 +90,7 @@ #include "strxcpyx.h" #include "sysctl-util.h" #include "syslog-util.h" +#include "taint.h" #include "terminal-util.h" #include "time-util.h" #include "transaction.h" @@ -122,6 +125,7 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata); static int manager_dispatch_run_queue(sd_event_source *source, void *userdata); static int manager_dispatch_sigchld(sd_event_source *source, void *userdata); @@ -263,12 +267,11 @@ static void manager_print_jobs_in_progress(Manager *m) { strempty(status_text)); } - sd_notifyf(false, - "STATUS=%sUser job %s/%s running (%s / %s)...", - job_of_n, - ident, - job_type_to_string(j->type), - time, limit); + (void) sd_notifyf(/* unset_environment= */ false, + "STATUS=%sUser job %s/%s running (%s / %s)...", + job_of_n, + ident, job_type_to_string(j->type), + time, limit); m->status_ready = false; } @@ -397,7 +400,7 @@ static int manager_setup_time_change(Manager *m) { return log_error_errno(r, "Failed to create time change event source: %m"); /* Schedule this slightly earlier than the .timer event sources */ - r = sd_event_source_set_priority(m->time_change_event_source, SD_EVENT_PRIORITY_NORMAL-1); + r = sd_event_source_set_priority(m->time_change_event_source, EVENT_PRIORITY_TIME_CHANGE); if (r < 0) return log_error_errno(r, "Failed to set priority of time change event sources: %m"); @@ -464,7 +467,7 @@ static int manager_setup_timezone_change(Manager *m) { return log_error_errno(r, "Failed to create timezone change event source: %m"); /* Schedule this slightly earlier than the .timer event sources */ - r = sd_event_source_set_priority(new_event, SD_EVENT_PRIORITY_NORMAL-1); + r = sd_event_source_set_priority(new_event, EVENT_PRIORITY_TIME_ZONE); if (r < 0) return log_error_errno(r, "Failed to set priority of timezone change event sources: %m"); @@ -482,21 +485,19 @@ static int enable_special_signals(Manager *m) { if (MANAGER_IS_TEST_RUN(m)) return 0; - /* Enable that we get SIGINT on control-alt-del. In containers - * this will fail with EPERM (older) or EINVAL (newer), so - * ignore that. */ + /* Enable that we get SIGINT on control-alt-del. In containers this will fail with EPERM (older) or + * EINVAL (newer), so ignore that. */ if (reboot(RB_DISABLE_CAD) < 0 && !IN_SET(errno, EPERM, EINVAL)) - log_warning_errno(errno, "Failed to enable ctrl-alt-del handling: %m"); + log_warning_errno(errno, "Failed to enable ctrl-alt-del handling, ignoring: %m"); fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC); - if (fd < 0) { - /* Support systems without virtual console */ - if (fd != -ENOENT) - log_warning_errno(errno, "Failed to open /dev/tty0: %m"); - } else { + if (fd < 0) + /* Support systems without virtual console (ENOENT) gracefully */ + log_full_errno(fd == -ENOENT ? LOG_DEBUG : LOG_WARNING, fd, "Failed to open /dev/tty0, ignoring: %m"); + else { /* Enable that we get SIGWINCH on kbrequest */ if (ioctl(fd, KDSIGACCEPT, SIGWINCH) < 0) - log_warning_errno(errno, "Failed to enable kbrequest handling: %m"); + log_warning_errno(errno, "Failed to enable kbrequest handling, ignoring: %m"); } return 0; @@ -592,10 +593,21 @@ static int manager_setup_signals(Manager *m) { * notify processing can still figure out to which process/service a message belongs, before we reap the * process. Also, process this before handling cgroup notifications, so that we always collect child exit * status information before detecting that there's no process in a cgroup. */ - r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6); + r = sd_event_source_set_priority(m->signal_event_source, EVENT_PRIORITY_SIGNALS); if (r < 0) return r; + /* Report to supervisor that we now process the above signals. We report this as level "2", to + * indicate that we support more than sysvinit's signals (of course, sysvinit never sent this + * message, but conceptually it makes sense to consider level "1" to be equivalent to sysvinit's + * signal handling). Also, by setting this to "2" people looking for this hopefully won't + * misunderstand this as a boolean concept. Signal level 2 shall refer to the signals PID 1 + * understands at the time of release of systemd v256, i.e. including basic SIGRTMIN+18 handling for + * memory pressure and stuff. When more signals are hooked up (or more SIGRTMIN+18 multiplex + * operations added, this level should be increased). */ + (void) sd_notify(/* unset_environment= */ false, + "X_SYSTEMD_SIGNALS_LEVEL=2"); + if (MANAGER_IS_SYSTEM(m)) return enable_special_signals(m); @@ -641,16 +653,13 @@ static char** sanitize_environment(char **l) { "TRIGGER_TIMER_REALTIME_USEC", "TRIGGER_UNIT", "WATCHDOG_PID", - "WATCHDOG_USEC", - NULL); + "WATCHDOG_USEC"); /* Let's order the environment alphabetically, just to make it pretty */ return strv_sort(l); } int manager_default_environment(Manager *m) { - int r; - assert(m); m->transient_environment = strv_free(m->transient_environment); @@ -661,21 +670,39 @@ int manager_default_environment(Manager *m) { * * The initial passed environment is untouched to keep /proc/self/environ valid; it is used * for tagging the init process inside containers. */ - m->transient_environment = strv_new("PATH=" DEFAULT_PATH); - if (!m->transient_environment) + char *path = strjoin("PATH=", default_PATH()); + if (!path) + return log_oom(); + + if (strv_consume(&m->transient_environment, path) < 0) return log_oom(); /* Import locale variables LC_*= from configuration */ (void) locale_setup(&m->transient_environment); } else { - /* The user manager passes its own environment along to its children, except for $PATH. */ + /* The user manager passes its own environment along to its children, except for $PATH and + * session envs. */ + m->transient_environment = strv_copy(environ); if (!m->transient_environment) return log_oom(); - r = strv_env_replace_strdup(&m->transient_environment, "PATH=" DEFAULT_USER_PATH); - if (r < 0) + char *path = strjoin("PATH=", default_user_PATH()); + if (!path) + return log_oom(); + + if (strv_env_replace_consume(&m->transient_environment, path) < 0) return log_oom(); + + /* Envvars set for our 'manager' class session are private and should not be propagated + * to children. Also it's likely that the graphical session will set these on their own. */ + strv_env_unset_many(m->transient_environment, + "XDG_SESSION_ID", + "XDG_SESSION_CLASS", + "XDG_SESSION_TYPE", + "XDG_SESSION_DESKTOP", + "XDG_SEAT", + "XDG_VTNR"); } sanitize_environment(m->transient_environment); @@ -689,18 +716,18 @@ static int manager_setup_prefix(Manager *m) { }; static const struct table_entry paths_system[_EXEC_DIRECTORY_TYPE_MAX] = { - [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME, NULL }, - [EXEC_DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE, NULL }, - [EXEC_DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE, NULL }, - [EXEC_DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS, NULL }, + [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME, NULL }, + [EXEC_DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE, NULL }, + [EXEC_DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE, NULL }, + [EXEC_DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS, NULL }, [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_SYSTEM_CONFIGURATION, NULL }, }; static const struct table_entry paths_user[_EXEC_DIRECTORY_TYPE_MAX] = { - [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME, NULL }, - [EXEC_DIRECTORY_STATE] = { SD_PATH_USER_STATE_PRIVATE, NULL }, - [EXEC_DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE, NULL }, - [EXEC_DIRECTORY_LOGS] = { SD_PATH_USER_STATE_PRIVATE, "log" }, + [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME, NULL }, + [EXEC_DIRECTORY_STATE] = { SD_PATH_USER_STATE_PRIVATE, NULL }, + [EXEC_DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE, NULL }, + [EXEC_DIRECTORY_LOGS] = { SD_PATH_USER_STATE_PRIVATE, "log" }, [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_USER_CONFIGURATION, NULL }, }; @@ -736,7 +763,7 @@ static int manager_setup_run_queue(Manager *m) { if (r < 0) return r; - r = sd_event_source_set_priority(m->run_queue_event_source, SD_EVENT_PRIORITY_IDLE); + r = sd_event_source_set_priority(m->run_queue_event_source, EVENT_PRIORITY_RUN_QUEUE); if (r < 0) return r; @@ -759,7 +786,7 @@ static int manager_setup_sigchld_event_source(Manager *m) { if (r < 0) return r; - r = sd_event_source_set_priority(m->sigchld_event_source, SD_EVENT_PRIORITY_NORMAL-7); + r = sd_event_source_set_priority(m->sigchld_event_source, EVENT_PRIORITY_SIGCHLD); if (r < 0) return r; @@ -861,6 +888,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, *m = (Manager) { .runtime_scope = runtime_scope, .objective = _MANAGER_OBJECTIVE_INVALID, + .previous_objective = _MANAGER_OBJECTIVE_INVALID, .status_unit_format = STATUS_UNIT_FORMAT_DEFAULT, @@ -878,6 +906,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, .cgroups_agent_fd = -EBADF, .signal_fd = -EBADF, .user_lookup_fds = EBADF_PAIR, + .handoff_timestamp_fds = EBADF_PAIR, .private_listen_fd = -EBADF, .dev_autofs_fd = -EBADF, .cgroup_inotify_fd = -EBADF, @@ -992,8 +1021,8 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, return r; #if HAVE_LIBBPF - if (MANAGER_IS_SYSTEM(m) && lsm_bpf_supported(/* initialize = */ true)) { - r = lsm_bpf_setup(m); + if (MANAGER_IS_SYSTEM(m) && bpf_restrict_fs_supported(/* initialize = */ true)) { + r = bpf_restrict_fs_setup(m); if (r < 0) log_warning_errno(r, "Failed to setup LSM BPF, ignoring: %m"); } @@ -1013,42 +1042,19 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, if (r < 0 && r != -EEXIST) return r; + } - m->executor_fd = open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH); - if (m->executor_fd < 0) - return log_emergency_errno(errno, - "Failed to open executor binary '%s': %m", - SYSTEMD_EXECUTOR_BINARY_PATH); - } else if (!FLAGS_SET(test_run_flags, MANAGER_TEST_DONT_OPEN_EXECUTOR)) { - _cleanup_free_ char *self_exe = NULL, *executor_path = NULL; - _cleanup_close_ int self_dir_fd = -EBADF; - int level = LOG_DEBUG; - - /* Prefer sd-executor from the same directory as the test, e.g.: when running unit tests from the - * build directory. Fallback to working directory and then the installation path. */ - r = readlink_and_make_absolute("/proc/self/exe", &self_exe); - if (r < 0) - return r; - - self_dir_fd = open_parent(self_exe, O_CLOEXEC|O_PATH|O_DIRECTORY, 0); - if (self_dir_fd < 0) - return self_dir_fd; - - m->executor_fd = RET_NERRNO(openat(self_dir_fd, "systemd-executor", O_CLOEXEC|O_PATH)); - if (m->executor_fd == -ENOENT) - m->executor_fd = RET_NERRNO(openat(AT_FDCWD, "systemd-executor", O_CLOEXEC|O_PATH)); - if (m->executor_fd == -ENOENT) { - m->executor_fd = RET_NERRNO(open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH)); - level = LOG_WARNING; /* Tests should normally use local builds */ - } + if (!FLAGS_SET(test_run_flags, MANAGER_TEST_DONT_OPEN_EXECUTOR)) { + m->executor_fd = pin_callout_binary(SYSTEMD_EXECUTOR_BINARY_PATH); if (m->executor_fd < 0) - return m->executor_fd; + return log_debug_errno(m->executor_fd, "Failed to pin executor binary: %m"); + _cleanup_free_ char *executor_path = NULL; r = fd_get_path(m->executor_fd, &executor_path); if (r < 0) return r; - log_full(level, "Using systemd-executor binary from '%s'.", executor_path); + log_debug("Using systemd-executor binary from '%s'.", executor_path); } /* Note that we do not set up the notify fd here. We do that after deserialization, @@ -1113,7 +1119,7 @@ static int manager_setup_notify(Manager *m) { /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which * service an exit message belongs. */ - r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-8); + r = sd_event_source_set_priority(m->notify_event_source, EVENT_PRIORITY_NOTIFY); if (r < 0) return log_error_errno(r, "Failed to set priority of notify event source: %m"); @@ -1187,7 +1193,7 @@ static int manager_setup_cgroups_agent(Manager *m) { /* Process cgroups notifications early. Note that when the agent notification is received * we'll just enqueue the unit in the cgroup empty queue, hence pick a high priority than * that. Also see handling of cgroup inotify for the unified cgroup stuff. */ - r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-9); + r = sd_event_source_set_priority(m->cgroups_agent_event_source, EVENT_PRIORITY_CGROUP_AGENT); if (r < 0) return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m"); @@ -1236,13 +1242,13 @@ static int manager_setup_user_lookup_fd(Manager *m) { if (!m->user_lookup_event_source) { r = sd_event_add_io(m->event, &m->user_lookup_event_source, m->user_lookup_fds[0], EPOLLIN, manager_dispatch_user_lookup_fd, m); if (r < 0) - return log_error_errno(errno, "Failed to allocate user lookup event source: %m"); + return log_error_errno(r, "Failed to allocate user lookup event source: %m"); /* Process even earlier than the notify event source, so that we always know first about valid UID/GID * resolutions */ - r = sd_event_source_set_priority(m->user_lookup_event_source, SD_EVENT_PRIORITY_NORMAL-11); + r = sd_event_source_set_priority(m->user_lookup_event_source, EVENT_PRIORITY_USER_LOOKUP); if (r < 0) - return log_error_errno(errno, "Failed to set priority of user lookup event source: %m"); + return log_error_errno(r, "Failed to set priority of user lookup event source: %m"); (void) sd_event_source_set_description(m->user_lookup_event_source, "user-lookup"); } @@ -1250,6 +1256,49 @@ static int manager_setup_user_lookup_fd(Manager *m) { return 0; } +static int manager_setup_handoff_timestamp_fd(Manager *m) { + int r; + + assert(m); + + /* Set up the socket pair used for passing timestamps back when the executor processes we fork + * off invokes execve(), i.e. when we hand off control to our payload processes. */ + + if (m->handoff_timestamp_fds[0] < 0) { + m->handoff_timestamp_event_source = sd_event_source_disable_unref(m->handoff_timestamp_event_source); + safe_close_pair(m->handoff_timestamp_fds); + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->handoff_timestamp_fds) < 0) + return log_error_errno(errno, "Failed to allocate handoff timestamp socket: %m"); + + /* Make sure children never have to block */ + (void) fd_increase_rxbuf(m->handoff_timestamp_fds[0], NOTIFY_RCVBUF_SIZE); + + r = setsockopt_int(m->handoff_timestamp_fds[0], SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + /* Mark the receiving socket as O_NONBLOCK (but leave sending side as-is) */ + r = fd_nonblock(m->handoff_timestamp_fds[0], true); + if (r < 0) + return log_error_errno(r, "Failed to make handoff timestamp socket O_NONBLOCK: %m"); + } + + if (!m->handoff_timestamp_event_source) { + r = sd_event_add_io(m->event, &m->handoff_timestamp_event_source, m->handoff_timestamp_fds[0], EPOLLIN, manager_dispatch_handoff_timestamp_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate handoff timestamp event source: %m"); + + r = sd_event_source_set_priority(m->handoff_timestamp_event_source, EVENT_PRIORITY_HANDOFF_TIMESTAMP); + if (r < 0) + return log_error_errno(r, "Failed to set priority of handoff timestamp event source: %m"); + + (void) sd_event_source_set_description(m->handoff_timestamp_event_source, "handoff-timestamp"); + } + + return 0; +} + static unsigned manager_dispatch_cleanup_queue(Manager *m) { Unit *u; unsigned n = 0; @@ -1664,12 +1713,14 @@ Manager* manager_free(Manager *m) { sd_event_source_unref(m->jobs_in_progress_event_source); sd_event_source_unref(m->run_queue_event_source); sd_event_source_unref(m->user_lookup_event_source); + sd_event_source_unref(m->handoff_timestamp_event_source); sd_event_source_unref(m->memory_pressure_event_source); safe_close(m->signal_fd); safe_close(m->notify_fd); safe_close(m->cgroups_agent_fd); safe_close_pair(m->user_lookup_fds); + safe_close_pair(m->handoff_timestamp_fds); manager_close_ask_password(m); @@ -1679,7 +1730,7 @@ Manager* manager_free(Manager *m) { free(m->notify_socket); - lookup_paths_free(&m->lookup_paths); + lookup_paths_done(&m->lookup_paths); strv_free(m->transient_environment); strv_free(m->client_environment); @@ -1691,8 +1742,10 @@ Manager* manager_free(Manager *m) { unit_defaults_done(&m->defaults); - assert(hashmap_isempty(m->units_requiring_mounts_for)); - hashmap_free(m->units_requiring_mounts_for); + FOREACH_ARRAY(map, m->units_needing_mounts_for, _UNIT_MOUNT_DEPENDENCY_TYPE_MAX) { + assert(hashmap_isempty(*map)); + hashmap_free(*map); + } hashmap_free(m->uid_refs); hashmap_free(m->gid_refs); @@ -1708,7 +1761,7 @@ Manager* manager_free(Manager *m) { m->fw_ctx = fw_ctx_free(m->fw_ctx); #if BPF_FRAMEWORK - lsm_bpf_destroy(m->restrict_fs); + bpf_restrict_fs_destroy(m->restrict_fs); #endif safe_close(m->executor_fd); @@ -1802,7 +1855,7 @@ static void manager_distribute_fds(Manager *m, FDSet *fds) { HASHMAP_FOREACH(u, m->units) { - if (fdset_size(fds) <= 0) + if (fdset_isempty(fds)) break; if (!UNIT_VTABLE(u)->distribute_fds) @@ -1973,6 +2026,20 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo return log_error_errno(r, "Deserialization failed: %m"); } + if (m->previous_objective >= 0) { + if (IN_SET(m->previous_objective, MANAGER_REEXECUTE, MANAGER_SOFT_REBOOT, MANAGER_SWITCH_ROOT)) + log_debug("Launching as effect of a '%s' operation.", + manager_objective_to_string(m->previous_objective)); + else + log_warning("Got unexpected previous objective '%s', ignoring.", + manager_objective_to_string(m->previous_objective)); + } + + /* If we are in a new soft-reboot iteration bump the counter now before starting units, so + * that they can reliably read it. We get the previous objective from serialized state. */ + if (m->previous_objective == MANAGER_SOFT_REBOOT) + m->soft_reboots_count++; + /* Any fds left? Find some unit which wants them. This is useful to allow container managers to pass * some file descriptors to us pre-initialized. This enables socket-based activation of entire * containers. */ @@ -1994,6 +2061,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo /* This shouldn't fail, except if things are really broken. */ return r; + r = manager_setup_handoff_timestamp_fd(m); + if (r < 0) + /* This shouldn't fail, except if things are really broken. */ + return r; + /* Connect to the bus if we are good for it */ manager_setup_bus(m); @@ -2203,8 +2275,8 @@ static int manager_dispatch_target_deps_queue(Manager *m) { if (n_targets < 0) return n_targets; - for (int i = 0; i < n_targets; i++) { - r = unit_add_default_target_dependency(u, targets[i]); + FOREACH_ARRAY(i, targets, n_targets) { + r = unit_add_default_target_dependency(u, *i); if (r < 0) return r; } @@ -2303,7 +2375,7 @@ int manager_load_unit_prepare( Unit *unit = manager_get_unit(m, name); if (unit) { - /* The time-based cache allows to start new units without daemon-reload, + /* The time-based cache allows new units to be started without daemon-reload, * but if they are already referenced (because of dependencies or ordering) * then we have to force a load of the fragment. As an optimization, check * first if anything in the usual paths was modified since the last time @@ -2403,7 +2475,7 @@ void manager_clear_jobs(Manager *m) { job_finish_and_invalidate(j, JOB_CANCELED, false, false); } -void manager_unwatch_pidref(Manager *m, PidRef *pid) { +void manager_unwatch_pidref(Manager *m, const PidRef *pid) { assert(m); for (;;) { @@ -2586,22 +2658,70 @@ static void manager_invoke_notify_message( UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds); else if (DEBUG_LOGGING) { - _cleanup_free_ char *buf = NULL, *x = NULL, *y = NULL; + _cleanup_free_ char *joined = strv_join(tags, ", "); + char buf[CELLESCAPE_DEFAULT_LENGTH]; + + log_unit_debug(u, "Got notification message from unexpected unit type, ignoring: %s", + joined ? cellescape(buf, sizeof(buf), joined) : "(null)"); + } +} + +static int manager_get_units_for_pidref(Manager *m, const PidRef *pidref, Unit ***ret_units) { + /* Determine array of every unit that is interested in the specified process */ + + assert(m); + assert(pidref_is_set(pidref)); - buf = strv_join(tags, ", "); - if (buf) - x = ellipsize(buf, 20, 90); - if (x) - y = cescape(x); + Unit *u1, *u2, **array; + u1 = manager_get_unit_by_pidref_cgroup(m, pidref); + u2 = hashmap_get(m->watch_pids, pidref); + array = hashmap_get(m->watch_pids_more, pidref); + + size_t n = 0; + if (u1) + n++; + if (u2) + n++; + if (array) + for (size_t j = 0; array[j]; j++) + n++; + + assert(n <= INT_MAX); /* Make sure we can reasonably return the counter as "int" */ + + if (ret_units) { + _cleanup_free_ Unit **units = NULL; + + if (n > 0) { + units = new(Unit*, n + 1); + if (!units) + return -ENOMEM; + + /* We return a dense array, and put the "main" unit first, i.e. unit in whose cgroup + * the process currently is. Note that we do not bother with filtering duplicates + * here. */ + + size_t i = 0; + if (u1) + units[i++] = u1; + if (u2) + units[i++] = u2; + if (array) + for (size_t j = 0; array[j]; j++) + units[i++] = array[j]; + assert(i == n); + + units[i] = NULL; /* end array in an extra NULL */ + } - log_unit_debug(u, "Got notification message \"%s\", ignoring.", strnull(y)); + *ret_units = TAKE_PTR(units); } + + return (int) n; } static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { - - _cleanup_fdset_free_ FDSet *fds = NULL; Manager *m = ASSERT_PTR(userdata); + _cleanup_fdset_free_ FDSet *fds = NULL; char buf[NOTIFY_BUFFER_MAX+1]; struct iovec iovec = { .iov_base = buf, @@ -2618,12 +2738,9 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t struct cmsghdr *cmsg; struct ucred *ucred = NULL; - _cleanup_free_ Unit **array_copy = NULL; _cleanup_strv_free_ char **tags = NULL; - Unit *u1, *u2, **array; int r, *fd_array = NULL; size_t n_fds = 0; - bool found = false; ssize_t n; assert(m->notify_fd == fd); @@ -2711,39 +2828,22 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t PidRef pidref = PIDREF_MAKE_FROM_PID(ucred->pid); /* Notify every unit that might be interested, which might be multiple. */ - u1 = manager_get_unit_by_pidref_cgroup(m, &pidref); - u2 = hashmap_get(m->watch_pids, &pidref); - array = hashmap_get(m->watch_pids_more, &pidref); - if (array) { - size_t k = 0; + _cleanup_free_ Unit **array = NULL; - while (array[k]) - k++; - - array_copy = newdup(Unit*, array, k+1); - if (!array_copy) - log_oom(); - } - /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle - * duplicate units make sure we only invoke each unit's handler once. */ - if (u1) { - manager_invoke_notify_message(m, u1, ucred, tags, fds); - found = true; - } - if (u2) { - manager_invoke_notify_message(m, u2, ucred, tags, fds); - found = true; + int n_array = manager_get_units_for_pidref(m, &pidref, &array); + if (n_array < 0) { + log_warning_errno(n_array, "Failed to determine units for PID " PID_FMT ", ignoring: %m", ucred->pid); + return 0; } - if (array_copy) - for (size_t i = 0; array_copy[i]; i++) { - manager_invoke_notify_message(m, array_copy[i], ucred, tags, fds); - found = true; - } - - if (!found) - log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid); + if (n_array == 0) + log_debug("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid); + else + /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle + * duplicate units – making sure we only invoke each unit's handler once. */ + FOREACH_ARRAY(u, array, n_array) + manager_invoke_notify_message(m, *u, ucred, tags, fds); - if (fdset_size(fds) > 0) + if (!fdset_isempty(fds)) log_warning("Got extra auxiliary fds with notification message, closing them."); return 0; @@ -2792,10 +2892,7 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { goto turn_off; if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) { - _cleanup_free_ Unit **array_copy = NULL; _cleanup_free_ char *name = NULL; - Unit *u1, *u2, **array; - (void) pid_get_comm(si.si_pid, &name); log_debug("Child "PID_FMT" (%s) died (code=%s, status=%i/%s)", @@ -2813,41 +2910,27 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { * pidfd here any more even if we wanted (since the process just exited). */ PidRef pidref = PIDREF_MAKE_FROM_PID(si.si_pid); - /* And now figure out the unit this belongs to, it might be multiple... */ - u1 = manager_get_unit_by_pidref_cgroup(m, &pidref); - u2 = hashmap_get(m->watch_pids, &pidref); - array = hashmap_get(m->watch_pids_more, &pidref); - if (array) { - size_t n = 0; - - /* Count how many entries the array has */ - while (array[n]) - n++; - - /* Make a copy of the array so that we don't trip up on the array changing beneath us */ - array_copy = newdup(Unit*, array, n+1); - if (!array_copy) - log_oom(); - } - - /* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but - * that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for - * each iteration. */ - if (u1) { - /* We check for oom condition, in case we got SIGCHLD before the oom notification. - * We only do this for the cgroup the PID belonged to. */ - (void) unit_check_oom(u1); + /* And now figure out the units this belongs to, there might be multiple... */ + _cleanup_free_ Unit **array = NULL; + int n_array = manager_get_units_for_pidref(m, &pidref, &array); + if (n_array < 0) + log_warning_errno(n_array, "Failed to get units for process " PID_FMT ", ignoring: %m", si.si_pid); + else if (n_array == 0) + log_debug("Got SIGCHLD for process " PID_FMT " we weren't interested in, ignoring.", si.si_pid); + else { + /* We check for an OOM condition, in case we got SIGCHLD before the OOM notification. + * We only do this for the cgroup the PID belonged to, which is the f */ + (void) unit_check_oom(array[0]); /* We check if systemd-oomd performed a kill so that we log and notify appropriately */ - (void) unit_check_oomd_kill(u1); + (void) unit_check_oomd_kill(array[0]); - manager_invoke_sigchld_event(m, u1, &si); + /* Finally, execute them all. Note that the array might contain duplicates, but that's fine, + * manager_invoke_sigchld_event() will ensure we only invoke the handlers once for each + * iteration. */ + FOREACH_ARRAY(u, array, n_array) + manager_invoke_sigchld_event(m, *u, &si); } - if (u2) - manager_invoke_sigchld_event(m, u2, &si); - if (array_copy) - for (size_t i = 0; array_copy[i]; i++) - manager_invoke_sigchld_event(m, array_copy[i], &si); } /* And now, we actually reap the zombie. */ @@ -2878,8 +2961,8 @@ static void manager_start_special(Manager *m, const char *name, JobMode mode) { log_info("Activating special unit %s...", s); - sd_notifyf(false, - "STATUS=Activating special unit %s...", s); + (void) sd_notifyf(/* unset_environment= */ false, + "STATUS=Activating special unit %s...", s); m->status_ready = false; } @@ -2986,7 +3069,7 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t r = manager_get_dump_string(m, /* patterns= */ NULL, &dump); if (r < 0) { - log_warning_errno(errno, "Failed to acquire manager dump: %m"); + log_warning_errno(r, "Failed to acquire manager dump: %m"); break; } @@ -3008,9 +3091,9 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t const char *target; JobMode mode; } target_table[] = { - [0] = { SPECIAL_DEFAULT_TARGET, JOB_ISOLATE }, - [1] = { SPECIAL_RESCUE_TARGET, JOB_ISOLATE }, - [2] = { SPECIAL_EMERGENCY_TARGET, JOB_ISOLATE }, + [0] = { SPECIAL_DEFAULT_TARGET, JOB_ISOLATE }, + [1] = { SPECIAL_RESCUE_TARGET, JOB_ISOLATE }, + [2] = { SPECIAL_EMERGENCY_TARGET, JOB_ISOLATE }, [3] = { SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY }, [4] = { SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY }, [5] = { SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY }, @@ -3077,7 +3160,7 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t r = manager_get_dump_jobs_string(m, /* patterns= */ NULL, " ", &dump_jobs); if (r < 0) { - log_warning_errno(errno, "Failed to acquire manager jobs dump: %m"); + log_warning_errno(r, "Failed to acquire manager jobs dump: %m"); break; } @@ -3371,16 +3454,18 @@ void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) { const char *msg; int audit_fd, r; + assert(m); + assert(u); + if (!MANAGER_IS_SYSTEM(m)) return; - audit_fd = get_audit_fd(); - if (audit_fd < 0) + /* Don't generate audit events if the service was already started and we're just deserializing */ + if (MANAGER_IS_RELOADING(m)) return; - /* Don't generate audit events if the service was already - * started and we're just deserializing */ - if (MANAGER_IS_RELOADING(m)) + audit_fd = get_audit_fd(); + if (audit_fd < 0) return; r = unit_name_to_prefix_and_instance(u->id, &p); @@ -3399,21 +3484,22 @@ void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) { log_warning_errno(errno, "Failed to send audit message, ignoring: %m"); } #endif - } void manager_send_unit_plymouth(Manager *m, Unit *u) { _cleanup_free_ char *message = NULL; int c, r; - /* Don't generate plymouth events if the service was already - * started and we're just deserializing */ - if (MANAGER_IS_RELOADING(m)) - return; + assert(m); + assert(u); if (!MANAGER_IS_SYSTEM(m)) return; + /* Don't generate plymouth events if the service was already started and we're just deserializing */ + if (MANAGER_IS_RELOADING(m)) + return; + if (detect_container() > 0) return; @@ -3431,6 +3517,27 @@ void manager_send_unit_plymouth(Manager *m, Unit *u) { "Failed to communicate with plymouth: %m"); } +void manager_send_unit_supervisor(Manager *m, Unit *u, bool active) { + assert(m); + assert(u); + + /* Notify a "supervisor" process about our progress, i.e. a container manager, hypervisor, or + * surrounding service manager. */ + + if (MANAGER_IS_RELOADING(m)) + return; + + if (!UNIT_VTABLE(u)->notify_supervisor) + return; + + if (in_initrd()) /* Only send these once we left the initrd */ + return; + + (void) sd_notifyf(/* unset_environment= */ false, + active ? "X_SYSTEMD_UNIT_ACTIVE=%s" : "X_SYSTEMD_UNIT_INACTIVE=%s", + u->id); +} + usec_t manager_get_watchdog(Manager *m, WatchdogType t) { assert(m); @@ -3566,7 +3673,7 @@ int manager_reload(Manager *m) { manager_clear_jobs_and_units(m); lookup_paths_flush_generator(&m->lookup_paths); - lookup_paths_free(&m->lookup_paths); + lookup_paths_done(&m->lookup_paths); exec_shared_runtime_vacuum(m); dynamic_user_vacuum(m, false); m->uid_refs = hashmap_free(m->uid_refs); @@ -3601,6 +3708,7 @@ int manager_reload(Manager *m) { (void) manager_setup_notify(m); (void) manager_setup_cgroups_agent(m); (void) manager_setup_user_lookup_fd(m); + (void) manager_setup_handoff_timestamp_fd(m); /* Third, fire things up! */ manager_coldplug(m); @@ -3645,8 +3753,6 @@ bool manager_unit_inactive_or_pending(Manager *m, const char *name) { } static void log_taint_string(Manager *m) { - _cleanup_free_ char *taint = NULL; - assert(m); if (MANAGER_IS_USER(m) || m->taint_logged) @@ -3654,7 +3760,7 @@ static void log_taint_string(Manager *m) { m->taint_logged = true; /* only check for taint once */ - taint = manager_taint_string(m); + _cleanup_free_ char *taint = taint_string(); if (isempty(taint)) return; @@ -3670,7 +3776,19 @@ static void manager_notify_finished(Manager *m) { if (MANAGER_IS_TEST_RUN(m)) return; - if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) { + if (MANAGER_IS_SYSTEM(m) && m->soft_reboots_count > 0) { + /* The soft-reboot case, where we only report data for the last reboot */ + firmware_usec = loader_usec = initrd_usec = kernel_usec = 0; + total_usec = userspace_usec = usec_sub_unsigned(m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic, + m->timestamps[MANAGER_TIMESTAMP_SHUTDOWN_START].monotonic); + + log_struct(LOG_INFO, + "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR, + "USERSPACE_USEC="USEC_FMT, userspace_usec, + LOG_MESSAGE("Soft-reboot finished in %s, counter is now at %u.", + FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC), + m->soft_reboots_count)); + } else if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) { char buf[FORMAT_TIMESPAN_MAX + STRLEN(" (firmware) + ") + FORMAT_TIMESPAN_MAX + STRLEN(" (loader) + ")] = {}; char *p = buf; @@ -3740,7 +3858,7 @@ static void manager_notify_finished(Manager *m) { log_taint_string(m); } -static void user_manager_send_ready(Manager *m) { +static void manager_send_ready_user_scope(Manager *m) { int r; assert(m); @@ -3749,7 +3867,7 @@ static void user_manager_send_ready(Manager *m) { if (!MANAGER_IS_USER(m) || m->ready_sent) return; - r = sd_notify(false, + r = sd_notify(/* unset_environment= */ false, "READY=1\n" "STATUS=Reached " SPECIAL_BASIC_TARGET "."); if (r < 0) @@ -3759,14 +3877,19 @@ static void user_manager_send_ready(Manager *m) { m->status_ready = false; } -static void manager_send_ready(Manager *m) { +static void manager_send_ready_system_scope(Manager *m) { int r; + assert(m); + + if (!MANAGER_IS_SYSTEM(m)) + return; + + /* Skip the notification if nothing changed. */ if (m->ready_sent && m->status_ready) - /* Skip the notification if nothing changed. */ return; - r = sd_notify(false, + r = sd_notify(/* unset_environment= */ false, "READY=1\n" "STATUS=Ready."); if (r < 0) @@ -3790,7 +3913,7 @@ static void manager_check_basic_target(Manager *m) { return; /* For user managers, send out READY=1 as soon as we reach basic.target */ - user_manager_send_ready(m); + manager_send_ready_user_scope(m); /* Log the taint string as soon as we reach basic.target */ log_taint_string(m); @@ -3808,7 +3931,7 @@ void manager_check_finished(Manager *m) { manager_check_basic_target(m); - if (hashmap_size(m->jobs) > 0) { + if (!hashmap_isempty(m->jobs)) { if (m->jobs_in_progress_event_source) /* Ignore any failure, this is only for feedback */ (void) sd_event_source_set_time(m->jobs_in_progress_event_source, @@ -3821,7 +3944,7 @@ void manager_check_finished(Manager *m) { if (hashmap_buckets(m->jobs) > hashmap_size(m->units) / 10) m->jobs = hashmap_free(m->jobs); - manager_send_ready(m); + manager_send_ready_system_scope(m); /* Notify Type=idle units that we are done now */ manager_close_idle_pipe(m); @@ -3851,9 +3974,7 @@ void manager_send_reloading(Manager *m) { assert(m); /* Let whoever invoked us know that we are now reloading */ - (void) sd_notifyf(/* unset= */ false, - "RELOADING=1\n" - "MONOTONIC_USEC=" USEC_FMT "\n", now(CLOCK_MONOTONIC)); + (void) notify_reloading_full(/* status = */ NULL); /* And ensure that we'll send READY=1 again as soon as we are ready again */ m->ready_sent = false; @@ -3878,8 +3999,8 @@ static int manager_run_environment_generators(Manager *m) { _cleanup_strv_free_ char **paths = NULL; void* args[] = { [STDOUT_GENERATE] = &tmp, - [STDOUT_COLLECT] = &tmp, - [STDOUT_CONSUME] = &m->transient_environment, + [STDOUT_COLLECT] = &tmp, + [STDOUT_CONSUME] = &m->transient_environment, }; int r; @@ -4040,7 +4161,7 @@ static int manager_run_generators(Manager *m) { /* On some systems /tmp/ doesn't exist, and on some other systems we cannot create it at all. Avoid * trying to mount a private tmpfs on it as there's no one size fits all. */ - if (is_dir("/tmp", /* follow= */ false) > 0) + if (is_dir("/tmp", /* follow= */ false) > 0 && !MANAGER_IS_TEST_RUN(m)) flags |= FORK_PRIVATE_TMP; r = safe_fork("(sd-gens)", flags, NULL); @@ -4373,7 +4494,7 @@ void manager_override_show_status(Manager *m, ShowStatus mode, const char *reaso set_show_status_marker(show_status_on(mode)); } -const char *manager_get_confirm_spawn(Manager *m) { +const char* manager_get_confirm_spawn(Manager *m) { static int last_errno = 0; struct stat st; int r; @@ -4478,14 +4599,15 @@ void manager_status_printf(Manager *m, StatusType type, const char *status, cons va_end(ap); } -Set* manager_get_units_requiring_mounts_for(Manager *m, const char *path) { +Set* manager_get_units_needing_mounts_for(Manager *m, const char *path, UnitMountDependencyType t) { assert(m); assert(path); + assert(t >= 0 && t < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX); if (path_equal(path, "/")) path = ""; - return hashmap_get(m->units_requiring_mounts_for, path); + return hashmap_get(m->units_needing_mounts_for[t], path); } int manager_update_failed_units(Manager *m, Unit *u, bool failed) { @@ -4542,7 +4664,7 @@ ManagerState manager_state(Manager *m) { } /* Are there any failed units? If so, we are in degraded mode */ - if (set_size(m->failed_units) > 0) + if (!set_isempty(m->failed_units)) return MANAGER_DEGRADED; return MANAGER_RUNNING; @@ -4701,20 +4823,19 @@ static void manager_vacuum(Manager *m) { exec_shared_runtime_vacuum(m); } -int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { +static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { struct buffer { uid_t uid; gid_t gid; char unit_name[UNIT_NAME_MAX+1]; } _packed_ buffer; - Manager *m = userdata; + Manager *m = ASSERT_PTR(userdata); ssize_t l; size_t n; Unit *u; - assert_se(source); - assert_se(m); + assert(source); /* Invoked whenever a child process succeeded resolving its user/group to use and sent us the * resulting UID/GID in a datagram. We parse the datagram here and pass it off to the unit, so that @@ -4763,76 +4884,71 @@ int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t re return 0; } -static int short_uid_range(const char *path) { - _cleanup_(uid_range_freep) UidRange *p = NULL; - int r; - - assert(path); - - /* Taint systemd if we the UID range assigned to this environment doesn't at least cover 0…65534, - * i.e. from root to nobody. */ - - r = uid_range_load_userns(&p, path); - if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) - return false; - if (r < 0) - return log_debug_errno(r, "Failed to load %s: %m", path); - - return !uid_range_covers(p, 0, 65535); -} - -char* manager_taint_string(const Manager *m) { - /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". Only things that are detected at - * runtime should be tagged here. For stuff that is known during compilation, emit a warning in the - * configuration phase. */ - - assert(m); - - const char* stage[12] = {}; - size_t n = 0; - - _cleanup_free_ char *usrbin = NULL; - if (readlink_malloc("/bin", &usrbin) < 0 || !PATH_IN_SET(usrbin, "usr/bin", "/usr/bin")) - stage[n++] = "unmerged-usr"; +static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + usec_t ts[2] = {}; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + struct msghdr msghdr = { + .msg_iov = &IOVEC_MAKE(ts, sizeof(ts)), + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + ssize_t n; - if (access("/proc/cgroups", F_OK) < 0) - stage[n++] = "cgroups-missing"; + assert(source); - if (cg_all_unified() == 0) - stage[n++] = "cgroupsv1"; + n = recvmsg_safe(m->handoff_timestamp_fds[0], &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC); + if (ERRNO_IS_NEG_TRANSIENT(n)) + return 0; /* Spurious wakeup, try again */ + if (n == -EXFULL) { + log_warning("Got message with truncated control, ignoring."); + return 0; + } + if (n < 0) + return log_error_errno(n, "Failed to receive handoff timestamp message: %m"); - if (clock_is_localtime(NULL) > 0) - stage[n++] = "local-hwclock"; + if (msghdr.msg_flags & MSG_TRUNC) { + log_warning("Got truncated handoff timestamp message, ignoring."); + return 0; + } + if (n != sizeof(ts)) { + log_warning("Got handoff timestamp message of unexpected size %zi (expected %zu), ignoring.", n, sizeof(ts)); + return 0; + } - if (os_release_support_ended(NULL, /* quiet= */ true, NULL) > 0) - stage[n++] = "support-ended"; + struct ucred *ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred); + if (!ucred || !pid_is_valid(ucred->pid)) { + log_warning("Received notify message without valid credentials. Ignoring."); + return 0; + } - _cleanup_free_ char *destination = NULL; - if (readlink_malloc("/var/run", &destination) < 0 || - !PATH_IN_SET(destination, "../run", "/run")) - stage[n++] = "var-run-bad"; + log_debug("Got handoff timestamp event for PID " PID_FMT ".", ucred->pid); - _cleanup_free_ char *overflowuid = NULL, *overflowgid = NULL; - if (read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid) >= 0 && - !streq(overflowuid, "65534")) - stage[n++] = "overflowuid-not-65534"; - if (read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid) >= 0 && - !streq(overflowgid, "65534")) - stage[n++] = "overflowgid-not-65534"; + _cleanup_free_ Unit **units = NULL; + int n_units = manager_get_units_for_pidref(m, &PIDREF_MAKE_FROM_PID(ucred->pid), &units); + if (n_units < 0) { + log_warning_errno(n_units, "Unable to determine units for PID " PID_FMT ", ignoring: %m", ucred->pid); + return 0; + } + if (n_units == 0) { + log_debug("Got handoff timestamp for process " PID_FMT " we are not interested in, ignoring.", ucred->pid); + return 0; + } - struct utsname uts; - assert_se(uname(&uts) >= 0); - if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0) - stage[n++] = "old-kernel"; + dual_timestamp dt = { + .realtime = ts[0], + .monotonic = ts[1], + }; - if (short_uid_range("/proc/self/uid_map") > 0) - stage[n++] = "short-uid-range"; - if (short_uid_range("/proc/self/gid_map") > 0) - stage[n++] = "short-gid-range"; + FOREACH_ARRAY(u, units, n_units) { + if (!UNIT_VTABLE(*u)->notify_handoff_timestamp) + continue; - assert(n < ELEMENTSOF(stage) - 1); /* One extra for NULL terminator */ + UNIT_VTABLE(*u)->notify_handoff_timestamp(*u, ucred, &dt); + } - return strv_join((char**) stage, ":"); + return 0; } void manager_ref_console(Manager *m) { @@ -4988,14 +5104,13 @@ LogTarget manager_get_executor_log_target(Manager *m) { assert(m); /* If journald is not available tell sd-executor to go to kmsg, as it might be starting journald */ + if (!MANAGER_IS_TEST_RUN(m) && !manager_journal_is_running(m)) + return LOG_TARGET_KMSG; - if (manager_journal_is_running(m)) - return log_get_target(); - - return LOG_TARGET_KMSG; + return log_get_target(); } -static const char *const manager_state_table[_MANAGER_STATE_MAX] = { +static const char* const manager_state_table[_MANAGER_STATE_MAX] = { [MANAGER_INITIALIZING] = "initializing", [MANAGER_STARTING] = "starting", [MANAGER_RUNNING] = "running", @@ -5006,7 +5121,22 @@ static const char *const manager_state_table[_MANAGER_STATE_MAX] = { DEFINE_STRING_TABLE_LOOKUP(manager_state, ManagerState); -static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = { +static const char* const manager_objective_table[_MANAGER_OBJECTIVE_MAX] = { + [MANAGER_OK] = "ok", + [MANAGER_EXIT] = "exit", + [MANAGER_RELOAD] = "reload", + [MANAGER_REEXECUTE] = "reexecute", + [MANAGER_REBOOT] = "reboot", + [MANAGER_SOFT_REBOOT] = "soft-reboot", + [MANAGER_POWEROFF] = "poweroff", + [MANAGER_HALT] = "halt", + [MANAGER_KEXEC] = "kexec", + [MANAGER_SWITCH_ROOT] = "switch-root", +}; + +DEFINE_STRING_TABLE_LOOKUP(manager_objective, ManagerObjective); + +static const char* const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = { [MANAGER_TIMESTAMP_FIRMWARE] = "firmware", [MANAGER_TIMESTAMP_LOADER] = "loader", [MANAGER_TIMESTAMP_KERNEL] = "kernel", @@ -5026,6 +5156,7 @@ static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = { [MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH] = "initrd-generators-finish", [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START] = "initrd-units-load-start", [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH] = "initrd-units-load-finish", + [MANAGER_TIMESTAMP_SHUTDOWN_START] = "shutdown-start", }; DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp); diff --git a/src/core/manager.h b/src/core/manager.h index d96eb7b..0641b27 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -120,6 +120,9 @@ typedef enum ManagerTimestamp { MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH, MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START, MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH, + + MANAGER_TIMESTAMP_SHUTDOWN_START, + _MANAGER_TIMESTAMP_MAX, _MANAGER_TIMESTAMP_INVALID = -EINVAL, } ManagerTimestamp; @@ -137,6 +140,7 @@ typedef enum WatchdogType { #include "path-lookup.h" #include "show-status.h" #include "unit-name.h" +#include "unit.h" typedef enum ManagerTestRunFlags { MANAGER_TEST_NORMAL = 0, /* run normally */ @@ -282,6 +286,9 @@ struct Manager { int user_lookup_fds[2]; sd_event_source *user_lookup_event_source; + int handoff_timestamp_fds[2]; + sd_event_source *handoff_timestamp_event_source; + RuntimeScope runtime_scope; LookupPaths lookup_paths; @@ -375,6 +382,8 @@ struct Manager { bool etc_localtime_accessible; ManagerObjective objective; + /* Objective as it was before serialization, mostly to detect soft-reboots */ + ManagerObjective previous_objective; /* Flags */ bool dispatching_load_queue; @@ -438,10 +447,9 @@ struct Manager { /* This is true before and after switching root. */ bool switching_root; - /* This maps all possible path prefixes to the units needing - * them. It's a hashmap with a path string as key and a Set as - * value where Unit objects are contained. */ - Hashmap *units_requiring_mounts_for; + /* These map all possible path prefixes to the units needing them. They are hashmaps with a path + * string as key, and a Set as value where Unit objects are contained. */ + Hashmap *units_needing_mounts_for[_UNIT_MOUNT_DEPENDENCY_TYPE_MAX]; /* Used for processing polkit authorization responses */ Hashmap *polkit_registry; @@ -488,8 +496,8 @@ struct Manager { /* Reference to RestrictFileSystems= BPF program */ struct restrict_fs_bpf *restrict_fs; - /* Allow users to configure a rate limit for Reload() operations */ - RateLimit reload_ratelimit; + /* Allow users to configure a rate limit for Reload()/Reexecute() operations */ + RateLimit reload_reexec_ratelimit; /* Dump*() are slow, so always rate limit them to 10 per 10 minutes */ RateLimit dump_ratelimit; @@ -501,6 +509,8 @@ struct Manager { /* Pin the systemd-executor binary, so that it never changes until re-exec, ensuring we don't have * serialization/deserialization compatibility issues during upgrades. */ int executor_fd; + + unsigned soft_reboots_count; }; static inline usec_t manager_default_timeout_abort_usec(Manager *m) { @@ -550,7 +560,7 @@ int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error void manager_clear_jobs(Manager *m); -void manager_unwatch_pidref(Manager *m, PidRef *pid); +void manager_unwatch_pidref(Manager *m, const PidRef *pid); unsigned manager_dispatch_load_queue(Manager *m); @@ -575,6 +585,7 @@ void manager_reset_failed(Manager *m); void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success); void manager_send_unit_plymouth(Manager *m, Unit *u); +void manager_send_unit_supervisor(Manager *m, Unit *u, bool active); bool manager_unit_inactive_or_pending(Manager *m, const char *name); @@ -596,7 +607,7 @@ double manager_get_progress(Manager *m); void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) _printf_(4,5); -Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path); +Set* manager_get_units_needing_mounts_for(Manager *m, const char *path, UnitMountDependencyType t); ManagerState manager_state(Manager *m); @@ -608,8 +619,6 @@ int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc); void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now); int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc); -char* manager_taint_string(const Manager *m); - void manager_ref_console(Manager *m); void manager_unref_console(Manager *m); @@ -619,13 +628,16 @@ void manager_restore_original_log_level(Manager *m); void manager_override_log_target(Manager *m, LogTarget target); void manager_restore_original_log_target(Manager *m); -const char *manager_state_to_string(ManagerState m) _const_; +const char* manager_get_confirm_spawn(Manager *m); +void manager_disable_confirm_spawn(void); + +const char* manager_state_to_string(ManagerState m) _const_; ManagerState manager_state_from_string(const char *s) _pure_; -const char *manager_get_confirm_spawn(Manager *m); -void manager_disable_confirm_spawn(void); +const char* manager_objective_to_string(ManagerObjective m) _const_; +ManagerObjective manager_objective_from_string(const char *s) _pure_; -const char *manager_timestamp_to_string(ManagerTimestamp m) _const_; +const char* manager_timestamp_to_string(ManagerTimestamp m) _const_; ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_; ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s); @@ -644,3 +656,26 @@ OOMPolicy oom_policy_from_string(const char *s) _pure_; void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope); void unit_defaults_done(UnitDefaults *defaults); + +enum { + /* most important … */ + EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-11, + EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-10, + EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-10, + EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv1 */ + EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv2 */ + EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-8, + EVENT_PRIORITY_HANDOFF_TIMESTAMP = SD_EVENT_PRIORITY_NORMAL-7, + EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6, + EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5, + EVENT_PRIORITY_SIGCHLD = SD_EVENT_PRIORITY_NORMAL-4, + EVENT_PRIORITY_SIGNALS = SD_EVENT_PRIORITY_NORMAL-3, + EVENT_PRIORITY_CGROUP_EMPTY = SD_EVENT_PRIORITY_NORMAL-2, + EVENT_PRIORITY_TIME_CHANGE = SD_EVENT_PRIORITY_NORMAL-1, + EVENT_PRIORITY_TIME_ZONE = SD_EVENT_PRIORITY_NORMAL-1, + EVENT_PRIORITY_IPC = SD_EVENT_PRIORITY_NORMAL, + EVENT_PRIORITY_REWATCH_PIDS = SD_EVENT_PRIORITY_IDLE, + EVENT_PRIORITY_SERVICE_WATCHDOG = SD_EVENT_PRIORITY_IDLE+1, + EVENT_PRIORITY_RUN_QUEUE = SD_EVENT_PRIORITY_IDLE+2, + /* … to least important */ +}; diff --git a/src/core/meson.build b/src/core/meson.build index 7701d3d..7a2012a 100644 --- a/src/core/meson.build +++ b/src/core/meson.build @@ -7,7 +7,8 @@ libcore_sources = files( 'bpf-devices.c', 'bpf-firewall.c', 'bpf-foreign.c', - 'bpf-lsm.c', + 'bpf-restrict-fs.c', + 'bpf-restrict-ifaces.c', 'bpf-socket-bind.c', 'cgroup.c', 'core-varlink.c', @@ -51,7 +52,6 @@ libcore_sources = files( 'mount.c', 'namespace.c', 'path.c', - 'restrict-ifaces.c', 'scope.c', 'selinux-access.c', 'selinux-setup.c', @@ -61,6 +61,7 @@ libcore_sources = files( 'smack-setup.c', 'socket.c', 'swap.c', + 'taint.c', 'target.c', 'timer.c', 'transaction.c', @@ -125,7 +126,7 @@ libcore = shared_library( libaudit, libblkid, libdl, - libkmod, + libkmod_cflags, libm, libmount, libpam, diff --git a/src/core/mount.c b/src/core/mount.c index 3c4971c..ebafcaf 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -39,18 +39,18 @@ #define RETRY_UMOUNT_MAX 32 static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = { - [MOUNT_DEAD] = UNIT_INACTIVE, - [MOUNT_MOUNTING] = UNIT_ACTIVATING, - [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING, - [MOUNT_MOUNTED] = UNIT_ACTIVE, - [MOUNT_REMOUNTING] = UNIT_RELOADING, - [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING, + [MOUNT_DEAD] = UNIT_INACTIVE, + [MOUNT_MOUNTING] = UNIT_ACTIVATING, + [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING, + [MOUNT_MOUNTED] = UNIT_ACTIVE, + [MOUNT_REMOUNTING] = UNIT_RELOADING, + [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING, [MOUNT_REMOUNTING_SIGTERM] = UNIT_RELOADING, [MOUNT_REMOUNTING_SIGKILL] = UNIT_RELOADING, [MOUNT_UNMOUNTING_SIGTERM] = UNIT_DEACTIVATING, [MOUNT_UNMOUNTING_SIGKILL] = UNIT_DEACTIVATING, - [MOUNT_FAILED] = UNIT_FAILED, - [MOUNT_CLEANING] = UNIT_MAINTENANCE, + [MOUNT_FAILED] = UNIT_FAILED, + [MOUNT_CLEANING] = UNIT_MAINTENANCE, }; static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); @@ -171,24 +171,9 @@ static bool mount_propagate_stop(Mount *m) { * otherwise let's not bother. */ } -static bool mount_needs_quota(const MountParameters *p) { - assert(p); - - if (p->fstype && !fstype_needs_quota(p->fstype)) - return false; - - if (mount_is_bind(p)) - return false; - - return fstab_test_option(p->options, - "usrquota\0" "grpquota\0" "quota\0" "usrjquota\0" "grpjquota\0"); -} - static void mount_init(Unit *u) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); - assert(m); - assert(u); assert(u->load_state == UNIT_STUB); m->timeout_usec = u->manager->defaults.timeout_start_usec; @@ -218,12 +203,7 @@ static int mount_arm_timer(Mount *m, bool relative, usec_t usec) { static void mount_unwatch_control_pid(Mount *m) { assert(m); - - if (!pidref_is_set(&m->control_pid)) - return; - - unit_unwatch_pidref(UNIT(m), &m->control_pid); - pidref_done(&m->control_pid); + unit_unwatch_pidref_done(UNIT(m), &m->control_pid); } static void mount_parameters_done(MountParameters *p) { @@ -235,9 +215,7 @@ static void mount_parameters_done(MountParameters *p) { } static void mount_done(Unit *u) { - Mount *m = MOUNT(u); - - assert(m); + Mount *m = ASSERT_PTR(MOUNT(u)); m->where = mfree(m->where); @@ -245,6 +223,7 @@ static void mount_done(Unit *u) { mount_parameters_done(&m->parameters_fragment); m->exec_runtime = exec_runtime_free(m->exec_runtime); + exec_command_done_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX); m->control_command = NULL; @@ -262,6 +241,8 @@ static int update_parameters_proc_self_mountinfo( MountParameters *p; int r, q, w; + assert(m); + p = &m->parameters_proc_self_mountinfo; r = free_and_strdup(&p->what, what); @@ -281,8 +262,6 @@ static int update_parameters_proc_self_mountinfo( static int mount_add_mount_dependencies(Mount *m) { MountParameters *pm; - Unit *other; - Set *s; int r; assert(m); @@ -296,7 +275,7 @@ static int mount_add_mount_dependencies(Mount *m) { if (r < 0) return r; - r = unit_require_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT); + r = unit_add_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT, UNIT_MOUNT_REQUIRES); if (r < 0) return r; } @@ -308,30 +287,43 @@ static int mount_add_mount_dependencies(Mount *m) { path_is_absolute(pm->what) && (mount_is_bind(pm) || mount_is_loop(pm) || !mount_is_network(pm))) { - r = unit_require_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES); if (r < 0) return r; } /* Adds in dependencies to other units that use this path or paths further down in the hierarchy */ - s = manager_get_units_requiring_mounts_for(UNIT(m)->manager, m->where); - SET_FOREACH(other, s) { - - if (other->load_state != UNIT_LOADED) - continue; - - if (other == UNIT(m)) - continue; - - r = unit_add_dependency(other, UNIT_AFTER, UNIT(m), true, UNIT_DEPENDENCY_PATH); - if (r < 0) - return r; - - if (UNIT(m)->fragment_path) { - /* If we have fragment configuration, then make this dependency required */ - r = unit_add_dependency(other, UNIT_REQUIRES, UNIT(m), true, UNIT_DEPENDENCY_PATH); + for (UnitMountDependencyType t = 0; t < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX; ++t) { + Unit *other; + Set *s = manager_get_units_needing_mounts_for(UNIT(m)->manager, m->where, t); + + SET_FOREACH(other, s) { + if (other->load_state != UNIT_LOADED) + continue; + + if (other == UNIT(m)) + continue; + + r = unit_add_dependency( + other, + UNIT_AFTER, + UNIT(m), + /* add_reference= */ true, + UNIT_DEPENDENCY_PATH); if (r < 0) return r; + + if (UNIT(m)->fragment_path) { + /* If we have fragment configuration, then make this dependency required/wanted */ + r = unit_add_dependency( + other, + unit_mount_dependency_type_to_dependency_type(t), + UNIT(m), + /* add_reference= */ true, + UNIT_DEPENDENCY_PATH); + if (r < 0) + return r; + } } } @@ -413,39 +405,9 @@ static int mount_add_device_dependencies(Mount *m) { return 0; } -static int mount_add_quota_dependencies(Mount *m) { - MountParameters *p; - int r; - - assert(m); - - if (!MANAGER_IS_SYSTEM(UNIT(m)->manager)) - return 0; - - p = get_mount_parameters_fragment(m); - if (!p) - return 0; - - if (!mount_needs_quota(p)) - return 0; - - r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTACHECK_SERVICE, - /* add_reference= */ true, UNIT_DEPENDENCY_FILE); - if (r < 0) - return r; - - r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTAON_SERVICE, - /* add_reference= */true, UNIT_DEPENDENCY_FILE); - if (r < 0) - return r; - - return 0; -} - static bool mount_is_extrinsic(Unit *u) { + Mount *m = ASSERT_PTR(MOUNT(u)); MountParameters *p; - Mount *m = MOUNT(u); - assert(m); /* Returns true for all units that are "magic" and should be excluded from the usual * start-up and shutdown dependencies. We call them "extrinsic" here, as they are generally @@ -501,10 +463,7 @@ static int mount_add_default_ordering_dependencies(Mount *m, MountParameters *p, after = SPECIAL_LOCAL_FS_PRE_TARGET; before = SPECIAL_INITRD_USR_FS_TARGET; - } else if (mount_is_credentials(m)) - after = before = NULL; - - else if (mount_is_network(p)) { + } else if (mount_is_network(p)) { after = SPECIAL_REMOTE_FS_PRE_TARGET; before = SPECIAL_REMOTE_FS_TARGET; @@ -645,6 +604,9 @@ static int mount_add_non_exec_dependencies(Mount *m) { if (!m->where) return 0; + if (mount_is_credentials(m)) + UNIT(m)->default_dependencies = false; + /* Adds in all dependencies directly responsible for ordering the mount, as opposed to dependencies * resulting from the ExecContext and such. */ @@ -656,10 +618,6 @@ static int mount_add_non_exec_dependencies(Mount *m) { if (r < 0) return r; - r = mount_add_quota_dependencies(m); - if (r < 0) - return r; - r = mount_add_default_dependencies(m); if (r < 0) return r; @@ -668,11 +626,9 @@ static int mount_add_non_exec_dependencies(Mount *m) { } static int mount_add_extras(Mount *m) { - Unit *u = UNIT(m); + Unit *u = UNIT(ASSERT_PTR(m)); int r; - assert(m); - /* Note: this call might be called after we already have been loaded once (and even when it has already been * activated), in case data from /proc/self/mountinfo has changed. This means all code here needs to be ready * to run with an already set up unit. */ @@ -717,7 +673,7 @@ static int mount_add_extras(Mount *m) { } static void mount_load_root_mount(Unit *u) { - assert(u); + Mount *m = ASSERT_PTR(MOUNT(u)); if (!unit_has_name(u, SPECIAL_ROOT_MOUNT)) return; @@ -726,37 +682,35 @@ static void mount_load_root_mount(Unit *u) { u->default_dependencies = false; /* The stdio/kmsg bridge socket is on /, in order to avoid a dep loop, don't use kmsg logging for -.mount */ - MOUNT(u)->exec_context.std_output = EXEC_OUTPUT_NULL; - MOUNT(u)->exec_context.std_input = EXEC_INPUT_NULL; + m->exec_context.std_output = EXEC_OUTPUT_NULL; + m->exec_context.std_input = EXEC_INPUT_NULL; if (!u->description) u->description = strdup("Root Mount"); } static int mount_load(Unit *u) { - Mount *m = MOUNT(u); - int r, q = 0; + Mount *m = ASSERT_PTR(MOUNT(u)); + int r; - assert(m); - assert(u); assert(u->load_state == UNIT_STUB); mount_load_root_mount(u); - bool fragment_optional = m->from_proc_self_mountinfo || u->perpetual; - r = unit_load_fragment_and_dropin(u, !fragment_optional); + bool from_kernel = m->from_proc_self_mountinfo || u->perpetual; + + r = unit_load_fragment_and_dropin(u, /* fragment_required = */ !from_kernel); /* Add in some extras. Note we do this in all cases (even if we failed to load the unit) when announced by the * kernel, because we need some things to be set up no matter what when the kernel establishes a mount and thus * we need to update the state in our unit to track it. After all, consider that we don't allow changing the * 'slice' field for a unit once it is active. */ - if (u->load_state == UNIT_LOADED || m->from_proc_self_mountinfo || u->perpetual) - q = mount_add_extras(m); + if (u->load_state == UNIT_LOADED || from_kernel) + RET_GATHER(r, mount_add_extras(m)); if (r < 0) return r; - if (q < 0) - return q; + if (u->load_state != UNIT_LOADED) return 0; @@ -765,6 +719,7 @@ static int mount_load(Unit *u) { static void mount_set_state(Mount *m, MountState state) { MountState old_state; + assert(m); if (m->state != state) @@ -787,10 +742,9 @@ static void mount_set_state(Mount *m, MountState state) { } static int mount_coldplug(Unit *u) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); int r; - assert(m); assert(m->state == MOUNT_DEAD); if (m->deserialized_state == m->state) @@ -809,17 +763,17 @@ static int mount_coldplug(Unit *u) { return r; } - if (!IN_SET(m->deserialized_state, MOUNT_DEAD, MOUNT_FAILED)) + if (!IN_SET(m->deserialized_state, MOUNT_DEAD, MOUNT_FAILED)) { (void) unit_setup_exec_runtime(u); + (void) unit_setup_cgroup_runtime(u); + } mount_set_state(m, m->deserialized_state); return 0; } static void mount_catchup(Unit *u) { - Mount *m = MOUNT(ASSERT_PTR(u)); - - assert(m); + Mount *m = ASSERT_PTR(MOUNT(u)); /* Adjust the deserialized state. See comments in mount_process_proc_self_mountinfo(). */ if (m->from_proc_self_mountinfo) @@ -854,12 +808,15 @@ static void mount_catchup(Unit *u) { } static void mount_dump(Unit *u, FILE *f, const char *prefix) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); MountParameters *p; + const char *prefix2; - assert(m); assert(f); + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + p = get_mount_parameters(m); fprintf(f, @@ -904,14 +861,22 @@ static void mount_dump(Unit *u, FILE *f, const char *prefix) { exec_context_dump(&m->exec_context, f, prefix); kill_context_dump(&m->kill_context, f, prefix); cgroup_context_dump(UNIT(m), f, prefix); + + for (MountExecCommand c = 0; c < _MOUNT_EXEC_COMMAND_MAX; c++) { + if (!m->exec_command[c].argv) + continue; + + fprintf(f, "%s%s %s:\n", + prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), mount_exec_command_to_string(c)); + + exec_command_dump(m->exec_command + c, f, prefix2); + } } static int mount_spawn(Mount *m, ExecCommand *c, PidRef *ret_pid) { - _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT( EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN); _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - pid_t pid; int r; assert(m); @@ -936,11 +901,7 @@ static int mount_spawn(Mount *m, ExecCommand *c, PidRef *ret_pid) { &exec_params, m->exec_runtime, &m->cgroup_context, - &pid); - if (r < 0) - return r; - - r = pidref_set_pid(&pidref, pid); + &pidref); if (r < 0) return r; @@ -1025,13 +986,7 @@ static void mount_enter_signal(Mount *m, MountState state, MountResult f) { if (m->result == MOUNT_SUCCESS) m->result = f; - r = unit_kill_context( - UNIT(m), - &m->kill_context, - state_to_kill_operation(state), - /* main_pid= */ NULL, - &m->control_pid, - /* main_pid_alien= */ false); + r = unit_kill_context(UNIT(m), state_to_kill_operation(state)); if (r < 0) { log_unit_warning_errno(UNIT(m), r, "Failed to kill processes: %m"); goto fail; @@ -1166,9 +1121,9 @@ static int mount_set_mount_command(Mount *m, ExecCommand *c, const MountParamete } static void mount_enter_mounting(Mount *m) { - int r; MountParameters *p; bool source_is_dir = true; + int r; assert(m); @@ -1192,6 +1147,34 @@ static void mount_enter_mounting(Mount *m) { if (r < 0 && r != -EEXIST) log_unit_warning_errno(UNIT(m), r, "Failed to create mount point '%s', ignoring: %m", m->where); + /* If we are asked to create an OverlayFS, create the upper/work directories if they are missing */ + if (p && streq_ptr(p->fstype, "overlay")) { + _cleanup_strv_free_ char **dirs = NULL; + + r = fstab_filter_options( + p->options, + "upperdir\0workdir\0", + /* ret_namefound= */ NULL, + /* ret_value= */ NULL, + &dirs, + /* ret_filtered= */ NULL); + if (r < 0) + log_unit_warning_errno( + UNIT(m), + r, + "Failed to determine upper directory for OverlayFS, ignoring: %m"); + else + STRV_FOREACH(d, dirs) { + r = mkdir_p_label(*d, m->directory_mode); + if (r < 0 && r != -EEXIST) + log_unit_warning_errno( + UNIT(m), + r, + "Failed to create overlay directory '%s', ignoring: %m", + *d); + } + } + if (source_is_dir) unit_warn_if_dir_nonempty(UNIT(m), m->where); unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_start); @@ -1249,8 +1232,8 @@ static void mount_set_reload_result(Mount *m, MountResult result) { } static void mount_enter_remounting(Mount *m) { - int r; MountParameters *p; + int r; assert(m); @@ -1312,15 +1295,15 @@ static void mount_cycle_clear(Mount *m) { m->result = MOUNT_SUCCESS; m->reload_result = MOUNT_SUCCESS; exec_command_reset_status_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX); - UNIT(m)->reset_accounting = true; + + if (m->cgroup_runtime) + m->cgroup_runtime->reset_accounting = true; } static int mount_start(Unit *u) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); int r; - assert(m); - /* We cannot fulfill this request right now, try again later * please! */ if (IN_SET(m->state, @@ -1347,9 +1330,7 @@ static int mount_start(Unit *u) { } static int mount_stop(Unit *u) { - Mount *m = MOUNT(u); - - assert(m); + Mount *m = ASSERT_PTR(MOUNT(u)); /* When we directly call umount() for a path, then the state of the corresponding mount unit may be * outdated. Let's re-read mountinfo now and update the state. */ @@ -1401,9 +1382,8 @@ static int mount_stop(Unit *u) { } static int mount_reload(Unit *u) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); - assert(m); assert(m->state == MOUNT_MOUNTED); mount_enter_remounting(m); @@ -1412,9 +1392,8 @@ static int mount_reload(Unit *u) { } static int mount_serialize(Unit *u, FILE *f, FDSet *fds) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); - assert(m); assert(f); assert(fds); @@ -1431,11 +1410,9 @@ static int mount_serialize(Unit *u, FILE *f, FDSet *fds) { } static int mount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); int r; - assert(m); - assert(u); assert(key); assert(value); assert(fds); @@ -1495,21 +1472,19 @@ static int mount_deserialize_item(Unit *u, const char *key, const char *value, F } static UnitActiveState mount_active_state(Unit *u) { - assert(u); + Mount *m = ASSERT_PTR(MOUNT(u)); - return state_translation_table[MOUNT(u)->state]; + return state_translation_table[m->state]; } static const char *mount_sub_state_to_string(Unit *u) { - assert(u); + Mount *m = ASSERT_PTR(MOUNT(u)); - return mount_state_to_string(MOUNT(u)->state); + return mount_state_to_string(m->state); } static bool mount_may_gc(Unit *u) { - Mount *m = MOUNT(u); - - assert(m); + Mount *m = ASSERT_PTR(MOUNT(u)); if (m->from_proc_self_mountinfo) return false; @@ -1518,10 +1493,9 @@ static bool mount_may_gc(Unit *u) { } static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); MountResult f; - assert(m); assert(pid >= 0); if (pid != m->control_pid.pid) @@ -1653,9 +1627,8 @@ static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) { } static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { - Mount *m = MOUNT(userdata); + Mount *m = ASSERT_PTR(MOUNT(userdata)); - assert(m); assert(m->timer_event_source == source); switch (m->state) { @@ -1738,6 +1711,7 @@ static int mount_setup_new_unit( Unit **ret) { _cleanup_(unit_freep) Unit *u = NULL; + Mount *mnt; int r; assert(m); @@ -1749,24 +1723,26 @@ static int mount_setup_new_unit( if (r < 0) return r; + mnt = ASSERT_PTR(MOUNT(u)); + r = free_and_strdup(&u->source_path, "/proc/self/mountinfo"); if (r < 0) return r; - r = free_and_strdup(&MOUNT(u)->where, where); + r = free_and_strdup(&mnt->where, where); if (r < 0) return r; - r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype); + r = update_parameters_proc_self_mountinfo(mnt, what, options, fstype); if (r < 0) return r; /* This unit was generated because /proc/self/mountinfo reported it. Remember this, so that by the * time we load the unit file for it (and thus add in extra deps right after) we know what source to * attributes the deps to. */ - MOUNT(u)->from_proc_self_mountinfo = true; + mnt->from_proc_self_mountinfo = true; - r = mount_add_non_exec_dependencies(MOUNT(u)); + r = mount_add_non_exec_dependencies(mnt); if (r < 0) return r; @@ -1787,14 +1763,16 @@ static int mount_setup_existing_unit( const char *fstype, MountProcFlags *ret_flags) { + Mount *m = ASSERT_PTR(MOUNT(u)); int r; assert(u); + assert(where); assert(ret_flags); - if (!MOUNT(u)->where) { - MOUNT(u)->where = strdup(where); - if (!MOUNT(u)->where) + if (!m->where) { + m->where = strdup(where); + if (!m->where) return -ENOMEM; } @@ -1802,10 +1780,9 @@ static int mount_setup_existing_unit( * for the current unit. Note that the flags field is reset on each iteration of reading * /proc/self/mountinfo, hence we know for sure anything already set here is from the current * iteration and thus worthy of taking into account. */ - MountProcFlags flags = - MOUNT(u)->proc_flags | MOUNT_PROC_IS_MOUNTED; + MountProcFlags flags = m->proc_flags | MOUNT_PROC_IS_MOUNTED; - r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype); + r = update_parameters_proc_self_mountinfo(m, what, options, fstype); if (r < 0) return r; if (r > 0) @@ -1818,12 +1795,12 @@ static int mount_setup_existing_unit( * from the serialized state), and need to catch up. Since we know that the MOUNT_MOUNTING state is * reached when we wait for the mount to appear we hence can assume that if we are in it, we are * actually seeing it established for the first time. */ - if (!MOUNT(u)->from_proc_self_mountinfo || MOUNT(u)->state == MOUNT_MOUNTING) + if (!m->from_proc_self_mountinfo || m->state == MOUNT_MOUNTING) flags |= MOUNT_PROC_JUST_MOUNTED; - MOUNT(u)->from_proc_self_mountinfo = true; + m->from_proc_self_mountinfo = true; - if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) { + if (UNIT_IS_LOAD_ERROR(u->load_state)) { /* The unit was previously not found or otherwise not loaded. Now that the unit shows up in * /proc/self/mountinfo we should reconsider it this, hence set it to UNIT_LOADED. */ u->load_state = UNIT_LOADED; @@ -1835,7 +1812,7 @@ static int mount_setup_existing_unit( if (FLAGS_SET(flags, MOUNT_PROC_JUST_CHANGED)) { /* If things changed, then make sure that all deps are regenerated. Let's * first remove all automatic deps, and then add in the new ones. */ - r = mount_add_non_exec_dependencies(MOUNT(u)); + r = mount_add_non_exec_dependencies(m); if (r < 0) return r; } @@ -1950,14 +1927,27 @@ static void mount_shutdown(Manager *m) { m->mount_monitor = NULL; } +static void mount_handoff_timestamp( + Unit *u, + const struct ucred *ucred, + const dual_timestamp *ts) { + + Mount *m = ASSERT_PTR(MOUNT(u)); + + assert(ucred); + assert(ts); + + if (m->control_pid.pid == ucred->pid && m->control_command) { + exec_status_handoff(&m->control_command->exec_status, ucred, ts); + unit_add_to_dbus_queue(u); + } +} + static int mount_get_timeout(Unit *u, usec_t *timeout) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); usec_t t; int r; - assert(m); - assert(u); - if (!m->timer_event_source) return 0; @@ -2063,7 +2053,7 @@ static void mount_enumerate(Manager *m) { goto fail; } - r = sd_event_source_set_priority(m->mount_event_source, SD_EVENT_PRIORITY_NORMAL-10); + r = sd_event_source_set_priority(m->mount_event_source, EVENT_PRIORITY_MOUNT_TABLE); if (r < 0) { log_error_errno(r, "Failed to adjust mount watch priority: %m"); goto fail; @@ -2330,19 +2320,15 @@ fail: } static int mount_can_clean(Unit *u, ExecCleanMask *ret) { - Mount *m = MOUNT(u); - - assert(m); + Mount *m = ASSERT_PTR(MOUNT(u)); return exec_context_get_clean_mask(&m->exec_context, ret); } static int mount_can_start(Unit *u) { - Mount *m = MOUNT(u); + Mount *m = ASSERT_PTR(MOUNT(u)); int r; - assert(m); - r = unit_test_start_limit(u); if (r < 0) { mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT, /* flush_result = */ false); @@ -2440,6 +2426,7 @@ const UnitVTable mount_vtable = { .cgroup_context_offset = offsetof(Mount, cgroup_context), .kill_context_offset = offsetof(Mount, kill_context), .exec_runtime_offset = offsetof(Mount, exec_runtime), + .cgroup_runtime_offset = offsetof(Mount, cgroup_runtime), .sections = "Unit\0" @@ -2482,6 +2469,8 @@ const UnitVTable mount_vtable = { .reset_failed = mount_reset_failed, + .notify_handoff_timestamp = mount_handoff_timestamp, + .control_pid = mount_control_pid, .bus_set_property = bus_mount_set_property, diff --git a/src/core/mount.h b/src/core/mount.h index 6712c16..a029dc8 100644 --- a/src/core/mount.h +++ b/src/core/mount.h @@ -79,6 +79,7 @@ struct Mount { CGroupContext cgroup_context; ExecRuntime *exec_runtime; + CGroupRuntime *cgroup_runtime; MountState state, deserialized_state; diff --git a/src/core/namespace.c b/src/core/namespace.c index 88681aa..6c0dc94 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -47,6 +47,7 @@ #include "tmpfile-util.h" #include "umask-util.h" #include "user-util.h" +#include "vpick.h" #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC) @@ -500,9 +501,24 @@ static int append_extensions( /* First, prepare a mount for each image, but these won't be visible to the unit, instead * they will be mounted in our propagate directory, and used as a source for the overlay. */ for (size_t i = 0; i < n; i++) { + _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL; _cleanup_free_ char *mount_point = NULL; const MountImage *m = mount_images + i; + r = path_pick(/* toplevel_path= */ NULL, + /* toplevel_fd= */ AT_FDCWD, + m->source, + &pick_filter_image_raw, + PICK_ARCHITECTURE|PICK_TRIES, + &result); + if (r < 0) + return r; + if (!result.path) + return log_debug_errno( + SYNTHETIC_ERRNO(ENOENT), + "No matching entry in .v/ directory %s found.", + m->source); + if (asprintf(&mount_point, "%s/%zu", extension_dir, i) < 0) return -ENOMEM; @@ -524,7 +540,7 @@ static int append_extensions( .path_malloc = TAKE_PTR(mount_point), .image_options_const = m->mount_options, .ignore = m->ignore_enoent, - .source_const = m->source, + .source_malloc = TAKE_PTR(result.path), .mode = MOUNT_EXTENSION_IMAGE, .has_prefix = true, }; @@ -534,7 +550,8 @@ static int append_extensions( * Bind mount them in the same location as the ExtensionImages, so that we * can check that they are valid trees (extension-release.d). */ STRV_FOREACH(extension_directory, extension_directories) { - _cleanup_free_ char *mount_point = NULL, *source = NULL; + _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL; + _cleanup_free_ char *mount_point = NULL; const char *e = *extension_directory; bool ignore_enoent = false; @@ -551,9 +568,19 @@ static int append_extensions( if (startswith(e, "+")) e++; - source = strdup(e); - if (!source) - return -ENOMEM; + r = path_pick(/* toplevel_path= */ NULL, + /* toplevel_fd= */ AT_FDCWD, + e, + &pick_filter_image_dir, + PICK_ARCHITECTURE|PICK_TRIES, + &result); + if (r < 0) + return r; + if (!result.path) + return log_debug_errno( + SYNTHETIC_ERRNO(ENOENT), + "No matching entry in .v/ directory %s found.", + e); for (size_t j = 0; hierarchies && hierarchies[j]; ++j) { char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]); @@ -571,7 +598,7 @@ static int append_extensions( *me = (MountEntry) { .path_malloc = TAKE_PTR(mount_point), - .source_malloc = TAKE_PTR(source), + .source_malloc = TAKE_PTR(result.path), .mode = MOUNT_EXTENSION_DIRECTORY, .ignore = ignore_enoent, .has_prefix = true, @@ -626,8 +653,7 @@ static int append_tmpfs_mounts(MountList *ml, const TemporaryFileSystem *tmpfs, return log_debug_errno(r, "Failed to parse mount option '%s': %m", str); ro = flags & MS_RDONLY; - if (ro) - flags ^= MS_RDONLY; + flags &= ~MS_RDONLY; MountEntry *me = mount_list_extend(ml); if (!me) @@ -876,42 +902,41 @@ static void drop_outside_root(MountList *ml, const char *root_directory) { ml->n_mounts = t - ml->mounts; } -static int clone_device_node( - const char *d, - const char *temporary_mount, - bool *make_devnode) { - +static int clone_device_node(const char *node, const char *temporary_mount, bool *make_devnode) { _cleanup_free_ char *sl = NULL; - const char *dn, *bn, *t; + const char *dn, *bn; struct stat st; int r; - if (stat(d, &st) < 0) { + assert(node); + assert(path_is_absolute(node)); + assert(temporary_mount); + assert(make_devnode); + + if (stat(node, &st) < 0) { if (errno == ENOENT) { - log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d); + log_debug_errno(errno, "Device node '%s' to clone does not exist.", node); return -ENXIO; } - return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d); + return log_debug_errno(errno, "Failed to stat() device node '%s' to clone: %m", node); } - if (!S_ISBLK(st.st_mode) && - !S_ISCHR(st.st_mode)) - return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), - "Device node '%s' to clone is not a device node, ignoring.", - d); + r = stat_verify_device_node(&st); + if (r < 0) + return log_debug_errno(r, "Cannot clone device node '%s': %m", node); - dn = strjoina(temporary_mount, d); + dn = strjoina(temporary_mount, node); /* First, try to create device node properly */ if (*make_devnode) { - mac_selinux_create_file_prepare(d, st.st_mode); + mac_selinux_create_file_prepare(node, st.st_mode); r = mknod(dn, st.st_mode, st.st_rdev); mac_selinux_create_file_clear(); if (r >= 0) goto add_symlink; if (errno != EPERM) - return log_debug_errno(errno, "mknod failed for %s: %m", d); + return log_debug_errno(errno, "Failed to mknod '%s': %m", node); /* This didn't work, let's not try this again for the next iterations. */ *make_devnode = false; @@ -921,17 +946,17 @@ static int clone_device_node( * Do not prepare device-node SELinux label (see issue 13762) */ r = mknod(dn, S_IFREG, 0); if (r < 0 && errno != EEXIST) - return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d); + return log_debug_errno(errno, "Failed to mknod dummy device node for '%s': %m", node); /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */ - r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL); + r = mount_nofollow_verbose(LOG_DEBUG, node, dn, NULL, MS_BIND, NULL); if (r < 0) return r; add_symlink: - bn = path_startswith(d, "/dev/"); + bn = path_startswith(node, "/dev/"); if (!bn) return 0; @@ -944,14 +969,27 @@ add_symlink: (void) mkdir_parents(sl, 0755); - t = strjoina("../", bn); + const char *t = strjoina("../", bn); if (symlink(t, sl) < 0) log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl); return 0; } -static char *settle_runtime_dir(RuntimeScope scope) { +static int bind_mount_device_dir(const char *temporary_mount, const char *dir) { + const char *t; + + assert(temporary_mount); + assert(dir); + assert(path_is_absolute(dir)); + + t = strjoina(temporary_mount, dir); + + (void) mkdir(t, 0755); + return mount_nofollow_verbose(LOG_DEBUG, dir, t, NULL, MS_BIND, NULL); +} + +static char* settle_runtime_dir(RuntimeScope scope) { char *runtime_dir; if (scope != RUNTIME_SCOPE_USER) @@ -992,8 +1030,8 @@ static int mount_private_dev(MountEntry *m, RuntimeScope scope) { "/dev/urandom\0" "/dev/tty\0"; - _cleanup_free_ char *temporary_mount = NULL; - const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL; + _cleanup_(rmdir_and_freep) char *temporary_mount = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *dev = NULL; bool can_mknod = true; int r; @@ -1003,67 +1041,56 @@ static int mount_private_dev(MountEntry *m, RuntimeScope scope) { if (r < 0) return r; - dev = strjoina(temporary_mount, "/dev"); + dev = path_join(temporary_mount, "dev"); + if (!dev) + return -ENOMEM; + (void) mkdir(dev, 0755); r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV); if (r < 0) - goto fail; + return r; r = label_fix_full(AT_FDCWD, dev, "/dev", 0); - if (r < 0) { - log_debug_errno(r, "Failed to fix label of '%s' as /dev: %m", dev); - goto fail; - } + if (r < 0) + return log_debug_errno(r, "Failed to fix label of '%s' as /dev/: %m", dev); - devpts = strjoina(temporary_mount, "/dev/pts"); - (void) mkdir(devpts, 0755); - r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL); + r = bind_mount_device_dir(temporary_mount, "/dev/pts"); if (r < 0) - goto fail; + return r; /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx. * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible. * Thus, in that case make a clone. * In nspawn and other containers it will be a symlink, in that case make it a symlink. */ r = is_symlink("/dev/ptmx"); - if (r < 0) { - log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m"); - goto fail; - } else if (r > 0) { - devptmx = strjoina(temporary_mount, "/dev/ptmx"); - if (symlink("pts/ptmx", devptmx) < 0) { - r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx); - goto fail; - } + if (r < 0) + return log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m"); + if (r > 0) { + const char *devptmx = strjoina(temporary_mount, "/dev/ptmx"); + if (symlink("pts/ptmx", devptmx) < 0) + return log_debug_errno(errno, "Failed to create symlink '%s' to pts/ptmx: %m", devptmx); } else { r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod); if (r < 0) - goto fail; + return r; } - devshm = strjoina(temporary_mount, "/dev/shm"); - (void) mkdir(devshm, 0755); - r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL); + r = bind_mount_device_dir(temporary_mount, "/dev/shm"); if (r < 0) - goto fail; - - devmqueue = strjoina(temporary_mount, "/dev/mqueue"); - (void) mkdir(devmqueue, 0755); - (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL); + return r; - devhugepages = strjoina(temporary_mount, "/dev/hugepages"); - (void) mkdir(devhugepages, 0755); - (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL); + FOREACH_STRING(d, "/dev/mqueue", "/dev/hugepages") + (void) bind_mount_device_dir(temporary_mount, d); - devlog = strjoina(temporary_mount, "/dev/log"); + const char *devlog = strjoina(temporary_mount, "/dev/log"); if (symlink("/run/systemd/journal/dev-log", devlog) < 0) - log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog); + log_debug_errno(errno, "Failed to create symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog); NULSTR_FOREACH(d, devnodes) { r = clone_device_node(d, temporary_mount, &can_mknod); /* ENXIO means the *source* is not a device file, skip creation in that case */ if (r < 0 && r != -ENXIO) - goto fail; + return r; } r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID); @@ -1081,31 +1108,10 @@ static int mount_private_dev(MountEntry *m, RuntimeScope scope) { r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL); if (r < 0) - goto fail; - - (void) rmdir(dev); - (void) rmdir(temporary_mount); + return r; + dev = rmdir_and_free(dev); /* Mount is successfully moved, do not umount() */ return 1; - -fail: - if (devpts) - (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW); - - if (devshm) - (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW); - - if (devhugepages) - (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW); - - if (devmqueue) - (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW); - - (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW); - (void) rmdir(dev); - (void) rmdir(temporary_mount); - - return r; } static int mount_bind_dev(const MountEntry *m) { @@ -1118,7 +1124,7 @@ static int mount_bind_dev(const MountEntry *m) { (void) mkdir_p_label(mount_entry_path(m), 0755); - r = path_is_mount_point(mount_entry_path(m), NULL, 0); + r = path_is_mount_point(mount_entry_path(m)); if (r < 0) return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m"); if (r > 0) /* make this a NOP if /dev is already a mount point */ @@ -1138,7 +1144,7 @@ static int mount_bind_sysfs(const MountEntry *m) { (void) mkdir_p_label(mount_entry_path(m), 0755); - r = path_is_mount_point(mount_entry_path(m), NULL, 0); + r = path_is_mount_point(mount_entry_path(m)); if (r < 0) return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m"); if (r > 0) /* make this a NOP if /sys is already a mount point */ @@ -1185,7 +1191,7 @@ static int mount_private_apivfs( /* When we do not have enough privileges to mount a new instance, fall back to use an * existing mount. */ - r = path_is_mount_point(entry_path, /* root = */ NULL, /* flags = */ 0); + r = path_is_mount_point(entry_path); if (r < 0) return log_debug_errno(r, "Unable to determine whether '%s' is already mounted: %m", entry_path); if (r > 0) @@ -1300,7 +1306,7 @@ static int mount_run(const MountEntry *m) { assert(m); - r = path_is_mount_point(mount_entry_path(m), NULL, 0); + r = path_is_mount_point(mount_entry_path(m)); if (r < 0 && r != -ENOENT) return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m"); if (r > 0) /* make this a NOP if /run is already a mount point */ @@ -1354,7 +1360,7 @@ static int mount_image( if (r < 0) return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory)); if (isempty(host_os_release_id)) - return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory)); + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s'.", empty_to_root(root_directory)); } r = verity_dissect_and_mount( @@ -1448,6 +1454,8 @@ static int follow_symlink( _cleanup_free_ char *target = NULL; int r; + assert(m); + /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the @@ -1469,7 +1477,7 @@ static int follow_symlink( mount_entry_consume_prefix(m, TAKE_PTR(target)); - m->n_followed ++; + m->n_followed++; return 0; } @@ -1524,7 +1532,7 @@ static int apply_one_mount( r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible); if (r < 0) return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), - "File type not supported for inaccessible mounts. Note that symlinks are not allowed"); + "File type not supported for inaccessible mounts. Note that symlinks are not allowed."); what = inaccessible; break; } @@ -1534,7 +1542,7 @@ static int apply_one_mount( case MOUNT_READ_WRITE_IMPLICIT: case MOUNT_EXEC: case MOUNT_NOEXEC: - r = path_is_mount_point(mount_entry_path(m), root_directory, 0); + r = path_is_mount_point_full(mount_entry_path(m), root_directory, /* flags = */ 0); if (r == -ENOENT && m->ignore) return 0; if (r < 0) @@ -1575,7 +1583,7 @@ static int apply_one_mount( if (r < 0) return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory)); if (isempty(host_os_release_id)) - return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory)); + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s'.", empty_to_root(root_directory)); r = load_extension_release_pairs(mount_entry_source(m), class, extension_name, /* relax_extension_release_check= */ false, &extension_release); if (r == -ENOENT && m->ignore) @@ -1588,13 +1596,13 @@ static int apply_one_mount( host_os_release_id, host_os_release_version_id, host_os_release_level, - /* host_extension_scope */ NULL, /* Leave empty, we need to accept both system and portable */ + /* host_extension_scope = */ NULL, /* Leave empty, we need to accept both system and portable */ extension_release, class); - if (r == 0) - return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's", extension_name); if (r < 0) return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's.", extension_name); _fallthrough_; } @@ -2049,9 +2057,9 @@ static bool root_read_only( } static bool home_read_only( - char** read_only_paths, - char** inaccessible_paths, - char** empty_directories, + char * const *read_only_paths, + char * const *inaccessible_paths, + char * const *empty_directories, const BindMount *bind_mounts, size_t n_bind_mounts, const TemporaryFileSystem *temporary_filesystems, @@ -2070,13 +2078,13 @@ static bool home_read_only( prefixed_path_strv_contains(empty_directories, "/home")) return true; - for (size_t i = 0; i < n_temporary_filesystems; i++) - if (path_equal(temporary_filesystems[i].path, "/home")) + FOREACH_ARRAY(i, temporary_filesystems, n_temporary_filesystems) + if (path_equal(i->path, "/home")) return true; /* If /home is overmounted with some dir from the host it's not writable. */ - for (size_t i = 0; i < n_bind_mounts; i++) - if (path_equal(bind_mounts[i].destination, "/home")) + FOREACH_ARRAY(i, bind_mounts, n_bind_mounts) + if (path_equal(i->destination, "/home")) return true; return false; @@ -2088,6 +2096,7 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) { _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; _cleanup_strv_free_ char **hierarchies = NULL; _cleanup_(mount_list_done) MountList ml = {}; + _cleanup_close_ int userns_fd = -EBADF; bool require_prefix = false; const char *root; DissectImageFlags dissect_image_flags = @@ -2099,7 +2108,8 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) { DISSECT_IMAGE_USR_NO_ROOT | DISSECT_IMAGE_GROWFS | DISSECT_IMAGE_ADD_PARTITION_DEVICES | - DISSECT_IMAGE_PIN_PARTITION_DEVICES; + DISSECT_IMAGE_PIN_PARTITION_DEVICES | + DISSECT_IMAGE_ALLOW_USERSPACE_VERITY; int r; assert(p); @@ -2123,40 +2133,57 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) { SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path); - r = loop_device_make_by_path( - p->root_image, - FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, - /* sector_size= */ UINT32_MAX, - FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, - LOCK_SH, - &loop_device); - if (r < 0) - return log_debug_errno(r, "Failed to create loop device for root image: %m"); - - r = dissect_loop_device( - loop_device, - p->verity, - p->root_image_options, - p->root_image_policy, - dissect_image_flags, - &dissected_image); - if (r < 0) - return log_debug_errno(r, "Failed to dissect image: %m"); + if (p->runtime_scope == RUNTIME_SCOPE_SYSTEM) { + /* In system mode we mount directly */ - r = dissected_image_load_verity_sig_partition( - dissected_image, - loop_device->fd, - p->verity); - if (r < 0) - return r; + r = loop_device_make_by_path( + p->root_image, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */, + /* sector_size= */ UINT32_MAX, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, + &loop_device); + if (r < 0) + return log_debug_errno(r, "Failed to create loop device for root image: %m"); + + r = dissect_loop_device( + loop_device, + p->verity, + p->root_image_options, + p->root_image_policy, + dissect_image_flags, + &dissected_image); + if (r < 0) + return log_debug_errno(r, "Failed to dissect image: %m"); - r = dissected_image_decrypt( - dissected_image, - NULL, - p->verity, - dissect_image_flags); - if (r < 0) - return log_debug_errno(r, "Failed to decrypt dissected image: %m"); + r = dissected_image_load_verity_sig_partition( + dissected_image, + loop_device->fd, + p->verity); + if (r < 0) + return r; + + r = dissected_image_decrypt( + dissected_image, + NULL, + p->verity, + dissect_image_flags); + if (r < 0) + return log_debug_errno(r, "Failed to decrypt dissected image: %m"); + } else { + userns_fd = namespace_open_by_type(NAMESPACE_USER); + if (userns_fd < 0) + return log_debug_errno(userns_fd, "Failed to open our own user namespace: %m"); + + r = mountfsd_mount_image( + p->root_image, + userns_fd, + p->root_image_policy, + dissect_image_flags, + &dissected_image); + if (r < 0) + return r; + } } if (p->root_directory) @@ -2520,16 +2547,18 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) { root, /* uid_shift= */ UID_INVALID, /* uid_range= */ UID_INVALID, - /* userns_fd= */ -EBADF, + userns_fd, dissect_image_flags); if (r < 0) return log_debug_errno(r, "Failed to mount root image: %m"); /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device * if it likes. */ - r = loop_device_flock(loop_device, LOCK_UN); - if (r < 0) - return log_debug_errno(r, "Failed to release lock on loopback block device: %m"); + if (loop_device) { + r = loop_device_flock(loop_device, LOCK_UN); + if (r < 0) + return log_debug_errno(r, "Failed to release lock on loopback block device: %m"); + } r = dissected_image_relinquish(dissected_image); if (r < 0) @@ -2538,7 +2567,7 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) { } else if (p->root_directory) { /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */ - r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW); + r = path_is_mount_point_full(root, /* root = */ NULL, AT_SYMLINK_FOLLOW); if (r < 0) return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root); if (r == 0) { @@ -2595,9 +2624,9 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) { void bind_mount_free_many(BindMount *b, size_t n) { assert(b || n == 0); - for (size_t i = 0; i < n; i++) { - free(b[i].source); - free(b[i].destination); + FOREACH_ARRAY(i, b, n) { + free(i->source); + free(i->destination); } free(b); @@ -2625,7 +2654,7 @@ int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) { *b = c; - c[(*n) ++] = (BindMount) { + c[(*n)++] = (BindMount) { .source = TAKE_PTR(s), .destination = TAKE_PTR(d), .read_only = item->read_only, @@ -2694,7 +2723,7 @@ int mount_image_add(MountImage **m, size_t *n, const MountImage *item) { *m = c; - c[(*n) ++] = (MountImage) { + c[(*n)++] = (MountImage) { .source = TAKE_PTR(s), .destination = TAKE_PTR(d), .mount_options = TAKE_PTR(options), @@ -2745,7 +2774,7 @@ int temporary_filesystem_add( *t = c; - c[(*n) ++] = (TemporaryFileSystem) { + c[(*n)++] = (TemporaryFileSystem) { .path = TAKE_PTR(p), .options = TAKE_PTR(o), }; diff --git a/src/core/path.c b/src/core/path.c index ef00c20..fdb6ca4 100644 --- a/src/core/path.c +++ b/src/core/path.c @@ -90,7 +90,7 @@ int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler) { /* If this is a symlink watch both the symlink inode and where it points to. If the inode is * not a symlink both calls will install the same watch, which is redundant and doesn't * hurt. */ - for (int follow_symlink = 0; follow_symlink < 2; follow_symlink ++) { + for (int follow_symlink = 0; follow_symlink < 2; follow_symlink++) { uint32_t f = flags; SET_FLAG(f, IN_DONT_FOLLOW, !follow_symlink); @@ -249,6 +249,8 @@ static bool path_spec_check_good(PathSpec *s, bool initial, bool from_trigger_no static void path_spec_mkdir(PathSpec *s, mode_t mode) { int r; + assert(s); + if (IN_SET(s->type, PATH_EXISTS, PATH_EXISTS_GLOB)) return; @@ -260,6 +262,10 @@ static void path_spec_mkdir(PathSpec *s, mode_t mode) { static void path_spec_dump(PathSpec *s, FILE *f, const char *prefix) { const char *type; + assert(s); + assert(f); + assert(prefix); + assert_se(type = path_type_to_string(s->type)); fprintf(f, "%s%s: %s\n", prefix, type, s->path); } @@ -272,9 +278,8 @@ void path_spec_done(PathSpec *s) { } static void path_init(Unit *u) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); - assert(u); assert(u->load_state == UNIT_STUB); p->directory_mode = 0755; @@ -295,9 +300,7 @@ void path_free_specs(Path *p) { } static void path_done(Unit *u) { - Path *p = PATH(u); - - assert(p); + Path *p = ASSERT_PTR(PATH(u)); p->trigger_notify_event_source = sd_event_source_disable_unref(p->trigger_notify_event_source); path_free_specs(p); @@ -309,7 +312,7 @@ static int path_add_mount_dependencies(Path *p) { assert(p); LIST_FOREACH(spec, s, p->specs) { - r = unit_require_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES); if (r < 0) return r; } @@ -389,10 +392,9 @@ static int path_add_extras(Path *p) { } static int path_load(Unit *u) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); int r; - assert(u); assert(u->load_state == UNIT_STUB); r = unit_load_fragment_and_dropin(u, true); @@ -410,11 +412,11 @@ static int path_load(Unit *u) { } static void path_dump(Unit *u, FILE *f, const char *prefix) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); Unit *trigger; - assert(p); assert(f); + assert(prefix); trigger = UNIT_TRIGGER(u); @@ -461,6 +463,7 @@ static int path_watch(Path *p) { static void path_set_state(Path *p, PathState state) { PathState old_state; + assert(p); if (p->state != state) @@ -481,9 +484,8 @@ static void path_set_state(Path *p, PathState state) { static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify); static int path_coldplug(Unit *u) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); - assert(p); assert(p->state == PATH_DEAD); if (p->deserialized_state != p->state) { @@ -625,10 +627,9 @@ static void path_mkdir(Path *p) { } static int path_start(Unit *u) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); int r; - assert(p); assert(IN_SET(p->state, PATH_DEAD, PATH_FAILED)); r = unit_test_trigger_loaded(u); @@ -648,9 +649,8 @@ static int path_start(Unit *u) { } static int path_stop(Unit *u) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); - assert(p); assert(IN_SET(p->state, PATH_WAITING, PATH_RUNNING)); path_enter_dead(p, PATH_SUCCESS); @@ -658,9 +658,8 @@ static int path_stop(Unit *u) { } static int path_serialize(Unit *u, FILE *f, FDSet *fds) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); - assert(u); assert(f); assert(fds); @@ -688,9 +687,8 @@ static int path_serialize(Unit *u, FILE *f, FDSet *fds) { } static int path_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); - assert(u); assert(key); assert(value); assert(fds); @@ -755,28 +753,24 @@ static int path_deserialize_item(Unit *u, const char *key, const char *value, FD } static UnitActiveState path_active_state(Unit *u) { - assert(u); + Path *p = ASSERT_PTR(PATH(u)); - return state_translation_table[PATH(u)->state]; + return state_translation_table[p->state]; } static const char *path_sub_state_to_string(Unit *u) { - assert(u); + Path *p = ASSERT_PTR(PATH(u)); - return path_state_to_string(PATH(u)->state); + return path_state_to_string(p->state); } static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { - PathSpec *s = userdata, *found = NULL; - Path *p; + PathSpec *s = ASSERT_PTR(userdata), *found = NULL; + Path *p = ASSERT_PTR(PATH(s->unit)); int changed; - assert(s); - assert(s->unit); assert(fd >= 0); - p = PATH(s->unit); - if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING)) return 0; @@ -827,10 +821,9 @@ static int path_trigger_notify_on_defer(sd_event_source *s, void *userdata) { } static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); int r; - assert(u); assert(other); /* Invoked whenever the unit we trigger changes state or gains or loses a job */ @@ -897,9 +890,7 @@ static void path_trigger_notify(Unit *u, Unit *other) { } static void path_reset_failed(Unit *u) { - Path *p = PATH(u); - - assert(p); + Path *p = ASSERT_PTR(PATH(u)); if (p->state == PATH_FAILED) path_set_state(p, PATH_DEAD); @@ -908,11 +899,9 @@ static void path_reset_failed(Unit *u) { } static int path_can_start(Unit *u) { - Path *p = PATH(u); + Path *p = ASSERT_PTR(PATH(u)); int r; - assert(p); - r = unit_test_start_limit(u); if (r < 0) { path_enter_dead(p, PATH_FAILURE_START_LIMIT_HIT); @@ -961,13 +950,11 @@ static int activation_details_path_deserialize(const char *key, const char *valu } static int activation_details_path_append_env(ActivationDetails *details, char ***strv) { - ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details); + ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details)); char *s; int r; - assert(details); assert(strv); - assert(p); if (isempty(p->trigger_path_filename)) return 0; @@ -984,21 +971,15 @@ static int activation_details_path_append_env(ActivationDetails *details, char * } static int activation_details_path_append_pair(ActivationDetails *details, char ***strv) { - ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details); + ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details)); int r; - assert(details); assert(strv); - assert(p); if (isempty(p->trigger_path_filename)) return 0; - r = strv_extend(strv, "trigger_path"); - if (r < 0) - return r; - - r = strv_extend(strv, p->trigger_path_filename); + r = strv_extend_many(strv, "trigger_path", p->trigger_path_filename); if (r < 0) return r; diff --git a/src/core/restrict-ifaces.c b/src/core/restrict-ifaces.c deleted file mode 100644 index 4dd8656..0000000 --- a/src/core/restrict-ifaces.c +++ /dev/null @@ -1,200 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1-or-later */ - -#include "fd-util.h" -#include "restrict-ifaces.h" -#include "netlink-util.h" - -#if BPF_FRAMEWORK -/* libbpf, clang and llc compile time dependencies are satisfied */ - -#include "bpf-dlopen.h" -#include "bpf-link.h" -#include "bpf-util.h" -#include "bpf/restrict_ifaces/restrict-ifaces-skel.h" - -static struct restrict_ifaces_bpf *restrict_ifaces_bpf_free(struct restrict_ifaces_bpf *obj) { - restrict_ifaces_bpf__destroy(obj); - return NULL; -} - -DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_ifaces_bpf *, restrict_ifaces_bpf_free); - -static int prepare_restrict_ifaces_bpf( - Unit* u, - bool is_allow_list, - const Set *restrict_network_interfaces, - struct restrict_ifaces_bpf **ret_object) { - - _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; - _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; - char *iface; - int r, map_fd; - - assert(ret_object); - - obj = restrict_ifaces_bpf__open(); - if (!obj) - return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "restrict-interfaces: Failed to open BPF object: %m"); - - r = sym_bpf_map__set_max_entries(obj->maps.sd_restrictif, MAX(set_size(restrict_network_interfaces), 1u)); - if (r != 0) - return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r, - "restrict-interfaces: Failed to resize BPF map '%s': %m", - sym_bpf_map__name(obj->maps.sd_restrictif)); - - obj->rodata->is_allow_list = is_allow_list; - - r = restrict_ifaces_bpf__load(obj); - if (r != 0) - return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, r, "restrict-interfaces: Failed to load BPF object: %m"); - - map_fd = sym_bpf_map__fd(obj->maps.sd_restrictif); - - SET_FOREACH(iface, restrict_network_interfaces) { - uint8_t dummy = 0; - int ifindex; - - ifindex = rtnl_resolve_interface(&rtnl, iface); - if (ifindex < 0) { - log_unit_warning_errno(u, ifindex, - "restrict-interfaces: Couldn't find index of network interface '%s', ignoring: %m", - iface); - continue; - } - - if (sym_bpf_map_update_elem(map_fd, &ifindex, &dummy, BPF_ANY)) - return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno, - "restrict-interfaces: Failed to update BPF map '%s' fd: %m", - sym_bpf_map__name(obj->maps.sd_restrictif)); - } - - *ret_object = TAKE_PTR(obj); - return 0; -} - -int restrict_network_interfaces_supported(void) { - _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; - static int supported = -1; - int r; - - if (supported >= 0) - return supported; - - if (!cgroup_bpf_supported()) - return (supported = false); - - if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SKB, /*opts=*/NULL)) { - log_debug("restrict-interfaces: BPF program type cgroup_skb is not supported"); - return (supported = false); - } - - r = prepare_restrict_ifaces_bpf(NULL, true, NULL, &obj); - if (r < 0) { - log_debug_errno(r, "restrict-interfaces: Failed to load BPF object: %m"); - return (supported = false); - } - - return (supported = bpf_can_link_program(obj->progs.sd_restrictif_i)); -} - -static int restrict_network_interfaces_install_impl(Unit *u) { - _cleanup_(bpf_link_freep) struct bpf_link *egress_link = NULL, *ingress_link = NULL; - _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; - _cleanup_free_ char *cgroup_path = NULL; - _cleanup_close_ int cgroup_fd = -EBADF; - CGroupContext *cc; - int r; - - cc = unit_get_cgroup_context(u); - if (!cc) - return 0; - - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path); - if (r < 0) - return log_unit_error_errno(u, r, "restrict-interfaces: Failed to get cgroup path: %m"); - - if (!cc->restrict_network_interfaces) - return 0; - - r = prepare_restrict_ifaces_bpf(u, - cc->restrict_network_interfaces_is_allow_list, - cc->restrict_network_interfaces, - &obj); - if (r < 0) - return r; - - cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC | O_DIRECTORY, 0); - if (cgroup_fd < 0) - return -errno; - - ingress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_i, cgroup_fd); - r = sym_libbpf_get_error(ingress_link); - if (r != 0) - return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create ingress cgroup link: %m"); - - egress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_e, cgroup_fd); - r = sym_libbpf_get_error(egress_link); - if (r != 0) - return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create egress cgroup link: %m"); - - u->restrict_ifaces_ingress_bpf_link = TAKE_PTR(ingress_link); - u->restrict_ifaces_egress_bpf_link = TAKE_PTR(egress_link); - - return 0; -} - -int restrict_network_interfaces_install(Unit *u) { - int r = restrict_network_interfaces_install_impl(u); - fdset_close(u->initial_restric_ifaces_link_fds); - return r; -} - -int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) { - int r; - - assert(u); - - r = bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_ingress_bpf_link); - if (r < 0) - return r; - - return bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_egress_bpf_link); -} - -int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) { - int r; - - assert(u); - - if (!u->initial_restric_ifaces_link_fds) { - u->initial_restric_ifaces_link_fds = fdset_new(); - if (!u->initial_restric_ifaces_link_fds) - return log_oom(); - } - - r = fdset_put(u->initial_restric_ifaces_link_fds, fd); - if (r < 0) - return log_unit_error_errno(u, r, - "restrict-interfaces: Failed to put restrict-ifaces-bpf-fd %d to restored fdset: %m", fd); - - return 0; -} - -#else /* ! BPF_FRAMEWORK */ -int restrict_network_interfaces_supported(void) { - return 0; -} - -int restrict_network_interfaces_install(Unit *u) { - return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), - "restrict-interfaces: Failed to install; BPF programs built from source code are not supported: %m"); -} - -int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) { - return 0; -} - -int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) { - return 0; -} -#endif diff --git a/src/core/restrict-ifaces.h b/src/core/restrict-ifaces.h deleted file mode 100644 index 6e7a824..0000000 --- a/src/core/restrict-ifaces.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1-or-later */ -#pragma once - -#include "fdset.h" -#include "unit.h" - -typedef struct Unit Unit; - -int restrict_network_interfaces_supported(void); -int restrict_network_interfaces_install(Unit *u); - -int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds); - -/* Add BPF link fd created before daemon-reload or daemon-reexec. - * FDs will be closed at the end of restrict_network_interfaces_install. */ -int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd); diff --git a/src/core/scope.c b/src/core/scope.c index 2841280..cfa2aeb 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -23,21 +23,20 @@ #include "user-util.h" static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = { - [SCOPE_DEAD] = UNIT_INACTIVE, - [SCOPE_START_CHOWN] = UNIT_ACTIVATING, - [SCOPE_RUNNING] = UNIT_ACTIVE, - [SCOPE_ABANDONED] = UNIT_ACTIVE, + [SCOPE_DEAD] = UNIT_INACTIVE, + [SCOPE_START_CHOWN] = UNIT_ACTIVATING, + [SCOPE_RUNNING] = UNIT_ACTIVE, + [SCOPE_ABANDONED] = UNIT_ACTIVE, [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING, [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING, - [SCOPE_FAILED] = UNIT_FAILED, + [SCOPE_FAILED] = UNIT_FAILED, }; static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); static void scope_init(Unit *u) { - Scope *s = SCOPE(u); + Scope *s = ASSERT_PTR(SCOPE(u)); - assert(u); assert(u->load_state == UNIT_STUB); s->runtime_max_usec = USEC_INFINITY; @@ -48,9 +47,7 @@ static void scope_init(Unit *u) { } static void scope_done(Unit *u) { - Scope *s = SCOPE(u); - - assert(u); + Scope *s = ASSERT_PTR(SCOPE(u)); s->controller = mfree(s->controller); s->controller_track = sd_bus_track_unref(s->controller_track); @@ -84,6 +81,7 @@ static int scope_arm_timer(Scope *s, bool relative, usec_t usec) { static void scope_set_state(Scope *s, ScopeState state) { ScopeState old_state; + assert(s); if (s->state != state) @@ -101,7 +99,8 @@ static void scope_set_state(Scope *s, ScopeState state) { } if (state != old_state) - log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state)); + log_unit_debug(UNIT(s), "Changed %s -> %s", + scope_state_to_string(old_state), scope_state_to_string(state)); unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); } @@ -181,10 +180,9 @@ static int scope_add_extras(Scope *s) { } static int scope_load(Unit *u) { - Scope *s = SCOPE(u); + Scope *s = ASSERT_PTR(SCOPE(u)); int r; - assert(s); assert(u->load_state == UNIT_STUB); if (!u->transient && !MANAGER_IS_RELOADING(u->manager)) @@ -227,10 +225,9 @@ static usec_t scope_coldplug_timeout(Scope *s) { } static int scope_coldplug(Unit *u) { - Scope *s = SCOPE(u); + Scope *s = ASSERT_PTR(SCOPE(u)); int r; - assert(s); assert(s->state == SCOPE_DEAD); if (s->deserialized_state == s->state) @@ -260,10 +257,10 @@ static int scope_coldplug(Unit *u) { } static void scope_dump(Unit *u, FILE *f, const char *prefix) { - Scope *s = SCOPE(u); + Scope *s = ASSERT_PTR(SCOPE(u)); - assert(s); assert(f); + assert(prefix); fprintf(f, "%sScope State: %s\n" @@ -277,7 +274,7 @@ static void scope_dump(Unit *u, FILE *f, const char *prefix) { prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC), prefix, oom_policy_to_string(s->oom_policy)); - cgroup_context_dump(UNIT(s), f, prefix); + cgroup_context_dump(u, f, prefix); kill_context_dump(&s->kill_context, f, prefix); } @@ -317,13 +314,9 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) { else { r = unit_kill_context( UNIT(s), - &s->kill_context, state != SCOPE_STOP_SIGTERM ? KILL_KILL : s->was_abandoned ? KILL_TERMINATE_AND_LOG : - KILL_TERMINATE, - /* main_pid= */ NULL, - /* control_pid= */ NULL, - /* main_pid_alien= */ false); + KILL_TERMINATE); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); goto fail; @@ -350,13 +343,15 @@ fail: } static int scope_enter_start_chown(Scope *s) { + Unit *u = UNIT(ASSERT_PTR(s)); _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - Unit *u = UNIT(s); int r; - assert(s); assert(s->user); + if (!s->cgroup_runtime) + return -EINVAL; + r = scope_arm_timer(s, /* relative= */ true, u->manager->defaults.timeout_start_usec); if (r < 0) return r; @@ -389,7 +384,7 @@ static int scope_enter_start_chown(Scope *s) { } } - r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, uid, gid); + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, s->cgroup_runtime->cgroup_path, uid, gid); if (r < 0) { log_unit_error_errno(UNIT(s), r, "Failed to adjust control group access: %m"); _exit(EXIT_CGROUP); @@ -411,11 +406,9 @@ fail: } static int scope_enter_running(Scope *s) { - Unit *u = UNIT(s); + Unit *u = UNIT(ASSERT_PTR(s)); int r; - assert(s); - (void) bus_scope_track_controller(s); r = unit_acquire_invocation_id(u); @@ -458,9 +451,7 @@ fail: } static int scope_start(Unit *u) { - Scope *s = SCOPE(u); - - assert(s); + Scope *s = ASSERT_PTR(SCOPE(u)); if (unit_has_name(u, SPECIAL_INIT_SCOPE)) return -EPERM; @@ -489,9 +480,7 @@ static int scope_start(Unit *u) { } static int scope_stop(Unit *u) { - Scope *s = SCOPE(u); - - assert(s); + Scope *s = ASSERT_PTR(SCOPE(u)); if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) return 0; @@ -503,9 +492,7 @@ static int scope_stop(Unit *u) { } static void scope_reset_failed(Unit *u) { - Scope *s = SCOPE(u); - - assert(s); + Scope *s = ASSERT_PTR(SCOPE(u)); if (s->state == SCOPE_FAILED) scope_set_state(s, SCOPE_DEAD); @@ -514,7 +501,7 @@ static void scope_reset_failed(Unit *u) { } static int scope_get_timeout(Unit *u, usec_t *timeout) { - Scope *s = SCOPE(u); + Scope *s = ASSERT_PTR(SCOPE(u)); usec_t t; int r; @@ -532,10 +519,9 @@ static int scope_get_timeout(Unit *u, usec_t *timeout) { } static int scope_serialize(Unit *u, FILE *f, FDSet *fds) { - Scope *s = SCOPE(u); + Scope *s = ASSERT_PTR(SCOPE(u)); PidRef *pid; - assert(s); assert(f); assert(fds); @@ -552,10 +538,9 @@ static int scope_serialize(Unit *u, FILE *f, FDSet *fds) { } static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Scope *s = SCOPE(u); + Scope *s = ASSERT_PTR(SCOPE(u)); int r; - assert(u); assert(key); assert(value); assert(fds); @@ -600,8 +585,7 @@ static int scope_deserialize_item(Unit *u, const char *key, const char *value, F } static void scope_notify_cgroup_empty_event(Unit *u) { - Scope *s = SCOPE(u); - assert(u); + Scope *s = ASSERT_PTR(SCOPE(u)); log_unit_debug(u, "cgroup is empty"); @@ -610,7 +594,7 @@ static void scope_notify_cgroup_empty_event(Unit *u) { } static void scope_notify_cgroup_oom_event(Unit *u, bool managed_oom) { - Scope *s = SCOPE(u); + Scope *s = ASSERT_PTR(SCOPE(u)); if (managed_oom) log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd."); @@ -642,9 +626,7 @@ static void scope_notify_cgroup_oom_event(Unit *u, bool managed_oom) { } static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) { - Scope *s = SCOPE(u); - - assert(s); + Scope *s = ASSERT_PTR(SCOPE(u)); if (s->state == SCOPE_START_CHOWN) { if (!is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL)) @@ -662,9 +644,8 @@ static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) { } static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { - Scope *s = SCOPE(userdata); + Scope *s = ASSERT_PTR(SCOPE(userdata)); - assert(s); assert(s->timer_event_source == source); switch (s->state) { @@ -726,15 +707,15 @@ int scope_abandon(Scope *s) { } static UnitActiveState scope_active_state(Unit *u) { - assert(u); + Scope *s = ASSERT_PTR(SCOPE(u)); - return state_translation_table[SCOPE(u)->state]; + return state_translation_table[s->state]; } static const char *scope_sub_state_to_string(Unit *u) { - assert(u); + Scope *s = ASSERT_PTR(SCOPE(u)); - return scope_state_to_string(SCOPE(u)->state); + return scope_state_to_string(s->state); } static void scope_enumerate_perpetual(Manager *m) { @@ -782,6 +763,7 @@ const UnitVTable scope_vtable = { .object_size = sizeof(Scope), .cgroup_context_offset = offsetof(Scope, cgroup_context), .kill_context_offset = offsetof(Scope, kill_context), + .cgroup_runtime_offset = offsetof(Scope, cgroup_runtime), .sections = "Unit\0" @@ -806,8 +788,7 @@ const UnitVTable scope_vtable = { .start = scope_start, .stop = scope_stop, - .freeze = unit_freeze_vtable_common, - .thaw = unit_thaw_vtable_common, + .freezer_action = unit_cgroup_freezer_action, .get_timeout = scope_get_timeout, diff --git a/src/core/scope.h b/src/core/scope.h index c9574a3..1090431 100644 --- a/src/core/scope.h +++ b/src/core/scope.h @@ -21,6 +21,7 @@ struct Scope { CGroupContext cgroup_context; KillContext kill_context; + CGroupRuntime *cgroup_runtime; ScopeState state, deserialized_state; ScopeResult result; diff --git a/src/core/selinux-access.c b/src/core/selinux-access.c index 62181a6..a67a520 100644 --- a/src/core/selinux-access.c +++ b/src/core/selinux-access.c @@ -193,7 +193,6 @@ int mac_selinux_access_check_internal( assert(message); assert(permission); assert(function); - assert(error); r = access_init(error); if (r <= 0) @@ -248,7 +247,7 @@ int mac_selinux_access_check_internal( tclass = "system"; } - sd_bus_creds_get_cmdline(creds, &cmdline); + (void) sd_bus_creds_get_cmdline(creds, &cmdline); cl = strv_join(cmdline, " "); struct audit_info audit_info = { @@ -268,7 +267,7 @@ int mac_selinux_access_check_internal( log_full_errno_zerook(LOG_DEBUG, r, "SELinux access check scon=%s tcon=%s tclass=%s perm=%s state=%s function=%s path=%s cmdline=%s: %m", - scon, acon, tclass, permission, enforce ? "enforcing" : "permissive", function, strna(unit_path), strna(empty_to_null(cl))); + scon, acon, tclass, permission, enforce ? "enforcing" : "permissive", function, strna(unit_path), empty_to_na(cl)); return enforce ? r : 0; } diff --git a/src/core/service.c b/src/core/service.c index ffe92d2..8ec27c4 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -24,6 +24,7 @@ #include "fd-util.h" #include "fileio.h" #include "format-util.h" +#include "io-util.h" #include "load-dropin.h" #include "load-fragment.h" #include "log.h" @@ -34,6 +35,7 @@ #include "path-util.h" #include "process-util.h" #include "random-util.h" +#include "selinux-util.h" #include "serialize.h" #include "service.h" #include "signal-util.h" @@ -49,61 +51,61 @@ #define service_spawn(...) service_spawn_internal(__func__, __VA_ARGS__) static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = { - [SERVICE_DEAD] = UNIT_INACTIVE, - [SERVICE_CONDITION] = UNIT_ACTIVATING, - [SERVICE_START_PRE] = UNIT_ACTIVATING, - [SERVICE_START] = UNIT_ACTIVATING, - [SERVICE_START_POST] = UNIT_ACTIVATING, - [SERVICE_RUNNING] = UNIT_ACTIVE, - [SERVICE_EXITED] = UNIT_ACTIVE, - [SERVICE_RELOAD] = UNIT_RELOADING, - [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING, - [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING, - [SERVICE_STOP] = UNIT_DEACTIVATING, - [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, - [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, - [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, - [SERVICE_STOP_POST] = UNIT_DEACTIVATING, - [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING, - [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, - [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, - [SERVICE_FAILED] = UNIT_FAILED, - [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE, + [SERVICE_DEAD] = UNIT_INACTIVE, + [SERVICE_CONDITION] = UNIT_ACTIVATING, + [SERVICE_START_PRE] = UNIT_ACTIVATING, + [SERVICE_START] = UNIT_ACTIVATING, + [SERVICE_START_POST] = UNIT_ACTIVATING, + [SERVICE_RUNNING] = UNIT_ACTIVE, + [SERVICE_EXITED] = UNIT_ACTIVE, + [SERVICE_RELOAD] = UNIT_RELOADING, + [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING, + [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING, + [SERVICE_STOP] = UNIT_DEACTIVATING, + [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_STOP_POST] = UNIT_DEACTIVATING, + [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_FAILED] = UNIT_FAILED, + [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE, [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED, - [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE, - [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, - [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING, - [SERVICE_CLEANING] = UNIT_MAINTENANCE, + [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE, + [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, + [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING, + [SERVICE_CLEANING] = UNIT_MAINTENANCE, }; /* For Type=idle we never want to delay any other jobs, hence we * consider idle jobs active as soon as we start working on them */ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = { - [SERVICE_DEAD] = UNIT_INACTIVE, - [SERVICE_CONDITION] = UNIT_ACTIVE, - [SERVICE_START_PRE] = UNIT_ACTIVE, - [SERVICE_START] = UNIT_ACTIVE, - [SERVICE_START_POST] = UNIT_ACTIVE, - [SERVICE_RUNNING] = UNIT_ACTIVE, - [SERVICE_EXITED] = UNIT_ACTIVE, - [SERVICE_RELOAD] = UNIT_RELOADING, - [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING, - [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING, - [SERVICE_STOP] = UNIT_DEACTIVATING, - [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, - [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, - [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, - [SERVICE_STOP_POST] = UNIT_DEACTIVATING, - [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING, - [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, - [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, - [SERVICE_FAILED] = UNIT_FAILED, - [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE, + [SERVICE_DEAD] = UNIT_INACTIVE, + [SERVICE_CONDITION] = UNIT_ACTIVE, + [SERVICE_START_PRE] = UNIT_ACTIVE, + [SERVICE_START] = UNIT_ACTIVE, + [SERVICE_START_POST] = UNIT_ACTIVE, + [SERVICE_RUNNING] = UNIT_ACTIVE, + [SERVICE_EXITED] = UNIT_ACTIVE, + [SERVICE_RELOAD] = UNIT_RELOADING, + [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING, + [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING, + [SERVICE_STOP] = UNIT_DEACTIVATING, + [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_STOP_POST] = UNIT_DEACTIVATING, + [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SERVICE_FAILED] = UNIT_FAILED, + [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE, [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED, - [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE, - [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, - [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING, - [SERVICE_CLEANING] = UNIT_MAINTENANCE, + [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE, + [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING, + [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING, + [SERVICE_CLEANING] = UNIT_MAINTENANCE, }; static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata); @@ -114,6 +116,25 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev static void service_enter_signal(Service *s, ServiceState state, ServiceResult f); static void service_enter_reload_by_notify(Service *s); +static bool SERVICE_STATE_WITH_MAIN_PROCESS(ServiceState state) { + return IN_SET(state, + SERVICE_START, SERVICE_START_POST, + SERVICE_RUNNING, + SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL); +} + +static bool SERVICE_STATE_WITH_CONTROL_PROCESS(ServiceState state) { + return IN_SET(state, + SERVICE_CONDITION, + SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, + SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, + SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, + SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, + SERVICE_CLEANING); +} + static void service_init(Unit *u) { Service *s = SERVICE(u); @@ -151,25 +172,17 @@ static void service_init(Unit *u) { static void service_unwatch_control_pid(Service *s) { assert(s); - - if (!pidref_is_set(&s->control_pid)) - return; - - unit_unwatch_pidref(UNIT(s), &s->control_pid); - pidref_done(&s->control_pid); + unit_unwatch_pidref_done(UNIT(s), &s->control_pid); } static void service_unwatch_main_pid(Service *s) { assert(s); - - if (!pidref_is_set(&s->main_pid)) - return; - - unit_unwatch_pidref(UNIT(s), &s->main_pid); - pidref_done(&s->main_pid); + unit_unwatch_pidref_done(UNIT(s), &s->main_pid); } static void service_unwatch_pid_file(Service *s) { + assert(s); + if (!s->pid_file_pathspec) return; @@ -179,42 +192,41 @@ static void service_unwatch_pid_file(Service *s) { s->pid_file_pathspec = mfree(s->pid_file_pathspec); } -static int service_set_main_pidref(Service *s, PidRef *pidref) { +static int service_set_main_pidref(Service *s, PidRef pidref_consume, const dual_timestamp *start_timestamp) { + _cleanup_(pidref_done) PidRef pidref = pidref_consume; int r; assert(s); - /* Takes ownership of the specified pidref on success, but not on failure. */ + /* Takes ownership of the specified pidref on both success and failure. */ - if (!pidref_is_set(pidref)) + if (!pidref_is_set(&pidref)) return -ESRCH; - if (pidref->pid <= 1) + if (pidref.pid <= 1) return -EINVAL; - if (pidref_is_self(pidref)) + if (pidref_is_self(&pidref)) return -EINVAL; - if (pidref_equal(&s->main_pid, pidref) && s->main_pid_known) { - pidref_done(pidref); + if (s->main_pid_known && pidref_equal(&s->main_pid, &pidref)) return 0; - } - if (!pidref_equal(&s->main_pid, pidref)) { + if (!pidref_equal(&s->main_pid, &pidref)) { service_unwatch_main_pid(s); - exec_status_start(&s->main_exec_status, pidref->pid); + exec_status_start(&s->main_exec_status, pidref.pid, start_timestamp); } - s->main_pid = TAKE_PIDREF(*pidref); + s->main_pid = TAKE_PIDREF(pidref); s->main_pid_known = true; r = pidref_is_my_child(&s->main_pid); if (r < 0) log_unit_warning_errno(UNIT(s), r, "Can't determine if process "PID_FMT" is our child, assuming it is not: %m", s->main_pid.pid); - else if (r == 0) + else if (r == 0) // FIXME: Supervise through pidfd here log_unit_warning(UNIT(s), "Supervising process "PID_FMT" which is not our child. We'll most likely not notice when it exits.", s->main_pid.pid); - s->main_pid_alien = r <= 0; + return 0; } @@ -290,7 +302,7 @@ static void service_start_watchdog(Service *s) { /* Let's process everything else which might be a sign * of living before we consider a service died. */ - r = sd_event_source_set_priority(s->watchdog_event_source, SD_EVENT_PRIORITY_IDLE); + r = sd_event_source_set_priority(s->watchdog_event_source, EVENT_PRIORITY_SERVICE_WATCHDOG); } if (r < 0) log_unit_warning_errno(UNIT(s), r, "Failed to install watchdog timer: %m"); @@ -429,7 +441,7 @@ static void service_release_fd_store(Service *s) { static void service_release_stdio_fd(Service *s) { assert(s); - if (s->stdin_fd < 0 && s->stdout_fd < 0 && s->stdout_fd < 0) + if (s->stdin_fd < 0 && s->stdout_fd < 0 && s->stderr_fd < 0) return; log_unit_debug(UNIT(s), "Releasing stdin/stdout/stderr file descriptors."); @@ -438,10 +450,9 @@ static void service_release_stdio_fd(Service *s) { s->stdout_fd = asynchronous_close(s->stdout_fd); s->stderr_fd = asynchronous_close(s->stderr_fd); } -static void service_done(Unit *u) { - Service *s = SERVICE(u); - assert(s); +static void service_done(Unit *u) { + Service *s = ASSERT_PTR(SERVICE(u)); open_file_free_many(&s->open_files); @@ -449,6 +460,7 @@ static void service_done(Unit *u) { s->status_text = mfree(s->status_text); s->exec_runtime = exec_runtime_free(s->exec_runtime); + exec_command_free_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX); s->control_command = NULL; s->main_command = NULL; @@ -511,7 +523,8 @@ static int service_add_fd_store(Service *s, int fd_in, const char *name, bool do if (fstat(fd, &st) < 0) return -errno; - log_unit_debug(UNIT(s), "Trying to stash fd for dev=" DEVNUM_FORMAT_STR "/inode=%" PRIu64, DEVNUM_FORMAT_VAL(st.st_dev), (uint64_t) st.st_ino); + log_unit_debug(UNIT(s), "Trying to stash fd for dev=" DEVNUM_FORMAT_STR "/inode=%" PRIu64, + DEVNUM_FORMAT_VAL(st.st_dev), (uint64_t) st.st_ino); if (s->n_fd_store >= s->n_fd_store_max) /* Our store is full. Use this errno rather than E[NM]FILE to distinguish from the case @@ -545,17 +558,16 @@ static int service_add_fd_store(Service *s, int fd_in, const char *name, bool do r = sd_event_add_io(UNIT(s)->manager->event, &fs->event_source, fs->fd, 0, on_fd_store_io, fs); if (r < 0 && r != -EPERM) /* EPERM indicates fds that aren't pollable, which is OK */ return r; - else if (r >= 0) + if (r >= 0) (void) sd_event_source_set_description(fs->event_source, "service-fd-store"); } + log_unit_debug(UNIT(s), "Added fd %i (%s) to fd store.", fs->fd, fs->fdname); + fs->service = s; - LIST_PREPEND(fd_store, s->fd_store, fs); + LIST_PREPEND(fd_store, s->fd_store, TAKE_PTR(fs)); s->n_fd_store++; - log_unit_debug(UNIT(s), "Added fd %i (%s) to fd store.", fs->fd, fs->fdname); - - TAKE_PTR(fs); return 1; /* fd newly stored */ } @@ -654,9 +666,6 @@ static int service_verify(Service *s) { if (s->type == SERVICE_ONESHOT && IN_SET(s->restart, SERVICE_RESTART_ALWAYS, SERVICE_RESTART_ON_SUCCESS)) return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has Restart= set to either always or on-success, which isn't allowed for Type=oneshot services. Refusing."); - if (s->type == SERVICE_ONESHOT && !exit_status_set_is_empty(&s->restart_force_status)) - return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has RestartForceExitStatus= set, which isn't allowed for Type=oneshot services. Refusing."); - if (s->type == SERVICE_ONESHOT && s->exit_type == SERVICE_EXIT_CGROUP) return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has ExitType=cgroup set, which isn't allowed for Type=oneshot services. Refusing."); @@ -856,7 +865,7 @@ static int service_add_extras(Service *s) { } static int service_load(Unit *u) { - Service *s = SERVICE(u); + Service *s = ASSERT_PTR(SERVICE(u)); int r; r = unit_load_fragment_and_dropin(u, true); @@ -901,21 +910,19 @@ static void service_dump_fdstore(Service *s, FILE *f, const char *prefix) { "%s%s '%s' (type=%s; dev=" DEVNUM_FORMAT_STR "; inode=%" PRIu64 "; rdev=" DEVNUM_FORMAT_STR "; path=%s; access=%s)\n", prefix, i == s->fd_store ? "File Descriptor Store Entry:" : " ", i->fdname, - inode_type_to_string(st.st_mode), + strna(inode_type_to_string(st.st_mode)), DEVNUM_FORMAT_VAL(st.st_dev), (uint64_t) st.st_ino, DEVNUM_FORMAT_VAL(st.st_rdev), strna(path), - accmode_to_string(flags)); + strna(accmode_to_string(flags))); } } static void service_dump(Unit *u, FILE *f, const char *prefix) { - Service *s = SERVICE(u); + Service *s = ASSERT_PTR(SERVICE(u)); const char *prefix2; - assert(s); - prefix = strempty(prefix); prefix2 = strjoina(prefix, "\t"); @@ -1016,8 +1023,8 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) { if (!s->exec_command[c]) continue; - fprintf(f, "%s-> %s:\n", - prefix, service_exec_command_to_string(c)); + fprintf(f, "%s%s %s:\n", + prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), service_exec_command_to_string(c)); exec_command_dump_list(s->exec_command[c], f, prefix2); } @@ -1159,7 +1166,7 @@ static int service_load_pid_file(Service *s, bool may_warn) { } else log_unit_debug(UNIT(s), "Main PID loaded: "PID_FMT, pidref.pid); - r = service_set_main_pidref(s, &pidref); + r = service_set_main_pidref(s, TAKE_PIDREF(pidref), /* start_timestamp = */ NULL); if (r < 0) return r; @@ -1189,7 +1196,7 @@ static void service_search_main_pid(Service *s) { return; log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid.pid); - if (service_set_main_pidref(s, &pid) < 0) + if (service_set_main_pidref(s, TAKE_PIDREF(pid), /* start_timestamp = */ NULL) < 0) return; r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false); @@ -1224,22 +1231,12 @@ static void service_set_state(Service *s, ServiceState state) { SERVICE_CLEANING)) s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); - if (!IN_SET(state, - SERVICE_START, SERVICE_START_POST, - SERVICE_RUNNING, - SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, - SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, - SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { + if (!SERVICE_STATE_WITH_MAIN_PROCESS(state)) { service_unwatch_main_pid(s); s->main_command = NULL; } - if (!IN_SET(state, - SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, - SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, - SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, - SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, - SERVICE_CLEANING)) { + if (!SERVICE_STATE_WITH_CONTROL_PROCESS(state)) { service_unwatch_control_pid(s); s->control_command = NULL; s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; @@ -1326,12 +1323,7 @@ static int service_coldplug(Unit *u) { if (pidref_is_set(&s->main_pid) && pidref_is_unwaited(&s->main_pid) > 0 && - (IN_SET(s->deserialized_state, - SERVICE_START, SERVICE_START_POST, - SERVICE_RUNNING, - SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, - SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, - SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) { + SERVICE_STATE_WITH_MAIN_PROCESS(s->deserialized_state)) { r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false); if (r < 0) return r; @@ -1339,12 +1331,7 @@ static int service_coldplug(Unit *u) { if (pidref_is_set(&s->control_pid) && pidref_is_unwaited(&s->control_pid) > 0 && - IN_SET(s->deserialized_state, - SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, - SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, - SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, - SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, - SERVICE_CLEANING)) { + SERVICE_STATE_WITH_CONTROL_PROCESS(s->deserialized_state)) { r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false); if (r < 0) return r; @@ -1357,6 +1344,7 @@ static int service_coldplug(Unit *u) { SERVICE_DEAD_RESOURCES_PINNED)) { (void) unit_enqueue_rewatch_pids(u); (void) unit_setup_exec_runtime(u); + (void) unit_setup_cgroup_runtime(u); } if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY)) @@ -1418,13 +1406,12 @@ static int service_collect_fds( UNIT_FOREACH_DEPENDENCY(u, UNIT(s), UNIT_ATOM_TRIGGERED_BY) { _cleanup_free_ int *cfds = NULL; - Socket *sock; int cn_fds; - - if (u->type != UNIT_SOCKET) - continue; + Socket *sock; sock = SOCKET(u); + if (!sock) + continue; cn_fds = socket_collect_fds(sock, &cfds); if (cn_fds < 0) @@ -1436,18 +1423,8 @@ static int service_collect_fds( if (!rfds) { rfds = TAKE_PTR(cfds); rn_socket_fds = cn_fds; - } else { - int *t; - - t = reallocarray(rfds, rn_socket_fds + cn_fds, sizeof(int)); - if (!t) - return -ENOMEM; - - memcpy(t + rn_socket_fds, cfds, cn_fds * sizeof(int)); - - rfds = t; - rn_socket_fds += cn_fds; - } + } else if (!GREEDY_REALLOC_APPEND(rfds, rn_socket_fds, cfds, cn_fds)) + return -ENOMEM; r = strv_extend_n(&rfd_names, socket_fdname(sock), cn_fds); if (r < 0) @@ -1510,9 +1487,10 @@ static int service_allocate_exec_fd_event_source( if (r < 0) return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m"); - /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */ + /* This is a bit higher priority than SIGCHLD, to make sure we don't confuse the case "failed to + * start" from the case "succeeded to start, but failed immediately after". */ - r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3); + r = sd_event_source_set_priority(source, EVENT_PRIORITY_EXEC_FD); if (r < 0) return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m"); @@ -1602,12 +1580,52 @@ static Service *service_get_triggering_service(Service *s) { return NULL; } +static ExecFlags service_exec_flags(ServiceExecCommand command_id, ExecFlags cred_flag) { + /* All service main/control processes honor sandboxing and namespacing options (except those + explicitly excluded in service_spawn()) */ + ExecFlags flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT; + + assert(command_id >= 0); + assert(command_id < _SERVICE_EXEC_COMMAND_MAX); + assert((cred_flag & ~(EXEC_SETUP_CREDENTIALS_FRESH|EXEC_SETUP_CREDENTIALS)) == 0); + assert((cred_flag != 0) == (command_id == SERVICE_EXEC_START)); + + /* Control processes spawned before main process also get tty access */ + if (IN_SET(command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START)) + flags |= EXEC_APPLY_TTY_STDIN; + + /* All start phases get access to credentials. ExecStartPre= gets a new credential store upon + * every invocation, so that updating credential files through it works. When the first main process + * starts, passed creds become stable. Also see 'cred_flag'. */ + if (command_id == SERVICE_EXEC_START_PRE) + flags |= EXEC_SETUP_CREDENTIALS_FRESH; + if (command_id == SERVICE_EXEC_START_POST) + flags |= EXEC_SETUP_CREDENTIALS; + + if (IN_SET(command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START)) + flags |= EXEC_SETENV_MONITOR_RESULT; + + if (command_id == SERVICE_EXEC_START) + return flags|cred_flag|EXEC_PASS_FDS|EXEC_SET_WATCHDOG; + + flags |= EXEC_IS_CONTROL; + + /* Put control processes spawned later than main process under .control sub-cgroup if appropriate */ + if (!IN_SET(command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE)) + flags |= EXEC_CONTROL_CGROUP; + + if (IN_SET(command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST)) + flags |= EXEC_SETENV_RESULT; + + return flags; +} + static int service_spawn_internal( const char *caller, Service *s, ExecCommand *c, - usec_t timeout, ExecFlags flags, + usec_t timeout, PidRef *ret_pid) { _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(flags); @@ -1615,7 +1633,6 @@ static int service_spawn_internal( _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL; _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; size_t n_env = 0; - pid_t pid; int r; assert(caller); @@ -1631,7 +1648,7 @@ static int service_spawn_internal( assert(!s->exec_fd_event_source); - if (flags & EXEC_IS_CONTROL) { + if (FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL)) { /* If this is a control process, mask the permissions/chroot application if this is requested. */ if (s->permissions_start_only) exec_params.flags &= ~EXEC_APPLY_SANDBOXING; @@ -1639,7 +1656,7 @@ static int service_spawn_internal( exec_params.flags &= ~EXEC_APPLY_CHROOT; } - if ((flags & EXEC_PASS_FDS) || + if (FLAGS_SET(exec_params.flags, EXEC_PASS_FDS) || s->exec_context.std_input == EXEC_INPUT_SOCKET || s->exec_context.std_output == EXEC_OUTPUT_SOCKET || s->exec_context.std_error == EXEC_OUTPUT_SOCKET) { @@ -1654,10 +1671,12 @@ static int service_spawn_internal( exec_params.open_files = s->open_files; + exec_params.flags |= EXEC_PASS_FDS; + log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds); } - if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { + if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd); if (r < 0) return r; @@ -1671,7 +1690,7 @@ static int service_spawn_internal( if (!our_env) return -ENOMEM; - if (service_exec_needs_notify_socket(s, flags)) { + if (service_exec_needs_notify_socket(s, exec_params.flags)) { if (asprintf(our_env + n_env++, "NOTIFY_SOCKET=%s", UNIT(s)->manager->notify_socket) < 0) return -ENOMEM; @@ -1730,10 +1749,10 @@ static int service_spawn_internal( Service *env_source = NULL; const char *monitor_prefix; - if (flags & EXEC_SETENV_RESULT) { + if (FLAGS_SET(exec_params.flags, EXEC_SETENV_RESULT)) { env_source = s; monitor_prefix = ""; - } else if (flags & EXEC_SETENV_MONITOR_RESULT) { + } else if (FLAGS_SET(exec_params.flags, EXEC_SETENV_MONITOR_RESULT)) { env_source = service_get_triggering_service(s); monitor_prefix = "MONITOR_"; } @@ -1751,18 +1770,15 @@ static int service_spawn_internal( r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%i", monitor_prefix, env_source->main_exec_status.status); else r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%s", monitor_prefix, signal_to_string(env_source->main_exec_status.status)); - if (r < 0) return -ENOMEM; } if (env_source != s) { - if (!sd_id128_is_null(UNIT(env_source)->invocation_id)) { - r = asprintf(our_env + n_env++, "%sINVOCATION_ID=" SD_ID128_FORMAT_STR, - monitor_prefix, SD_ID128_FORMAT_VAL(UNIT(env_source)->invocation_id)); - if (r < 0) + if (!sd_id128_is_null(UNIT(env_source)->invocation_id)) + if (asprintf(our_env + n_env++, "%sINVOCATION_ID=" SD_ID128_FORMAT_STR, + monitor_prefix, SD_ID128_FORMAT_VAL(UNIT(env_source)->invocation_id)) < 0) return -ENOMEM; - } if (asprintf(our_env + n_env++, "%sUNIT=%s", monitor_prefix, UNIT(env_source)->id) < 0) return -ENOMEM; @@ -1806,17 +1822,13 @@ static int service_spawn_internal( &exec_params, s->exec_runtime, &s->cgroup_context, - &pid); + &pidref); if (r < 0) return r; s->exec_fd_event_source = TAKE_PTR(exec_fd_source); s->exec_fd_hot = false; - r = pidref_set_pid(&pidref, pid); - if (r < 0) - return r; - r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true); if (r < 0) return r; @@ -1864,10 +1876,10 @@ static int cgroup_good(Service *s) { /* Returns 0 if the cgroup is empty or doesn't exist, > 0 if it is exists and is populated, < 0 if we can't * figure it out */ - if (!UNIT(s)->cgroup_path) + if (!s->cgroup_runtime || !s->cgroup_runtime->cgroup_path) return 0; - r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path); + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, s->cgroup_runtime->cgroup_path); if (r < 0) return r; @@ -1876,6 +1888,7 @@ static int cgroup_good(Service *s) { static bool service_shall_restart(Service *s, const char **reason) { assert(s); + assert(reason); /* Don't restart after manual stops */ if (s->forbid_restart) { @@ -1891,6 +1904,13 @@ static bool service_shall_restart(Service *s, const char **reason) { /* Restart if the exit code/status are configured as restart triggers */ if (exit_status_set_test(&s->restart_force_status, s->main_exec_status.code, s->main_exec_status.status)) { + /* Don't allow Type=oneshot services to restart on success. Note that Restart=always/on-success + * is already rejected in service_verify. */ + if (s->type == SERVICE_ONESHOT && s->result == SERVICE_SUCCESS) { + *reason = "service type and exit status"; + return false; + } + *reason = "forced by exit status"; return true; } @@ -1962,7 +1982,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) } else if (s->result == SERVICE_SKIP_CONDITION) { unit_log_skip(UNIT(s), service_result_to_string(s->result)); end_state = service_determine_dead_state(s); - restart_state = SERVICE_DEAD_BEFORE_AUTO_RESTART; + restart_state = _SERVICE_STATE_INVALID; /* Never restart if skipped due to condition failure */ } else { unit_log_failure(UNIT(s), service_result_to_string(s->result)); end_state = SERVICE_FAILED; @@ -1984,8 +2004,10 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) if (allow_restart) { usec_t restart_usec_next; + assert(restart_state >= 0 && restart_state < _SERVICE_STATE_MAX); + /* We make two state changes here: one that maps to the high-level UNIT_INACTIVE/UNIT_FAILED - * state (i.e. a state indicating deactivation), and then one that that maps to the + * state (i.e. a state indicating deactivation), and then one that maps to the * high-level UNIT_STARTING state (i.e. a state indicating activation). We do this so that * external software can watch the state changes and see all service failures, even if they * are only transitionary and followed by an automatic restart. We have fine-grained @@ -1999,8 +2021,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) r = service_arm_timer(s, /* relative= */ true, restart_usec_next); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to install restart timer: %m"); - service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false); - return; + return service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false); } log_unit_debug(UNIT(s), "Next restart interval calculated as: %s", FORMAT_TIMESPAN(restart_usec_next, 0)); @@ -2064,8 +2085,8 @@ static void service_enter_stop_post(Service *s, ServiceResult f) { r = service_spawn(s, s->control_command, + service_exec_flags(s->control_command_id, /* cred_flag = */ 0), s->timeout_stop_usec, - EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP, &s->control_pid); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-post' task: %m"); @@ -2118,13 +2139,7 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f (void) unit_enqueue_rewatch_pids(UNIT(s)); kill_operation = state_to_kill_operation(s, state); - r = unit_kill_context( - UNIT(s), - &s->kill_context, - kill_operation, - &s->main_pid, - &s->control_pid, - s->main_pid_alien); + r = unit_kill_context(UNIT(s), kill_operation); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); goto fail; @@ -2193,8 +2208,8 @@ static void service_enter_stop(Service *s, ServiceResult f) { r = service_spawn(s, s->control_command, + service_exec_flags(s->control_command_id, /* cred_flag = */ 0), s->timeout_stop_usec, - EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP, &s->control_pid); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop' task: %m"); @@ -2209,6 +2224,7 @@ static void service_enter_stop(Service *s, ServiceResult f) { static bool service_good(Service *s) { int main_pid_ok; + assert(s); if (s->type == SERVICE_DBUS && !s->bus_name_good) @@ -2265,6 +2281,7 @@ static void service_enter_running(Service *s, ServiceResult f) { static void service_enter_start_post(Service *s) { int r; + assert(s); service_unwatch_control_pid(s); @@ -2277,8 +2294,8 @@ static void service_enter_start_post(Service *s) { r = service_spawn(s, s->control_command, + service_exec_flags(s->control_command_id, /* cred_flag = */ 0), s->timeout_start_usec, - EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP, &s->control_pid); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-post' task: %m"); @@ -2387,43 +2404,44 @@ static void service_enter_start(Service *s) { r = service_spawn(s, c, + service_exec_flags(SERVICE_EXEC_START, EXEC_SETUP_CREDENTIALS_FRESH), timeout, - EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_WRITE_CREDENTIALS|EXEC_SETENV_MONITOR_RESULT, &pidref); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start' task: %m"); goto fail; } - if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) { - /* For simple services we immediately start - * the START_POST binaries. */ + assert(pidref.pid == c->exec_status.pid); - (void) service_set_main_pidref(s, &pidref); - service_enter_start_post(s); - - } else if (s->type == SERVICE_FORKING) { + switch (s->type) { - /* For forking services we wait until the start - * process exited. */ + case SERVICE_SIMPLE: + case SERVICE_IDLE: + /* For simple services we immediately start the START_POST binaries. */ + (void) service_set_main_pidref(s, TAKE_PIDREF(pidref), &c->exec_status.start_timestamp); + return service_enter_start_post(s); + case SERVICE_FORKING: + /* For forking services we wait until the start process exited. */ pidref_done(&s->control_pid); s->control_pid = TAKE_PIDREF(pidref); - service_set_state(s, SERVICE_START); - - } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC)) { + return service_set_state(s, SERVICE_START); + + case SERVICE_ONESHOT: /* For oneshot services we wait until the start process exited, too, but it is our main process. */ + case SERVICE_EXEC: + case SERVICE_DBUS: + case SERVICE_NOTIFY: + case SERVICE_NOTIFY_RELOAD: + /* For D-Bus services we know the main pid right away, but wait for the bus name to appear + * on the bus. 'notify' and 'exec' services wait for readiness notification and EOF + * on exec_fd, respectively. */ + (void) service_set_main_pidref(s, TAKE_PIDREF(pidref), &c->exec_status.start_timestamp); + return service_set_state(s, SERVICE_START); - /* For oneshot services we wait until the start process exited, too, but it is our main process. */ - - /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the - * bus. 'notify' and 'exec' services are similar. */ - - (void) service_set_main_pidref(s, &pidref); - service_set_state(s, SERVICE_START); - } else + default: assert_not_reached(); - - return; + } fail: service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES); @@ -2447,8 +2465,8 @@ static void service_enter_start_pre(Service *s) { r = service_spawn(s, s->control_command, + service_exec_flags(s->control_command_id, /* cred_flag = */ 0), s->timeout_start_usec, - EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS, &s->control_pid); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-pre' task: %m"); @@ -2484,10 +2502,9 @@ static void service_enter_condition(Service *s) { r = service_spawn(s, s->control_command, + service_exec_flags(s->control_command_id, /* cred_flag = */ 0), s->timeout_start_usec, - EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN, &s->control_pid); - if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'exec-condition' task: %m"); goto fail; @@ -2527,11 +2544,9 @@ static void service_enter_restart(Service *s) { /* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't * fully stopped, i.e. as long as it remains up or remains in auto-start states. The user can reset * the counter explicitly however via the usual "systemctl reset-failure" logic. */ - s->n_restarts ++; + s->n_restarts++; s->flush_n_restarts = false; - s->notify_access_override = _NOTIFY_ACCESS_INVALID; - log_unit_struct(UNIT(s), LOG_INFO, "MESSAGE_ID=" SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR, LOG_UNIT_INVOCATION_ID(UNIT(s)), @@ -2595,8 +2610,8 @@ static void service_enter_reload(Service *s) { r = service_spawn(s, s->control_command, + service_exec_flags(s->control_command_id, /* cred_flag = */ 0), s->timeout_start_usec, - EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP, &s->control_pid); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'reload' task: %m"); @@ -2651,13 +2666,8 @@ static void service_run_next_control(Service *s) { r = service_spawn(s, s->control_command, + service_exec_flags(s->control_command_id, /* cred_flag = */ 0), timeout, - EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL| - (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD) ? EXEC_WRITE_CREDENTIALS : 0)| - (IN_SET(s->control_command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_STOP_POST) ? EXEC_APPLY_TTY_STDIN : 0)| - (IN_SET(s->control_command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_SETENV_RESULT : 0)| - (IN_SET(s->control_command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START) ? EXEC_SETENV_MONITOR_RESULT : 0)| - (IN_SET(s->control_command_id, SERVICE_EXEC_START_POST, SERVICE_EXEC_RELOAD, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_CONTROL_CGROUP : 0), &s->control_pid); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to spawn next control task: %m"); @@ -2688,8 +2698,8 @@ static void service_run_next_main(Service *s) { r = service_spawn(s, s->main_command, + service_exec_flags(SERVICE_EXEC_START, EXEC_SETUP_CREDENTIALS), s->timeout_start_usec, - EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS, &pidref); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to spawn next main task: %m"); @@ -2697,7 +2707,7 @@ static void service_run_next_main(Service *s) { return; } - (void) service_set_main_pidref(s, &pidref); + (void) service_set_main_pidref(s, TAKE_PIDREF(pidref), &s->main_command->exec_status.start_timestamp); } static int service_start(Unit *u) { @@ -2755,16 +2765,16 @@ static int service_start(Unit *u) { s->flush_n_restarts = false; } - u->reset_accounting = true; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (crt) + crt->reset_accounting = true; service_enter_condition(s); return 1; } static int service_stop(Unit *u) { - Service *s = SERVICE(u); - - assert(s); + Service *s = ASSERT_PTR(SERVICE(u)); /* Don't create restart jobs from manual stops. */ s->forbid_restart = true; @@ -2821,9 +2831,7 @@ static int service_stop(Unit *u) { } static int service_reload(Unit *u) { - Service *s = SERVICE(u); - - assert(s); + Service *s = ASSERT_PTR(SERVICE(u)); assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED)); @@ -2832,9 +2840,7 @@ static int service_reload(Unit *u) { } static bool service_can_reload(Unit *u) { - Service *s = SERVICE(u); - - assert(s); + Service *s = ASSERT_PTR(SERVICE(u)); return s->exec_command[SERVICE_EXEC_RELOAD] || s->type == SERVICE_NOTIFY_RELOAD; @@ -2858,14 +2864,13 @@ static unsigned service_exec_command_index(Unit *u, ServiceExecCommand id, const } static int service_serialize_exec_command(Unit *u, FILE *f, const ExecCommand *command) { + Service *s = ASSERT_PTR(SERVICE(u)); _cleanup_free_ char *args = NULL, *p = NULL; - Service *s = SERVICE(u); const char *type, *key; ServiceExecCommand id; size_t length = 0; unsigned idx; - assert(s); assert(f); if (!command) @@ -2927,10 +2932,9 @@ static int service_serialize_exec_command(Unit *u, FILE *f, const ExecCommand *c } static int service_serialize(Unit *u, FILE *f, FDSet *fds) { - Service *s = SERVICE(u); + Service *s = ASSERT_PTR(SERVICE(u)); int r; - assert(u); assert(f); assert(fds); @@ -2996,13 +3000,14 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) { if (!c) return log_oom(); - (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %i", copy, c, fs->do_poll); + (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %s", copy, c, one_zero(fs->do_poll)); } if (s->main_exec_status.pid > 0) { (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid); (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp); (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp); + (void) serialize_dual_timestamp(f, "main-exec-status-handoff", &s->main_exec_status.handoff_timestamp); if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) { (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code); @@ -3033,14 +3038,14 @@ int service_deserialize_exec_command( const char *key, const char *value) { - Service *s = SERVICE(u); - int r; - unsigned idx = 0, i; - bool control, found = false, last = false; - ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID; + Service *s = ASSERT_PTR(SERVICE(u)); ExecCommand *command = NULL; + ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID; _cleanup_free_ char *path = NULL; _cleanup_strv_free_ char **argv = NULL; + unsigned idx = 0, i; + bool control, found = false, last = false; + int r; enum ExecCommandState { STATE_EXEC_COMMAND_TYPE, @@ -3051,7 +3056,6 @@ int service_deserialize_exec_command( _STATE_EXEC_COMMAND_INVALID = -EINVAL, } state; - assert(s); assert(key); assert(value); @@ -3096,7 +3100,7 @@ int service_deserialize_exec_command( case STATE_EXEC_COMMAND_ARGS: r = strv_extend(&argv, arg); if (r < 0) - return -ENOMEM; + return r; break; default: assert_not_reached(); @@ -3139,10 +3143,9 @@ int service_deserialize_exec_command( } static int service_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Service *s = SERVICE(u); + Service *s = ASSERT_PTR(SERVICE(u)); int r; - assert(u); assert(key); assert(value); assert(fds); @@ -3179,10 +3182,10 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, (void) deserialize_pidref(fds, value, &s->control_pid); } else if (streq(key, "main-pid")) { - _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + PidRef pidref; if (!pidref_is_set(&s->main_pid) && deserialize_pidref(fds, value, &pidref) >= 0) - (void) service_set_main_pidref(s, &pidref); + (void) service_set_main_pidref(s, pidref, /* start_timestamp = */ NULL); } else if (streq(key, "main-pid-known")) { int b; @@ -3239,9 +3242,9 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, _cleanup_close_ int fd = -EBADF; int do_poll; - r = extract_first_word(&value, &fdv, NULL, 0); - if (r <= 0) { - log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value); + r = extract_many_words(&value, " ", EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE, &fdv, &fdn, &fdp); + if (r < 2 || r > 3) { + log_unit_debug(u, "Failed to deserialize fd-store-fd, ignoring: %s", value); return 0; } @@ -3249,24 +3252,17 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, if (fd < 0) return 0; - r = extract_first_word(&value, &fdn, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE); - if (r <= 0) { - log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value); - return 0; - } - - r = extract_first_word(&value, &fdp, NULL, 0); - if (r == 0) { - /* If the value is not present, we assume the default */ - do_poll = 1; - } else if (r < 0 || (r = safe_atoi(fdp, &do_poll)) < 0) { - log_unit_debug_errno(u, r, "Failed to parse fd-store-fd value \"%s\", ignoring: %m", value); + do_poll = r == 3 ? parse_boolean(fdp) : true; + if (do_poll < 0) { + log_unit_debug_errno(u, do_poll, + "Failed to deserialize fd-store-fd do_poll, ignoring: %s", fdp); return 0; } r = service_add_fd_store(s, fd, fdn, do_poll); if (r < 0) { - log_unit_debug_errno(u, r, "Failed to store deserialized fd %i, ignoring: %m", fd); + log_unit_debug_errno(u, r, + "Failed to store deserialized fd '%s', ignoring: %m", fdn); return 0; } @@ -3296,6 +3292,8 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp); else if (streq(key, "main-exec-status-exit")) deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp); + else if (streq(key, "main-exec-status-handoff")) + deserialize_dual_timestamp(value, &s->main_exec_status.handoff_timestamp); else if (streq(key, "notify-access-override")) { NotifyAccess notify_access; @@ -3383,13 +3381,12 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, } static UnitActiveState service_active_state(Unit *u) { + Service *s = ASSERT_PTR(SERVICE(u)); const UnitActiveState *table; - assert(u); - - table = SERVICE(u)->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table; + table = s->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table; - return table[SERVICE(u)->state]; + return table[s->state]; } static const char *service_sub_state_to_string(Unit *u) { @@ -3399,9 +3396,7 @@ static const char *service_sub_state_to_string(Unit *u) { } static bool service_may_gc(Unit *u) { - Service *s = SERVICE(u); - - assert(s); + Service *s = ASSERT_PTR(SERVICE(u)); /* Never clean up services that still have a process around, even if the service is formally dead. Note that * unit_may_gc() already checked our cgroup for us, we just check our two additional PIDs, too, in case they @@ -3422,6 +3417,7 @@ static bool service_may_gc(Unit *u) { static int service_retry_pid_file(Service *s) { int r; + assert(s); assert(s->pid_file); assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST)); @@ -3438,6 +3434,8 @@ static int service_retry_pid_file(Service *s) { static int service_watch_pid_file(Service *s) { int r; + assert(s); + log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path); r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io); @@ -3457,6 +3455,7 @@ static int service_watch_pid_file(Service *s) { static int service_demand_pid_file(Service *s) { _cleanup_free_ PathSpec *ps = NULL; + assert(s); assert(s->pid_file); assert(!s->pid_file_pathspec); @@ -3485,11 +3484,8 @@ static int service_demand_pid_file(Service *s) { static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { PathSpec *p = ASSERT_PTR(userdata); - Service *s; + Service *s = ASSERT_PTR(SERVICE(p->unit)); - s = SERVICE(p->unit); - - assert(s); assert(fd >= 0); assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST)); assert(s->pid_file_pathspec); @@ -3515,20 +3511,19 @@ fail: } static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { - Service *s = SERVICE(userdata); - - assert(s); + Service *s = ASSERT_PTR(SERVICE(userdata)); log_unit_debug(UNIT(s), "got exec-fd event"); /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve() - * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically - * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the - * parent. We need to be careful however, as there are other reasons that we might cause the child's side of - * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless - * the child signalled us first that it is about to call the execve(). It does so by sending us a simple - * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it - * sends a zero byte we'll ignore POLLHUP on the fd again. */ + * successfully for it. We implement this through a pipe() towards the child, which the kernel + * automatically closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on + * the pipe in the parent. We need to be careful however, as there are other reasons that we might + * cause the child's side of the pipe to be closed (for example, a simple exit()). To deal with that + * we'll ignore EOFs on the pipe unless the child signalled us first that it is about to call the + * execve(). It does so by sending us a simple non-zero byte via the pipe. We also provide the child + * with a way to inform us in case execve() failed: if it sends a zero byte we'll ignore POLLHUP on + * the fd again. */ for (;;) { uint8_t x; @@ -3541,8 +3536,7 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m"); } - if (n == 0) { /* EOF → the event we are waiting for */ - + if (n == 0) { /* EOF → the event we are waiting for in case of Type=exec */ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source); if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */ @@ -3561,16 +3555,13 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev /* A byte was read → this turns on/off the exec fd logic */ assert(n == sizeof(x)); + s->exec_fd_hot = x; } - - return 0; } static void service_notify_cgroup_empty_event(Unit *u) { - Service *s = SERVICE(u); - - assert(u); + Service *s = ASSERT_PTR(SERVICE(u)); log_unit_debug(u, "Control group is empty."); @@ -3647,7 +3638,7 @@ static void service_notify_cgroup_empty_event(Unit *u) { } static void service_notify_cgroup_oom_event(Unit *u, bool managed_oom) { - Service *s = SERVICE(u); + Service *s = ASSERT_PTR(SERVICE(u)); if (managed_oom) log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd."); @@ -3702,12 +3693,12 @@ static void service_notify_cgroup_oom_event(Unit *u, bool managed_oom) { } static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { + Service *s = ASSERT_PTR(SERVICE(u)); bool notify_dbus = true; - Service *s = SERVICE(u); ServiceResult f; ExitClean clean_mode; + int r; - assert(s); assert(pid >= 0); /* Oneshot services and non-SERVICE_EXEC_START commands should not be @@ -3918,7 +3909,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { s->control_command->command_next && f == SERVICE_SUCCESS) { - /* There is another command to * execute, so let's do that. */ + /* There is another command to execute, so let's do that. */ log_unit_debug(u, "Running next control command for state %s.", service_state_to_string(s->state)); service_run_next_control(s); @@ -3959,7 +3950,6 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { if (s->pid_file) { bool has_start_post; - int r; /* Let's try to load the pid file here if we can. * The PID file might actually be created by a START_POST @@ -3986,8 +3976,6 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { } if (s->pid_file) { - int r; - r = service_load_pid_file(s, true); if (r < 0) { r = service_demand_pid_file(s); @@ -4076,9 +4064,8 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { } static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { - Service *s = SERVICE(userdata); + Service *s = ASSERT_PTR(SERVICE(userdata)); - assert(s); assert(source == s->timer_event_source); switch (s->state) { @@ -4275,10 +4262,9 @@ static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *us } static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata) { - Service *s = SERVICE(userdata); + Service *s = ASSERT_PTR(SERVICE(userdata)); usec_t watchdog_usec; - assert(s); assert(source == s->watchdog_event_source); watchdog_usec = service_get_watchdog_usec(s); @@ -4295,35 +4281,49 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void return 0; } -static bool service_notify_message_authorized(Service *s, pid_t pid, FDSet *fds) { +static void service_force_watchdog(Service *s) { assert(s); + if (!UNIT(s)->manager->service_watchdogs) + return; + + log_unit_error(UNIT(s), "Watchdog request (last status: %s)!", + s->status_text ?: ""); + + service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG); +} + +static bool service_notify_message_authorized(Service *s, pid_t pid) { + assert(s); + assert(pid_is_valid(pid)); + NotifyAccess notify_access = service_get_notify_access(s); if (notify_access == NOTIFY_NONE) { - log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled.", pid); + /* Warn level only if no notifications are expected */ + log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled", pid); return false; } if (notify_access == NOTIFY_MAIN && pid != s->main_pid.pid) { if (pidref_is_set(&s->main_pid)) - log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid); + log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid); else - log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid); + log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid); return false; } if (notify_access == NOTIFY_EXEC && pid != s->main_pid.pid && pid != s->control_pid.pid) { if (pidref_is_set(&s->main_pid) && pidref_is_set(&s->control_pid)) - log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT, - pid, s->main_pid.pid, s->control_pid.pid); + log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT, + pid, s->main_pid.pid, s->control_pid.pid); else if (pidref_is_set(&s->main_pid)) - log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid); + log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid); else if (pidref_is_set(&s->control_pid)) - log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid.pid); + log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid.pid); else - log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid); + log_unit_debug(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid); return false; } @@ -4331,44 +4331,35 @@ static bool service_notify_message_authorized(Service *s, pid_t pid, FDSet *fds) return true; } -static void service_force_watchdog(Service *s) { - if (!UNIT(s)->manager->service_watchdogs) - return; - - log_unit_error(UNIT(s), "Watchdog request (last status: %s)!", - s->status_text ?: ""); - - service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG); -} - static void service_notify_message( Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds) { - Service *s = SERVICE(u); - bool notify_dbus = false; - usec_t monotonic_usec = USEC_INFINITY; - const char *e; + Service *s = ASSERT_PTR(SERVICE(u)); int r; - assert(u); assert(ucred); - if (!service_notify_message_authorized(s, ucred->pid, fds)) + if (!service_notify_message_authorized(s, ucred->pid)) return; if (DEBUG_LOGGING) { - _cleanup_free_ char *cc = NULL; - - cc = strv_join(tags, ", "); + _cleanup_free_ char *cc = strv_join(tags, ", "); log_unit_debug(u, "Got notification message from PID "PID_FMT" (%s)", ucred->pid, empty_to_na(cc)); } + usec_t monotonic_usec = USEC_INFINITY; + bool notify_dbus = false; + const char *e; + /* Interpret MAINPID= */ e = strv_find_startswith(tags, "MAINPID="); - if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY)) { + if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, + SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY, + SERVICE_STOP, SERVICE_STOP_SIGTERM)) { + _cleanup_(pidref_done) PidRef new_main_pid = PIDREF_NULL; r = pidref_set_pidstr(&new_main_pid, e); @@ -4384,10 +4375,10 @@ static void service_notify_message( log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, but we'll accept it as the request to change it came from a privileged process.", new_main_pid.pid); r = 1; } else - log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid.pid); + log_unit_warning(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid.pid); } if (r > 0) { - (void) service_set_main_pidref(s, &new_main_pid); + (void) service_set_main_pidref(s, TAKE_PIDREF(new_main_pid), /* start_timestamp = */ NULL); r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false); if (r < 0) @@ -4585,11 +4576,36 @@ static void service_notify_message( unit_add_to_dbus_queue(u); } +static void service_handoff_timestamp( + Unit *u, + const struct ucred *ucred, + const dual_timestamp *ts) { + + Service *s = ASSERT_PTR(SERVICE(u)); + + assert(ucred); + assert(ts); + + if (s->main_pid.pid == ucred->pid) { + if (s->main_command) + exec_status_handoff(&s->main_command->exec_status, ucred, ts); + + exec_status_handoff(&s->main_exec_status, ucred, ts); + } else if (s->control_pid.pid == ucred->pid && s->control_command) + exec_status_handoff(&s->control_command->exec_status, ucred, ts); + else + return; + + unit_add_to_dbus_queue(u); +} + static int service_get_timeout(Unit *u, usec_t *timeout) { - Service *s = SERVICE(u); + Service *s = ASSERT_PTR(SERVICE(u)); uint64_t t; int r; + assert(timeout); + if (!s->timer_event_source) return 0; @@ -4604,7 +4620,7 @@ static int service_get_timeout(Unit *u, usec_t *timeout) { } static usec_t service_get_timeout_start_usec(Unit *u) { - Service *s = SERVICE(ASSERT_PTR(u)); + Service *s = ASSERT_PTR(SERVICE(u)); return s->timeout_start_usec; } @@ -4624,16 +4640,14 @@ static bool pick_up_pid_from_bus_name(Service *s) { } static int bus_name_pid_lookup_callback(sd_bus_message *reply, void *userdata, sd_bus_error *ret_error) { + Service *s = ASSERT_PTR(SERVICE(userdata)); _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; const sd_bus_error *e; - Unit *u = ASSERT_PTR(userdata); uint32_t pid; - Service *s; int r; assert(reply); - s = SERVICE(u); s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot); if (!s->bus_name || !pick_up_pid_from_bus_name(s)) @@ -4658,20 +4672,17 @@ static int bus_name_pid_lookup_callback(sd_bus_message *reply, void *userdata, s return 1; } - log_unit_debug(u, "D-Bus name %s is now owned by process " PID_FMT, s->bus_name, pidref.pid); + log_unit_debug(UNIT(s), "D-Bus name %s is now owned by process " PID_FMT, s->bus_name, pidref.pid); - (void) service_set_main_pidref(s, &pidref); + (void) service_set_main_pidref(s, TAKE_PIDREF(pidref), /* start_timestamp = */ NULL); (void) unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false); return 1; } static void service_bus_name_owner_change(Unit *u, const char *new_owner) { - - Service *s = SERVICE(u); + Service *s = ASSERT_PTR(SERVICE(u)); int r; - assert(s); - if (new_owner) log_unit_debug(u, "D-Bus name %s now owned by %s", s->bus_name, new_owner); else @@ -4721,7 +4732,7 @@ int service_set_socket_fd( Service *s, int fd, Socket *sock, - SocketPeer *peer, + SocketPeer *peer, /* reference to object is donated to us on success */ bool selinux_context_net) { _cleanup_free_ char *peer_text = NULL; @@ -4729,6 +4740,7 @@ int service_set_socket_fd( assert(s); assert(fd >= 0); + assert(sock); /* This is called by the socket code when instantiating a new service for a stream socket and the socket needs * to be configured. We take ownership of the passed fd on success. */ @@ -4760,12 +4772,13 @@ int service_set_socket_fd( return r; } - r = unit_add_two_dependencies(UNIT(sock), UNIT_BEFORE, UNIT_TRIGGERS, UNIT(s), false, UNIT_DEPENDENCY_IMPLICIT); + r = unit_add_two_dependencies(UNIT(s), UNIT_AFTER, UNIT_TRIGGERED_BY, UNIT(sock), false, UNIT_DEPENDENCY_IMPLICIT); if (r < 0) - return r; + return log_unit_debug_errno(UNIT(s), r, + "Failed to add After=/TriggeredBy= dependencies on socket unit: %m"); s->socket_fd = fd; - s->socket_peer = socket_peer_ref(peer); + s->socket_peer = peer; s->socket_fd_selinux_context_net = selinux_context_net; unit_ref_set(&s->accept_socket, UNIT(s), UNIT(sock)); @@ -4773,9 +4786,7 @@ int service_set_socket_fd( } static void service_reset_failed(Unit *u) { - Service *s = SERVICE(u); - - assert(s); + Service *s = ASSERT_PTR(SERVICE(u)); if (s->state == SERVICE_FAILED) service_set_state(s, service_determine_dead_state(s)); @@ -4787,8 +4798,13 @@ static void service_reset_failed(Unit *u) { s->flush_n_restarts = false; } -static PidRef* service_main_pid(Unit *u) { - return &ASSERT_PTR(SERVICE(u))->main_pid; +static PidRef* service_main_pid(Unit *u, bool *ret_is_alien) { + Service *s = ASSERT_PTR(SERVICE(u)); + + if (ret_is_alien) + *ret_is_alien = s->main_pid_alien; + + return &s->main_pid; } static PidRef* service_control_pid(Unit *u) { @@ -4796,9 +4812,7 @@ static PidRef* service_control_pid(Unit *u) { } static bool service_needs_console(Unit *u) { - Service *s = SERVICE(u); - - assert(s); + Service *s = ASSERT_PTR(SERVICE(u)); /* We provide our own implementation of this here, instead of relying of the generic implementation * unit_needs_console() provides, since we want to return false if we are in SERVICE_EXITED state. */ @@ -4826,9 +4840,7 @@ static bool service_needs_console(Unit *u) { } static int service_exit_status(Unit *u) { - Service *s = SERVICE(u); - - assert(u); + Service *s = ASSERT_PTR(SERVICE(u)); if (s->main_exec_status.pid <= 0 || !dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) @@ -4841,20 +4853,17 @@ static int service_exit_status(Unit *u) { } static const char* service_status_text(Unit *u) { - Service *s = SERVICE(u); - - assert(s); + Service *s = ASSERT_PTR(SERVICE(u)); return s->status_text; } static int service_clean(Unit *u, ExecCleanMask mask) { + Service *s = ASSERT_PTR(SERVICE(u)); _cleanup_strv_free_ char **l = NULL; bool may_clean_fdstore = false; - Service *s = SERVICE(u); int r; - assert(s); assert(mask != 0); if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_DEAD_RESOURCES_PINNED)) @@ -4910,11 +4919,10 @@ fail: } static int service_can_clean(Unit *u, ExecCleanMask *ret) { - Service *s = SERVICE(u); + Service *s = ASSERT_PTR(SERVICE(u)); ExecCleanMask mask = 0; int r; - assert(s); assert(ret); r = exec_context_get_clean_mask(&s->exec_context, &mask); @@ -4928,10 +4936,12 @@ static int service_can_clean(Unit *u, ExecCleanMask *ret) { return 0; } -static const char *service_finished_job(Unit *u, JobType t, JobResult result) { +static const char* service_finished_job(Unit *u, JobType t, JobResult result) { + Service *s = ASSERT_PTR(SERVICE(u)); + if (t == JOB_START && result == JOB_DONE && - SERVICE(u)->type == SERVICE_ONESHOT) + s->type == SERVICE_ONESHOT) return "Finished %s."; /* Fall back to generic */ @@ -4939,11 +4949,9 @@ static const char *service_finished_job(Unit *u, JobType t, JobResult result) { } static int service_can_start(Unit *u) { - Service *s = SERVICE(u); + Service *s = ASSERT_PTR(SERVICE(u)); int r; - assert(s); - /* Make sure we don't enter a busy loop of some kind. */ r = unit_test_start_limit(u); if (r < 0) { @@ -4955,7 +4963,7 @@ static int service_can_start(Unit *u) { } static void service_release_resources(Unit *u) { - Service *s = SERVICE(ASSERT_PTR(u)); + Service *s = ASSERT_PTR(SERVICE(u)); /* Invoked by the unit state engine, whenever it realizes that unit is dead and there's no job * anymore for it, and it hence is a good idea to release resources */ @@ -4978,6 +4986,52 @@ static void service_release_resources(Unit *u) { service_set_state(s, SERVICE_DEAD); } +int service_determine_exec_selinux_label(Service *s, char **ret) { + int r; + + assert(s); + assert(ret); + + if (!mac_selinux_use()) + return -ENODATA; + + /* Returns the SELinux label used for execution of the main service binary */ + + if (s->exec_context.selinux_context) + /* Prefer the explicitly configured label if there is one */ + return strdup_to(ret, s->exec_context.selinux_context); + + if (s->exec_context.root_image || + s->exec_context.n_extension_images > 0 || + !strv_isempty(s->exec_context.extension_directories)) /* We cannot chase paths through images */ + return log_unit_debug_errno(UNIT(s), SYNTHETIC_ERRNO(ENODATA), "Service with RootImage=, ExtensionImages= or ExtensionDirectories= set, cannot determine socket SELinux label before activation, ignoring."); + + ExecCommand *c = s->exec_command[SERVICE_EXEC_START]; + if (!c) + return -ENODATA; + + _cleanup_free_ char *path = NULL; + r = chase(c->path, s->exec_context.root_directory, CHASE_PREFIX_ROOT, &path, NULL); + if (r < 0) { + log_unit_debug_errno(UNIT(s), r, "Failed to resolve service binary '%s', ignoring.", c->path); + return -ENODATA; + } + + r = mac_selinux_get_create_label_from_exe(path, ret); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) { + log_unit_debug_errno(UNIT(s), r, "Reading SELinux label off binary '%s' is not supported, ignoring.", path); + return -ENODATA; + } + if (ERRNO_IS_NEG_PRIVILEGE(r)) { + log_unit_debug_errno(UNIT(s), r, "Can't read SELinux label off binary '%s', due to privileges, ignoring.", path); + return -ENODATA; + } + if (r < 0) + return log_unit_debug_errno(UNIT(s), r, "Failed to read SELinux label off binary '%s': %m", path); + + return 0; +} + static const char* const service_restart_table[_SERVICE_RESTART_MAX] = { [SERVICE_RESTART_NO] = "no", [SERVICE_RESTART_ON_SUCCESS] = "on-success", @@ -4992,7 +5046,7 @@ DEFINE_STRING_TABLE_LOOKUP(service_restart, ServiceRestart); static const char* const service_restart_mode_table[_SERVICE_RESTART_MODE_MAX] = { [SERVICE_RESTART_MODE_NORMAL] = "normal", - [SERVICE_RESTART_MODE_DIRECT] = "direct", + [SERVICE_RESTART_MODE_DIRECT] = "direct", }; DEFINE_STRING_TABLE_LOOKUP(service_restart_mode, ServiceRestartMode); @@ -5080,6 +5134,7 @@ const UnitVTable service_vtable = { .cgroup_context_offset = offsetof(Service, cgroup_context), .kill_context_offset = offsetof(Service, kill_context), .exec_runtime_offset = offsetof(Service, exec_runtime), + .cgroup_runtime_offset = offsetof(Service, cgroup_runtime), .sections = "Unit\0" @@ -5110,8 +5165,7 @@ const UnitVTable service_vtable = { .clean = service_clean, .can_clean = service_can_clean, - .freeze = unit_freeze_vtable_common, - .thaw = unit_thaw_vtable_common, + .freezer_action = unit_cgroup_freezer_action, .serialize = service_serialize, .deserialize_item = service_deserialize_item, @@ -5130,6 +5184,7 @@ const UnitVTable service_vtable = { .notify_cgroup_empty = service_notify_cgroup_empty_event, .notify_cgroup_oom = service_notify_cgroup_oom_event, .notify_message = service_notify_message, + .notify_handoff_timestamp = service_handoff_timestamp, .main_pid = service_main_pid, .control_pid = service_control_pid, diff --git a/src/core/service.h b/src/core/service.h index e85302e..59598f7 100644 --- a/src/core/service.h +++ b/src/core/service.h @@ -168,6 +168,8 @@ struct Service { /* Runtime data of the execution context */ ExecRuntime *exec_runtime; + CGroupRuntime *cgroup_runtime; + PidRef main_pid, control_pid; /* if we are a socket activated service instance, store information of the connection/peer/socket */ @@ -255,6 +257,8 @@ void service_release_socket_fd(Service *s); usec_t service_restart_usec_next(Service *s); +int service_determine_exec_selinux_label(Service *s, char **ret); + const char* service_restart_to_string(ServiceRestart i) _const_; ServiceRestart service_restart_from_string(const char *s) _pure_; diff --git a/src/core/show-status.c b/src/core/show-status.c index 5b003ba..57ad4db 100644 --- a/src/core/show-status.c +++ b/src/core/show-status.c @@ -38,13 +38,13 @@ int parse_show_status(const char *v, ShowStatus *ret) { int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) { static const char status_indent[] = " "; /* "[" STATUS "] " */ + static bool prev_ephemeral = false; static int dumb = -1; _cleanup_free_ char *s = NULL; _cleanup_close_ int fd = -EBADF; struct iovec iovec[7] = {}; int n = 0; - static bool prev_ephemeral; assert(format); @@ -75,7 +75,7 @@ int status_vprintf(const char *status, ShowStatusFlags flags, const char *format if (c <= 0) c = 80; - sl = status ? sizeof(status_indent)-1 : 0; + sl = status ? strlen(status_indent) : 0; emax = c - sl - 1; if (emax < 3) diff --git a/src/core/slice.c b/src/core/slice.c index fb4f23c..4e71976 100644 --- a/src/core/slice.c +++ b/src/core/slice.c @@ -16,8 +16,8 @@ #include "unit.h" static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = { - [SLICE_DEAD] = UNIT_INACTIVE, - [SLICE_ACTIVE] = UNIT_ACTIVE + [SLICE_DEAD] = UNIT_INACTIVE, + [SLICE_ACTIVE] = UNIT_ACTIVE, }; static void slice_init(Unit *u) { @@ -27,32 +27,29 @@ static void slice_init(Unit *u) { u->ignore_on_isolate = true; } -static void slice_set_state(Slice *t, SliceState state) { +static void slice_set_state(Slice *s, SliceState state) { SliceState old_state; - assert(t); - if (t->state != state) - bus_unit_send_pending_change_signal(UNIT(t), false); + assert(s); + + if (s->state != state) + bus_unit_send_pending_change_signal(UNIT(s), false); - old_state = t->state; - t->state = state; + old_state = s->state; + s->state = state; if (state != old_state) - log_debug("%s changed %s -> %s", - UNIT(t)->id, - slice_state_to_string(old_state), - slice_state_to_string(state)); + log_unit_debug(UNIT(s), "Changed %s -> %s", + slice_state_to_string(old_state), slice_state_to_string(state)); - unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); + unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); } static int slice_add_parent_slice(Slice *s) { - Unit *u = UNIT(s); + Unit *u = UNIT(ASSERT_PTR(s)); _cleanup_free_ char *a = NULL; int r; - assert(s); - if (UNIT_GET_SLICE(u)) return 0; @@ -151,10 +148,9 @@ static int slice_load_system_slice(Unit *u) { } static int slice_load(Unit *u) { - Slice *s = SLICE(u); + Slice *s = ASSERT_PTR(SLICE(u)); int r; - assert(s); assert(u->load_state == UNIT_STUB); r = slice_load_root_slice(u); @@ -196,36 +192,35 @@ static int slice_load(Unit *u) { } static int slice_coldplug(Unit *u) { - Slice *t = SLICE(u); + Slice *s = ASSERT_PTR(SLICE(u)); - assert(t); - assert(t->state == SLICE_DEAD); + assert(s->state == SLICE_DEAD); - if (t->deserialized_state != t->state) - slice_set_state(t, t->deserialized_state); + if (s->deserialized_state != s->state) + slice_set_state(s, s->deserialized_state); return 0; } static void slice_dump(Unit *u, FILE *f, const char *prefix) { - Slice *t = SLICE(u); + Slice *s = ASSERT_PTR(SLICE(u)); - assert(t); + assert(s); assert(f); + assert(prefix); fprintf(f, "%sSlice State: %s\n", - prefix, slice_state_to_string(t->state)); + prefix, slice_state_to_string(s->state)); - cgroup_context_dump(UNIT(t), f, prefix); + cgroup_context_dump(u, f, prefix); } static int slice_start(Unit *u) { - Slice *t = SLICE(u); + Slice *s = ASSERT_PTR(SLICE(u)); int r; - assert(t); - assert(t->state == SLICE_DEAD); + assert(s->state == SLICE_DEAD); r = unit_acquire_invocation_id(u); if (r < 0) @@ -234,27 +229,25 @@ static int slice_start(Unit *u) { (void) unit_realize_cgroup(u); (void) unit_reset_accounting(u); - slice_set_state(t, SLICE_ACTIVE); + slice_set_state(s, SLICE_ACTIVE); return 1; } static int slice_stop(Unit *u) { - Slice *t = SLICE(u); + Slice *s = ASSERT_PTR(SLICE(u)); - assert(t); - assert(t->state == SLICE_ACTIVE); + assert(s->state == SLICE_ACTIVE); /* We do not need to destroy the cgroup explicitly, * unit_notify() will do that for us anyway. */ - slice_set_state(t, SLICE_DEAD); + slice_set_state(s, SLICE_DEAD); return 1; } static int slice_serialize(Unit *u, FILE *f, FDSet *fds) { - Slice *s = SLICE(u); + Slice *s = ASSERT_PTR(SLICE(u)); - assert(s); assert(f); assert(fds); @@ -264,9 +257,8 @@ static int slice_serialize(Unit *u, FILE *f, FDSet *fds) { } static int slice_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Slice *s = SLICE(u); + Slice *s = ASSERT_PTR(SLICE(u)); - assert(u); assert(key); assert(value); assert(fds); @@ -276,26 +268,26 @@ static int slice_deserialize_item(Unit *u, const char *key, const char *value, F state = slice_state_from_string(value); if (state < 0) - log_debug("Failed to parse state value %s", value); + log_unit_debug(u, "Failed to parse state: %s", value); else s->deserialized_state = state; } else - log_debug("Unknown serialization key '%s'", key); + log_unit_debug(u, "Unknown serialization key: %s", key); return 0; } static UnitActiveState slice_active_state(Unit *u) { - assert(u); + Slice *s = ASSERT_PTR(SLICE(u)); - return state_translation_table[SLICE(u)->state]; + return state_translation_table[s->state]; } static const char *slice_sub_state_to_string(Unit *u) { - assert(u); + Slice *s = ASSERT_PTR(SLICE(u)); - return slice_state_to_string(SLICE(u)->state); + return slice_state_to_string(s->state); } static int slice_make_perpetual(Manager *m, const char *name, Unit **ret) { @@ -347,46 +339,47 @@ static void slice_enumerate_perpetual(Manager *m) { (void) slice_make_perpetual(m, SPECIAL_SYSTEM_SLICE, NULL); } -static bool slice_freezer_action_supported_by_children(Unit *s) { +static bool slice_can_freeze(Unit *s) { Unit *member; assert(s); - UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) { - - if (member->type == UNIT_SLICE && - !slice_freezer_action_supported_by_children(member)) + UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) + if (!unit_can_freeze(member)) return false; - - if (!UNIT_VTABLE(member)->freeze) - return false; - } - return true; } static int slice_freezer_action(Unit *s, FreezerAction action) { + FreezerAction child_action; Unit *member; int r; assert(s); - assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); - - if (action == FREEZER_FREEZE && !slice_freezer_action_supported_by_children(s)) { + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_PARENT_FREEZE, + FREEZER_THAW, FREEZER_PARENT_THAW)); + + if (action == FREEZER_FREEZE && !slice_can_freeze(s)) { + /* We're intentionally only checking for FREEZER_FREEZE here and ignoring the + * _BY_PARENT variant. If we're being frozen by parent, that means someone has + * already checked if we can be frozen further up the call stack. No point to + * redo that work */ log_unit_warning(s, "Requested freezer operation is not supported by all children of the slice"); return 0; } - UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) { - if (!member->cgroup_realized) - continue; + if (action == FREEZER_FREEZE) + child_action = FREEZER_PARENT_FREEZE; + else if (action == FREEZER_THAW) + child_action = FREEZER_PARENT_THAW; + else + child_action = action; - if (action == FREEZER_FREEZE) - r = UNIT_VTABLE(member)->freeze(member); - else if (UNIT_VTABLE(member)->thaw) - r = UNIT_VTABLE(member)->thaw(member); + UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) { + if (UNIT_VTABLE(member)->freezer_action) + r = UNIT_VTABLE(member)->freezer_action(member, child_action); else - /* Thawing is requested but no corresponding method is available, ignore. */ + /* Only thawing will reach here, since freezing checks for a method in can_freeze */ r = 0; if (r < 0) return r; @@ -395,27 +388,10 @@ static int slice_freezer_action(Unit *s, FreezerAction action) { return unit_cgroup_freezer_action(s, action); } -static int slice_freeze(Unit *s) { - assert(s); - - return slice_freezer_action(s, FREEZER_FREEZE); -} - -static int slice_thaw(Unit *s) { - assert(s); - - return slice_freezer_action(s, FREEZER_THAW); -} - -static bool slice_can_freeze(Unit *s) { - assert(s); - - return slice_freezer_action_supported_by_children(s); -} - const UnitVTable slice_vtable = { .object_size = sizeof(Slice), .cgroup_context_offset = offsetof(Slice, cgroup_context), + .cgroup_runtime_offset = offsetof(Slice, cgroup_runtime), .sections = "Unit\0" @@ -436,8 +412,7 @@ const UnitVTable slice_vtable = { .start = slice_start, .stop = slice_stop, - .freeze = slice_freeze, - .thaw = slice_thaw, + .freezer_action = slice_freezer_action, .can_freeze = slice_can_freeze, .serialize = slice_serialize, diff --git a/src/core/slice.h b/src/core/slice.h index e2f9274..004349d 100644 --- a/src/core/slice.h +++ b/src/core/slice.h @@ -11,6 +11,8 @@ struct Slice { SliceState state, deserialized_state; CGroupContext cgroup_context; + + CGroupRuntime *cgroup_runtime; }; extern const UnitVTable slice_vtable; diff --git a/src/core/socket.c b/src/core/socket.c index 9adae16..41147d4 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -53,29 +53,44 @@ struct SocketPeer { Socket *socket; union sockaddr_union peer; socklen_t peer_salen; + struct ucred peer_cred; }; static const UnitActiveState state_translation_table[_SOCKET_STATE_MAX] = { - [SOCKET_DEAD] = UNIT_INACTIVE, - [SOCKET_START_PRE] = UNIT_ACTIVATING, - [SOCKET_START_CHOWN] = UNIT_ACTIVATING, - [SOCKET_START_POST] = UNIT_ACTIVATING, - [SOCKET_LISTENING] = UNIT_ACTIVE, - [SOCKET_RUNNING] = UNIT_ACTIVE, - [SOCKET_STOP_PRE] = UNIT_DEACTIVATING, + [SOCKET_DEAD] = UNIT_INACTIVE, + [SOCKET_START_PRE] = UNIT_ACTIVATING, + [SOCKET_START_CHOWN] = UNIT_ACTIVATING, + [SOCKET_START_POST] = UNIT_ACTIVATING, + [SOCKET_LISTENING] = UNIT_ACTIVE, + [SOCKET_RUNNING] = UNIT_ACTIVE, + [SOCKET_STOP_PRE] = UNIT_DEACTIVATING, [SOCKET_STOP_PRE_SIGTERM] = UNIT_DEACTIVATING, [SOCKET_STOP_PRE_SIGKILL] = UNIT_DEACTIVATING, - [SOCKET_STOP_POST] = UNIT_DEACTIVATING, - [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING, - [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING, - [SOCKET_FAILED] = UNIT_FAILED, - [SOCKET_CLEANING] = UNIT_MAINTENANCE, + [SOCKET_STOP_POST] = UNIT_DEACTIVATING, + [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING, + [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING, + [SOCKET_FAILED] = UNIT_FAILED, + [SOCKET_CLEANING] = UNIT_MAINTENANCE, }; static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); static void flush_ports(Socket *s); +static bool SOCKET_STATE_WITH_PROCESS(SocketState state) { + return IN_SET(state, + SOCKET_START_PRE, + SOCKET_START_CHOWN, + SOCKET_START_POST, + SOCKET_STOP_PRE, + SOCKET_STOP_PRE_SIGTERM, + SOCKET_STOP_PRE_SIGKILL, + SOCKET_STOP_POST, + SOCKET_FINAL_SIGTERM, + SOCKET_FINAL_SIGKILL, + SOCKET_CLEANING); +} + static void socket_init(Unit *u) { Socket *s = SOCKET(u); @@ -108,12 +123,7 @@ static void socket_init(Unit *u) { static void socket_unwatch_control_pid(Socket *s) { assert(s); - - if (!pidref_is_set(&s->control_pid)) - return; - - unit_unwatch_pidref(UNIT(s), &s->control_pid); - pidref_done(&s->control_pid); + unit_unwatch_pidref_done(UNIT(s), &s->control_pid); } static void socket_cleanup_fd_list(SocketPort *p) { @@ -144,11 +154,9 @@ void socket_free_ports(Socket *s) { } static void socket_done(Unit *u) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); SocketPeer *p; - assert(s); - socket_free_ports(s); while ((p = set_steal_first(s->peers_by_address))) @@ -157,6 +165,7 @@ static void socket_done(Unit *u) { s->peers_by_address = set_free(s->peers_by_address); s->exec_runtime = exec_runtime_free(s->exec_runtime); + exec_command_free_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX); s->control_command = NULL; @@ -221,7 +230,7 @@ static int socket_add_mount_dependencies(Socket *s) { if (!path) continue; - r = unit_require_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES); if (r < 0) return r; } @@ -243,6 +252,7 @@ static int socket_add_device_dependencies(Socket *s) { static int socket_add_default_dependencies(Socket *s) { int r; + assert(s); if (!UNIT(s)->default_dependencies) @@ -263,6 +273,7 @@ static int socket_add_default_dependencies(Socket *s) { static bool socket_has_exec(Socket *s) { unsigned i; + assert(s); for (i = 0; i < _SOCKET_EXEC_COMMAND_MAX; i++) @@ -273,11 +284,9 @@ static bool socket_has_exec(Socket *s) { } static int socket_add_extras(Socket *s) { - Unit *u = UNIT(s); + Unit *u = UNIT(ASSERT_PTR(s)); int r; - assert(s); - /* Pick defaults for the trigger limit, if nothing was explicitly configured. We pick a relatively high limit * in Accept=yes mode, and a lower limit for Accept=no. Reason: in Accept=yes mode we are invoking accept() * ourselves before the trigger limit can hit, thus incoming connections are taken off the socket queue quickly @@ -406,11 +415,13 @@ static void peer_address_hash_func(const SocketPeer *s, struct siphash *state) { assert(s); if (s->peer.sa.sa_family == AF_INET) - siphash24_compress(&s->peer.in.sin_addr, sizeof(s->peer.in.sin_addr), state); + siphash24_compress_typesafe(s->peer.in.sin_addr, state); else if (s->peer.sa.sa_family == AF_INET6) - siphash24_compress(&s->peer.in6.sin6_addr, sizeof(s->peer.in6.sin6_addr), state); + siphash24_compress_typesafe(s->peer.in6.sin6_addr, state); else if (s->peer.sa.sa_family == AF_VSOCK) - siphash24_compress(&s->peer.vm.svm_cid, sizeof(s->peer.vm.svm_cid), state); + siphash24_compress_typesafe(s->peer.vm.svm_cid, state); + else if (s->peer.sa.sa_family == AF_UNIX) + siphash24_compress_typesafe(s->peer_cred.uid, state); else assert_not_reached(); } @@ -429,6 +440,8 @@ static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) { return memcmp(&x->peer.in6.sin6_addr, &y->peer.in6.sin6_addr, sizeof(x->peer.in6.sin6_addr)); case AF_VSOCK: return CMP(x->peer.vm.svm_cid, y->peer.vm.svm_cid); + case AF_UNIX: + return CMP(x->peer_cred.uid, y->peer_cred.uid); } assert_not_reached(); } @@ -436,10 +449,9 @@ static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) { DEFINE_PRIVATE_HASH_OPS(peer_address_hash_ops, SocketPeer, peer_address_hash_func, peer_address_compare_func); static int socket_load(Unit *u) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); int r; - assert(u); assert(u->load_state == UNIT_STUB); r = unit_load_fragment_and_dropin(u, true); @@ -457,16 +469,22 @@ static int socket_load(Unit *u) { return socket_verify(s); } -static SocketPeer *socket_peer_new(void) { +static SocketPeer *socket_peer_dup(const SocketPeer *q) { SocketPeer *p; + assert(q); + p = new(SocketPeer, 1); if (!p) return NULL; *p = (SocketPeer) { .n_ref = 1, + .peer = q->peer, + .peer_salen = q->peer_salen, + .peer_cred = q->peer_cred, }; + return p; } @@ -483,36 +501,46 @@ DEFINE_TRIVIAL_REF_UNREF_FUNC(SocketPeer, socket_peer, socket_peer_free); int socket_acquire_peer(Socket *s, int fd, SocketPeer **ret) { _cleanup_(socket_peer_unrefp) SocketPeer *remote = NULL; - SocketPeer sa = { + SocketPeer key = { .peer_salen = sizeof(union sockaddr_union), + .peer_cred = UCRED_INVALID, }, *i; int r; - assert(fd >= 0); assert(s); + assert(fd >= 0); assert(ret); - if (getpeername(fd, &sa.peer.sa, &sa.peer_salen) < 0) + if (getpeername(fd, &key.peer.sa, &key.peer_salen) < 0) return log_unit_error_errno(UNIT(s), errno, "getpeername() failed: %m"); - if (!IN_SET(sa.peer.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) { + switch (key.peer.sa.sa_family) { + case AF_INET: + case AF_INET6: + case AF_VSOCK: + break; + + case AF_UNIX: + r = getpeercred(fd, &key.peer_cred); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to get peer credentials of socket: %m"); + break; + + default: *ret = NULL; return 0; } - i = set_get(s->peers_by_address, &sa); + i = set_get(s->peers_by_address, &key); if (i) { *ret = socket_peer_ref(i); return 1; } - remote = socket_peer_new(); + remote = socket_peer_dup(&key); if (!remote) return log_oom(); - remote->peer = sa.peer; - remote->peer_salen = sa.peer_salen; - r = set_ensure_put(&s->peers_by_address, &peer_address_hash_ops, remote); if (r < 0) return log_unit_error_errno(UNIT(s), r, "Failed to insert peer info into hash table: %m"); @@ -540,10 +568,9 @@ static const char* listen_lookup(int family, int type) { } static void socket_dump(Unit *u, FILE *f, const char *prefix) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); const char *prefix2, *str; - assert(s); assert(f); prefix = strempty(prefix); @@ -563,6 +590,7 @@ static void socket_dump(Unit *u, FILE *f, const char *prefix) { "%sTransparent: %s\n" "%sBroadcast: %s\n" "%sPassCredentials: %s\n" + "%sPassFileDescriptorsToExec: %s\n" "%sPassSecurity: %s\n" "%sPassPacketInfo: %s\n" "%sTCPCongestion: %s\n" @@ -583,6 +611,7 @@ static void socket_dump(Unit *u, FILE *f, const char *prefix) { prefix, yes_no(s->transparent), prefix, yes_no(s->broadcast), prefix, yes_no(s->pass_cred), + prefix, yes_no(s->pass_fds_to_exec), prefix, yes_no(s->pass_sec), prefix, yes_no(s->pass_pktinfo), prefix, strna(s->tcp_congestion), @@ -776,8 +805,8 @@ static void socket_dump(Unit *u, FILE *f, const char *prefix) { if (!s->exec_command[c]) continue; - fprintf(f, "%s-> %s:\n", - prefix, socket_exec_command_to_string(c)); + fprintf(f, "%s%s %s:\n", + prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), socket_exec_command_to_string(c)); exec_command_dump_list(s->exec_command[c], f, prefix2); } @@ -1274,6 +1303,9 @@ static int socket_symlink(Socket *s) { static int usbffs_write_descs(int fd, Service *s) { int r; + assert(fd >= 0); + assert(s); + if (!s->usb_function_descriptors || !s->usb_function_strings) return -EINVAL; @@ -1339,12 +1371,17 @@ clear: } int socket_load_service_unit(Socket *s, int cfd, Unit **ret) { + int r; + /* Figure out what the unit that will be used to handle the connections on the socket looks like. * * If cfd < 0, then we don't have a connection yet. In case of Accept=yes sockets, use a fake * instance name. */ + assert(s); + assert(ret); + if (UNIT_ISSET(s->service)) { *ret = UNIT_DEREF(s->service); return 0; @@ -1355,7 +1392,6 @@ int socket_load_service_unit(Socket *s, int cfd, Unit **ret) { /* Build the instance name and load the unit */ _cleanup_free_ char *prefix = NULL, *instance = NULL, *name = NULL; - int r; r = unit_name_to_prefix(UNIT(s)->id, &prefix); if (r < 0) @@ -1385,50 +1421,26 @@ int socket_load_service_unit(Socket *s, int cfd, Unit **ret) { } static int socket_determine_selinux_label(Socket *s, char **ret) { + Unit *service; int r; assert(s); assert(ret); - Unit *service; - ExecCommand *c; - const char *exec_context; - _cleanup_free_ char *path = NULL; - - r = socket_load_service_unit(s, -1, &service); - if (r == -ENODATA) - goto no_label; + r = socket_load_service_unit(s, /* cfd= */ -EBADF, &service); + if (r == -ENODATA) { + *ret = NULL; + return 0; + } if (r < 0) return r; - exec_context = SERVICE(service)->exec_context.selinux_context; - if (exec_context) { - char *con; - - con = strdup(exec_context); - if (!con) - return -ENOMEM; - - *ret = TAKE_PTR(con); + r = service_determine_exec_selinux_label(SERVICE(service), ret); + if (r == -ENODATA) { + *ret = NULL; return 0; } - - c = SERVICE(service)->exec_command[SERVICE_EXEC_START]; - if (!c) - goto no_label; - - r = chase(c->path, SERVICE(service)->exec_context.root_directory, CHASE_PREFIX_ROOT, &path, NULL); - if (r < 0) - goto no_label; - - r = mac_selinux_get_create_label_from_exe(path, ret); - if (IN_SET(r, -EPERM, -EOPNOTSUPP)) - goto no_label; return r; - -no_label: - *ret = NULL; - return 0; } static int socket_address_listen_do( @@ -1794,6 +1806,7 @@ static int socket_check_open(Socket *s) { static void socket_set_state(Socket *s, SocketState state) { SocketState old_state; + assert(s); if (s->state != state) @@ -1802,18 +1815,7 @@ static void socket_set_state(Socket *s, SocketState state) { old_state = s->state; s->state = state; - if (!IN_SET(state, - SOCKET_START_PRE, - SOCKET_START_CHOWN, - SOCKET_START_POST, - SOCKET_STOP_PRE, - SOCKET_STOP_PRE_SIGTERM, - SOCKET_STOP_PRE_SIGKILL, - SOCKET_STOP_POST, - SOCKET_FINAL_SIGTERM, - SOCKET_FINAL_SIGKILL, - SOCKET_CLEANING)) { - + if (!SOCKET_STATE_WITH_PROCESS(state)) { s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source); socket_unwatch_control_pid(s); s->control_command = NULL; @@ -1841,10 +1843,9 @@ static void socket_set_state(Socket *s, SocketState state) { } static int socket_coldplug(Unit *u) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); int r; - assert(s); assert(s->state == SOCKET_DEAD); if (s->deserialized_state == s->state) @@ -1852,17 +1853,7 @@ static int socket_coldplug(Unit *u) { if (pidref_is_set(&s->control_pid) && pidref_is_unwaited(&s->control_pid) > 0 && - IN_SET(s->deserialized_state, - SOCKET_START_PRE, - SOCKET_START_CHOWN, - SOCKET_START_POST, - SOCKET_STOP_PRE, - SOCKET_STOP_PRE_SIGTERM, - SOCKET_STOP_PRE_SIGKILL, - SOCKET_STOP_POST, - SOCKET_FINAL_SIGTERM, - SOCKET_FINAL_SIGKILL, - SOCKET_CLEANING)) { + SOCKET_STATE_WITH_PROCESS(s->deserialized_state)) { r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false); if (r < 0) @@ -1911,11 +1902,9 @@ static int socket_coldplug(Unit *u) { } static int socket_spawn(Socket *s, ExecCommand *c, PidRef *ret_pid) { - _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT( EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN); _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - pid_t pid; int r; assert(s); @@ -1934,17 +1923,33 @@ static int socket_spawn(Socket *s, ExecCommand *c, PidRef *ret_pid) { if (r < 0) return r; + /* Note that ExecStartPre= command doesn't inherit any FDs. It runs before we open listen FDs. */ + if (s->pass_fds_to_exec) { + _cleanup_strv_free_ char **fd_names = NULL; + _cleanup_free_ int *fds = NULL; + int n_fds; + + n_fds = socket_collect_fds(s, &fds); + if (n_fds < 0) + return n_fds; + + r = strv_extend_n(&fd_names, socket_fdname(s), n_fds); + if (r < 0) + return r; + + exec_params.flags |= EXEC_PASS_FDS; + exec_params.fds = TAKE_PTR(fds); + exec_params.fd_names = TAKE_PTR(fd_names); + exec_params.n_socket_fds = n_fds; + } + r = exec_spawn(UNIT(s), c, &s->exec_context, &exec_params, s->exec_runtime, &s->cgroup_context, - &pid); - if (r < 0) - return r; - - r = pidref_set_pid(&pidref, pid); + &pidref); if (r < 0) return r; @@ -2052,6 +2057,7 @@ static void socket_enter_signal(Socket *s, SocketState state, SocketResult f); static void socket_enter_stop_post(Socket *s, SocketResult f) { int r; + assert(s); if (s->result == SOCKET_SUCCESS) @@ -2094,13 +2100,7 @@ static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) { if (s->result == SOCKET_SUCCESS) s->result = f; - r = unit_kill_context( - UNIT(s), - &s->kill_context, - state_to_kill_operation(s, state), - /* main_pid= */ NULL, - &s->control_pid, - /* main_pid_alien= */ false); + r = unit_kill_context(UNIT(s), state_to_kill_operation(s, state)); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); goto fail; @@ -2134,6 +2134,7 @@ fail: static void socket_enter_stop_pre(Socket *s, SocketResult f) { int r; + assert(s); if (s->result == SOCKET_SUCCESS) @@ -2160,6 +2161,7 @@ static void socket_enter_stop_pre(Socket *s, SocketResult f) { static void socket_enter_listening(Socket *s) { int r; + assert(s); if (!s->accept && s->flush_pending) { @@ -2179,6 +2181,7 @@ static void socket_enter_listening(Socket *s) { static void socket_enter_start_post(Socket *s) { int r; + assert(s); socket_unwatch_control_pid(s); @@ -2235,6 +2238,7 @@ fail: static void socket_enter_start_pre(Socket *s) { int r; + assert(s); socket_unwatch_control_pid(s); @@ -2278,7 +2282,6 @@ static void socket_enter_running(Socket *s, int cfd_in) { /* Note that this call takes possession of the connection fd passed. It either has to assign it * somewhere or close it. */ _cleanup_close_ int cfd = cfd_in; - _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; int r; @@ -2315,8 +2318,8 @@ static void socket_enter_running(Socket *s, int cfd_in) { if (!pending) { if (!UNIT_ISSET(s->service)) { - r = log_unit_warning_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT), - "Service to activate vanished, refusing activation."); + log_unit_warning(UNIT(s), + "Service to activate vanished, refusing activation."); goto fail; } @@ -2347,7 +2350,10 @@ static void socket_enter_running(Socket *s, int cfd_in) { if (r > 0 && p->n_ref > s->max_connections_per_source) { _cleanup_free_ char *t = NULL; - (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, true, false, &t); + if (p->peer.sa.sa_family == AF_UNIX) + (void) asprintf(&t, "UID " UID_FMT, p->peer_cred.uid); + else + (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, /* translate_ipv6= */ true, /* include_port= */ false, &t); log_unit_warning(UNIT(s), "Too many incoming connections (%u) from source %s, dropping connection.", @@ -2357,18 +2363,15 @@ static void socket_enter_running(Socket *s, int cfd_in) { } r = socket_load_service_unit(s, cfd, &service); - if (r < 0) { - if (ERRNO_IS_DISCONNECT(r)) - return; - - log_unit_warning_errno(UNIT(s), r, "Failed to load connection service unit: %m"); + if (ERRNO_IS_NEG_DISCONNECT(r)) + return; + if (r < 0 || UNIT_IS_LOAD_ERROR(service->load_state)) { + log_unit_warning_errno(UNIT(s), r < 0 ? r : service->load_error, + "Failed to load connection service unit: %m"); goto fail; } - - r = unit_add_two_dependencies(UNIT(s), UNIT_BEFORE, UNIT_TRIGGERS, service, - false, UNIT_DEPENDENCY_IMPLICIT); - if (r < 0) { - log_unit_warning_errno(UNIT(s), r, "Failed to add Before=/Triggers= dependencies on connection unit: %m"); + if (service->load_state == UNIT_MASKED) { + log_unit_warning(UNIT(s), "Connection service unit is masked, refusing."); goto fail; } @@ -2383,7 +2386,10 @@ static void socket_enter_running(Socket *s, int cfd_in) { goto fail; } - TAKE_FD(cfd); /* We passed ownership of the fd to the service now. Forget it here. */ + /* We passed ownership of the fd and socket peer to the service now. */ + TAKE_FD(cfd); + TAKE_PTR(p); + s->n_connections++; r = manager_add_job(UNIT(s)->manager, JOB_START, service, JOB_REPLACE, NULL, &error, NULL); @@ -2405,13 +2411,9 @@ refuse: return; queue_error: - if (ERRNO_IS_RESOURCE(r)) - log_unit_warning(UNIT(s), "Failed to queue service startup job: %s", - bus_error_message(&error, r)); - else - log_unit_warning(UNIT(s), "Failed to queue service startup job (Maybe the service file is missing or not a %s unit?): %s", - cfd >= 0 ? "template" : "non-template", - bus_error_message(&error, r)); + log_unit_warning_errno(UNIT(s), r, "Failed to queue service startup job%s: %s", + cfd >= 0 && !ERRNO_IS_RESOURCE(r) ? " (Maybe the service is missing or is a template unit?)" : "", + bus_error_message(&error, r)); fail: socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES); @@ -2444,11 +2446,9 @@ static void socket_run_next(Socket *s) { } static int socket_start(Unit *u) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); int r; - assert(s); - /* We cannot fulfill this request right now, try again later * please! */ if (IN_SET(s->state, @@ -2496,16 +2496,15 @@ static int socket_start(Unit *u) { s->result = SOCKET_SUCCESS; exec_command_reset_status_list_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX); - u->reset_accounting = true; + if (s->cgroup_runtime) + s->cgroup_runtime->reset_accounting = true; socket_enter_start_pre(s); return 1; } static int socket_stop(Unit *u) { - Socket *s = SOCKET(u); - - assert(s); + Socket *s = ASSERT_PTR(SOCKET(u)); /* Already on it */ if (IN_SET(s->state, @@ -2540,10 +2539,9 @@ static int socket_stop(Unit *u) { } static int socket_serialize(Unit *u, FILE *f, FDSet *fds) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); int r; - assert(u); assert(f); assert(fds); @@ -2595,10 +2593,9 @@ static int socket_serialize(Unit *u, FILE *f, FDSet *fds) { } static int socket_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); int r; - assert(u); assert(key); assert(value); @@ -2836,9 +2833,7 @@ static int socket_deserialize_item(Unit *u, const char *key, const char *value, } static void socket_distribute_fds(Unit *u, FDSet *fds) { - Socket *s = SOCKET(u); - - assert(u); + Socket *s = ASSERT_PTR(SOCKET(u)); LIST_FOREACH(port, p, s->ports) { int fd; @@ -2860,15 +2855,15 @@ static void socket_distribute_fds(Unit *u, FDSet *fds) { } static UnitActiveState socket_active_state(Unit *u) { - assert(u); + Socket *s = ASSERT_PTR(SOCKET(u)); - return state_translation_table[SOCKET(u)->state]; + return state_translation_table[s->state]; } static const char *socket_sub_state_to_string(Unit *u) { - assert(u); + Socket *s = ASSERT_PTR(SOCKET(u)); - return socket_state_to_string(SOCKET(u)->state); + return socket_state_to_string(s->state); } int socket_port_to_address(const SocketPort *p, char **ret) { @@ -2906,7 +2901,6 @@ int socket_port_to_address(const SocketPort *p, char **ret) { } const char* socket_port_type_to_string(SocketPort *p) { - assert(p); switch (p->type) { @@ -2968,9 +2962,7 @@ SocketType socket_port_type_from_string(const char *s) { } static bool socket_may_gc(Unit *u) { - Socket *s = SOCKET(u); - - assert(u); + Socket *s = ASSERT_PTR(SOCKET(u)); return s->n_connections == 0; } @@ -3108,10 +3100,9 @@ fail: } static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); SocketResult f; - assert(s); assert(pid >= 0); if (pid != s->control_pid.pid) @@ -3215,9 +3206,8 @@ static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) { } static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { - Socket *s = SOCKET(userdata); + Socket *s = ASSERT_PTR(SOCKET(userdata)); - assert(s); assert(s->timer_event_source == source); switch (s->state) { @@ -3289,12 +3279,11 @@ static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *use return 0; } -int socket_collect_fds(Socket *s, int **fds) { - size_t k = 0, n = 0; - int *rfds; +int socket_collect_fds(Socket *s, int **ret) { + size_t n = 0, k = 0; assert(s); - assert(fds); + assert(ret); /* Called from the service code for requesting our fds */ @@ -3304,25 +3293,25 @@ int socket_collect_fds(Socket *s, int **fds) { n += p->n_auxiliary_fds; } - if (n <= 0) { - *fds = NULL; + if (n == 0) { + *ret = NULL; return 0; } - rfds = new(int, n); - if (!rfds) + int *fds = new(int, n); + if (!fds) return -ENOMEM; LIST_FOREACH(port, p, s->ports) { if (p->fd >= 0) - rfds[k++] = p->fd; - for (size_t i = 0; i < p->n_auxiliary_fds; ++i) - rfds[k++] = p->auxiliary_fds[i]; + fds[k++] = p->fd; + FOREACH_ARRAY(i, p->auxiliary_fds, p->n_auxiliary_fds) + fds[k++] = *i; } assert(k == n); - *fds = rfds; + *ret = fds; return (int) n; } @@ -3353,9 +3342,8 @@ void socket_connection_unref(Socket *s) { } static void socket_trigger_notify(Unit *u, Unit *other) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); - assert(u); assert(other); /* Filter out invocations with bogus state */ @@ -3390,8 +3378,24 @@ static void socket_trigger_notify(Unit *u, Unit *other) { socket_set_state(s, SOCKET_RUNNING); } +static void socket_handoff_timestamp( + Unit *u, + const struct ucred *ucred, + const dual_timestamp *ts) { + + Socket *s = ASSERT_PTR(SOCKET(u)); + + assert(ucred); + assert(ts); + + if (s->control_pid.pid == ucred->pid && s->control_command) { + exec_status_handoff(&s->control_command->exec_status, ucred, ts); + unit_add_to_dbus_queue(u); + } +} + static int socket_get_timeout(Unit *u, usec_t *timeout) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); usec_t t; int r; @@ -3423,11 +3427,10 @@ static PidRef *socket_control_pid(Unit *u) { } static int socket_clean(Unit *u, ExecCleanMask mask) { + Socket *s = ASSERT_PTR(SOCKET(u)); _cleanup_strv_free_ char **l = NULL; - Socket *s = SOCKET(u); int r; - assert(s); assert(mask != 0); if (s->state != SOCKET_DEAD) @@ -3467,19 +3470,15 @@ fail: } static int socket_can_clean(Unit *u, ExecCleanMask *ret) { - Socket *s = SOCKET(u); - - assert(s); + Socket *s = ASSERT_PTR(SOCKET(u)); return exec_context_get_clean_mask(&s->exec_context, ret); } static int socket_can_start(Unit *u) { - Socket *s = SOCKET(u); + Socket *s = ASSERT_PTR(SOCKET(u)); int r; - assert(s); - r = unit_test_start_limit(u); if (r < 0) { socket_enter_dead(s, SOCKET_FAILURE_START_LIMIT_HIT); @@ -3494,7 +3493,7 @@ static const char* const socket_exec_command_table[_SOCKET_EXEC_COMMAND_MAX] = { [SOCKET_EXEC_START_CHOWN] = "ExecStartChown", [SOCKET_EXEC_START_POST] = "ExecStartPost", [SOCKET_EXEC_STOP_PRE] = "ExecStopPre", - [SOCKET_EXEC_STOP_POST] = "ExecStopPost" + [SOCKET_EXEC_STOP_POST] = "ExecStopPost", }; DEFINE_STRING_TABLE_LOOKUP(socket_exec_command, SocketExecCommand); @@ -3508,7 +3507,7 @@ static const char* const socket_result_table[_SOCKET_RESULT_MAX] = { [SOCKET_FAILURE_CORE_DUMP] = "core-dump", [SOCKET_FAILURE_START_LIMIT_HIT] = "start-limit-hit", [SOCKET_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit", - [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit" + [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit", }; DEFINE_STRING_TABLE_LOOKUP(socket_result, SocketResult); @@ -3552,6 +3551,7 @@ const UnitVTable socket_vtable = { .cgroup_context_offset = offsetof(Socket, cgroup_context), .kill_context_offset = offsetof(Socket, kill_context), .exec_runtime_offset = offsetof(Socket, exec_runtime), + .cgroup_runtime_offset = offsetof(Socket, cgroup_runtime), .sections = "Unit\0" @@ -3596,6 +3596,8 @@ const UnitVTable socket_vtable = { .reset_failed = socket_reset_failed, + .notify_handoff_timestamp = socket_handoff_timestamp, + .control_pid = socket_control_pid, .bus_set_property = bus_socket_set_property, diff --git a/src/core/socket.h b/src/core/socket.h index 0983e8c..5e3929c 100644 --- a/src/core/socket.h +++ b/src/core/socket.h @@ -92,6 +92,7 @@ struct Socket { CGroupContext cgroup_context; ExecRuntime *exec_runtime; + CGroupRuntime *cgroup_runtime; /* For Accept=no sockets refers to the one service we'll * activate. For Accept=yes sockets is either NULL, or filled @@ -128,6 +129,7 @@ struct Socket { bool transparent; bool broadcast; bool pass_cred; + bool pass_fds_to_exec; bool pass_sec; bool pass_pktinfo; SocketTimestamping timestamping; @@ -170,7 +172,7 @@ int socket_acquire_peer(Socket *s, int fd, SocketPeer **p); DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPeer*, socket_peer_unref); /* Called from the service code when collecting fds */ -int socket_collect_fds(Socket *s, int **fds); +int socket_collect_fds(Socket *s, int **ret); /* Called from the service code when a per-connection service ended */ void socket_connection_unref(Socket *s); diff --git a/src/core/swap.c b/src/core/swap.c index 682c2b9..c4d2ba8 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -30,15 +30,15 @@ #include "virt.h" static const UnitActiveState state_translation_table[_SWAP_STATE_MAX] = { - [SWAP_DEAD] = UNIT_INACTIVE, - [SWAP_ACTIVATING] = UNIT_ACTIVATING, - [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE, - [SWAP_ACTIVE] = UNIT_ACTIVE, - [SWAP_DEACTIVATING] = UNIT_DEACTIVATING, + [SWAP_DEAD] = UNIT_INACTIVE, + [SWAP_ACTIVATING] = UNIT_ACTIVATING, + [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE, + [SWAP_ACTIVE] = UNIT_ACTIVE, + [SWAP_DEACTIVATING] = UNIT_DEACTIVATING, [SWAP_DEACTIVATING_SIGTERM] = UNIT_DEACTIVATING, [SWAP_DEACTIVATING_SIGKILL] = UNIT_DEACTIVATING, - [SWAP_FAILED] = UNIT_FAILED, - [SWAP_CLEANING] = UNIT_MAINTENANCE, + [SWAP_FAILED] = UNIT_FAILED, + [SWAP_CLEANING] = UNIT_MAINTENANCE, }; static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); @@ -68,9 +68,7 @@ static const char *swap_sub_state_to_string(Unit *u) { } static bool swap_may_gc(Unit *u) { - Swap *s = SWAP(u); - - assert(s); + Swap *s = ASSERT_PTR(SWAP(u)); if (s->from_proc_swaps) return false; @@ -134,10 +132,9 @@ static int swap_set_devnode(Swap *s, const char *devnode) { } static void swap_init(Unit *u) { - Swap *s = SWAP(u); + Swap *s = ASSERT_PTR(SWAP(u)); - assert(s); - assert(UNIT(s)->load_state == UNIT_STUB); + assert(u->load_state == UNIT_STUB); s->timeout_usec = u->manager->defaults.timeout_start_usec; @@ -152,18 +149,11 @@ static void swap_init(Unit *u) { static void swap_unwatch_control_pid(Swap *s) { assert(s); - - if (!pidref_is_set(&s->control_pid)) - return; - - unit_unwatch_pidref(UNIT(s), &s->control_pid); - pidref_done(&s->control_pid); + unit_unwatch_pidref_done(UNIT(s), &s->control_pid); } static void swap_done(Unit *u) { - Swap *s = SWAP(u); - - assert(s); + Swap *s = ASSERT_PTR(SWAP(u)); swap_unset_proc_swaps(s); swap_set_devnode(s, NULL); @@ -173,6 +163,7 @@ static void swap_done(Unit *u) { s->parameters_fragment.options = mfree(s->parameters_fragment.options); s->exec_runtime = exec_runtime_free(s->exec_runtime); + exec_command_done_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX); s->control_command = NULL; @@ -255,6 +246,7 @@ static int swap_verify(Swap *s) { _cleanup_free_ char *e = NULL; int r; + assert(s); assert(UNIT(s)->load_state == UNIT_LOADED); r = unit_name_from_path(s->what, ".swap", &e); @@ -321,7 +313,7 @@ static int swap_add_extras(Swap *s) { return r; } - r = unit_require_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT); + r = unit_add_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT, UNIT_MOUNT_REQUIRES); if (r < 0) return r; @@ -353,25 +345,22 @@ static int swap_add_extras(Swap *s) { } static int swap_load(Unit *u) { - Swap *s = SWAP(u); - int r, q = 0; + Swap *s = ASSERT_PTR(SWAP(u)); + int r; - assert(s); assert(u->load_state == UNIT_STUB); /* Load a .swap file */ - bool fragment_optional = s->from_proc_swaps; - r = unit_load_fragment_and_dropin(u, !fragment_optional); + r = unit_load_fragment_and_dropin(u, /* fragment_required = */ !s->from_proc_swaps); /* Add in some extras, and do so either when we successfully loaded something or when /proc/swaps is * already active. */ if (u->load_state == UNIT_LOADED || s->from_proc_swaps) - q = swap_add_extras(s); + RET_GATHER(r, swap_add_extras(s)); if (r < 0) return r; - if (q < 0) - return q; + if (u->load_state != UNIT_LOADED) return 0; @@ -385,11 +374,11 @@ static int swap_setup_unit( int priority, bool set_flags) { + _cleanup_(unit_freep) Unit *new_unit = NULL; _cleanup_free_ char *e = NULL; - bool delete = false; - Unit *u = NULL; + Unit *u; + Swap *s; int r; - SwapParameters *p; assert(m); assert(what); @@ -397,70 +386,61 @@ static int swap_setup_unit( r = unit_name_from_path(what, ".swap", &e); if (r < 0) - return log_unit_error_errno(u, r, "Failed to generate unit name from path: %m"); + return log_error_errno(r, "Failed to generate unit name from path: %m"); u = manager_get_unit(m, e); - if (u && - SWAP(u)->from_proc_swaps && - !path_equal(SWAP(u)->parameters_proc_swaps.what, what_proc_swaps)) - return log_error_errno(SYNTHETIC_ERRNO(EEXIST), - "Swap %s appeared twice with different device paths %s and %s", - e, SWAP(u)->parameters_proc_swaps.what, what_proc_swaps); - - if (!u) { - delete = true; + if (u) { + s = ASSERT_PTR(SWAP(u)); + + if (s->from_proc_swaps && + !path_equal(s->parameters_proc_swaps.what, what_proc_swaps)) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EEXIST), + "Swap appeared twice with different device paths %s and %s, refusing.", + s->parameters_proc_swaps.what, what_proc_swaps); + } else { + r = unit_new_for_name(m, sizeof(Swap), e, &new_unit); + if (r < 0) + return log_warning_errno(r, "Failed to load swap unit '%s': %m", e); - r = unit_new_for_name(m, sizeof(Swap), e, &u); - if (r < 0) { - log_unit_warning_errno(u, r, "Failed to load swap unit: %m"); - goto fail; - } + u = new_unit; + s = ASSERT_PTR(SWAP(u)); - SWAP(u)->what = strdup(what); - if (!SWAP(u)->what) { - r = log_oom(); - goto fail; - } + s->what = strdup(what); + if (!s->what) + return log_oom(); unit_add_to_load_queue(u); - } else - delete = false; + } - p = &SWAP(u)->parameters_proc_swaps; + SwapParameters *p = &s->parameters_proc_swaps; if (!p->what) { p->what = strdup(what_proc_swaps); - if (!p->what) { - r = log_oom(); - goto fail; - } + if (!p->what) + return log_oom(); } - /* The unit is definitely around now, mark it as loaded if it was previously referenced but could not be - * loaded. After all we can load it now, from the data in /proc/swaps. */ - if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) { + /* The unit is definitely around now, mark it as loaded if it was previously referenced but + * could not be loaded. After all we can load it now, from the data in /proc/swaps. */ + if (UNIT_IS_LOAD_ERROR(u->load_state)) { u->load_state = UNIT_LOADED; u->load_error = 0; } if (set_flags) { - SWAP(u)->is_active = true; - SWAP(u)->just_activated = !SWAP(u)->from_proc_swaps; + s->is_active = true; + s->just_activated = !s->from_proc_swaps; } - SWAP(u)->from_proc_swaps = true; + s->from_proc_swaps = true; p->priority = priority; p->priority_set = true; unit_add_to_dbus_queue(u); - return 0; + TAKE_PTR(new_unit); -fail: - if (delete) - unit_free(u); - - return r; + return 0; } static void swap_process_new(Manager *m, const char *device, int prio, bool set_flags) { @@ -541,11 +521,10 @@ static void swap_set_state(Swap *s, SwapState state) { } static int swap_coldplug(Unit *u) { - Swap *s = SWAP(u); + Swap *s = ASSERT_PTR(SWAP(u)); SwapState new_state = SWAP_DEAD; int r; - assert(s); assert(s->state == SWAP_DEAD); if (s->deserialized_state != s->state) @@ -569,20 +548,25 @@ static int swap_coldplug(Unit *u) { return r; } - if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED)) + if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED)) { (void) unit_setup_exec_runtime(u); + (void) unit_setup_cgroup_runtime(u); + } swap_set_state(s, new_state); return 0; } static void swap_dump(Unit *u, FILE *f, const char *prefix) { - Swap *s = SWAP(u); + Swap *s = ASSERT_PTR(SWAP(u)); SwapParameters *p; + const char *prefix2; - assert(s); assert(f); + prefix = strempty(prefix); + prefix2 = strjoina(prefix, "\t"); + if (s->from_proc_swaps) p = &s->parameters_proc_swaps; else if (s->from_fragment) @@ -628,14 +612,23 @@ static void swap_dump(Unit *u, FILE *f, const char *prefix) { exec_context_dump(&s->exec_context, f, prefix); kill_context_dump(&s->kill_context, f, prefix); cgroup_context_dump(UNIT(s), f, prefix); + + for (SwapExecCommand c = 0; c < _SWAP_EXEC_COMMAND_MAX; c++) { + if (!s->exec_command[c].argv) + continue; + + fprintf(f, "%s%s %s:\n", + prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), swap_exec_command_to_string(c)); + + exec_command_dump(s->exec_command + c, f, prefix2); + } + } static int swap_spawn(Swap *s, ExecCommand *c, PidRef *ret_pid) { - _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT( EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN); _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; - pid_t pid; int r; assert(s); @@ -660,11 +653,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, PidRef *ret_pid) { &exec_params, s->exec_runtime, &s->cgroup_context, - &pid); - if (r < 0) - return r; - - r = pidref_set_pid(&pidref, pid); + &pidref); if (r < 0) return r; @@ -734,13 +723,7 @@ static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) { if (s->result == SWAP_SUCCESS) s->result = f; - r = unit_kill_context( - UNIT(s), - &s->kill_context, - state_to_kill_operation(s, state), - /* main_pid= */ NULL, - &s->control_pid, - /* main_pid_alien= */ false); + r = unit_kill_context(UNIT(s), state_to_kill_operation(s, state)); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m"); goto fail; @@ -870,7 +853,9 @@ static void swap_cycle_clear(Swap *s) { s->result = SWAP_SUCCESS; exec_command_reset_status_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX); - UNIT(s)->reset_accounting = true; + + if (s->cgroup_runtime) + s->cgroup_runtime->reset_accounting = true; } static int swap_start(Unit *u) { @@ -913,9 +898,7 @@ static int swap_start(Unit *u) { } static int swap_stop(Unit *u) { - Swap *s = SWAP(u); - - assert(s); + Swap *s = ASSERT_PTR(SWAP(u)); switch (s->state) { @@ -949,9 +932,8 @@ static int swap_stop(Unit *u) { } static int swap_serialize(Unit *u, FILE *f, FDSet *fds) { - Swap *s = SWAP(u); + Swap *s = ASSERT_PTR(SWAP(u)); - assert(s); assert(f); assert(fds); @@ -966,9 +948,8 @@ static int swap_serialize(Unit *u, FILE *f, FDSet *fds) { } static int swap_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Swap *s = SWAP(u); + Swap *s = ASSERT_PTR(SWAP(u)); - assert(s); assert(fds); if (streq(key, "state")) { @@ -1009,10 +990,9 @@ static int swap_deserialize_item(Unit *u, const char *key, const char *value, FD } static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) { - Swap *s = SWAP(u); + Swap *s = ASSERT_PTR(SWAP(u)); SwapResult f; - assert(s); assert(pid >= 0); if (pid != s->control_pid.pid) @@ -1086,9 +1066,8 @@ static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) { } static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { - Swap *s = SWAP(userdata); + Swap *s = ASSERT_PTR(SWAP(userdata)); - assert(s); assert(s->timer_event_source == source); switch (s->state) { @@ -1261,12 +1240,10 @@ static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, v return swap_process_proc_swaps(m); } -static Unit *swap_following(Unit *u) { - Swap *s = SWAP(u); +static Unit* swap_following(Unit *u) { + Swap *s = ASSERT_PTR(SWAP(u)); Swap *first = NULL; - assert(s); - /* If the user configured the swap through /etc/fstab or * a device unit, follow that. */ @@ -1298,16 +1275,15 @@ static Unit *swap_following(Unit *u) { return UNIT(first); } -static int swap_following_set(Unit *u, Set **_set) { - Swap *s = SWAP(u); +static int swap_following_set(Unit *u, Set **ret) { + Swap *s = ASSERT_PTR(SWAP(u)); _cleanup_set_free_ Set *set = NULL; int r; - assert(s); - assert(_set); + assert(ret); if (LIST_JUST_US(same_devnode, s)) { - *_set = NULL; + *ret = NULL; return 0; } @@ -1321,7 +1297,7 @@ static int swap_following_set(Unit *u, Set **_set) { return r; } - *_set = TAKE_PTR(set); + *ret = TAKE_PTR(set); return 1; } @@ -1358,7 +1334,7 @@ static void swap_enumerate(Manager *m) { /* Dispatch this before we dispatch SIGCHLD, so that * we always get the events from /proc/swaps before * the SIGCHLD of /sbin/swapon. */ - r = sd_event_source_set_priority(m->swap_event_source, SD_EVENT_PRIORITY_NORMAL-10); + r = sd_event_source_set_priority(m->swap_event_source, EVENT_PRIORITY_SWAP_TABLE); if (r < 0) { log_error_errno(r, "Failed to change /proc/swaps priority: %m"); goto fail; @@ -1422,28 +1398,22 @@ int swap_process_device_new(Manager *m, sd_device *dev) { int swap_process_device_remove(Manager *m, sd_device *dev) { const char *dn; - int r; Swap *s; + int r; r = sd_device_get_devname(dev, &dn); if (r < 0) return 0; - while ((s = hashmap_get(m->swaps_by_devnode, dn))) { - int q; - - q = swap_set_devnode(s, NULL); - if (q < 0) - r = q; - } + r = 0; + while ((s = hashmap_get(m->swaps_by_devnode, dn))) + RET_GATHER(r, swap_set_devnode(s, NULL)); return r; } static void swap_reset_failed(Unit *u) { - Swap *s = SWAP(u); - - assert(s); + Swap *s = ASSERT_PTR(SWAP(u)); if (s->state == SWAP_FAILED) swap_set_state(s, SWAP_DEAD); @@ -1452,14 +1422,27 @@ static void swap_reset_failed(Unit *u) { s->clean_result = SWAP_SUCCESS; } +static void swap_handoff_timestamp( + Unit *u, + const struct ucred *ucred, + const dual_timestamp *ts) { + + Swap *s = ASSERT_PTR(SWAP(u)); + + assert(ucred); + assert(ts); + + if (s->control_pid.pid == ucred->pid && s->control_command) { + exec_status_handoff(&s->control_command->exec_status, ucred, ts); + unit_add_to_dbus_queue(u); + } +} + static int swap_get_timeout(Unit *u, usec_t *timeout) { - Swap *s = SWAP(u); + Swap *s = ASSERT_PTR(SWAP(u)); usec_t t; int r; - assert(s); - assert(u); - if (!s->timer_event_source) return 0; @@ -1493,11 +1476,10 @@ static PidRef* swap_control_pid(Unit *u) { } static int swap_clean(Unit *u, ExecCleanMask mask) { + Swap *s = ASSERT_PTR(SWAP(u)); _cleanup_strv_free_ char **l = NULL; - Swap *s = SWAP(u); int r; - assert(s); assert(mask != 0); if (s->state != SWAP_DEAD) @@ -1537,19 +1519,15 @@ fail: } static int swap_can_clean(Unit *u, ExecCleanMask *ret) { - Swap *s = SWAP(u); - - assert(s); + Swap *s = ASSERT_PTR(SWAP(u)); return exec_context_get_clean_mask(&s->exec_context, ret); } static int swap_can_start(Unit *u) { - Swap *s = SWAP(u); + Swap *s = ASSERT_PTR(SWAP(u)); int r; - assert(s); - r = unit_test_start_limit(u); if (r < 0) { swap_enter_dead(s, SWAP_FAILURE_START_LIMIT_HIT); @@ -1605,6 +1583,7 @@ const UnitVTable swap_vtable = { .cgroup_context_offset = offsetof(Swap, cgroup_context), .kill_context_offset = offsetof(Swap, kill_context), .exec_runtime_offset = offsetof(Swap, exec_runtime), + .cgroup_runtime_offset = offsetof(Swap, cgroup_runtime), .sections = "Unit\0" @@ -1645,6 +1624,8 @@ const UnitVTable swap_vtable = { .reset_failed = swap_reset_failed, + .notify_handoff_timestamp = swap_handoff_timestamp, + .control_pid = swap_control_pid, .bus_set_property = bus_swap_set_property, diff --git a/src/core/swap.h b/src/core/swap.h index ef20f0f..d9bbd37 100644 --- a/src/core/swap.h +++ b/src/core/swap.h @@ -70,6 +70,7 @@ struct Swap { CGroupContext cgroup_context; ExecRuntime *exec_runtime; + CGroupRuntime *cgroup_runtime; SwapState state, deserialized_state; diff --git a/src/core/system.conf.in b/src/core/system.conf.in index 05eb681..1c08aa4 100644 --- a/src/core/system.conf.in +++ b/src/core/system.conf.in @@ -26,7 +26,7 @@ #ShowStatus=yes #CrashChangeVT=no #CrashShell=no -#CrashReboot=no +#CrashAction=freeze #CtrlAltDelBurstAction=reboot-force #CPUAffinity= #NUMAPolicy=default @@ -39,6 +39,7 @@ #WatchdogDevice= #CapabilityBoundingSet= #NoNewPrivileges=no +#ProtectSystem=auto #SystemCallArchitectures= #TimerSlackNSec= #StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}} diff --git a/src/core/taint.c b/src/core/taint.c new file mode 100644 index 0000000..969b37f --- /dev/null +++ b/src/core/taint.c @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "clock-util.h" +#include "errno-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "log.h" +#include "os-util.h" +#include "path-util.h" +#include "strv.h" +#include "taint.h" +#include "uid-range.h" + +static int short_uid_gid_range(UIDRangeUsernsMode mode) { + _cleanup_(uid_range_freep) UIDRange *p = NULL; + int r; + + /* Taint systemd if we the UID/GID range assigned to this environment doesn't at least cover 0…65534, + * i.e. from root to nobody. */ + + r = uid_range_load_userns(/* path= */ NULL, mode, &p); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + return false; + if (r < 0) + return log_debug_errno(r, "Failed to load uid_map or gid_map: %m"); + + return !uid_range_covers(p, 0, 65535); +} + +char* taint_string(void) { + const char *stage[12] = {}; + size_t n = 0; + + /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". Only things that are detected at + * runtime should be tagged here. For stuff that is known during compilation, emit a warning in the + * configuration phase. */ + + _cleanup_free_ char *bin = NULL, *usr_sbin = NULL, *var_run = NULL; + + if (readlink_malloc("/bin", &bin) < 0 || !PATH_IN_SET(bin, "usr/bin", "/usr/bin")) + stage[n++] = "unmerged-usr"; + + /* Note that the check is different from default_PATH(), as we want to taint on uncanonical symlinks + * too. */ + if (readlink_malloc("/usr/sbin", &usr_sbin) < 0 || !PATH_IN_SET(usr_sbin, "bin", "/usr/bin")) + stage[n++] = "unmerged-bin"; + + if (readlink_malloc("/var/run", &var_run) < 0 || !PATH_IN_SET(var_run, "../run", "/run")) + stage[n++] = "var-run-bad"; + + if (cg_all_unified() == 0) + stage[n++] = "cgroupsv1"; + + if (clock_is_localtime(NULL) > 0) + stage[n++] = "local-hwclock"; + + if (os_release_support_ended(NULL, /* quiet= */ true, NULL) > 0) + stage[n++] = "support-ended"; + + struct utsname uts; + assert_se(uname(&uts) >= 0); + if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0) + stage[n++] = "old-kernel"; + + _cleanup_free_ char *overflowuid = NULL, *overflowgid = NULL; + if (read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid) >= 0 && + !streq(overflowuid, "65534")) + stage[n++] = "overflowuid-not-65534"; + if (read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid) >= 0 && + !streq(overflowgid, "65534")) + stage[n++] = "overflowgid-not-65534"; + + if (short_uid_gid_range(UID_RANGE_USERNS_INSIDE) > 0) + stage[n++] = "short-uid-range"; + if (short_uid_gid_range(GID_RANGE_USERNS_INSIDE) > 0) + stage[n++] = "short-gid-range"; + + assert(n < ELEMENTSOF(stage) - 1); /* One extra for NULL terminator */ + + return strv_join((char**) stage, ":"); +} diff --git a/src/core/taint.h b/src/core/taint.h new file mode 100644 index 0000000..2e514e3 --- /dev/null +++ b/src/core/taint.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +char* taint_string(void); diff --git a/src/core/target.c b/src/core/target.c index 8f2a331..15866e9 100644 --- a/src/core/target.c +++ b/src/core/target.c @@ -11,12 +11,13 @@ #include "unit.h" static const UnitActiveState state_translation_table[_TARGET_STATE_MAX] = { - [TARGET_DEAD] = UNIT_INACTIVE, - [TARGET_ACTIVE] = UNIT_ACTIVE + [TARGET_DEAD] = UNIT_INACTIVE, + [TARGET_ACTIVE] = UNIT_ACTIVE, }; static void target_set_state(Target *t, TargetState state) { TargetState old_state; + assert(t); if (t->state != state) @@ -26,10 +27,8 @@ static void target_set_state(Target *t, TargetState state) { t->state = state; if (state != old_state) - log_debug("%s changed %s -> %s", - UNIT(t)->id, - target_state_to_string(old_state), - target_state_to_string(state)); + log_unit_debug(UNIT(t), "Changed %s -> %s", + target_state_to_string(old_state), target_state_to_string(state)); unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true); } @@ -56,8 +55,8 @@ static int target_add_default_dependencies(Target *t) { if (n_others < 0) return n_others; - for (int i = 0; i < n_others; i++) { - r = unit_add_default_target_dependency(others[i], UNIT(t)); + FOREACH_ARRAY(i, others, n_others) { + r = unit_add_default_target_dependency(*i, UNIT(t)); if (r < 0) return r; } @@ -70,11 +69,9 @@ static int target_add_default_dependencies(Target *t) { } static int target_load(Unit *u) { - Target *t = TARGET(u); + Target *t = ASSERT_PTR(TARGET(u)); int r; - assert(t); - r = unit_load_fragment_and_dropin(u, true); if (r < 0) return r; @@ -87,9 +84,8 @@ static int target_load(Unit *u) { } static int target_coldplug(Unit *u) { - Target *t = TARGET(u); + Target *t = ASSERT_PTR(TARGET(u)); - assert(t); assert(t->state == TARGET_DEAD); if (t->deserialized_state != t->state) @@ -99,10 +95,10 @@ static int target_coldplug(Unit *u) { } static void target_dump(Unit *u, FILE *f, const char *prefix) { - Target *t = TARGET(u); + Target *t = ASSERT_PTR(TARGET(u)); - assert(t); assert(f); + assert(prefix); fprintf(f, "%sTarget State: %s\n", @@ -110,10 +106,9 @@ static void target_dump(Unit *u, FILE *f, const char *prefix) { } static int target_start(Unit *u) { - Target *t = TARGET(u); + Target *t = ASSERT_PTR(TARGET(u)); int r; - assert(t); assert(t->state == TARGET_DEAD); r = unit_acquire_invocation_id(u); @@ -125,9 +120,8 @@ static int target_start(Unit *u) { } static int target_stop(Unit *u) { - Target *t = TARGET(u); + Target *t = ASSERT_PTR(TARGET(u)); - assert(t); assert(t->state == TARGET_ACTIVE); target_set_state(t, TARGET_DEAD); @@ -135,21 +129,18 @@ static int target_stop(Unit *u) { } static int target_serialize(Unit *u, FILE *f, FDSet *fds) { - Target *s = TARGET(u); + Target *t = ASSERT_PTR(TARGET(u)); - assert(s); assert(f); assert(fds); - (void) serialize_item(f, "state", target_state_to_string(s->state)); + (void) serialize_item(f, "state", target_state_to_string(t->state)); return 0; } static int target_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Target *s = TARGET(u); + Target *t = ASSERT_PTR(TARGET(u)); - assert(s); - assert(u); assert(key); assert(value); assert(fds); @@ -159,26 +150,26 @@ static int target_deserialize_item(Unit *u, const char *key, const char *value, state = target_state_from_string(value); if (state < 0) - log_debug("Failed to parse state value %s", value); + log_unit_debug(u, "Failed to parse state: %s", value); else - s->deserialized_state = state; + t->deserialized_state = state; } else - log_debug("Unknown serialization key '%s'", key); + log_unit_debug(u, "Unknown serialization key: %s", key); return 0; } static UnitActiveState target_active_state(Unit *u) { - assert(u); + Target *t = ASSERT_PTR(TARGET(u)); - return state_translation_table[TARGET(u)->state]; + return state_translation_table[t->state]; } static const char *target_sub_state_to_string(Unit *u) { - assert(u); + Target *t = ASSERT_PTR(TARGET(u)); - return target_state_to_string(TARGET(u)->state); + return target_state_to_string(t->state); } const UnitVTable target_vtable = { @@ -213,4 +204,6 @@ const UnitVTable target_vtable = { [JOB_DONE] = "Stopped target %s.", }, }, + + .notify_supervisor = true, }; diff --git a/src/core/timer.c b/src/core/timer.c index 3c41a25..d7ce473 100644 --- a/src/core/timer.c +++ b/src/core/timer.c @@ -25,19 +25,18 @@ #include "virt.h" static const UnitActiveState state_translation_table[_TIMER_STATE_MAX] = { - [TIMER_DEAD] = UNIT_INACTIVE, + [TIMER_DEAD] = UNIT_INACTIVE, [TIMER_WAITING] = UNIT_ACTIVE, [TIMER_RUNNING] = UNIT_ACTIVE, [TIMER_ELAPSED] = UNIT_ACTIVE, - [TIMER_FAILED] = UNIT_FAILED + [TIMER_FAILED] = UNIT_FAILED, }; static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata); static void timer_init(Unit *u) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); - assert(u); assert(u->load_state == UNIT_STUB); t->next_elapse_monotonic_or_boottime = USEC_INFINITY; @@ -58,9 +57,7 @@ void timer_free_values(Timer *t) { } static void timer_done(Unit *u) { - Timer *t = TIMER(u); - - assert(t); + Timer *t = ASSERT_PTR(TIMER(u)); timer_free_values(t); @@ -141,7 +138,7 @@ static int timer_setup_persistent(Timer *t) { if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) { - r = unit_require_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES); if (r < 0) return r; @@ -192,19 +189,18 @@ static uint64_t timer_get_fixed_delay_hash(Timer *t) { } siphash24_init(&state, hash_key); - siphash24_compress(&machine_id, sizeof(sd_id128_t), &state); + siphash24_compress_typesafe(machine_id, &state); siphash24_compress_boolean(MANAGER_IS_SYSTEM(UNIT(t)->manager), &state); - siphash24_compress(&uid, sizeof(uid_t), &state); + siphash24_compress_typesafe(uid, &state); siphash24_compress_string(UNIT(t)->id, &state); return siphash24_finalize(&state); } static int timer_load(Unit *u) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); int r; - assert(u); assert(u->load_state == UNIT_STUB); r = unit_load_fragment_and_dropin(u, true); @@ -231,9 +227,12 @@ static int timer_load(Unit *u) { } static void timer_dump(Unit *u, FILE *f, const char *prefix) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); Unit *trigger; + assert(f); + assert(prefix); + trigger = UNIT_TRIGGER(u); fprintf(f, @@ -279,6 +278,7 @@ static void timer_dump(Unit *u, FILE *f, const char *prefix) { static void timer_set_state(Timer *t, TimerState state) { TimerState old_state; + assert(t); if (t->state != state) @@ -303,9 +303,8 @@ static void timer_set_state(Timer *t, TimerState state) { static void timer_enter_waiting(Timer *t, bool time_change); static int timer_coldplug(Unit *u) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); - assert(t); assert(t->state == TIMER_DEAD); if (t->deserialized_state == t->state) @@ -634,10 +633,9 @@ fail: } static int timer_start(Unit *u) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); int r; - assert(t); assert(IN_SET(t->state, TIMER_DEAD, TIMER_FAILED)); r = unit_test_trigger_loaded(u); @@ -682,9 +680,8 @@ static int timer_start(Unit *u) { } static int timer_stop(Unit *u) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); - assert(t); assert(IN_SET(t->state, TIMER_WAITING, TIMER_RUNNING, TIMER_ELAPSED)); timer_enter_dead(t, TIMER_SUCCESS); @@ -692,9 +689,8 @@ static int timer_stop(Unit *u) { } static int timer_serialize(Unit *u, FILE *f, FDSet *fds) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); - assert(u); assert(f); assert(fds); @@ -711,9 +707,8 @@ static int timer_serialize(Unit *u, FILE *f, FDSet *fds) { } static int timer_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); - assert(u); assert(key); assert(value); assert(fds); @@ -747,21 +742,19 @@ static int timer_deserialize_item(Unit *u, const char *key, const char *value, F } static UnitActiveState timer_active_state(Unit *u) { - assert(u); + Timer *t = ASSERT_PTR(TIMER(u)); - return state_translation_table[TIMER(u)->state]; + return state_translation_table[t->state]; } static const char *timer_sub_state_to_string(Unit *u) { - assert(u); + Timer *t = ASSERT_PTR(TIMER(u)); - return timer_state_to_string(TIMER(u)->state); + return timer_state_to_string(t->state); } static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) { - Timer *t = TIMER(userdata); - - assert(t); + Timer *t = ASSERT_PTR(TIMER(userdata)); if (t->state != TIMER_WAITING) return 0; @@ -772,9 +765,8 @@ static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) { } static void timer_trigger_notify(Unit *u, Unit *other) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); - assert(u); assert(other); /* Filter out invocations with bogus state */ @@ -812,9 +804,7 @@ static void timer_trigger_notify(Unit *u, Unit *other) { } static void timer_reset_failed(Unit *u) { - Timer *t = TIMER(u); - - assert(t); + Timer *t = ASSERT_PTR(TIMER(u)); if (t->state == TIMER_FAILED) timer_set_state(t, TIMER_DEAD); @@ -823,11 +813,9 @@ static void timer_reset_failed(Unit *u) { } static void timer_time_change(Unit *u) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); usec_t ts; - assert(u); - if (t->state != TIMER_WAITING) return; @@ -849,9 +837,7 @@ static void timer_time_change(Unit *u) { } static void timer_timezone_change(Unit *u) { - Timer *t = TIMER(u); - - assert(u); + Timer *t = ASSERT_PTR(TIMER(u)); if (t->state != TIMER_WAITING) return; @@ -866,10 +852,9 @@ static void timer_timezone_change(Unit *u) { } static int timer_clean(Unit *u, ExecCleanMask mask) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); int r; - assert(t); assert(mask != 0); if (t->state != TIMER_DEAD) @@ -892,9 +877,8 @@ static int timer_clean(Unit *u, ExecCleanMask mask) { } static int timer_can_clean(Unit *u, ExecCleanMask *ret) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); - assert(t); assert(ret); *ret = t->persistent ? EXEC_CLEAN_STATE : 0; @@ -902,11 +886,9 @@ static int timer_can_clean(Unit *u, ExecCleanMask *ret) { } static int timer_can_start(Unit *u) { - Timer *t = TIMER(u); + Timer *t = ASSERT_PTR(TIMER(u)); int r; - assert(t); - r = unit_test_start_limit(u); if (r < 0) { timer_enter_dead(t, TIMER_FAILURE_START_LIMIT_HIT); @@ -917,9 +899,8 @@ static int timer_can_start(Unit *u) { } static void activation_details_timer_serialize(ActivationDetails *details, FILE *f) { - ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details); + ActivationDetailsTimer *t = ASSERT_PTR(ACTIVATION_DETAILS_TIMER(details)); - assert(details); assert(f); assert(t); @@ -950,10 +931,9 @@ static int activation_details_timer_deserialize(const char *key, const char *val } static int activation_details_timer_append_env(ActivationDetails *details, char ***strv) { - ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details); + ActivationDetailsTimer *t = ASSERT_PTR(ACTIVATION_DETAILS_TIMER(details)); int r; - assert(details); assert(strv); assert(t); @@ -972,10 +952,9 @@ static int activation_details_timer_append_env(ActivationDetails *details, char } static int activation_details_timer_append_pair(ActivationDetails *details, char ***strv) { - ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details); + ActivationDetailsTimer *t = ASSERT_PTR(ACTIVATION_DETAILS_TIMER(details)); int r; - assert(details); assert(strv); assert(t); @@ -1014,7 +993,7 @@ static const char* const timer_base_table[_TIMER_BASE_MAX] = { [TIMER_STARTUP] = "OnStartupSec", [TIMER_UNIT_ACTIVE] = "OnUnitActiveSec", [TIMER_UNIT_INACTIVE] = "OnUnitInactiveSec", - [TIMER_CALENDAR] = "OnCalendar" + [TIMER_CALENDAR] = "OnCalendar", }; DEFINE_STRING_TABLE_LOOKUP(timer_base, TimerBase); diff --git a/src/core/transaction.c b/src/core/transaction.c index a81c40f..ab6e699 100644 --- a/src/core/transaction.c +++ b/src/core/transaction.c @@ -446,10 +446,10 @@ static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsi * the graph over 'before' edges in the actual job execution order. We traverse over both unit * ordering dependencies and we test with job_compare() whether it is the 'before' edge in the job * execution ordering. */ - for (size_t d = 0; d < ELEMENTSOF(directions); d++) { + FOREACH_ELEMENT(d, directions) { Unit *u; - UNIT_FOREACH_DEPENDENCY(u, j->unit, directions[d]) { + UNIT_FOREACH_DEPENDENCY(u, j->unit, *d) { Job *o; /* Is there a job for this unit? */ @@ -463,7 +463,7 @@ static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsi } /* Cut traversing if the job j is not really *before* o. */ - if (job_compare(j, o, directions[d]) >= 0) + if (job_compare(j, o, *d) >= 0) continue; r = transaction_verify_order_one(tr, o, j, generation, e); @@ -964,7 +964,7 @@ int transaction_add_job_and_dependencies( if (type != JOB_STOP) { r = bus_unit_validate_load_state(unit, e); - /* The time-based cache allows to start new units without daemon-reload, but if they are + /* The time-based cache allows new units to be started without daemon-reload, but if they are * already referenced (because of dependencies or ordering) then we have to force a load of * the fragment. As an optimization, check first if anything in the usual paths was modified * since the last time the cache was loaded. Also check if the last time an attempt to load diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c index 9f95984..f25e2e3 100644 --- a/src/core/unit-printf.c +++ b/src/core/unit-printf.c @@ -4,6 +4,7 @@ #include "cgroup-util.h" #include "format-util.h" #include "macro.h" +#include "sd-path.h" #include "specifier.h" #include "string-util.h" #include "strv.h" @@ -86,68 +87,46 @@ static void bad_specifier(const Unit *u, char specifier) { static int specifier_cgroup(char specifier, const void *data, const char *root, const void *userdata, char **ret) { const Unit *u = ASSERT_PTR(userdata); + CGroupRuntime *crt = unit_get_cgroup_runtime(u); bad_specifier(u, specifier); - if (u->cgroup_path) { - char *n; - - n = strdup(u->cgroup_path); - if (!n) - return -ENOMEM; - - *ret = n; - return 0; - } + if (crt && crt->cgroup_path) + return strdup_to(ret, crt->cgroup_path); return unit_default_cgroup_path(u, ret); } static int specifier_cgroup_root(char specifier, const void *data, const char *root, const void *userdata, char **ret) { const Unit *u = ASSERT_PTR(userdata); - char *n; bad_specifier(u, specifier); - n = strdup(u->manager->cgroup_root); - if (!n) - return -ENOMEM; - - *ret = n; - return 0; + return strdup_to(ret, u->manager->cgroup_root); } static int specifier_cgroup_slice(char specifier, const void *data, const char *root, const void *userdata, char **ret) { const Unit *u = ASSERT_PTR(userdata), *slice; - char *n; bad_specifier(u, specifier); slice = UNIT_GET_SLICE(u); if (slice) { - if (slice->cgroup_path) - n = strdup(slice->cgroup_path); - else - return unit_default_cgroup_path(slice, ret); - } else - n = strdup(u->manager->cgroup_root); - if (!n) - return -ENOMEM; + CGroupRuntime *crt = unit_get_cgroup_runtime(slice); - *ret = n; - return 0; + if (crt && crt->cgroup_path) + return strdup_to(ret, crt->cgroup_path); + + return unit_default_cgroup_path(slice, ret); + } + + return strdup_to(ret, u->manager->cgroup_root); } static int specifier_special_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret) { const Unit *u = ASSERT_PTR(userdata); - char *n; - - n = strdup(u->manager->prefix[PTR_TO_UINT(data)]); - if (!n) - return -ENOMEM; - *ret = n; - return 0; + return strdup_to(ret, u->manager->prefix[PTR_TO_UINT(data)]); } static int specifier_credentials_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) { @@ -164,6 +143,14 @@ static int specifier_credentials_dir(char specifier, const void *data, const cha return 0; } +static int specifier_shared_data_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) { + const Unit *u = ASSERT_PTR(userdata); + + assert(ret); + + return sd_path_lookup(MANAGER_IS_SYSTEM(u->manager) ? SD_PATH_SYSTEM_SHARED : SD_PATH_USER_SHARED, NULL, ret); +} + int unit_name_printf(const Unit *u, const char* format, char **ret) { /* * This will use the passed string as format string and replace the following specifiers (which should all be @@ -208,6 +195,7 @@ int unit_full_printf_full(const Unit *u, const char *format, size_t max_length, * * %C: the cache directory root (e.g. /var/cache or $XDG_CACHE_HOME) * %d: the credentials directory ($CREDENTIALS_DIRECTORY) + * %D: the shared data root (e.g. /usr/share or $XDG_DATA_HOME) * %E: the configuration directory root (e.g. /etc or $XDG_CONFIG_HOME) * %L: the log directory root (e.g. /var/log or $XDG_STATE_HOME/log) * %S: the state directory root (e.g. /var/lib or $XDG_STATE_HOME) @@ -245,6 +233,7 @@ int unit_full_printf_full(const Unit *u, const char *format, size_t max_length, { 'C', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CACHE) }, { 'd', specifier_credentials_dir, NULL }, + { 'D', specifier_shared_data_dir, NULL }, { 'E', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CONFIGURATION) }, { 'L', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_LOGS) }, { 'S', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_STATE) }, diff --git a/src/core/unit-serialize.c b/src/core/unit-serialize.c index fe4221c..175e327 100644 --- a/src/core/unit-serialize.c +++ b/src/core/unit-serialize.c @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include "bpf-restrict-ifaces.h" #include "bpf-socket-bind.h" #include "bus-util.h" #include "dbus.h" @@ -7,29 +8,11 @@ #include "fileio.h" #include "format-util.h" #include "parse-util.h" -#include "restrict-ifaces.h" #include "serialize.h" #include "string-table.h" #include "unit-serialize.h" #include "user-util.h" -static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) { - _cleanup_free_ char *s = NULL; - int r; - - assert(f); - assert(key); - - if (mask == 0) - return 0; - - r = cg_mask_to_string(mask, &s); - if (r < 0) - return log_error_errno(r, "Failed to format cgroup mask: %m"); - - return serialize_item(f, key, s); -} - /* Make sure out values fit in the bitfield. */ assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8); @@ -69,40 +52,6 @@ static int deserialize_markers(Unit *u, const char *value) { } } -static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { - [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes", - [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets", - [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes", - [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets", -}; - -DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric); - -static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { - [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-base", - [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-base", - [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-base", - [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base", -}; - -DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric); - -static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { - [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-last", - [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-last", - [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-last", - [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last", -}; - -DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric); - -static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = { - [CGROUP_MEMORY_PEAK] = "memory-accounting-peak", - [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak", -}; - -DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric); - int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool switching_root) { int r; @@ -158,48 +107,7 @@ int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool switching_root) { (void) serialize_bool(f, "exported-log-rate-limit-interval", u->exported_log_ratelimit_interval); (void) serialize_bool(f, "exported-log-rate-limit-burst", u->exported_log_ratelimit_burst); - (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, u->cpu_usage_base); - if (u->cpu_usage_last != NSEC_INFINITY) - (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last); - - if (u->managed_oom_kill_last > 0) - (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, u->managed_oom_kill_last); - - if (u->oom_kill_last > 0) - (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last); - - for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) { - (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, u->io_accounting_base[im]); - - if (u->io_accounting_last[im] != UINT64_MAX) - (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, u->io_accounting_last[im]); - } - - for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) { - uint64_t v; - - r = unit_get_memory_accounting(u, metric, &v); - if (r >= 0) - (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v); - } - - if (u->cgroup_path) - (void) serialize_item(f, "cgroup", u->cgroup_path); - - (void) serialize_bool(f, "cgroup-realized", u->cgroup_realized); - (void) serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask); - (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask); - (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", u->cgroup_invalidated_mask); - - (void) bpf_serialize_socket_bind(u, f, fds); - - (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", u->ip_bpf_ingress_installed); - (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", u->ip_bpf_egress_installed); - (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", u->bpf_device_control_installed); - (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", u->ip_bpf_custom_ingress_installed); - (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", u->ip_bpf_custom_egress_installed); - - (void) serialize_restrict_network_interfaces(u, f, fds); + (void) cgroup_runtime_serialize(u, f, fds); if (uid_is_valid(u->ref_uid)) (void) serialize_item_format(f, "ref-uid", UID_FMT, u->ref_uid); @@ -214,14 +122,6 @@ int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool switching_root) { bus_track_serialize(u->bus_track, f, "ref"); - for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { - uint64_t v; - - r = unit_get_ip_accounting(u, m, &v); - if (r >= 0) - (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v); - } - if (!switching_root) { if (u->job) { fputs("job\n", f); @@ -297,7 +197,6 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) { for (;;) { _cleanup_free_ char *l = NULL; - ssize_t m; size_t k; char *v; @@ -380,76 +279,7 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) { else if (MATCH_DESERIALIZE("exported-log-rate-limit-burst", l, v, parse_boolean, u->exported_log_ratelimit_burst)) continue; - else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-base", l, v, safe_atou64, u->cpu_usage_base) || - MATCH_DESERIALIZE_IMMEDIATE("cpuacct-usage-base", l, v, safe_atou64, u->cpu_usage_base)) - continue; - - else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-last", l, v, safe_atou64, u->cpu_usage_last)) - continue; - - else if (MATCH_DESERIALIZE_IMMEDIATE("managed-oom-kill-last", l, v, safe_atou64, u->managed_oom_kill_last)) - continue; - - else if (MATCH_DESERIALIZE_IMMEDIATE("oom-kill-last", l, v, safe_atou64, u->oom_kill_last)) - continue; - - else if (streq(l, "cgroup")) { - r = unit_set_cgroup_path(u, v); - if (r < 0) - log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v); - - (void) unit_watch_cgroup(u); - (void) unit_watch_cgroup_memory(u); - - continue; - - } else if (MATCH_DESERIALIZE("cgroup-realized", l, v, parse_boolean, u->cgroup_realized)) - continue; - - else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-realized-mask", l, v, cg_mask_from_string, u->cgroup_realized_mask)) - continue; - - else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-enabled-mask", l, v, cg_mask_from_string, u->cgroup_enabled_mask)) - continue; - - else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-invalidated-mask", l, v, cg_mask_from_string, u->cgroup_invalidated_mask)) - continue; - - else if (STR_IN_SET(l, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) { - int fd; - - fd = deserialize_fd(fds, v); - if (fd >= 0) - (void) bpf_socket_bind_add_initial_link_fd(u, fd); - continue; - - } else if (streq(l, "ip-bpf-ingress-installed")) { - (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_ingress_installed); - continue; - } else if (streq(l, "ip-bpf-egress-installed")) { - (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_egress_installed); - continue; - } else if (streq(l, "bpf-device-control-installed")) { - (void) bpf_program_deserialize_attachment(v, fds, &u->bpf_device_control_installed); - continue; - - } else if (streq(l, "ip-bpf-custom-ingress-installed")) { - (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_ingress_installed); - continue; - } else if (streq(l, "ip-bpf-custom-egress-installed")) { - (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_egress_installed); - continue; - - } else if (streq(l, "restrict-ifaces-bpf-fd")) { - int fd; - - fd = deserialize_fd(fds, v); - if (fd >= 0) - (void) restrict_network_interfaces_add_initial_link_fd(u, fd); - - continue; - - } else if (streq(l, "ref-uid")) { + else if (streq(l, "ref-uid")) { uid_t uid; r = parse_uid(v, &uid); @@ -499,55 +329,6 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) { continue; } - m = memory_accounting_metric_field_last_from_string(l); - if (m >= 0) { - uint64_t c; - - r = safe_atou64(v, &c); - if (r < 0) - log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", v); - else - u->memory_accounting_last[m] = c; - continue; - } - - /* Check if this is an IP accounting metric serialization field */ - m = ip_accounting_metric_field_from_string(l); - if (m >= 0) { - uint64_t c; - - r = safe_atou64(v, &c); - if (r < 0) - log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", v); - else - u->ip_accounting_extra[m] = c; - continue; - } - - m = io_accounting_metric_field_base_from_string(l); - if (m >= 0) { - uint64_t c; - - r = safe_atou64(v, &c); - if (r < 0) - log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", v); - else - u->io_accounting_base[m] = c; - continue; - } - - m = io_accounting_metric_field_last_from_string(l); - if (m >= 0) { - uint64_t c; - - r = safe_atou64(v, &c); - if (r < 0) - log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", v); - else - u->io_accounting_last[m] = c; - continue; - } - r = exec_shared_runtime_deserialize_compat(u, l, v, fds); if (r < 0) { log_unit_warning(u, "Failed to deserialize runtime parameter '%s', ignoring.", l); @@ -556,6 +337,13 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) { /* Returns positive if key was handled by the call */ continue; + r = cgroup_runtime_deserialize_one(u, l, v, fds); + if (r < 0) { + log_unit_warning(u, "Failed to deserialize cgroup runtime parameter '%s, ignoring.", l); + continue; + } else if (r > 0) + continue; /* was handled */ + if (UNIT_VTABLE(u)->deserialize_item) { r = UNIT_VTABLE(u)->deserialize_item(u, l, v, fds); if (r < 0) @@ -574,7 +362,9 @@ int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) { /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings * applied after we are done. For that we invalidate anything already realized, so that we can * realize it again. */ - if (u->cgroup_realized) { + CGroupRuntime *crt; + crt = unit_get_cgroup_runtime(u); + if (crt && crt->cgroup_realized) { unit_invalidate_cgroup(u, _CGROUP_MASK_ALL); unit_invalidate_cgroup_bpf(u); } @@ -661,8 +451,8 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { prefix2 = strjoina(prefix, "\t"); fprintf(f, - "%s-> Unit %s:\n", - prefix, u->id); + "%s%s Unit %s:\n", + prefix, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), u->id); SET_FOREACH(t, u->aliases) fprintf(f, "%s\tAlias: %s\n", prefix, t); @@ -707,23 +497,25 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { } if (UNIT_HAS_CGROUP_CONTEXT(u)) { + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + fprintf(f, "%s\tSlice: %s\n" "%s\tCGroup: %s\n" "%s\tCGroup realized: %s\n", prefix, strna(unit_slice_name(u)), - prefix, strna(u->cgroup_path), - prefix, yes_no(u->cgroup_realized)); + prefix, strna(crt ? crt->cgroup_path : NULL), + prefix, yes_no(crt ? crt->cgroup_realized : false)); - if (u->cgroup_realized_mask != 0) { + if (crt && crt->cgroup_realized_mask != 0) { _cleanup_free_ char *s = NULL; - (void) cg_mask_to_string(u->cgroup_realized_mask, &s); + (void) cg_mask_to_string(crt->cgroup_realized_mask, &s); fprintf(f, "%s\tCGroup realized mask: %s\n", prefix, strnull(s)); } - if (u->cgroup_enabled_mask != 0) { + if (crt && crt->cgroup_enabled_mask != 0) { _cleanup_free_ char *s = NULL; - (void) cg_mask_to_string(u->cgroup_enabled_mask, &s); + (void) cg_mask_to_string(crt->cgroup_enabled_mask, &s); fprintf(f, "%s\tCGroup enabled mask: %s\n", prefix, strnull(s)); } @@ -831,21 +623,26 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { } } - if (!hashmap_isempty(u->requires_mounts_for)) { - UnitDependencyInfo di; - const char *path; + for (UnitMountDependencyType type = 0; type < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX; type++) + if (!hashmap_isempty(u->mounts_for[type])) { + UnitDependencyInfo di; + const char *path; - HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) { - bool space = false; + HASHMAP_FOREACH_KEY(di.data, path, u->mounts_for[type]) { + bool space = false; - fprintf(f, "%s\tRequiresMountsFor: %s (", prefix, path); + fprintf(f, + "%s\t%s: %s (", + prefix, + unit_mount_dependency_type_to_string(type), + path); - print_unit_dependency_mask(f, "origin", di.origin_mask, &space); - print_unit_dependency_mask(f, "destination", di.destination_mask, &space); + print_unit_dependency_mask(f, "origin", di.origin_mask, &space); + print_unit_dependency_mask(f, "destination", di.destination_mask, &space); - fputs(")\n", f); + fputs(")\n", f); + } } - } if (u->load_state == UNIT_LOADED) { diff --git a/src/core/unit.c b/src/core/unit.c index 2fc9f5a..2d40618 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -67,27 +67,29 @@ #endif /* Thresholds for logging at INFO level about resource consumption */ -#define MENTIONWORTHY_CPU_NSEC (1 * NSEC_PER_SEC) -#define MENTIONWORTHY_IO_BYTES (1024 * 1024ULL) -#define MENTIONWORTHY_IP_BYTES (0ULL) +#define MENTIONWORTHY_CPU_NSEC (1 * NSEC_PER_SEC) +#define MENTIONWORTHY_MEMORY_BYTES (64 * U64_MB) +#define MENTIONWORTHY_IO_BYTES (1 * U64_MB) +#define MENTIONWORTHY_IP_BYTES UINT64_C(0) -/* Thresholds for logging at INFO level about resource consumption */ -#define NOTICEWORTHY_CPU_NSEC (10*60 * NSEC_PER_SEC) /* 10 minutes */ -#define NOTICEWORTHY_IO_BYTES (10 * 1024 * 1024ULL) /* 10 MB */ -#define NOTICEWORTHY_IP_BYTES (128 * 1024 * 1024ULL) /* 128 MB */ +/* Thresholds for logging at NOTICE level about resource consumption */ +#define NOTICEWORTHY_CPU_NSEC (10 * NSEC_PER_MINUTE) +#define NOTICEWORTHY_MEMORY_BYTES (512 * U64_MB) +#define NOTICEWORTHY_IO_BYTES (10 * U64_MB) +#define NOTICEWORTHY_IP_BYTES (128 * U64_MB) const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX] = { - [UNIT_SERVICE] = &service_vtable, - [UNIT_SOCKET] = &socket_vtable, - [UNIT_TARGET] = &target_vtable, - [UNIT_DEVICE] = &device_vtable, - [UNIT_MOUNT] = &mount_vtable, + [UNIT_SERVICE] = &service_vtable, + [UNIT_SOCKET] = &socket_vtable, + [UNIT_TARGET] = &target_vtable, + [UNIT_DEVICE] = &device_vtable, + [UNIT_MOUNT] = &mount_vtable, [UNIT_AUTOMOUNT] = &automount_vtable, - [UNIT_SWAP] = &swap_vtable, - [UNIT_TIMER] = &timer_vtable, - [UNIT_PATH] = &path_vtable, - [UNIT_SLICE] = &slice_vtable, - [UNIT_SCOPE] = &scope_vtable, + [UNIT_SWAP] = &swap_vtable, + [UNIT_TIMER] = &timer_vtable, + [UNIT_PATH] = &path_vtable, + [UNIT_SLICE] = &slice_vtable, + [UNIT_SCOPE] = &scope_vtable, }; Unit* unit_new(Manager *m, size_t size) { @@ -107,29 +109,13 @@ Unit* unit_new(Manager *m, size_t size) { u->unit_file_preset = -1; u->on_failure_job_mode = JOB_REPLACE; u->on_success_job_mode = JOB_FAIL; - u->cgroup_control_inotify_wd = -1; - u->cgroup_memory_inotify_wd = -1; u->job_timeout = USEC_INFINITY; u->job_running_timeout = USEC_INFINITY; u->ref_uid = UID_INVALID; u->ref_gid = GID_INVALID; - u->cpu_usage_last = NSEC_INFINITY; - - unit_reset_memory_accounting_last(u); - unit_reset_io_accounting_last(u); - - u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL; u->failure_action_exit_status = u->success_action_exit_status = -1; - u->ip_accounting_ingress_map_fd = -EBADF; - u->ip_accounting_egress_map_fd = -EBADF; - - u->ipv4_allow_map_fd = -EBADF; - u->ipv6_allow_map_fd = -EBADF; - u->ipv4_deny_map_fd = -EBADF; - u->ipv6_deny_map_fd = -EBADF; - u->last_section_private = -1; u->start_ratelimit = (const RateLimit) { @@ -137,7 +123,13 @@ Unit* unit_new(Manager *m, size_t size) { m->defaults.start_limit_burst, }; - u->auto_start_stop_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_SEC, .burst = 16 }; + u->auto_start_stop_ratelimit = (const RateLimit) { + .interval = 10 * USEC_PER_SEC, + .burst = 16 + }; + + unit_reset_memory_accounting_last(u); + unit_reset_io_accounting_last(u); return u; } @@ -251,12 +243,12 @@ int unit_add_name(Unit *u, const char *text) { if (unit_name_is_valid(text, UNIT_NAME_TEMPLATE)) { if (!u->instance) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), - "instance is not set when adding name '%s': %m", text); + "Instance is not set when adding name '%s'.", text); r = unit_name_replace_instance(text, u->instance, &name); if (r < 0) return log_unit_debug_errno(u, r, - "failed to build instance name from '%s': %m", text); + "Failed to build instance name from '%s': %m", text); } else { name = strdup(text); if (!name) @@ -268,47 +260,47 @@ int unit_add_name(Unit *u, const char *text) { if (hashmap_contains(u->manager->units, name)) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST), - "unit already exist when adding name '%s': %m", name); + "Unit already exist when adding name '%s'.", name); if (!unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), - "name '%s' is invalid: %m", name); + "Name '%s' is invalid.", name); t = unit_name_to_type(name); if (t < 0) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), - "failed to derive unit type from name '%s': %m", name); + "failed to derive unit type from name '%s'.", name); if (u->type != _UNIT_TYPE_INVALID && t != u->type) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), - "unit type is illegal: u->type(%d) and t(%d) for name '%s': %m", + "Unit type is illegal: u->type(%d) and t(%d) for name '%s'.", u->type, t, name); r = unit_name_to_instance(name, &instance); if (r < 0) - return log_unit_debug_errno(u, r, "failed to extract instance from name '%s': %m", name); + return log_unit_debug_errno(u, r, "Failed to extract instance from name '%s': %m", name); if (instance && !unit_type_may_template(t)) - return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "templates are not allowed for name '%s': %m", name); + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "Templates are not allowed for name '%s'.", name); /* Ensure that this unit either has no instance, or that the instance matches. */ if (u->type != _UNIT_TYPE_INVALID && !streq_ptr(u->instance, instance)) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), - "cannot add name %s, the instances don't match (\"%s\" != \"%s\").", + "Cannot add name %s, the instances don't match (\"%s\" != \"%s\").", name, instance, u->instance); if (u->id && !unit_type_may_alias(t)) return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST), - "cannot add name %s, aliases are not allowed for %s units.", + "Cannot add name %s, aliases are not allowed for %s units.", name, unit_type_to_string(t)); if (hashmap_size(u->manager->units) >= MANAGER_MAX_NAMES) - return log_unit_warning_errno(u, SYNTHETIC_ERRNO(E2BIG), "cannot add name, manager has too many units: %m"); + return log_unit_warning_errno(u, SYNTHETIC_ERRNO(E2BIG), "Cannot add name, manager has too many units."); /* Add name to the global hashmap first, because that's easier to undo */ r = hashmap_put(u->manager->units, name, u); if (r < 0) - return log_unit_debug_errno(u, r, "add unit to hashmap failed for name '%s': %m", text); + return log_unit_debug_errno(u, r, "Add unit to hashmap failed for name '%s': %m", text); if (u->id) { r = unit_add_alias(u, name); /* unit_add_alias() takes ownership of the name on success */ @@ -475,7 +467,7 @@ bool unit_may_gc(Unit *u) { break; case COLLECT_INACTIVE_OR_FAILED: - if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED)) + if (!UNIT_IS_INACTIVE_OR_FAILED(state)) return false; break; @@ -488,16 +480,11 @@ bool unit_may_gc(Unit *u) { if (unit_success_failure_handler_has_jobs(u)) return false; - if (u->cgroup_path) { - /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay - * around. Units with active processes should never be collected. */ - - r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path); - if (r < 0) - log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path)); - if (r <= 0) - return false; - } + /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay + * around. Units with active processes should never be collected. */ + r = unit_cgroup_is_empty(u); + if (r <= 0 && r != -ENXIO) + return false; /* ENXIO means: currently not realized */ if (!UNIT_VTABLE(u)->may_gc) return true; @@ -689,38 +676,39 @@ static void unit_remove_transient(Unit *u) { } } -static void unit_free_requires_mounts_for(Unit *u) { +static void unit_free_mounts_for(Unit *u) { assert(u); - for (;;) { - _cleanup_free_ char *path = NULL; + for (UnitMountDependencyType t = 0; t < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX; ++t) { + for (;;) { + _cleanup_free_ char *path = NULL; + + path = hashmap_steal_first_key(u->mounts_for[t]); + if (!path) + break; - path = hashmap_steal_first_key(u->requires_mounts_for); - if (!path) - break; - else { char s[strlen(path) + 1]; PATH_FOREACH_PREFIX_MORE(s, path) { char *y; Set *x; - x = hashmap_get2(u->manager->units_requiring_mounts_for, s, (void**) &y); + x = hashmap_get2(u->manager->units_needing_mounts_for[t], s, (void**) &y); if (!x) continue; (void) set_remove(x, u); if (set_isempty(x)) { - (void) hashmap_remove(u->manager->units_requiring_mounts_for, y); + assert_se(hashmap_remove(u->manager->units_needing_mounts_for[t], y)); free(y); set_free(x); } } } - } - u->requires_mounts_for = hashmap_free(u->requires_mounts_for); + u->mounts_for[t] = hashmap_free(u->mounts_for[t]); + } } static void unit_done(Unit *u) { @@ -769,7 +757,7 @@ Unit* unit_free(Unit *u) { u->deserialized_refs = strv_free(u->deserialized_refs); u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation); - unit_free_requires_mounts_for(u); + unit_free_mounts_for(u); SET_FOREACH(t, u->aliases) hashmap_remove_value(u->manager->units, t, u); @@ -801,12 +789,6 @@ Unit* unit_free(Unit *u) { if (u->on_console) manager_unref_console(u->manager); - fdset_free(u->initial_socket_bind_link_fds); -#if BPF_FRAMEWORK - bpf_link_free(u->ipv4_socket_bind_link); - bpf_link_free(u->ipv6_socket_bind_link); -#endif - unit_release_cgroup(u); if (!MANAGER_IS_RELOADING(u->manager)) @@ -863,16 +845,6 @@ Unit* unit_free(Unit *u) { bpf_firewall_close(u); - hashmap_free(u->bpf_foreign_by_key); - - bpf_program_free(u->bpf_device_control_installed); - -#if BPF_FRAMEWORK - bpf_link_free(u->restrict_ifaces_ingress_bpf_link); - bpf_link_free(u->restrict_ifaces_egress_bpf_link); -#endif - fdset_free(u->initial_restric_ifaces_link_fds); - condition_free_list(u->conditions); condition_free_list(u->asserts); @@ -902,32 +874,6 @@ FreezerState unit_freezer_state(Unit *u) { return u->freezer_state; } -int unit_freezer_state_kernel(Unit *u, FreezerState *ret) { - char *values[1] = {}; - int r; - - assert(u); - - r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", - STRV_MAKE("frozen"), values); - if (r < 0) - return r; - - r = _FREEZER_STATE_INVALID; - - if (values[0]) { - if (streq(values[0], "0")) - r = FREEZER_RUNNING; - else if (streq(values[0], "1")) - r = FREEZER_FROZEN; - } - - free(values[0]); - *ret = r; - - return 0; -} - UnitActiveState unit_active_state(Unit *u) { assert(u); @@ -1277,20 +1223,24 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) { /* Unlike unit_add_dependency() or friends, this always returns 0 on success. */ - if (c->working_directory && !c->working_directory_missing_ok) { - r = unit_require_mounts_for(u, c->working_directory, UNIT_DEPENDENCY_FILE); + if (c->working_directory) { + r = unit_add_mounts_for( + u, + c->working_directory, + UNIT_DEPENDENCY_FILE, + c->working_directory_missing_ok ? UNIT_MOUNT_WANTS : UNIT_MOUNT_REQUIRES); if (r < 0) return r; } if (c->root_directory) { - r = unit_require_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS); if (r < 0) return r; } if (c->root_image) { - r = unit_require_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS); if (r < 0) return r; } @@ -1299,14 +1249,14 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) { if (!u->manager->prefix[dt]) continue; - for (size_t i = 0; i < c->directories[dt].n_items; i++) { + FOREACH_ARRAY(i, c->directories[dt].items, c->directories[dt].n_items) { _cleanup_free_ char *p = NULL; - p = path_join(u->manager->prefix[dt], c->directories[dt].items[i].path); + p = path_join(u->manager->prefix[dt], i->path); if (!p) return -ENOMEM; - r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(u, p, UNIT_DEPENDENCY_FILE, UNIT_MOUNT_REQUIRES); if (r < 0) return r; } @@ -1326,16 +1276,11 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) { } if (c->private_tmp) { - - /* FIXME: for now we make a special case for /tmp and add a weak dependency on - * tmp.mount so /tmp being masked is supported. However there's no reason to treat - * /tmp specifically and masking other mount units should be handled more - * gracefully too, see PR#16894. */ - r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "tmp.mount", true, UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(u, "/tmp", UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS); if (r < 0) return r; - r = unit_require_mounts_for(u, "/var/tmp", UNIT_DEPENDENCY_FILE); + r = unit_add_mounts_for(u, "/var/tmp", UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS); if (r < 0) return r; @@ -1366,23 +1311,26 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) { * is run first. */ if (c->log_namespace) { - _cleanup_free_ char *socket_unit = NULL, *varlink_socket_unit = NULL; - - r = unit_name_build_from_type("systemd-journald", c->log_namespace, UNIT_SOCKET, &socket_unit); - if (r < 0) - return r; + static const struct { + const char *template; + UnitType type; + } deps[] = { + { "systemd-journald", UNIT_SOCKET, }, + { "systemd-journald-varlink", UNIT_SOCKET, }, + { "systemd-journald-sync", UNIT_SERVICE, }, + }; - r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, socket_unit, true, UNIT_DEPENDENCY_FILE); - if (r < 0) - return r; + FOREACH_ELEMENT(i, deps) { + _cleanup_free_ char *unit = NULL; - r = unit_name_build_from_type("systemd-journald-varlink", c->log_namespace, UNIT_SOCKET, &varlink_socket_unit); - if (r < 0) - return r; + r = unit_name_build_from_type(i->template, c->log_namespace, i->type, &unit); + if (r < 0) + return r; - r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, varlink_socket_unit, true, UNIT_DEPENDENCY_FILE); - if (r < 0) - return r; + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, unit, true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + } } else { r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_JOURNALD_SOCKET, true, UNIT_DEPENDENCY_FILE); if (r < 0) @@ -1515,6 +1463,7 @@ int unit_add_default_target_dependency(Unit *u, Unit *target) { static int unit_add_slice_dependencies(Unit *u) { Unit *slice; + assert(u); if (!UNIT_HAS_CGROUP_CONTEXT(u)) @@ -1526,8 +1475,12 @@ static int unit_add_slice_dependencies(Unit *u) { UnitDependencyMask mask = u->type == UNIT_SLICE ? UNIT_DEPENDENCY_IMPLICIT : UNIT_DEPENDENCY_FILE; slice = UNIT_GET_SLICE(u); - if (slice) + if (slice) { + if (!IN_SET(slice->freezer_state, FREEZER_RUNNING, FREEZER_THAWING)) + u->freezer_state = FREEZER_FROZEN_BY_PARENT; + return unit_add_two_dependencies(u, UNIT_AFTER, UNIT_REQUIRES, slice, true, mask); + } if (unit_has_name(u, SPECIAL_ROOT_SLICE)) return 0; @@ -1536,51 +1489,72 @@ static int unit_add_slice_dependencies(Unit *u) { } static int unit_add_mount_dependencies(Unit *u) { - UnitDependencyInfo di; - const char *path; bool changed = false; int r; assert(u); - HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) { - char prefix[strlen(path) + 1]; + for (UnitMountDependencyType t = 0; t < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX; ++t) { + UnitDependencyInfo di; + const char *path; - PATH_FOREACH_PREFIX_MORE(prefix, path) { - _cleanup_free_ char *p = NULL; - Unit *m; + HASHMAP_FOREACH_KEY(di.data, path, u->mounts_for[t]) { - r = unit_name_from_path(prefix, ".mount", &p); - if (r == -EINVAL) - continue; /* If the path cannot be converted to a mount unit name, then it's - * not manageable as a unit by systemd, and hence we don't need a - * dependency on it. Let's thus silently ignore the issue. */ - if (r < 0) - return r; + char prefix[strlen(ASSERT_PTR(path)) + 1]; - m = manager_get_unit(u->manager, p); - if (!m) { - /* Make sure to load the mount unit if it exists. If so the dependencies on - * this unit will be added later during the loading of the mount unit. */ - (void) manager_load_unit_prepare(u->manager, p, NULL, NULL, &m); - continue; - } - if (m == u) - continue; + PATH_FOREACH_PREFIX_MORE(prefix, path) { + _cleanup_free_ char *p = NULL; + Unit *m; - if (m->load_state != UNIT_LOADED) - continue; + r = unit_name_from_path(prefix, ".mount", &p); + if (r == -EINVAL) + continue; /* If the path cannot be converted to a mount unit name, + * then it's not manageable as a unit by systemd, and + * hence we don't need a dependency on it. Let's thus + * silently ignore the issue. */ + if (r < 0) + return r; - r = unit_add_dependency(u, UNIT_AFTER, m, true, di.origin_mask); - if (r < 0) - return r; - changed = changed || r > 0; + m = manager_get_unit(u->manager, p); + if (!m) { + /* Make sure to load the mount unit if it exists. If so the + * dependencies on this unit will be added later during the loading + * of the mount unit. */ + (void) manager_load_unit_prepare( + u->manager, + p, + /* path= */NULL, + /* e= */NULL, + &m); + continue; + } + if (m == u) + continue; - if (m->fragment_path) { - r = unit_add_dependency(u, UNIT_REQUIRES, m, true, di.origin_mask); + if (m->load_state != UNIT_LOADED) + continue; + + r = unit_add_dependency( + u, + UNIT_AFTER, + m, + /* add_reference= */ true, + di.origin_mask); if (r < 0) return r; changed = changed || r > 0; + + if (m->fragment_path) { + r = unit_add_dependency( + u, + unit_mount_dependency_type_to_dependency_type(t), + m, + /* add_reference= */ true, + di.origin_mask); + if (r < 0) + return r; + changed = changed || r > 0; + } } } } @@ -1959,6 +1933,10 @@ int unit_start(Unit *u, ActivationDetails *details) { return unit_start(following, details); } + /* Check to make sure the unit isn't frozen */ + if (u->freezer_state != FREEZER_RUNNING) + return -EDEADLK; + /* Check our ability to start early so that failure conditions don't cause us to enter a busy loop. */ if (UNIT_VTABLE(u)->can_start) { r = UNIT_VTABLE(u)->can_start(u); @@ -1975,7 +1953,6 @@ int unit_start(Unit *u, ActivationDetails *details) { * waits for a holdoff timer to elapse before it will start again. */ unit_add_to_dbus_queue(u); - unit_cgroup_freezer_action(u, FREEZER_THAW); if (!u->activation_details) /* Older details object wins */ u->activation_details = activation_details_ref(details); @@ -2010,6 +1987,7 @@ bool unit_can_isolate(Unit *u) { * -EBADR: This unit type does not support stopping. * -EALREADY: Unit is already stopped. * -EAGAIN: An operation is already in progress. Retry later. + * -EDEADLK: Unit is frozen */ int unit_stop(Unit *u) { UnitActiveState state; @@ -2027,11 +2005,14 @@ int unit_stop(Unit *u) { return unit_stop(following); } + /* Check to make sure the unit isn't frozen */ + if (u->freezer_state != FREEZER_RUNNING) + return -EDEADLK; + if (!UNIT_VTABLE(u)->stop) return -EBADR; unit_add_to_dbus_queue(u); - unit_cgroup_freezer_action(u, FREEZER_THAW); return UNIT_VTABLE(u)->stop(u); } @@ -2056,6 +2037,7 @@ bool unit_can_stop(Unit *u) { * -EBADR: This unit type does not support reloading. * -ENOEXEC: Unit is not started. * -EAGAIN: An operation is already in progress. Retry later. + * -EDEADLK: Unit is frozen. */ int unit_reload(Unit *u) { UnitActiveState state; @@ -2082,6 +2064,10 @@ int unit_reload(Unit *u) { return unit_reload(following); } + /* Check to make sure the unit isn't frozen */ + if (u->freezer_state != FREEZER_RUNNING) + return -EDEADLK; + unit_add_to_dbus_queue(u); if (!UNIT_VTABLE(u)->reload) { @@ -2090,8 +2076,6 @@ int unit_reload(Unit *u) { return 0; } - unit_cgroup_freezer_action(u, FREEZER_THAW); - return UNIT_VTABLE(u)->reload(u); } @@ -2238,16 +2222,16 @@ static void retroactively_start_dependencies(Unit *u) { UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_REPLACE) /* Requires= + BindsTo= */ if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) && !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) - manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL); + (void) manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL); UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_FAIL) /* Wants= */ if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) && !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other))) - manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL, NULL); + (void) manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL, NULL); UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_START) /* Conflicts= (and inverse) */ if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) - manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL); + (void) manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL); } static void retroactively_stop_dependencies(Unit *u) { @@ -2259,7 +2243,7 @@ static void retroactively_stop_dependencies(Unit *u) { /* Pull down units which are bound to us recursively if enabled */ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_STOP) /* BoundBy= */ if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) - manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL); + (void) manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL); } void unit_start_on_failure( @@ -2291,7 +2275,7 @@ void unit_start_on_failure( log_unit_warning_errno( u, r, "Failed to enqueue %s job, ignoring: %s", dependency_name, bus_error_message(&error, r)); - n_jobs ++; + n_jobs++; } if (n_jobs >= 0) @@ -2318,273 +2302,179 @@ static int raise_level(int log_level, bool condition_info, bool condition_notice } static int unit_log_resources(Unit *u) { - struct iovec iovec[1 + 2 + _CGROUP_IP_ACCOUNTING_METRIC_MAX + _CGROUP_IO_ACCOUNTING_METRIC_MAX + 4]; - bool any_traffic = false, have_ip_accounting = false, any_io = false, have_io_accounting = false; - _cleanup_free_ char *igress = NULL, *egress = NULL, *rr = NULL, *wr = NULL; - int log_level = LOG_DEBUG; /* May be raised if resources consumed over a threshold */ - size_t n_message_parts = 0, n_iovec = 0; - char* message_parts[1 + 2 + 2 + 2 + 1], *t; - nsec_t nsec = NSEC_INFINITY; - uint64_t memory_peak = UINT64_MAX, memory_swap_peak = UINT64_MAX; - int r; - const char* const ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { - [CGROUP_IP_INGRESS_BYTES] = "IP_METRIC_INGRESS_BYTES", - [CGROUP_IP_INGRESS_PACKETS] = "IP_METRIC_INGRESS_PACKETS", - [CGROUP_IP_EGRESS_BYTES] = "IP_METRIC_EGRESS_BYTES", - [CGROUP_IP_EGRESS_PACKETS] = "IP_METRIC_EGRESS_PACKETS", - }; - const char* const io_fields[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { - [CGROUP_IO_READ_BYTES] = "IO_METRIC_READ_BYTES", - [CGROUP_IO_WRITE_BYTES] = "IO_METRIC_WRITE_BYTES", - [CGROUP_IO_READ_OPERATIONS] = "IO_METRIC_READ_OPERATIONS", - [CGROUP_IO_WRITE_OPERATIONS] = "IO_METRIC_WRITE_OPERATIONS", + + static const struct { + const char *journal_field; + const char *message_suffix; + } memory_fields[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = { + [CGROUP_MEMORY_PEAK] = { "MEMORY_PEAK", "memory peak" }, + [CGROUP_MEMORY_SWAP_PEAK] = { "MEMORY_SWAP_PEAK", "memory swap peak" }, + }, ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = { "IP_METRIC_INGRESS_BYTES", "incoming IP traffic" }, + [CGROUP_IP_EGRESS_BYTES] = { "IP_METRIC_EGRESS_BYTES", "outgoing IP traffic" }, + [CGROUP_IP_INGRESS_PACKETS] = { "IP_METRIC_INGRESS_PACKETS", NULL }, + [CGROUP_IP_EGRESS_PACKETS] = { "IP_METRIC_EGRESS_PACKETS", NULL }, + }, io_fields[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IO_READ_BYTES] = { "IO_METRIC_READ_BYTES", "read from disk" }, + [CGROUP_IO_WRITE_BYTES] = { "IO_METRIC_WRITE_BYTES", "written to disk" }, + [CGROUP_IO_READ_OPERATIONS] = { "IO_METRIC_READ_OPERATIONS", NULL }, + [CGROUP_IO_WRITE_OPERATIONS] = { "IO_METRIC_WRITE_OPERATIONS", NULL }, }; + struct iovec *iovec = NULL; + size_t n_iovec = 0; + _cleanup_free_ char *message = NULL, *t = NULL; + nsec_t cpu_nsec = NSEC_INFINITY; + int log_level = LOG_DEBUG; /* May be raised if resources consumed over a threshold */ + assert(u); + CLEANUP_ARRAY(iovec, n_iovec, iovec_array_free); + + iovec = new(struct iovec, 1 + (_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1) + + _CGROUP_IP_ACCOUNTING_METRIC_MAX + _CGROUP_IO_ACCOUNTING_METRIC_MAX + 4); + if (!iovec) + return log_oom(); + /* Invoked whenever a unit enters failed or dead state. Logs information about consumed resources if resource * accounting was enabled for a unit. It does this in two ways: a friendly human readable string with reduced * information and the complete data in structured fields. */ - (void) unit_get_cpu_usage(u, &nsec); - if (nsec != NSEC_INFINITY) { + (void) unit_get_cpu_usage(u, &cpu_nsec); + if (cpu_nsec != NSEC_INFINITY) { /* Format the CPU time for inclusion in the structured log message */ - if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, nsec) < 0) { - r = log_oom(); - goto finish; - } - iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, cpu_nsec) < 0) + return log_oom(); + iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t)); /* Format the CPU time for inclusion in the human language message string */ - t = strjoin("consumed ", FORMAT_TIMESPAN(nsec / NSEC_PER_USEC, USEC_PER_MSEC), " CPU time"); - if (!t) { - r = log_oom(); - goto finish; - } - - message_parts[n_message_parts++] = t; + if (strextendf_with_separator(&message, ", ", + "Consumed %s CPU time", + FORMAT_TIMESPAN(cpu_nsec / NSEC_PER_USEC, USEC_PER_MSEC)) < 0) + return log_oom(); log_level = raise_level(log_level, - nsec > MENTIONWORTHY_CPU_NSEC, - nsec > NOTICEWORTHY_CPU_NSEC); + cpu_nsec > MENTIONWORTHY_CPU_NSEC, + cpu_nsec > NOTICEWORTHY_CPU_NSEC); } - (void) unit_get_memory_accounting(u, CGROUP_MEMORY_PEAK, &memory_peak); - if (memory_peak != UINT64_MAX) { - /* Format peak memory for inclusion in the structured log message */ - if (asprintf(&t, "MEMORY_PEAK=%" PRIu64, memory_peak) < 0) { - r = log_oom(); - goto finish; - } - iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) { + uint64_t value = UINT64_MAX; - /* Format peak memory for inclusion in the human language message string */ - t = strjoin(FORMAT_BYTES(memory_peak), " memory peak"); - if (!t) { - r = log_oom(); - goto finish; - } - message_parts[n_message_parts++] = t; - } + assert(memory_fields[metric].journal_field); + assert(memory_fields[metric].message_suffix); - (void) unit_get_memory_accounting(u, CGROUP_MEMORY_SWAP_PEAK, &memory_swap_peak); - if (memory_swap_peak != UINT64_MAX) { - /* Format peak swap memory for inclusion in the structured log message */ - if (asprintf(&t, "MEMORY_SWAP_PEAK=%" PRIu64, memory_swap_peak) < 0) { - r = log_oom(); - goto finish; - } - iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + (void) unit_get_memory_accounting(u, metric, &value); + if (value == UINT64_MAX) + continue; - /* Format peak swap memory for inclusion in the human language message string */ - t = strjoin(FORMAT_BYTES(memory_swap_peak), " memory swap peak"); - if (!t) { - r = log_oom(); - goto finish; - } - message_parts[n_message_parts++] = t; + if (asprintf(&t, "%s=%" PRIu64, memory_fields[metric].journal_field, value) < 0) + return log_oom(); + iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t)); + + /* If value is 0, we don't log it in the MESSAGE= field. */ + if (value == 0) + continue; + + if (strextendf_with_separator(&message, ", ", "%s %s", + FORMAT_BYTES(value), memory_fields[metric].message_suffix) < 0) + return log_oom(); + + log_level = raise_level(log_level, + value > MENTIONWORTHY_MEMORY_BYTES, + value > NOTICEWORTHY_MEMORY_BYTES); } for (CGroupIOAccountingMetric k = 0; k < _CGROUP_IO_ACCOUNTING_METRIC_MAX; k++) { uint64_t value = UINT64_MAX; - assert(io_fields[k]); + assert(io_fields[k].journal_field); (void) unit_get_io_accounting(u, k, k > 0, &value); if (value == UINT64_MAX) continue; - have_io_accounting = true; - if (value > 0) - any_io = true; - /* Format IO accounting data for inclusion in the structured log message */ - if (asprintf(&t, "%s=%" PRIu64, io_fields[k], value) < 0) { - r = log_oom(); - goto finish; - } - iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + if (asprintf(&t, "%s=%" PRIu64, io_fields[k].journal_field, value) < 0) + return log_oom(); + iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t)); + + /* If value is 0, we don't log it in the MESSAGE= field. */ + if (value == 0) + continue; /* Format the IO accounting data for inclusion in the human language message string, but only * for the bytes counters (and not for the operations counters) */ - if (k == CGROUP_IO_READ_BYTES) { - assert(!rr); - rr = strjoin("read ", strna(FORMAT_BYTES(value)), " from disk"); - if (!rr) { - r = log_oom(); - goto finish; - } - } else if (k == CGROUP_IO_WRITE_BYTES) { - assert(!wr); - wr = strjoin("written ", strna(FORMAT_BYTES(value)), " to disk"); - if (!wr) { - r = log_oom(); - goto finish; - } - } + if (io_fields[k].message_suffix) { + if (strextendf_with_separator(&message, ", ", "%s %s", + FORMAT_BYTES(value), io_fields[k].message_suffix) < 0) + return log_oom(); - if (IN_SET(k, CGROUP_IO_READ_BYTES, CGROUP_IO_WRITE_BYTES)) log_level = raise_level(log_level, value > MENTIONWORTHY_IO_BYTES, value > NOTICEWORTHY_IO_BYTES); - } - - if (have_io_accounting) { - if (any_io) { - if (rr) - message_parts[n_message_parts++] = TAKE_PTR(rr); - if (wr) - message_parts[n_message_parts++] = TAKE_PTR(wr); - - } else { - char *k; - - k = strdup("no IO"); - if (!k) { - r = log_oom(); - goto finish; - } - - message_parts[n_message_parts++] = k; } } for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { uint64_t value = UINT64_MAX; - assert(ip_fields[m]); + assert(ip_fields[m].journal_field); (void) unit_get_ip_accounting(u, m, &value); if (value == UINT64_MAX) continue; - have_ip_accounting = true; - if (value > 0) - any_traffic = true; - /* Format IP accounting data for inclusion in the structured log message */ - if (asprintf(&t, "%s=%" PRIu64, ip_fields[m], value) < 0) { - r = log_oom(); - goto finish; - } - iovec[n_iovec++] = IOVEC_MAKE_STRING(t); - - /* Format the IP accounting data for inclusion in the human language message string, but only for the - * bytes counters (and not for the packets counters) */ - if (m == CGROUP_IP_INGRESS_BYTES) { - assert(!igress); - igress = strjoin("received ", strna(FORMAT_BYTES(value)), " IP traffic"); - if (!igress) { - r = log_oom(); - goto finish; - } - } else if (m == CGROUP_IP_EGRESS_BYTES) { - assert(!egress); - egress = strjoin("sent ", strna(FORMAT_BYTES(value)), " IP traffic"); - if (!egress) { - r = log_oom(); - goto finish; - } - } + if (asprintf(&t, "%s=%" PRIu64, ip_fields[m].journal_field, value) < 0) + return log_oom(); + iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t)); + + /* If value is 0, we don't log it in the MESSAGE= field. */ + if (value == 0) + continue; + + /* Format the IP accounting data for inclusion in the human language message string, but only + * for the bytes counters (and not for the packets counters) */ + if (ip_fields[m].message_suffix) { + if (strextendf_with_separator(&message, ", ", "%s %s", + FORMAT_BYTES(value), ip_fields[m].message_suffix) < 0) + return log_oom(); - if (IN_SET(m, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES)) log_level = raise_level(log_level, value > MENTIONWORTHY_IP_BYTES, value > NOTICEWORTHY_IP_BYTES); - } - - /* This check is here because it is the earliest point following all possible log_level assignments. If - * log_level is assigned anywhere after this point, move this check. */ - if (!unit_log_level_test(u, log_level)) { - r = 0; - goto finish; - } - - if (have_ip_accounting) { - if (any_traffic) { - if (igress) - message_parts[n_message_parts++] = TAKE_PTR(igress); - if (egress) - message_parts[n_message_parts++] = TAKE_PTR(egress); - - } else { - char *k; - - k = strdup("no IP traffic"); - if (!k) { - r = log_oom(); - goto finish; - } - - message_parts[n_message_parts++] = k; } } + /* This check is here because it is the earliest point following all possible log_level assignments. + * (If log_level is assigned anywhere after this point, move this check.) */ + if (!unit_log_level_test(u, log_level)) + return 0; + /* Is there any accounting data available at all? */ if (n_iovec == 0) { - r = 0; - goto finish; - } - - if (n_message_parts == 0) - t = strjoina("MESSAGE=", u->id, ": Completed."); - else { - _cleanup_free_ char *joined = NULL; - - message_parts[n_message_parts] = NULL; - - joined = strv_join(message_parts, ", "); - if (!joined) { - r = log_oom(); - goto finish; - } - - joined[0] = ascii_toupper(joined[0]); - t = strjoina("MESSAGE=", u->id, ": ", joined, "."); + assert(!message); + return 0; } - /* The following four fields we allocate on the stack or are static strings, we hence don't want to free them, - * and hence don't increase n_iovec for them */ - iovec[n_iovec] = IOVEC_MAKE_STRING(t); - iovec[n_iovec + 1] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_UNIT_RESOURCES_STR); - - t = strjoina(u->manager->unit_log_field, u->id); - iovec[n_iovec + 2] = IOVEC_MAKE_STRING(t); - - t = strjoina(u->manager->invocation_log_field, u->invocation_id_string); - iovec[n_iovec + 3] = IOVEC_MAKE_STRING(t); + t = strjoin("MESSAGE=", u->id, ": ", message ?: "Completed", "."); + if (!t) + return log_oom(); + iovec[n_iovec++] = IOVEC_MAKE_STRING(TAKE_PTR(t)); - log_unit_struct_iovec(u, log_level, iovec, n_iovec + 4); - r = 0; + if (!set_iovec_string_field(iovec, &n_iovec, "MESSAGE_ID=", SD_MESSAGE_UNIT_RESOURCES_STR)) + return log_oom(); -finish: - free_many_charp(message_parts, n_message_parts); + if (!set_iovec_string_field(iovec, &n_iovec, u->manager->unit_log_field, u->id)) + return log_oom(); - for (size_t i = 0; i < n_iovec; i++) - free(iovec[i].iov_base); + if (!set_iovec_string_field(iovec, &n_iovec, u->manager->invocation_log_field, u->invocation_id_string)) + return log_oom(); - return r; + log_unit_struct_iovec(u, log_level, iovec, n_iovec); + return 0; } static void unit_update_on_console(Unit *u) { @@ -2796,12 +2686,14 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su unit_emit_audit_start(u); manager_send_unit_plymouth(m, u); + manager_send_unit_supervisor(m, u, /* active= */ true); } if (UNIT_IS_INACTIVE_OR_FAILED(ns) && !UNIT_IS_INACTIVE_OR_FAILED(os)) { /* This unit just stopped/failed. */ unit_emit_audit_stop(u, ns); + manager_send_unit_supervisor(m, u, /* active= */ false); unit_log_resources(u); } @@ -2859,7 +2751,7 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su } } -int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive) { +int unit_watch_pidref(Unit *u, const PidRef *pid, bool exclusive) { _cleanup_(pidref_freep) PidRef *pid_dup = NULL; int r; @@ -2943,7 +2835,7 @@ int unit_watch_pid(Unit *u, pid_t pid, bool exclusive) { return unit_watch_pidref(u, &pidref, exclusive); } -void unit_unwatch_pidref(Unit *u, PidRef *pid) { +void unit_unwatch_pidref(Unit *u, const PidRef *pid) { assert(u); assert(pidref_is_set(pid)); @@ -3005,6 +2897,16 @@ void unit_unwatch_all_pids(Unit *u) { u->pids = set_free(u->pids); } +void unit_unwatch_pidref_done(Unit *u, PidRef *pidref) { + assert(u); + + if (!pidref_is_set(pidref)) + return; + + unit_unwatch_pidref(u, pidref); + pidref_done(pidref); +} + static void unit_tidy_watch_pids(Unit *u) { PidRef *except1, *except2, *e; @@ -3030,7 +2932,7 @@ static int on_rewatch_pids_event(sd_event_source *s, void *userdata) { assert(s); unit_tidy_watch_pids(u); - unit_watch_all_pids(u); + (void) unit_watch_all_pids(u); /* If the PID set is empty now, then let's finish this off. */ unit_synthesize_cgroup_empty_event(u); @@ -3043,7 +2945,8 @@ int unit_enqueue_rewatch_pids(Unit *u) { assert(u); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) return -ENOENT; r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); @@ -3063,7 +2966,7 @@ int unit_enqueue_rewatch_pids(Unit *u) { if (r < 0) return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m"); - r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE); + r = sd_event_source_set_priority(s, EVENT_PRIORITY_REWATCH_PIDS); if (r < 0) return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: %m"); @@ -3288,8 +3191,8 @@ int unit_add_dependency( if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES)) return 0; - /* Note that ordering a device unit after a unit is permitted since it allows to start its job - * running timeout at a specific time. */ + /* Note that ordering a device unit after a unit is permitted since it allows its job running + * timeout to be started at a specific time. */ if (FLAGS_SET(a, UNIT_ATOM_BEFORE) && other->type == UNIT_DEVICE) { log_unit_warning(u, "Dependency Before=%s ignored (.device units cannot be delayed)", other->id); return 0; @@ -3529,8 +3432,11 @@ int unit_set_slice(Unit *u, Unit *slice) { return 0; /* Disallow slice changes if @u is already bound to cgroups */ - if (UNIT_GET_SLICE(u) && u->cgroup_realized) - return -EBUSY; + if (UNIT_GET_SLICE(u)) { + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (crt && crt->cgroup_realized) + return -EBUSY; + } /* Remove any slices assigned prior; we should only have one UNIT_IN_SLICE dependency */ if (UNIT_GET_SLICE(u)) @@ -4019,28 +3925,25 @@ void unit_notify_cgroup_oom(Unit *u, bool managed_oom) { UNIT_VTABLE(u)->notify_cgroup_oom(u, managed_oom); } -static Set *unit_pid_set(pid_t main_pid, pid_t control_pid) { - _cleanup_set_free_ Set *pid_set = NULL; +static int unit_pid_set(Unit *u, Set **pid_set) { int r; - pid_set = set_new(NULL); - if (!pid_set) - return NULL; + assert(u); + assert(pid_set); + + set_clear(*pid_set); /* This updates input. */ /* Exclude the main/control pids from being killed via the cgroup */ - if (main_pid > 0) { - r = set_put(pid_set, PID_TO_PTR(main_pid)); - if (r < 0) - return NULL; - } - if (control_pid > 0) { - r = set_put(pid_set, PID_TO_PTR(control_pid)); - if (r < 0) - return NULL; - } + PidRef *pid; + FOREACH_ARGUMENT(pid, unit_main_pid(u), unit_control_pid(u)) + if (pidref_is_set(pid)) { + r = set_ensure_put(pid_set, NULL, PID_TO_PTR(pid->pid)); + if (r < 0) + return r; + } - return TAKE_PTR(pid_set); + return 0; } static int kill_common_log(const PidRef *pid, int signo, void *userdata) { @@ -4074,13 +3977,55 @@ static int kill_or_sigqueue(PidRef* pidref, int signo, int code, int value) { } } +static int unit_kill_one( + Unit *u, + PidRef *pidref, + const char *type, + int signo, + int code, + int value, + sd_bus_error *ret_error) { + + int r; + + assert(u); + assert(type); + + if (!pidref_is_set(pidref)) + return 0; + + _cleanup_free_ char *comm = NULL; + (void) pidref_get_comm(pidref, &comm); + + r = kill_or_sigqueue(pidref, signo, code, value); + if (r == -ESRCH) + return 0; + if (r < 0) { + /* Report this failure both to the logs and to the client */ + if (ret_error) + sd_bus_error_set_errnof( + ret_error, r, + "Failed to send signal SIG%s to %s process " PID_FMT " (%s): %m", + signal_to_string(signo), type, pidref->pid, strna(comm)); + + return log_unit_warning_errno( + u, r, + "Failed to send signal SIG%s to %s process " PID_FMT " (%s) on client request: %m", + signal_to_string(signo), type, pidref->pid, strna(comm)); + } + + log_unit_info(u, "Sent signal SIG%s to %s process " PID_FMT " (%s) on client request.", + signal_to_string(signo), type, pidref->pid, strna(comm)); + return 1; /* killed */ +} + int unit_kill( Unit *u, KillWho who, int signo, int code, int value, - sd_bus_error *error) { + sd_bus_error *ret_error) { PidRef *main_pid, *control_pid; bool killed = false; @@ -4100,110 +4045,71 @@ int unit_kill( control_pid = unit_control_pid(u); if (!UNIT_HAS_CGROUP_CONTEXT(u) && !main_pid && !control_pid) - return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit type does not support process killing."); + return sd_bus_error_setf(ret_error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit type does not support process killing."); if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL)) { if (!main_pid) - return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type)); + return sd_bus_error_setf(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type)); if (!pidref_is_set(main_pid)) - return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill"); + return sd_bus_error_set_const(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill"); } if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL)) { if (!control_pid) - return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type)); + return sd_bus_error_setf(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type)); if (!pidref_is_set(control_pid)) - return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill"); + return sd_bus_error_set_const(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill"); } - if (pidref_is_set(control_pid) && - IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL)) { - _cleanup_free_ char *comm = NULL; - (void) pidref_get_comm(control_pid, &comm); - - r = kill_or_sigqueue(control_pid, signo, code, value); - if (r < 0) { - ret = r; - - /* Report this failure both to the logs and to the client */ - sd_bus_error_set_errnof( - error, r, - "Failed to send signal SIG%s to control process " PID_FMT " (%s): %m", - signal_to_string(signo), control_pid->pid, strna(comm)); - log_unit_warning_errno( - u, r, - "Failed to send signal SIG%s to control process " PID_FMT " (%s) on client request: %m", - signal_to_string(signo), control_pid->pid, strna(comm)); - } else { - log_unit_info(u, "Sent signal SIG%s to control process " PID_FMT " (%s) on client request.", - signal_to_string(signo), control_pid->pid, strna(comm)); - killed = true; - } + if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL)) { + r = unit_kill_one(u, control_pid, "control", signo, code, value, ret_error); + RET_GATHER(ret, r); + killed = killed || r > 0; } - if (pidref_is_set(main_pid) && - IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL)) { - _cleanup_free_ char *comm = NULL; - (void) pidref_get_comm(main_pid, &comm); - - r = kill_or_sigqueue(main_pid, signo, code, value); - if (r < 0) { - if (ret == 0) { - ret = r; - - sd_bus_error_set_errnof( - error, r, - "Failed to send signal SIG%s to main process " PID_FMT " (%s): %m", - signal_to_string(signo), main_pid->pid, strna(comm)); - } - - log_unit_warning_errno( - u, r, - "Failed to send signal SIG%s to main process " PID_FMT " (%s) on client request: %m", - signal_to_string(signo), main_pid->pid, strna(comm)); - - } else { - log_unit_info(u, "Sent signal SIG%s to main process " PID_FMT " (%s) on client request.", - signal_to_string(signo), main_pid->pid, strna(comm)); - killed = true; - } + if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL)) { + r = unit_kill_one(u, main_pid, "main", signo, code, value, ret >= 0 ? ret_error : NULL); + RET_GATHER(ret, r); + killed = killed || r > 0; } /* Note: if we shall enqueue rather than kill we won't do this via the cgroup mechanism, since it * doesn't really make much sense (and given that enqueued values are a relatively expensive * resource, and we shouldn't allow us to be subjects for such allocation sprees) */ - if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && u->cgroup_path && code == SI_USER) { - _cleanup_set_free_ Set *pid_set = NULL; + if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && code == SI_USER) { + CGroupRuntime *crt = unit_get_cgroup_runtime(u); - /* Exclude the main/control pids from being killed via the cgroup */ - pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0); - if (!pid_set) - return log_oom(); + if (crt && crt->cgroup_path) { + _cleanup_set_free_ Set *pid_set = NULL; - r = cg_kill_recursive(u->cgroup_path, signo, 0, pid_set, kill_common_log, u); - if (r < 0) { - if (!IN_SET(r, -ESRCH, -ENOENT)) { - if (ret == 0) { - ret = r; + /* Exclude the main/control pids from being killed via the cgroup */ + r = unit_pid_set(u, &pid_set); + if (r < 0) + return log_oom(); + r = cg_kill_recursive(crt->cgroup_path, signo, 0, pid_set, kill_common_log, u); + if (r < 0 && !IN_SET(r, -ESRCH, -ENOENT)) { + if (ret >= 0) sd_bus_error_set_errnof( - error, r, + ret_error, r, "Failed to send signal SIG%s to auxiliary processes: %m", signal_to_string(signo)); - } log_unit_warning_errno( u, r, "Failed to send signal SIG%s to auxiliary processes on client request: %m", signal_to_string(signo)); + + RET_GATHER(ret, r); } - } else - killed = true; + + killed = killed || r >= 0; + } } /* If the "fail" versions of the operation are requested, then complain if the set of processes we killed is empty */ - if (ret == 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL, KILL_MAIN_FAIL)) - return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No matching processes to kill"); + if (ret >= 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL, KILL_MAIN_FAIL)) + return sd_bus_error_set_const(ret_error, BUS_ERROR_NO_SUCH_PROCESS, "No matching processes to kill"); return ret; } @@ -4316,6 +4222,21 @@ static int user_from_unit_name(Unit *u, char **ret) { return 0; } +static int unit_verify_contexts(const Unit *u, const ExecContext *ec) { + assert(u); + + if (!ec) + return 0; + + if (MANAGER_IS_USER(u->manager) && ec->dynamic_user) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "DynamicUser= enabled for user unit, which is not supported. Refusing."); + + if (ec->dynamic_user && ec->working_directory_home) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory=~ is not allowed under DynamicUser=yes. Refusing."); + + return 0; +} + int unit_patch_contexts(Unit *u) { CGroupContext *cc; ExecContext *ec; @@ -4337,16 +4258,14 @@ int unit_patch_contexts(Unit *u) { return -ENOMEM; } - if (MANAGER_IS_USER(u->manager) && - !ec->working_directory) { - + if (MANAGER_IS_USER(u->manager) && !ec->working_directory) { r = get_home_dir(&ec->working_directory); if (r < 0) return r; - /* Allow user services to run, even if the - * home directory is missing */ - ec->working_directory_missing_ok = true; + if (!ec->working_directory_home) + /* If home directory is implied by us, allow it to be missing. */ + ec->working_directory_missing_ok = true; } if (ec->private_devices) @@ -4390,8 +4309,8 @@ int unit_patch_contexts(Unit *u) { ec->restrict_suid_sgid = true; } - for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) - exec_directory_sort(ec->directories + dt); + FOREACH_ARRAY(d, ec->directories, _EXEC_DIRECTORY_TYPE_MAX) + exec_directory_sort(d); } cc = unit_get_cgroup_context(u); @@ -4441,7 +4360,7 @@ int unit_patch_contexts(Unit *u) { } } - return 0; + return unit_verify_contexts(u, ec); } ExecContext *unit_get_exec_context(const Unit *u) { @@ -4458,7 +4377,7 @@ ExecContext *unit_get_exec_context(const Unit *u) { return (ExecContext*) ((uint8_t*) u + offset); } -KillContext *unit_get_kill_context(Unit *u) { +KillContext *unit_get_kill_context(const Unit *u) { size_t offset; assert(u); @@ -4472,7 +4391,7 @@ KillContext *unit_get_kill_context(Unit *u) { return (KillContext*) ((uint8_t*) u + offset); } -CGroupContext *unit_get_cgroup_context(Unit *u) { +CGroupContext *unit_get_cgroup_context(const Unit *u) { size_t offset; if (u->type < 0) @@ -4485,7 +4404,7 @@ CGroupContext *unit_get_cgroup_context(Unit *u) { return (CGroupContext*) ((uint8_t*) u + offset); } -ExecRuntime *unit_get_exec_runtime(Unit *u) { +ExecRuntime *unit_get_exec_runtime(const Unit *u) { size_t offset; if (u->type < 0) @@ -4498,6 +4417,19 @@ ExecRuntime *unit_get_exec_runtime(Unit *u) { return *(ExecRuntime**) ((uint8_t*) u + offset); } +CGroupRuntime *unit_get_cgroup_runtime(const Unit *u) { + size_t offset; + + if (u->type < 0) + return NULL; + + offset = UNIT_VTABLE(u)->cgroup_runtime_offset; + if (offset <= 0) + return NULL; + + return *(CGroupRuntime**) ((uint8_t*) u + offset); +} + static const char* unit_drop_in_dir(Unit *u, UnitWriteFlags flags) { assert(u); @@ -4820,26 +4752,57 @@ static int operation_to_signal( } } -int unit_kill_context( +static int unit_kill_context_one( Unit *u, - KillContext *c, - KillOperation k, - PidRef* main_pid, - PidRef* control_pid, - bool main_pid_alien) { + const PidRef *pidref, + const char *type, + bool is_alien, + int sig, + bool send_sighup, + cg_kill_log_func_t log_func) { + int r; + + assert(u); + assert(type); + + /* This returns > 0 if it makes sense to wait for SIGCHLD for the process, == 0 if not. */ + + if (!pidref_is_set(pidref)) + return 0; + + if (log_func) + log_func(pidref, sig, u); + + r = pidref_kill_and_sigcont(pidref, sig); + if (r == -ESRCH) + return !is_alien; + if (r < 0) { + _cleanup_free_ char *comm = NULL; + + (void) pidref_get_comm(pidref, &comm); + return log_unit_warning_errno(u, r, "Failed to kill %s process " PID_FMT " (%s), ignoring: %m", type, pidref->pid, strna(comm)); + } + + if (send_sighup) + (void) pidref_kill(pidref, SIGHUP); + + return !is_alien; +} + +int unit_kill_context(Unit *u, KillOperation k) { bool wait_for_exit = false, send_sighup; cg_kill_log_func_t log_func = NULL; int sig, r; assert(u); - assert(c); /* Kill the processes belonging to this unit, in preparation for shutting the unit down. Returns > 0 * if we killed something worth waiting for, 0 otherwise. Do not confuse with unit_kill_common() * which is used for user-requested killing of unit processes. */ - if (c->kill_mode == KILL_NONE) + KillContext *c = unit_get_kill_context(u); + if (!c || c->kill_mode == KILL_NONE) return 0; bool noteworthy; @@ -4852,61 +4815,33 @@ int unit_kill_context( IN_SET(k, KILL_TERMINATE, KILL_TERMINATE_AND_LOG) && sig != SIGHUP; - if (pidref_is_set(main_pid)) { - if (log_func) - log_func(main_pid, sig, u); - - r = pidref_kill_and_sigcont(main_pid, sig); - if (r < 0 && r != -ESRCH) { - _cleanup_free_ char *comm = NULL; - (void) pidref_get_comm(main_pid, &comm); + bool is_alien; + PidRef *main_pid = unit_main_pid_full(u, &is_alien); + r = unit_kill_context_one(u, main_pid, "main", is_alien, sig, send_sighup, log_func); + wait_for_exit = wait_for_exit || r > 0; - log_unit_warning_errno(u, r, "Failed to kill main process " PID_FMT " (%s), ignoring: %m", main_pid->pid, strna(comm)); - } else { - if (!main_pid_alien) - wait_for_exit = true; + r = unit_kill_context_one(u, unit_control_pid(u), "control", /* is_alien = */ false, sig, send_sighup, log_func); + wait_for_exit = wait_for_exit || r > 0; - if (r != -ESRCH && send_sighup) - (void) pidref_kill(main_pid, SIGHUP); - } - } - - if (pidref_is_set(control_pid)) { - if (log_func) - log_func(control_pid, sig, u); - - r = pidref_kill_and_sigcont(control_pid, sig); - if (r < 0 && r != -ESRCH) { - _cleanup_free_ char *comm = NULL; - (void) pidref_get_comm(control_pid, &comm); - - log_unit_warning_errno(u, r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", control_pid->pid, strna(comm)); - } else { - wait_for_exit = true; - - if (r != -ESRCH && send_sighup) - (void) pidref_kill(control_pid, SIGHUP); - } - } - - if (u->cgroup_path && + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (crt && crt->cgroup_path && (c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL))) { _cleanup_set_free_ Set *pid_set = NULL; /* Exclude the main/control pids from being killed via the cgroup */ - pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0); - if (!pid_set) - return -ENOMEM; + r = unit_pid_set(u, &pid_set); + if (r < 0) + return r; r = cg_kill_recursive( - u->cgroup_path, + crt->cgroup_path, sig, CGROUP_SIGCONT|CGROUP_IGNORE_SELF, pid_set, log_func, u); if (r < 0) { if (!IN_SET(r, -EAGAIN, -ESRCH, -ENOENT)) - log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", empty_to_root(u->cgroup_path)); + log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", empty_to_root(crt->cgroup_path)); } else if (r > 0) { @@ -4922,14 +4857,12 @@ int unit_kill_context( wait_for_exit = true; if (send_sighup) { - set_free(pid_set); - - pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0); - if (!pid_set) - return -ENOMEM; + r = unit_pid_set(u, &pid_set); + if (r < 0) + return r; (void) cg_kill_recursive( - u->cgroup_path, + crt->cgroup_path, SIGHUP, CGROUP_IGNORE_SELF, pid_set, @@ -4942,11 +4875,16 @@ int unit_kill_context( return wait_for_exit; } -int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) { +int unit_add_mounts_for(Unit *u, const char *path, UnitDependencyMask mask, UnitMountDependencyType type) { + Hashmap **unit_map, **manager_map; int r; assert(u); assert(path); + assert(type >= 0 && type < _UNIT_MOUNT_DEPENDENCY_TYPE_MAX); + + unit_map = &u->mounts_for[type]; + manager_map = &u->manager->units_needing_mounts_for[type]; /* Registers a unit for requiring a certain path and all its prefixes. We keep a hashtable of these * paths in the unit (from the path to the UnitDependencyInfo structure indicating how to the @@ -4956,7 +4894,7 @@ int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) if (!path_is_absolute(path)) return -EINVAL; - if (hashmap_contains(u->requires_mounts_for, path)) /* Exit quickly if the path is already covered. */ + if (hashmap_contains(*unit_map, path)) /* Exit quickly if the path is already covered. */ return 0; /* Use the canonical form of the path as the stored key. We call path_is_normalized() @@ -4975,7 +4913,7 @@ int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) .origin_mask = mask }; - r = hashmap_ensure_put(&u->requires_mounts_for, &path_hash_ops, p, di.data); + r = hashmap_ensure_put(unit_map, &path_hash_ops, p, di.data); if (r < 0) return r; assert(r > 0); @@ -4985,11 +4923,11 @@ int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) PATH_FOREACH_PREFIX_MORE(prefix, path) { Set *x; - x = hashmap_get(u->manager->units_requiring_mounts_for, prefix); + x = hashmap_get(*manager_map, prefix); if (!x) { _cleanup_free_ char *q = NULL; - r = hashmap_ensure_allocated(&u->manager->units_requiring_mounts_for, &path_hash_ops); + r = hashmap_ensure_allocated(manager_map, &path_hash_ops); if (r < 0) return r; @@ -5001,7 +4939,7 @@ int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) if (!x) return -ENOMEM; - r = hashmap_put(u->manager->units_requiring_mounts_for, q, x); + r = hashmap_put(*manager_map, q, x); if (r < 0) { set_free(x); return r; @@ -5035,8 +4973,7 @@ int unit_setup_exec_runtime(Unit *u) { if (*rt) return 0; - ec = unit_get_exec_context(u); - assert(ec); + ec = ASSERT_PTR(unit_get_exec_context(u)); r = unit_get_transitive_dependency_set(u, UNIT_ATOM_JOINS_NAMESPACE_OF, &units); if (r < 0) @@ -5073,6 +5010,21 @@ int unit_setup_exec_runtime(Unit *u) { return r; } +CGroupRuntime *unit_setup_cgroup_runtime(Unit *u) { + size_t offset; + + assert(u); + + offset = UNIT_VTABLE(u)->cgroup_runtime_offset; + assert(offset > 0); + + CGroupRuntime **rt = (CGroupRuntime**) ((uint8_t*) u + offset); + if (*rt) + return *rt; + + return (*rt = cgroup_runtime_new()); +} + bool unit_type_supported(UnitType t) { static int8_t cache[_UNIT_TYPE_MAX] = {}; /* -1: disabled, 1: enabled: 0: don't know */ int r; @@ -5178,12 +5130,14 @@ PidRef* unit_control_pid(Unit *u) { return NULL; } -PidRef* unit_main_pid(Unit *u) { +PidRef* unit_main_pid_full(Unit *u, bool *ret_is_alien) { assert(u); if (UNIT_VTABLE(u)->main_pid) - return UNIT_VTABLE(u)->main_pid(u); + return UNIT_VTABLE(u)->main_pid(u, ret_is_alien); + if (ret_is_alien) + *ret_is_alien = false; return NULL; } @@ -5393,7 +5347,6 @@ int unit_acquire_invocation_id(Unit *u) { } int unit_set_exec_params(Unit *u, ExecParameters *p) { - const char *confirm_spawn; int r; assert(u); @@ -5406,19 +5359,17 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) { p->runtime_scope = u->manager->runtime_scope; - confirm_spawn = manager_get_confirm_spawn(u->manager); - if (confirm_spawn) { - p->confirm_spawn = strdup(confirm_spawn); - if (!p->confirm_spawn) - return -ENOMEM; - } + r = strdup_to(&p->confirm_spawn, manager_get_confirm_spawn(u->manager)); + if (r < 0) + return r; p->cgroup_supported = u->manager->cgroup_supported; p->prefix = u->manager->prefix; SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager)); /* Copy parameters from unit */ - p->cgroup_path = u->cgroup_path; + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + p->cgroup_path = crt ? crt->cgroup_path : NULL; SET_FLAG(p->flags, EXEC_CGROUP_DELEGATE, unit_cgroup_delegate(u)); p->received_credentials_directory = u->manager->received_credentials_directory; @@ -5428,17 +5379,18 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) { p->fallback_smack_process_label = u->manager->defaults.smack_process_label; - if (u->manager->restrict_fs && p->bpf_outer_map_fd < 0) { - int fd = lsm_bpf_map_restrict_fs_fd(u); + if (u->manager->restrict_fs && p->bpf_restrict_fs_map_fd < 0) { + int fd = bpf_restrict_fs_map_fd(u); if (fd < 0) return fd; - p->bpf_outer_map_fd = fd; + p->bpf_restrict_fs_map_fd = fd; } p->user_lookup_fd = u->manager->user_lookup_fds[1]; + p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1]; - p->cgroup_id = u->cgroup_id; + p->cgroup_id = crt ? crt->cgroup_id : 0; p->invocation_id = u->invocation_id; sd_id128_to_string(p->invocation_id, p->invocation_id_string); p->unit_id = strdup(u->id); @@ -5460,6 +5412,10 @@ int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret) { (void) unit_realize_cgroup(u); + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); + if (!crt) + return -ENOMEM; + r = safe_fork(name, FORK_REOPEN_LOG|FORK_DEATHSIG_SIGTERM, &pid); if (r < 0) return r; @@ -5482,10 +5438,10 @@ int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret) { (void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE); (void) ignore_signals(SIGPIPE); - if (u->cgroup_path) { - r = cg_attach_everywhere(u->manager->cgroup_supported, u->cgroup_path, 0, NULL, NULL); + if (crt->cgroup_path) { + r = cg_attach_everywhere(u->manager->cgroup_supported, crt->cgroup_path, 0, NULL, NULL); if (r < 0) { - log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", empty_to_root(u->cgroup_path)); + log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", empty_to_root(crt->cgroup_path)); _exit(EXIT_CGROUP); } } @@ -5880,9 +5836,10 @@ int unit_prepare_exec(Unit *u) { (void) unit_realize_cgroup(u); - if (u->reset_accounting) { + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (crt && crt->reset_accounting) { (void) unit_reset_accounting(u); - u->reset_accounting = false; + crt->reset_accounting = false; } unit_export_state_files(u); @@ -5942,11 +5899,13 @@ int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func) { (void) unit_pick_cgroup_path(u); - if (!u->cgroup_path) + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + + if (!crt || !crt->cgroup_path) return 0; return cg_kill_recursive( - u->cgroup_path, + crt->cgroup_path, /* sig= */ 0, /* flags= */ 0, /* set= */ NULL, @@ -5976,7 +5935,7 @@ bool unit_needs_console(Unit *u) { return exec_context_may_touch_console(ec); } -int unit_pid_attachable(Unit *u, PidRef *pid, sd_bus_error *error) { +int unit_pid_attachable(Unit *u, const PidRef *pid, sd_bus_error *error) { int r; assert(u); @@ -6213,19 +6172,98 @@ bool unit_can_isolate_refuse_manual(Unit *u) { return unit_can_isolate(u) && !u->refuse_manual_start; } +void unit_next_freezer_state(Unit *u, FreezerAction action, FreezerState *ret, FreezerState *ret_target) { + Unit *slice; + FreezerState curr, parent, next, tgt; + + assert(u); + assert(IN_SET(action, FREEZER_FREEZE, FREEZER_PARENT_FREEZE, + FREEZER_THAW, FREEZER_PARENT_THAW)); + assert(ret); + assert(ret_target); + + /* This function determines the correct freezer state transitions for a unit + * given the action being requested. It returns the next state, and also the "target", + * which is either FREEZER_FROZEN or FREEZER_RUNNING, depending on what actual state we + * ultimately want to achieve. */ + + curr = u->freezer_state; + slice = UNIT_GET_SLICE(u); + if (slice) + parent = slice->freezer_state; + else + parent = FREEZER_RUNNING; + + if (action == FREEZER_FREEZE) { + /* We always "promote" a freeze initiated by parent into a normal freeze */ + if (IN_SET(curr, FREEZER_FROZEN, FREEZER_FROZEN_BY_PARENT)) + next = FREEZER_FROZEN; + else + next = FREEZER_FREEZING; + } else if (action == FREEZER_THAW) { + /* Thawing is the most complicated operation here, because we can't thaw a unit + * if its parent is frozen. So we instead "demote" a normal freeze into a freeze + * initiated by parent if the parent is frozen */ + if (IN_SET(curr, FREEZER_RUNNING, FREEZER_THAWING, FREEZER_FREEZING_BY_PARENT, FREEZER_FROZEN_BY_PARENT)) + next = curr; + else if (curr == FREEZER_FREEZING) { + if (IN_SET(parent, FREEZER_RUNNING, FREEZER_THAWING)) + next = FREEZER_THAWING; + else + next = FREEZER_FREEZING_BY_PARENT; + } else { + assert(curr == FREEZER_FROZEN); + if (IN_SET(parent, FREEZER_RUNNING, FREEZER_THAWING)) + next = FREEZER_THAWING; + else + next = FREEZER_FROZEN_BY_PARENT; + } + } else if (action == FREEZER_PARENT_FREEZE) { + /* We need to avoid accidentally demoting units frozen manually */ + if (IN_SET(curr, FREEZER_FREEZING, FREEZER_FROZEN, FREEZER_FROZEN_BY_PARENT)) + next = curr; + else + next = FREEZER_FREEZING_BY_PARENT; + } else { + assert(action == FREEZER_PARENT_THAW); + + /* We don't want to thaw units from a parent if they were frozen + * manually, so for such units this action is a no-op */ + if (IN_SET(curr, FREEZER_RUNNING, FREEZER_FREEZING, FREEZER_FROZEN)) + next = curr; + else + next = FREEZER_THAWING; + } + + tgt = freezer_state_finish(next); + if (tgt == FREEZER_FROZEN_BY_PARENT) + tgt = FREEZER_FROZEN; + assert(IN_SET(tgt, FREEZER_RUNNING, FREEZER_FROZEN)); + + *ret = next; + *ret_target = tgt; +} + bool unit_can_freeze(Unit *u) { assert(u); + if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE)) + return false; + if (UNIT_VTABLE(u)->can_freeze) return UNIT_VTABLE(u)->can_freeze(u); - return UNIT_VTABLE(u)->freeze; + return UNIT_VTABLE(u)->freezer_action; } void unit_frozen(Unit *u) { assert(u); - u->freezer_state = FREEZER_FROZEN; + u->freezer_state = u->freezer_state == FREEZER_FREEZING_BY_PARENT + ? FREEZER_FROZEN_BY_PARENT + : FREEZER_FROZEN; + + log_unit_debug(u, "Unit now %s.", freezer_state_to_string(u->freezer_state)); bus_unit_send_pending_freezer_message(u, false); } @@ -6235,19 +6273,19 @@ void unit_thawed(Unit *u) { u->freezer_state = FREEZER_RUNNING; + log_unit_debug(u, "Unit thawed."); + bus_unit_send_pending_freezer_message(u, false); } -static int unit_freezer_action(Unit *u, FreezerAction action) { +int unit_freezer_action(Unit *u, FreezerAction action) { UnitActiveState s; - int (*method)(Unit*); int r; assert(u); assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW)); - method = action == FREEZER_FREEZE ? UNIT_VTABLE(u)->freeze : UNIT_VTABLE(u)->thaw; - if (!method || !cg_freezer_supported()) + if (!cg_freezer_supported() || !unit_can_freeze(u)) return -EOPNOTSUPP; if (u->job) @@ -6260,36 +6298,21 @@ static int unit_freezer_action(Unit *u, FreezerAction action) { if (s != UNIT_ACTIVE) return -EHOSTDOWN; - if ((IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING) && action == FREEZER_FREEZE) || - (u->freezer_state == FREEZER_THAWING && action == FREEZER_THAW)) + if (action == FREEZER_FREEZE && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT)) return -EALREADY; + if (action == FREEZER_THAW && u->freezer_state == FREEZER_THAWING) + return -EALREADY; + if (action == FREEZER_THAW && IN_SET(u->freezer_state, FREEZER_FREEZING_BY_PARENT, FREEZER_FROZEN_BY_PARENT)) + return -ECHILD; - r = method(u); + r = UNIT_VTABLE(u)->freezer_action(u, action); if (r <= 0) return r; - assert(IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)); - + assert(IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_FREEZING_BY_PARENT, FREEZER_THAWING)); return 1; } -int unit_freeze(Unit *u) { - return unit_freezer_action(u, FREEZER_FREEZE); -} - -int unit_thaw(Unit *u) { - return unit_freezer_action(u, FREEZER_THAW); -} - -/* Wrappers around low-level cgroup freezer operations common for service and scope units */ -int unit_freeze_vtable_common(Unit *u) { - return unit_cgroup_freezer_action(u, FREEZER_FREEZE); -} - -int unit_thaw_vtable_common(Unit *u) { - return unit_cgroup_freezer_action(u, FREEZER_THAW); -} - Condition *unit_find_failed_condition(Unit *u) { Condition *failed_trigger = NULL; bool has_succeeded_trigger = false; @@ -6310,7 +6333,7 @@ Condition *unit_find_failed_condition(Unit *u) { } static const char* const collect_mode_table[_COLLECT_MODE_MAX] = { - [COLLECT_INACTIVE] = "inactive", + [COLLECT_INACTIVE] = "inactive", [COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed", }; @@ -6460,7 +6483,7 @@ int unit_compare_priority(Unit *a, Unit *b) { } const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX] = { - [UNIT_PATH] = &activation_details_path_vtable, + [UNIT_PATH] = &activation_details_path_vtable, [UNIT_TIMER] = &activation_details_timer_vtable, }; @@ -6596,11 +6619,7 @@ int activation_details_append_pair(ActivationDetails *details, char ***strv) { return 0; if (!isempty(details->trigger_unit_name)) { - r = strv_extend(strv, "trigger_unit"); - if (r < 0) - return r; - - r = strv_extend(strv, details->trigger_unit_name); + r = strv_extend_many(strv, "trigger_unit", details->trigger_unit_name); if (r < 0) return r; } @@ -6615,3 +6634,24 @@ int activation_details_append_pair(ActivationDetails *details, char ***strv) { } DEFINE_TRIVIAL_REF_UNREF_FUNC(ActivationDetails, activation_details, activation_details_free); + +static const char* const unit_mount_dependency_type_table[_UNIT_MOUNT_DEPENDENCY_TYPE_MAX] = { + [UNIT_MOUNT_WANTS] = "WantsMountsFor", + [UNIT_MOUNT_REQUIRES] = "RequiresMountsFor", +}; + +DEFINE_STRING_TABLE_LOOKUP(unit_mount_dependency_type, UnitMountDependencyType); + +UnitDependency unit_mount_dependency_type_to_dependency_type(UnitMountDependencyType t) { + switch (t) { + + case UNIT_MOUNT_WANTS: + return UNIT_WANTS; + + case UNIT_MOUNT_REQUIRES: + return UNIT_REQUIRES; + + default: + assert_not_reached(); + } +} diff --git a/src/core/unit.h b/src/core/unit.h index 60bc2e3..b135fec 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #pragma once +#include #include #include #include @@ -8,6 +9,14 @@ #include "sd-id128.h" +/* Circular dependency with manager.h, needs to be defined before local includes */ +typedef enum UnitMountDependencyType { + UNIT_MOUNT_WANTS, + UNIT_MOUNT_REQUIRES, + _UNIT_MOUNT_DEPENDENCY_TYPE_MAX, + _UNIT_MOUNT_DEPENDENCY_TYPE_INVALID = -EINVAL, +} UnitMountDependencyType; + #include "bpf-program.h" #include "cgroup.h" #include "condition.h" @@ -55,7 +64,11 @@ static inline bool UNIT_IS_INACTIVE_OR_FAILED(UnitActiveState t) { } static inline bool UNIT_IS_LOAD_COMPLETE(UnitLoadState t) { - return t >= 0 && t < _UNIT_LOAD_STATE_MAX && t != UNIT_STUB && t != UNIT_MERGED; + return t >= 0 && t < _UNIT_LOAD_STATE_MAX && !IN_SET(t, UNIT_STUB, UNIT_MERGED); +} + +static inline bool UNIT_IS_LOAD_ERROR(UnitLoadState t) { + return IN_SET(t, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR); } /* Stores the 'reason' a dependency was created as a bit mask, i.e. due to which configuration source it came to be. We @@ -199,6 +212,7 @@ struct UnitRef { LIST_FIELDS(UnitRef, refs_by_target); }; +/* The generic, dynamic definition of the unit */ typedef struct Unit { Manager *manager; @@ -216,9 +230,9 @@ typedef struct Unit { * Hashmap(UnitDependency → Hashmap(Unit* → UnitDependencyInfo)) */ Hashmap *dependencies; - /* Similar, for RequiresMountsFor= path dependencies. The key is the path, the value the - * UnitDependencyInfo type */ - Hashmap *requires_mounts_for; + /* Similar, for RequiresMountsFor= and WantsMountsFor= path dependencies. The key is the path, the + * value the UnitDependencyInfo type */ + Hashmap *mounts_for[_UNIT_MOUNT_DEPENDENCY_TYPE_MAX]; char *description; char **documentation; @@ -361,74 +375,6 @@ typedef struct Unit { UnitFileState unit_file_state; PresetAction unit_file_preset; - /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */ - nsec_t cpu_usage_base; - nsec_t cpu_usage_last; /* the most recently read value */ - - /* Most recently read value of memory accounting metrics */ - uint64_t memory_accounting_last[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1]; - - /* The current counter of OOM kills initiated by systemd-oomd */ - uint64_t managed_oom_kill_last; - - /* The current counter of the oom_kill field in the memory.events cgroup attribute */ - uint64_t oom_kill_last; - - /* Where the io.stat data was at the time the unit was started */ - uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; - uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */ - - /* Counterparts in the cgroup filesystem */ - char *cgroup_path; - uint64_t cgroup_id; - CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */ - CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */ - CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */ - CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ - - /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */ - int cgroup_control_inotify_wd; - int cgroup_memory_inotify_wd; - - /* Device Controller BPF program */ - BPFProgram *bpf_device_control_installed; - - /* IP BPF Firewalling/accounting */ - int ip_accounting_ingress_map_fd; - int ip_accounting_egress_map_fd; - uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX]; - - int ipv4_allow_map_fd; - int ipv6_allow_map_fd; - int ipv4_deny_map_fd; - int ipv6_deny_map_fd; - BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed; - BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed; - - Set *ip_bpf_custom_ingress; - Set *ip_bpf_custom_ingress_installed; - Set *ip_bpf_custom_egress; - Set *ip_bpf_custom_egress_installed; - - /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd, - * attached to unit cgroup by provided program fd and attach type. */ - Hashmap *bpf_foreign_by_key; - - FDSet *initial_socket_bind_link_fds; -#if BPF_FRAMEWORK - /* BPF links to BPF programs attached to cgroup/bind{4|6} hooks and - * responsible for allowing or denying a unit to bind(2) to a socket - * address. */ - struct bpf_link *ipv4_socket_bind_link; - struct bpf_link *ipv6_socket_bind_link; -#endif - - FDSet *initial_restric_ifaces_link_fds; -#if BPF_FRAMEWORK - struct bpf_link *restrict_ifaces_ingress_bpf_link; - struct bpf_link *restrict_ifaces_egress_bpf_link; -#endif - /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new * ones which might have appeared. */ sd_event_source *rewatch_pids_event_source; @@ -499,12 +445,6 @@ typedef struct Unit { bool in_audit:1; bool on_console:1; - bool cgroup_realized:1; - bool cgroup_members_mask_valid:1; - - /* Reset cgroup accounting next time we fork something off */ - bool reset_accounting:1; - bool start_limit_hit:1; /* Did we already invoke unit_coldplug() for this unit? */ @@ -520,9 +460,6 @@ typedef struct Unit { bool exported_log_ratelimit_interval:1; bool exported_log_ratelimit_burst:1; - /* Whether we warned about clamping the CPU quota period */ - bool warned_clamping_cpu_quota_period:1; - /* When writing transient unit files, stores which section we stored last. If < 0, we didn't write any yet. If * == 0 we are in the [Unit] section, if > 0 we are in the unit type-specific section. */ signed int last_section_private:2; @@ -568,6 +505,7 @@ static inline bool UNIT_WRITE_FLAGS_NOOP(UnitWriteFlags flags) { #include "kill.h" +/* The static const, immutable data about a specific unit type */ typedef struct UnitVTable { /* How much memory does an object of this unit type need */ size_t object_size; @@ -584,11 +522,14 @@ typedef struct UnitVTable { * KillContext is found, if the unit type has that */ size_t kill_context_offset; - /* If greater than 0, the offset into the object where the - * pointer to ExecSharedRuntime is found, if the unit type has - * that */ + /* If greater than 0, the offset into the object where the pointer to ExecRuntime is found, if + * the unit type has that */ size_t exec_runtime_offset; + /* If greater than 0, the offset into the object where the pointer to CGroupRuntime is found, if the + * unit type has that */ + size_t cgroup_runtime_offset; + /* The name of the configuration file section with the private settings of this unit */ const char *private_section; @@ -633,9 +574,9 @@ typedef struct UnitVTable { /* Clear out the various runtime/state/cache/logs/configuration data */ int (*clean)(Unit *u, ExecCleanMask m); - /* Freeze the unit */ - int (*freeze)(Unit *u); - int (*thaw)(Unit *u); + /* Freeze or thaw the unit. Returns > 0 to indicate that the request will be handled asynchronously; unit_frozen + * or unit_thawed should be called once the operation is done. Returns 0 if done successfully, or < 0 on error. */ + int (*freezer_action)(Unit *u, FreezerAction a); bool (*can_freeze)(Unit *u); /* Return which kind of data can be cleaned */ @@ -691,6 +632,9 @@ typedef struct UnitVTable { /* Called whenever a process of this unit sends us a message */ void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds); + /* Called whenever we learn a handoff timestamp */ + void (*notify_handoff_timestamp)(Unit *u, const struct ucred *ucred, const dual_timestamp *ts); + /* Called whenever a name this Unit registered for comes or goes away. */ void (*bus_name_owner_change)(Unit *u, const char *new_owner); @@ -722,10 +666,10 @@ typedef struct UnitVTable { /* Returns the start timeout of a unit */ usec_t (*get_timeout_start_usec)(Unit *u); - /* Returns the main PID if there is any defined, or 0. */ - PidRef* (*main_pid)(Unit *u); + /* Returns the main PID if there is any defined, or NULL. */ + PidRef* (*main_pid)(Unit *u, bool *ret_is_alien); - /* Returns the control PID if there is any defined, or 0. */ + /* Returns the control PID if there is any defined, or NULL. */ PidRef* (*control_pid)(Unit *u); /* Returns true if the unit currently needs access to the console */ @@ -794,6 +738,9 @@ typedef struct UnitVTable { /* If true, we'll notify plymouth about this unit */ bool notify_plymouth; + /* If true, we'll notify a surrounding VMM/container manager about this unit becoming available */ + bool notify_supervisor; + /* The audit events to generate on start + stop (or 0 if none shall be generated) */ int audit_start_message_type; int audit_stop_message_type; @@ -903,7 +850,6 @@ bool unit_has_name(const Unit *u, const char *name); UnitActiveState unit_active_state(Unit *u); FreezerState unit_freezer_state(Unit *u); -int unit_freezer_state_kernel(Unit *u, FreezerState *ret); const char* unit_sub_state_to_string(Unit *u); @@ -916,17 +862,18 @@ int unit_start(Unit *u, ActivationDetails *details); int unit_stop(Unit *u); int unit_reload(Unit *u); -int unit_kill(Unit *u, KillWho w, int signo, int code, int value, sd_bus_error *error); +int unit_kill(Unit *u, KillWho w, int signo, int code, int value, sd_bus_error *ret_error); void unit_notify_cgroup_oom(Unit *u, bool managed_oom); void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_success); -int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive); +int unit_watch_pidref(Unit *u, const PidRef *pid, bool exclusive); int unit_watch_pid(Unit *u, pid_t pid, bool exclusive); -void unit_unwatch_pidref(Unit *u, PidRef *pid); +void unit_unwatch_pidref(Unit *u, const PidRef *pid); void unit_unwatch_pid(Unit *u, pid_t pid); void unit_unwatch_all_pids(Unit *u); +void unit_unwatch_pidref_done(Unit *u, PidRef *pidref); int unit_enqueue_rewatch_pids(Unit *u); void unit_dequeue_rewatch_pids(Unit *u); @@ -984,12 +931,14 @@ void unit_ref_unset(UnitRef *ref); int unit_patch_contexts(Unit *u); ExecContext *unit_get_exec_context(const Unit *u) _pure_; -KillContext *unit_get_kill_context(Unit *u) _pure_; -CGroupContext *unit_get_cgroup_context(Unit *u) _pure_; +KillContext *unit_get_kill_context(const Unit *u) _pure_; +CGroupContext *unit_get_cgroup_context(const Unit *u) _pure_; -ExecRuntime *unit_get_exec_runtime(Unit *u) _pure_; +ExecRuntime *unit_get_exec_runtime(const Unit *u) _pure_; +CGroupRuntime *unit_get_cgroup_runtime(const Unit *u) _pure_; int unit_setup_exec_runtime(Unit *u); +CGroupRuntime *unit_setup_cgroup_runtime(Unit *u); const char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf); char* unit_concat_strv(char **l, UnitWriteFlags flags); @@ -997,11 +946,11 @@ char* unit_concat_strv(char **l, UnitWriteFlags flags); int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data); int unit_write_settingf(Unit *u, UnitWriteFlags mode, const char *name, const char *format, ...) _printf_(4,5); -int unit_kill_context(Unit *u, KillContext *c, KillOperation k, PidRef *main_pid, PidRef *control_pid, bool main_pid_alien); +int unit_kill_context(Unit *u, KillOperation k); int unit_make_transient(Unit *u); -int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask); +int unit_add_mounts_for(Unit *u, const char *path, UnitDependencyMask mask, UnitMountDependencyType type); bool unit_type_supported(UnitType t); @@ -1012,7 +961,10 @@ bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit); bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit); PidRef* unit_control_pid(Unit *u); -PidRef* unit_main_pid(Unit *u); +PidRef* unit_main_pid_full(Unit *u, bool *ret_is_alien); +static inline PidRef* unit_main_pid(Unit *u) { + return unit_main_pid_full(u, NULL); +} void unit_warn_if_dir_nonempty(Unit *u, const char* where); int unit_fail_if_noncanonical(Unit *u, const char* where); @@ -1046,7 +998,7 @@ int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func); bool unit_needs_console(Unit *u); -int unit_pid_attachable(Unit *unit, PidRef *pid, sd_bus_error *error); +int unit_pid_attachable(Unit *unit, const PidRef *pid, sd_bus_error *error); static inline bool unit_has_job_type(Unit *u, JobType type) { return u && u->job && u->job->type == type; @@ -1086,21 +1038,21 @@ bool unit_can_stop_refuse_manual(Unit *u); bool unit_can_isolate_refuse_manual(Unit *u); bool unit_can_freeze(Unit *u); -int unit_freeze(Unit *u); +int unit_freezer_action(Unit *u, FreezerAction action); +void unit_next_freezer_state(Unit *u, FreezerAction a, FreezerState *ret, FreezerState *ret_tgt); void unit_frozen(Unit *u); - -int unit_thaw(Unit *u); void unit_thawed(Unit *u); -int unit_freeze_vtable_common(Unit *u); -int unit_thaw_vtable_common(Unit *u); - Condition *unit_find_failed_condition(Unit *u); int unit_arm_timer(Unit *u, sd_event_source **source, bool relative, usec_t usec, sd_event_time_handler_t handler); int unit_compare_priority(Unit *a, Unit *b); +UnitMountDependencyType unit_mount_dependency_type_from_string(const char *s) _const_; +const char* unit_mount_dependency_type_to_string(UnitMountDependencyType t) _const_; +UnitDependency unit_mount_dependency_type_to_dependency_type(UnitMountDependencyType t) _pure_; + /* Macros which append UNIT= or USER_UNIT= to the message */ #define log_unit_full_errno_zerook(unit, level, error, ...) \ -- cgit v1.2.3