diff options
Diffstat (limited to 'src/shared/cgroup-setup.c')
-rw-r--r-- | src/shared/cgroup-setup.c | 1008 |
1 files changed, 1008 insertions, 0 deletions
diff --git a/src/shared/cgroup-setup.c b/src/shared/cgroup-setup.c new file mode 100644 index 0000000..934a16e --- /dev/null +++ b/src/shared/cgroup-setup.c @@ -0,0 +1,1008 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <unistd.h> + +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "missing_threads.h" +#include "mkdir.h" +#include "parse-util.h" +#include "path-util.h" +#include "proc-cmdline.h" +#include "process-util.h" +#include "recurse-dir.h" +#include "stdio-util.h" +#include "string-util.h" +#include "user-util.h" +#include "virt.h" + +static int cg_any_controller_used_for_v1(void) { + _cleanup_free_ char *buf = NULL; + _cleanup_strv_free_ char **lines = NULL; + int r; + + r = read_full_virtual_file("/proc/cgroups", &buf, NULL); + if (r < 0) + return log_debug_errno(r, "Could not read /proc/cgroups, ignoring: %m"); + + r = strv_split_newlines_full(&lines, buf, 0); + if (r < 0) + return r; + + /* The intention of this is to check if the fully unified cgroup tree setup is possible, meaning all + * enabled kernel cgroup controllers are currently not in use by cgroup1. For reference: + * https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups- + * + * Note that this is typically only useful to check inside a container where we don't know what + * cgroup tree setup is in use by the host; if the host is using legacy or hybrid, we can't use + * unified since some or all controllers would be missing. This is not the best way to detect this, + * as whatever container manager created our container should have mounted /sys/fs/cgroup + * appropriately, but in case that wasn't done, we try to detect if it's possible for us to use + * unified cgroups. */ + STRV_FOREACH(line, lines) { + _cleanup_free_ char *name = NULL, *hierarchy_id = NULL, *num = NULL, *enabled = NULL; + + /* Skip header line */ + if (startswith(*line, "#")) + continue; + + const char *p = *line; + r = extract_many_words(&p, NULL, 0, &name, &hierarchy_id, &num, &enabled, NULL); + if (r < 0) + return log_debug_errno(r, "Error parsing /proc/cgroups line, ignoring: %m"); + else if (r < 4) { + log_debug("Invalid /proc/cgroups line, ignoring."); + continue; + } + + /* Ignore disabled controllers. */ + if (streq(enabled, "0")) + continue; + + /* Ignore controllers we don't care about. */ + if (cgroup_controller_from_string(name) < 0) + continue; + + /* Since the unified cgroup doesn't use multiple hierarchies, if any controller has a + * non-zero hierarchy_id that means it's in use already in a legacy (or hybrid) cgroup v1 + * hierarchy, and can't be used in a unified cgroup. */ + if (!streq(hierarchy_id, "0")) { + log_debug("Cgroup controller %s in use by legacy v1 hierarchy.", name); + return 1; + } + } + + return 0; +} + +bool cg_is_unified_wanted(void) { + static thread_local int wanted = -1; + bool b; + const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL; + _cleanup_free_ char *c = NULL; + int r; + + /* If we have a cached value, return that. */ + if (wanted >= 0) + return wanted; + + /* If the hierarchy is already mounted, then follow whatever was chosen for it. */ + r = cg_unified_cached(true); + if (r >= 0) + return (wanted = r >= CGROUP_UNIFIED_ALL); + + /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */ + r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", /* flags = */ 0, &b); + if (r > 0) + return (wanted = b); + + /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to + * use hybrid or legacy hierarchy. */ + r = proc_cmdline_get_key("cgroup_no_v1", 0, &c); + if (r > 0 && streq_ptr(c, "all")) + return (wanted = true); + + /* If any controller is in use as v1, don't use unified. */ + if (cg_any_controller_used_for_v1() > 0) + return (wanted = false); + + return (wanted = is_default); +} + +bool cg_is_legacy_wanted(void) { + static thread_local int wanted = -1; + + /* If we have a cached value, return that. */ + if (wanted >= 0) + return wanted; + + /* Check if we have cgroup v2 already mounted. */ + if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL) + return (wanted = false); + + /* Otherwise, assume that at least partial legacy is wanted, + * since cgroup v2 should already be mounted at this point. */ + return (wanted = true); +} + +bool cg_is_hybrid_wanted(void) { + static thread_local int wanted = -1; + int r; + bool b; + const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD; + /* We default to true if the default is "hybrid", obviously, but also when the default is "unified", + * because if we get called, it means that unified hierarchy was not mounted. */ + + /* If we have a cached value, return that. */ + if (wanted >= 0) + return wanted; + + /* If the hierarchy is already mounted, then follow whatever was chosen for it. */ + if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL) + return (wanted = false); + + /* Otherwise, let's see what the kernel command line has to say. Since checking is expensive, cache + * a non-error result. */ + r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", /* flags = */ 0, &b); + + /* The meaning of the kernel option is reversed wrt. to the return value of this function, hence the + * negation. */ + return (wanted = r > 0 ? !b : is_default); +} + +int cg_weight_parse(const char *s, uint64_t *ret) { + uint64_t u; + int r; + + if (isempty(s)) { + *ret = CGROUP_WEIGHT_INVALID; + return 0; + } + + r = safe_atou64(s, &u); + if (r < 0) + return r; + + if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX) + return -ERANGE; + + *ret = u; + return 0; +} + +int cg_cpu_weight_parse(const char *s, uint64_t *ret) { + if (streq_ptr(s, "idle")) + return *ret = CGROUP_WEIGHT_IDLE; + return cg_weight_parse(s, ret); +} + +int cg_cpu_shares_parse(const char *s, uint64_t *ret) { + uint64_t u; + int r; + + if (isempty(s)) { + *ret = CGROUP_CPU_SHARES_INVALID; + return 0; + } + + r = safe_atou64(s, &u); + if (r < 0) + return r; + + if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX) + return -ERANGE; + + *ret = u; + return 0; +} + +int cg_blkio_weight_parse(const char *s, uint64_t *ret) { + uint64_t u; + int r; + + if (isempty(s)) { + *ret = CGROUP_BLKIO_WEIGHT_INVALID; + return 0; + } + + r = safe_atou64(s, &u); + if (r < 0) + return r; + + if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX) + return -ERANGE; + + *ret = u; + return 0; +} + +static int trim_cb( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */ + if (event == RECURSE_DIR_LEAVE && + de->d_type == DT_DIR && + unlinkat(dir_fd, de->d_name, AT_REMOVEDIR) < 0 && + !IN_SET(errno, ENOENT, ENOTEMPTY, EBUSY)) + log_debug_errno(errno, "Failed to trim inner cgroup %s, ignoring: %m", path); + + return RECURSE_DIR_CONTINUE; +} + +int cg_trim(const char *controller, const char *path, bool delete_root) { + _cleanup_free_ char *fs = NULL; + int r, q; + + assert(path); + assert(controller); + + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + r = recurse_dir_at( + AT_FDCWD, + fs, + /* statx_mask= */ 0, + /* n_depth_max= */ UINT_MAX, + RECURSE_DIR_ENSURE_TYPE, + trim_cb, + NULL); + if (r == -ENOENT) /* non-existing is the ultimate trimming, hence no error */ + r = 0; + else if (r < 0) + log_debug_errno(r, "Failed to iterate through cgroup %s: %m", path); + + /* If we shall delete the top-level cgroup, then propagate the failure to do so (except if it is + * already gone anyway). Also, let's debug log about this failure, except if the error code is an + * expected one. */ + if (delete_root && !empty_or_root(path) && + rmdir(fs) < 0 && errno != ENOENT) { + if (!IN_SET(errno, ENOTEMPTY, EBUSY)) + log_debug_errno(errno, "Failed to trim cgroup %s: %m", path); + if (r >= 0) + r = -errno; + } + + q = cg_hybrid_unified(); + if (q < 0) + return q; + if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) + (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root); + + return r; +} + +/* Create a cgroup in the hierarchy of controller. + * Returns 0 if the group already existed, 1 on success, negative otherwise. + */ +int cg_create(const char *controller, const char *path) { + _cleanup_free_ char *fs = NULL; + int r; + + r = cg_get_path_and_check(controller, path, NULL, &fs); + if (r < 0) + return r; + + r = mkdir_parents(fs, 0755); + if (r < 0) + return r; + + r = RET_NERRNO(mkdir(fs, 0755)); + if (r == -EEXIST) + return 0; + if (r < 0) + return r; + + r = cg_hybrid_unified(); + if (r < 0) + return r; + + if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { + r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path); + if (r < 0) + log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path); + } + + return 1; +} + +int cg_create_and_attach(const char *controller, const char *path, pid_t pid) { + int r, q; + + assert(pid >= 0); + + r = cg_create(controller, path); + if (r < 0) + return r; + + q = cg_attach(controller, path, pid); + if (q < 0) + return q; + + /* This does not remove the cgroup on failure */ + return r; +} + +int cg_attach(const char *controller, const char *path, pid_t pid) { + _cleanup_free_ char *fs = NULL; + char c[DECIMAL_STR_MAX(pid_t) + 2]; + int r; + + assert(path); + assert(pid >= 0); + + r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs); + if (r < 0) + return r; + + if (pid == 0) + pid = getpid_cached(); + + xsprintf(c, PID_FMT "\n", pid); + + r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r == -EOPNOTSUPP && cg_is_threaded(path) > 0) + /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */ + return -EUCLEAN; + if (r < 0) + return r; + + r = cg_hybrid_unified(); + if (r < 0) + return r; + + if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { + r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid); + if (r < 0) + log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path); + } + + return 0; +} + +int cg_attach_fallback(const char *controller, const char *path, pid_t pid) { + int r; + + assert(controller); + assert(path); + assert(pid >= 0); + + r = cg_attach(controller, path, pid); + if (r < 0) { + char prefix[strlen(path) + 1]; + + /* This didn't work? Then let's try all prefixes of + * the destination */ + + PATH_FOREACH_PREFIX(prefix, path) { + int q; + + q = cg_attach(controller, prefix, pid); + if (q >= 0) + return q; + } + } + + return r; +} + +int cg_set_access( + const char *controller, + const char *path, + uid_t uid, + gid_t gid) { + + struct Attribute { + const char *name; + bool fatal; + }; + + /* cgroup v1, aka legacy/non-unified */ + static const struct Attribute legacy_attributes[] = { + { "cgroup.procs", true }, + { "tasks", false }, + { "cgroup.clone_children", false }, + {}, + }; + + /* cgroup v2, aka unified */ + static const struct Attribute unified_attributes[] = { + { "cgroup.procs", true }, + { "cgroup.subtree_control", true }, + { "cgroup.threads", false }, + { "memory.oom.group", false }, + { "memory.reclaim", false }, + {}, + }; + + static const struct Attribute* const attributes[] = { + [false] = legacy_attributes, + [true] = unified_attributes, + }; + + _cleanup_free_ char *fs = NULL; + const struct Attribute *i; + int r, unified; + + assert(path); + + if (uid == UID_INVALID && gid == GID_INVALID) + return 0; + + unified = cg_unified_controller(controller); + if (unified < 0) + return unified; + + /* Configure access to the cgroup itself */ + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + r = chmod_and_chown(fs, 0755, uid, gid); + if (r < 0) + return r; + + /* Configure access to the cgroup's attributes */ + for (i = attributes[unified]; i->name; i++) { + fs = mfree(fs); + + r = cg_get_path(controller, path, i->name, &fs); + if (r < 0) + return r; + + r = chmod_and_chown(fs, 0644, uid, gid); + if (r < 0) { + if (i->fatal) + return r; + + log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs); + } + } + + if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { + r = cg_hybrid_unified(); + if (r < 0) + return r; + if (r > 0) { + /* Always propagate access mode from unified to legacy controller */ + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid); + if (r < 0) + log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path); + } + } + + return 0; +} + +struct access_callback_data { + uid_t uid; + gid_t gid; + int error; +}; + +static int access_callback( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + struct access_callback_data *d = ASSERT_PTR(userdata); + + if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY)) + return RECURSE_DIR_CONTINUE; + + assert(inode_fd >= 0); + + /* fchown() doesn't support O_PATH fds, hence we use the /proc/self/fd/ trick */ + if (chown(FORMAT_PROC_FD_PATH(inode_fd), d->uid, d->gid) < 0) { + log_debug_errno(errno, "Failed to change ownership of '%s', ignoring: %m", ASSERT_PTR(path)); + + if (d->error == 0) /* Return last error to caller */ + d->error = errno; + } + + return RECURSE_DIR_CONTINUE; +} + +int cg_set_access_recursive( + const char *controller, + const char *path, + uid_t uid, + gid_t gid) { + + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *fs = NULL; + int r; + + /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files, + * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to + * delegate, and cg_set_access_recursive() for any subcrgoups you might want to create below it. */ + + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return 0; + + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + fd = open(fs, O_DIRECTORY|O_CLOEXEC|O_RDONLY); + if (fd < 0) + return -errno; + + struct access_callback_data d = { + .uid = uid, + .gid = gid, + }; + + r = recurse_dir(fd, + fs, + /* statx_mask= */ 0, + /* n_depth_max= */ UINT_MAX, + RECURSE_DIR_SAME_MOUNT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL, + access_callback, + &d); + if (r < 0) + return r; + + return -d.error; +} + +int cg_migrate( + const char *cfrom, + const char *pfrom, + const char *cto, + const char *pto, + CGroupFlags flags) { + + bool done = false; + _cleanup_set_free_ Set *s = NULL; + int r, ret = 0; + pid_t my_pid; + + assert(cfrom); + assert(pfrom); + assert(cto); + assert(pto); + + s = set_new(NULL); + if (!s) + return -ENOMEM; + + my_pid = getpid_cached(); + + do { + _cleanup_fclose_ FILE *f = NULL; + pid_t pid = 0; + done = true; + + r = cg_enumerate_processes(cfrom, pfrom, &f); + if (r < 0) { + if (ret >= 0 && r != -ENOENT) + return r; + + return ret; + } + + while ((r = cg_read_pid(f, &pid)) > 0) { + + /* This might do weird stuff if we aren't a + * single-threaded program. However, we + * luckily know we are not */ + if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid) + continue; + + if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid)) + continue; + + /* Ignore kernel threads. Since they can only + * exist in the root cgroup, we only check for + * them there. */ + if (cfrom && + empty_or_root(pfrom) && + pid_is_kernel_thread(pid) > 0) + continue; + + r = cg_attach(cto, pto, pid); + if (r < 0) { + if (ret >= 0 && r != -ESRCH) + ret = r; + } else if (ret == 0) + ret = 1; + + done = false; + + r = set_put(s, PID_TO_PTR(pid)); + if (r < 0) { + if (ret >= 0) + return r; + + return ret; + } + } + + if (r < 0) { + if (ret >= 0) + return r; + + return ret; + } + } while (!done); + + return ret; +} + +int cg_migrate_recursive( + const char *cfrom, + const char *pfrom, + const char *cto, + const char *pto, + CGroupFlags flags) { + + _cleanup_closedir_ DIR *d = NULL; + int r, ret = 0; + char *fn; + + assert(cfrom); + assert(pfrom); + assert(cto); + assert(pto); + + ret = cg_migrate(cfrom, pfrom, cto, pto, flags); + + r = cg_enumerate_subgroups(cfrom, pfrom, &d); + if (r < 0) { + if (ret >= 0 && r != -ENOENT) + return r; + + return ret; + } + + while ((r = cg_read_subgroup(d, &fn)) > 0) { + _cleanup_free_ char *p = NULL; + + p = path_join(empty_to_root(pfrom), fn); + free(fn); + if (!p) + return -ENOMEM; + + r = cg_migrate_recursive(cfrom, p, cto, pto, flags); + if (r != 0 && ret >= 0) + ret = r; + } + + if (r < 0 && ret >= 0) + ret = r; + + if (flags & CGROUP_REMOVE) { + r = cg_rmdir(cfrom, pfrom); + if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY)) + return r; + } + + return ret; +} + +int cg_migrate_recursive_fallback( + const char *cfrom, + const char *pfrom, + const char *cto, + const char *pto, + CGroupFlags flags) { + + int r; + + assert(cfrom); + assert(pfrom); + assert(cto); + assert(pto); + + r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags); + if (r < 0) { + char prefix[strlen(pto) + 1]; + + /* This didn't work? Then let's try all prefixes of the destination */ + + PATH_FOREACH_PREFIX(prefix, pto) { + int q; + + q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags); + if (q >= 0) + return q; + } + } + + return r; +} + +int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) { + CGroupController c; + CGroupMask done; + bool created; + int r; + + /* This one will create a cgroup in our private tree, but also + * duplicate it in the trees specified in mask, and remove it + * in all others. + * + * Returns 0 if the group already existed in the systemd hierarchy, + * 1 on success, negative otherwise. + */ + + /* First create the cgroup in our own hierarchy. */ + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path); + if (r < 0) + return r; + created = r; + + /* If we are in the unified hierarchy, we are done now */ + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + return created; + + supported &= CGROUP_MASK_V1; + mask = CGROUP_MASK_EXTEND_JOINED(mask); + done = 0; + + /* Otherwise, do the same in the other hierarchies */ + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *n; + + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) + continue; + + n = cgroup_controller_to_string(c); + if (FLAGS_SET(mask, bit)) + (void) cg_create(n, path); + + done |= CGROUP_MASK_EXTEND_JOINED(bit); + } + + return created; +} + +int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) { + int r; + + r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid); + if (r < 0) + return r; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r > 0) + return 0; + + supported &= CGROUP_MASK_V1; + CGroupMask done = 0; + + for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *p = NULL; + + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) + continue; + + if (path_callback) + p = path_callback(bit, userdata); + if (!p) + p = path; + + (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid); + done |= CGROUP_MASK_EXTEND_JOINED(bit); + } + + return 0; +} + +int cg_migrate_v1_controllers(CGroupMask supported, CGroupMask mask, const char *from, cg_migrate_callback_t to_callback, void *userdata) { + CGroupController c; + CGroupMask done; + int r = 0, q; + + assert(to_callback); + + supported &= CGROUP_MASK_V1; + mask = CGROUP_MASK_EXTEND_JOINED(mask); + done = 0; + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *to = NULL; + + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) + continue; + + if (!FLAGS_SET(mask, bit)) + continue; + + to = to_callback(bit, userdata); + + /* Remember first error and try continuing */ + q = cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, from, cgroup_controller_to_string(c), to, 0); + r = (r < 0) ? r : q; + + done |= CGROUP_MASK_EXTEND_JOINED(bit); + } + + return r; +} + +int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) { + int r, q; + + r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root); + if (r < 0) + return r; + + q = cg_all_unified(); + if (q < 0) + return q; + if (q > 0) + return r; + + return cg_trim_v1_controllers(supported, _CGROUP_MASK_ALL, path, delete_root); +} + +int cg_trim_v1_controllers(CGroupMask supported, CGroupMask mask, const char *path, bool delete_root) { + CGroupController c; + CGroupMask done; + int r = 0, q; + + supported &= CGROUP_MASK_V1; + mask = CGROUP_MASK_EXTEND_JOINED(mask); + done = 0; + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) + continue; + + if (FLAGS_SET(mask, bit)) { + /* Remember first error and try continuing */ + q = cg_trim(cgroup_controller_to_string(c), path, delete_root); + r = (r < 0) ? r : q; + } + done |= CGROUP_MASK_EXTEND_JOINED(bit); + } + + return r; +} + +int cg_enable_everywhere( + CGroupMask supported, + CGroupMask mask, + const char *p, + CGroupMask *ret_result_mask) { + + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *fs = NULL; + CGroupController c; + CGroupMask ret = 0; + int r; + + assert(p); + + if (supported == 0) { + if (ret_result_mask) + *ret_result_mask = 0; + return 0; + } + + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) { + /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim + * complete success right away. (If you wonder why we return the full mask here, rather than zero: the + * caller tends to use the returned mask later on to compare if all controllers where properly joined, + * and if not requeues realization. This use is the primary purpose of the return value, hence let's + * minimize surprises here and reduce triggers for re-realization by always saying we fully + * succeeded.) */ + if (ret_result_mask) + *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with + * CGROUP_MASK_V2: The 'supported' mask + * might contain pure-V1 or BPF + * controllers, and we never want to + * claim that we could enable those with + * cgroup.subtree_control */ + return 0; + } + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs); + if (r < 0) + return r; + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *n; + + if (!FLAGS_SET(CGROUP_MASK_V2, bit)) + continue; + + if (!FLAGS_SET(supported, bit)) + continue; + + n = cgroup_controller_to_string(c); + { + char s[1 + strlen(n) + 1]; + + s[0] = FLAGS_SET(mask, bit) ? '+' : '-'; + strcpy(s + 1, n); + + if (!f) { + f = fopen(fs, "we"); + if (!f) + return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p); + } + + r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) { + log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m", + FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs); + clearerr(f); + + /* If we can't turn off a controller, leave it on in the reported resulting mask. This + * happens for example when we attempt to turn off a controller up in the tree that is + * used down in the tree. */ + if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY + * only here, and not follow the same logic + * for other errors such as EINVAL or + * EOPNOTSUPP or anything else. That's + * because EBUSY indicates that the + * controllers is currently enabled and + * cannot be disabled because something down + * the hierarchy is still using it. Any other + * error most likely means something like "I + * never heard of this controller" or + * similar. In the former case it's hence + * safe to assume the controller is still on + * after the failed operation, while in the + * latter case it's safer to assume the + * controller is unknown and hence certainly + * not enabled. */ + ret |= bit; + } else { + /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */ + if (FLAGS_SET(mask, bit)) + ret |= bit; + } + } + } + + /* Let's return the precise set of controllers now enabled for the cgroup. */ + if (ret_result_mask) + *ret_result_mask = ret; + + return 0; +} |