diff options
Diffstat (limited to '')
-rw-r--r-- | src/oom/meson.build | 34 | ||||
-rw-r--r-- | src/oom/oomctl.c | 139 | ||||
-rw-r--r-- | src/oom/oomd-manager-bus.c | 52 | ||||
-rw-r--r-- | src/oom/oomd-manager-bus.h | 8 | ||||
-rw-r--r-- | src/oom/oomd-manager.c | 855 | ||||
-rw-r--r-- | src/oom/oomd-manager.h | 73 | ||||
-rw-r--r-- | src/oom/oomd-util.c | 646 | ||||
-rw-r--r-- | src/oom/oomd-util.h | 145 | ||||
-rw-r--r-- | src/oom/oomd.c | 201 | ||||
-rw-r--r-- | src/oom/oomd.conf | 20 | ||||
-rw-r--r-- | src/oom/org.freedesktop.oom1.conf | 47 | ||||
-rw-r--r-- | src/oom/org.freedesktop.oom1.service | 14 | ||||
-rw-r--r-- | src/oom/test-oomd-util.c | 513 |
13 files changed, 2747 insertions, 0 deletions
diff --git a/src/oom/meson.build b/src/oom/meson.build new file mode 100644 index 0000000..1203518 --- /dev/null +++ b/src/oom/meson.build @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +systemd_oomd_sources = files( + 'oomd-manager-bus.c', + 'oomd-manager-bus.h', + 'oomd-manager.c', + 'oomd-manager.h', + 'oomd-util.c', + 'oomd-util.h', + 'oomd.c', +) + +oomctl_sources = files('oomctl.c') + +if conf.get('ENABLE_OOMD') == 1 + install_data('org.freedesktop.oom1.conf', + install_dir : dbuspolicydir) + + install_data('org.freedesktop.oom1.service', + install_dir : dbussystemservicedir) + + if install_sysconfdir_samples + install_data('oomd.conf', + install_dir : pkgsysconfdir) + endif +endif + +tests += [ + [files('test-oomd-util.c', + 'oomd-util.c', + 'oomd-util.h'), + [], + [libatomic]] +] diff --git a/src/oom/oomctl.c b/src/oom/oomctl.c new file mode 100644 index 0000000..2ffb9d4 --- /dev/null +++ b/src/oom/oomctl.c @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <getopt.h> +#include <unistd.h> + +#include "bus-error.h" +#include "copy.h" +#include "main-func.h" +#include "pretty-print.h" +#include "terminal-util.h" +#include "verbs.h" + +static PagerFlags arg_pager_flags = 0; + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("oomctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND ...\n\n" + "%2$sManage or inspect the userspace OOM killer.%3$s\n" + "\n%4$sCommands:%5$s\n" + " dump Output the current state of systemd-oomd\n" + "\n%4$sOptions:%5$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + "\nSee the %6$s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + ansi_underline(), + ansi_normal(), + link); + + return 0; +} + +static int dump_state(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int fd = -1; + int r; + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to connect system bus: %m"); + + pager_open(arg_pager_flags); + + r = sd_bus_call_method( + bus, + "org.freedesktop.oom1", + "/org/freedesktop/oom1", + "org.freedesktop.oom1.Manager", + "DumpByFileDescriptor", + &error, + &reply, + NULL); + if (r < 0) + return log_error_errno(r, "Failed to dump context: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "h", &fd); + if (r < 0) + return bus_log_parse_error(r); + + fflush(stdout); + return copy_bytes(fd, STDOUT_FILENO, UINT64_MAX, 0); +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + return 1; +} + +static int run(int argc, char* argv[]) { + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, help }, + { "dump", VERB_ANY, 1, VERB_DEFAULT, dump_state }, + {} + }; + + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/oom/oomd-manager-bus.c b/src/oom/oomd-manager-bus.c new file mode 100644 index 0000000..3a3308f --- /dev/null +++ b/src/oom/oomd-manager-bus.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <linux/capability.h> + +#include "bus-common-errors.h" +#include "bus-polkit.h" +#include "data-fd-util.h" +#include "fd-util.h" +#include "oomd-manager-bus.h" +#include "oomd-manager.h" +#include "user-util.h" + +static int bus_method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *dump = NULL; + _cleanup_close_ int fd = -1; + Manager *m = ASSERT_PTR(userdata); + int r; + + assert(message); + + r = manager_get_dump_string(m, &dump); + if (r < 0) + return r; + + fd = acquire_data_fd(dump, strlen(dump), 0); + if (fd < 0) + return fd; + + return sd_bus_reply_method_return(message, "h", fd); +} + +static const sd_bus_vtable manager_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_METHOD_WITH_NAMES("DumpByFileDescriptor", + NULL,, + "h", + SD_BUS_PARAM(fd), + bus_method_dump_by_fd, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_SIGNAL_WITH_NAMES("Killed", + "ss", + SD_BUS_PARAM(cgroup) + SD_BUS_PARAM(reason), + 0), + SD_BUS_VTABLE_END +}; + +const BusObjectImplementation manager_object = { + "/org/freedesktop/oom1", + "org.freedesktop.oom1.Manager", + .vtables = BUS_VTABLES(manager_vtable), +}; diff --git a/src/oom/oomd-manager-bus.h b/src/oom/oomd-manager-bus.h new file mode 100644 index 0000000..7935b35 --- /dev/null +++ b/src/oom/oomd-manager-bus.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "bus-object.h" + +typedef struct Manager Manager; + +extern const BusObjectImplementation manager_object; diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c new file mode 100644 index 0000000..836eeb4 --- /dev/null +++ b/src/oom/oomd-manager.c @@ -0,0 +1,855 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-daemon.h" + +#include "bus-log-control-api.h" +#include "bus-util.h" +#include "bus-polkit.h" +#include "cgroup-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "memory-util.h" +#include "oomd-manager-bus.h" +#include "oomd-manager.h" +#include "path-util.h" +#include "percent-util.h" + +typedef struct ManagedOOMMessage { + ManagedOOMMode mode; + char *path; + char *property; + uint32_t limit; +} ManagedOOMMessage; + +static void managed_oom_message_destroy(ManagedOOMMessage *message) { + assert(message); + free(message->path); + free(message->property); +} + +static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + ManagedOOMMode *mode = userdata, m; + const char *s; + + assert(mode); + assert_se(s = json_variant_string(v)); + + m = managed_oom_mode_from_string(s); + if (m < 0) + return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s); + + *mode = m; + return 0; +} + +static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) { + JsonVariant *c, *cgroups; + int r; + + static const JsonDispatch dispatch_table[] = { + { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMMessage, mode), JSON_MANDATORY }, + { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, path), JSON_MANDATORY }, + { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY }, + { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 }, + {}, + }; + + assert(m); + assert(parameters); + + cgroups = json_variant_by_key(parameters, "cgroups"); + if (!cgroups) + return -EINVAL; + + /* Skip malformed elements and keep processing in case the others are good */ + JSON_VARIANT_ARRAY_FOREACH(c, cgroups) { + _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {}; + OomdCGroupContext *ctx; + Hashmap *monitor_hm; + loadavg_t limit; + + if (!json_variant_is_object(c)) + continue; + + r = json_dispatch(c, dispatch_table, NULL, 0, &message); + if (r == -ENOMEM) + return r; + if (r < 0) + continue; + + if (uid != 0) { + uid_t cg_uid; + + r = cg_path_get_owner_uid(message.path, &cg_uid); + if (r < 0) { + log_debug_errno(r, "Failed to get cgroup %s owner uid: %m", message.path); + continue; + } + + /* Let's not be lenient for permission errors and skip processing if we receive an + * update for a cgroup that doesn't belong to the user. */ + if (uid != cg_uid) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), + "cgroup path owner UID does not match sender uid " + "(" UID_FMT " != " UID_FMT ")", uid, cg_uid); + } + + monitor_hm = streq(message.property, "ManagedOOMSwap") ? + m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts; + + if (message.mode == MANAGED_OOM_AUTO) { + (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path))); + continue; + } + + limit = m->default_mem_pressure_limit; + + if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) { + int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit); + + r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit); + if (r < 0) + continue; + } + + r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path); + if (r == -ENOMEM) + return r; + if (r < 0 && r != -EEXIST) + log_debug_errno(r, "Failed to insert message, ignoring: %m"); + + /* Always update the limit in case it was changed. For non-memory pressure detection the value is + * ignored so always updating it here is not a problem. */ + ctx = hashmap_get(monitor_hm, empty_to_root(message.path)); + if (ctx) + ctx->mem_pressure_limit = limit; + } + + /* Toggle wake-ups for "ManagedOOMSwap" if entries are present. */ + r = sd_event_source_set_enabled(m->swap_context_event_source, + hashmap_isempty(m->monitored_swap_cgroup_contexts) ? SD_EVENT_OFF : SD_EVENT_ON); + if (r < 0) + return log_error_errno(r, "Failed to toggle enabled state of swap context source: %m"); + + return 0; +} + +static int process_managed_oom_request( + Varlink *link, + JsonVariant *parameters, + VarlinkMethodFlags flags, + void *userdata) { + Manager *m = ASSERT_PTR(userdata); + uid_t uid; + int r; + + r = varlink_get_peer_uid(link, &uid); + if (r < 0) + return log_error_errno(r, "Failed to get varlink peer uid: %m"); + + return process_managed_oom_message(m, uid, parameters); +} + +static int process_managed_oom_reply( + Varlink *link, + JsonVariant *parameters, + const char *error_id, + VarlinkReplyFlags flags, + void *userdata) { + Manager *m = ASSERT_PTR(userdata); + uid_t uid; + int r; + + if (error_id) { + r = -EIO; + log_debug("Error getting ManagedOOM cgroups: %s", error_id); + goto finish; + } + + r = varlink_get_peer_uid(link, &uid); + if (r < 0) { + log_error_errno(r, "Failed to get varlink peer uid: %m"); + goto finish; + } + + r = process_managed_oom_message(m, uid, parameters); + +finish: + if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + m->varlink_client = varlink_close_unref(link); + + return r; +} + +/* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible + * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1". + * + * This function ignores most errors in order to handle cgroups that may have been cleaned up while + * populating the hashmap. + * + * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */ +static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) { + _cleanup_free_ char *subpath = NULL; + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert(new_h); + assert(path); + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); + if (r < 0) + return r; + + r = cg_read_subgroup(d, &subpath); + if (r < 0) + return r; + else if (r == 0) { /* No subgroups? We're a leaf node */ + r = oomd_insert_cgroup_context(NULL, new_h, path); + if (r == -ENOMEM) + return r; + if (r < 0) + log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path); + return 0; + } + + do { + _cleanup_free_ char *cg_path = NULL; + bool oom_group; + + cg_path = path_join(empty_to_root(path), subpath); + if (!cg_path) + return -ENOMEM; + + subpath = mfree(subpath); + + r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group); + /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */ + if (r == -ENOMEM) + return r; + if (r < 0) { + log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path); + return 0; + } + + if (oom_group) + r = oomd_insert_cgroup_context(NULL, new_h, cg_path); + else + r = recursively_get_cgroup_context(new_h, cg_path); + if (r == -ENOMEM) + return r; + if (r < 0) + log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path); + } while ((r = cg_read_subgroup(d, &subpath)) > 0); + + return 0; +} + +static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) { + _cleanup_hashmap_free_ Hashmap *new_base = NULL; + OomdCGroupContext *ctx; + int r; + + assert(monitored_cgroups); + + new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!new_base) + return -ENOMEM; + + HASHMAP_FOREACH(ctx, *monitored_cgroups) { + /* Skip most errors since the cgroup we're trying to update might not exist anymore. */ + r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path); + if (r == -ENOMEM) + return r; + if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT)) + log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path); + } + + hashmap_free(*monitored_cgroups); + *monitored_cgroups = TAKE_PTR(new_base); + + return 0; +} + +static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) { + _cleanup_hashmap_free_ Hashmap *candidates = NULL; + OomdCGroupContext *ctx; + int r; + + assert(monitored_cgroups); + assert(ret_candidates); + + candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!candidates) + return -ENOMEM; + + HASHMAP_FOREACH(ctx, monitored_cgroups) { + r = recursively_get_cgroup_context(candidates, ctx->path); + if (r == -ENOMEM) + return r; + if (r < 0) + log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path); + } + + *ret_candidates = TAKE_PTR(candidates); + + return 0; +} + +static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) { + _cleanup_hashmap_free_ Hashmap *new_candidates = NULL; + int r; + + assert(monitored_cgroups); + assert(candidates); + assert(*candidates); + + r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates); + if (r < 0) + return log_debug_errno(r, "Failed to get candidate contexts: %m"); + + oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates); + + hashmap_free(*candidates); + *candidates = TAKE_PTR(new_candidates); + + return 0; +} + +static int acquire_managed_oom_connect(Manager *m) { + _cleanup_(varlink_close_unrefp) Varlink *link = NULL; + int r; + + assert(m); + assert(m->event); + + r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM); + if (r < 0) + return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m"); + + (void) varlink_set_userdata(link, m); + (void) varlink_set_description(link, "oomd"); + (void) varlink_set_relative_timeout(link, USEC_INFINITY); + + r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + r = varlink_bind_reply(link, process_managed_oom_reply); + if (r < 0) + return log_error_errno(r, "Failed to bind reply callback: %m"); + + r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL); + if (r < 0) + return log_error_errno(r, "Failed to observe varlink call: %m"); + + m->varlink_client = TAKE_PTR(link); + return 0; +} + +static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + usec_t usec_now; + int r; + + assert(s); + assert(!hashmap_isempty(m->monitored_swap_cgroup_contexts)); + + /* Reset timer */ + r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); + if (r < 0) + return log_error_errno(r, "Failed to reset event timer: %m"); + + r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC); + if (r < 0) + return log_error_errno(r, "Failed to set relative time for timer: %m"); + + /* Reconnect if our connection dropped */ + if (!m->varlink_client) { + r = acquire_managed_oom_connect(m); + if (r < 0) + return log_error_errno(r, "Failed to acquire varlink connection: %m"); + } + + /* We still try to acquire system information for oomctl even if no units want swap monitoring */ + r = oomd_system_context_acquire("/proc/meminfo", &m->system_context); + /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */ + if (r < 0) + return log_error_errno(r, "Failed to acquire system context: %m"); + + /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the + * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts + * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent + * nodes are the ones that matter). */ + + /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */ + if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) && + oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) { + _cleanup_hashmap_free_ Hashmap *candidates = NULL; + _cleanup_free_ char *selected = NULL; + uint64_t threshold; + + log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and " + "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR, + m->system_context.mem_used, m->system_context.mem_total, + m->system_context.swap_used, m->system_context.swap_total, + PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); + + r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m"); + + threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100; + r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_notice_errno(r, "Failed to kill any cgroups based on swap: %m"); + else { + if (selected && r > 0) { + log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and " + "swap used (%"PRIu64") / total (%"PRIu64") being more than " + PERMYRIAD_AS_PERCENT_FORMAT_STR, + selected, + m->system_context.mem_used, m->system_context.mem_total, + m->system_context.swap_used, m->system_context.swap_total, + PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); + + /* send dbus signal */ + (void) sd_bus_emit_signal(m->bus, + "/org/freedesktop/oom1", + "org.freedesktop.oom1.Manager", + "Killed", + "ss", + selected, + "memory-used"); + } + return 0; + } + } + + return 0; +} + +static void clear_candidate_hashmapp(Manager **m) { + if (*m) + hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates); +} + +static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { + /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we + * update the candidate data (in which case clear_candidates will be NULL). */ + _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata; + _cleanup_set_free_ Set *targets = NULL; + bool in_post_action_delay = false; + Manager *m = ASSERT_PTR(userdata); + usec_t usec_now; + int r; + + assert(s); + + /* Reset timer */ + r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); + if (r < 0) + return log_error_errno(r, "Failed to reset event timer: %m"); + + r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC); + if (r < 0) + return log_error_errno(r, "Failed to set relative time for timer: %m"); + + /* Reconnect if our connection dropped */ + if (!m->varlink_client) { + r = acquire_managed_oom_connect(m); + if (r < 0) + return log_error_errno(r, "Failed to acquire varlink connection: %m"); + } + + /* Return early if nothing is requesting memory pressure monitoring */ + if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts)) + return 0; + + /* Update the cgroups used for detection/action */ + r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m"); + + /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale + * values and go on a kill storm. */ + if (m->mem_pressure_post_action_delay_start > 0) { + if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) + in_post_action_delay = true; + else + m->mem_pressure_post_action_delay_start = 0; + } + + r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m"); + else if (r == 1 && !in_post_action_delay) { + OomdCGroupContext *t; + SET_FOREACH(t, targets) { + _cleanup_free_ char *selected = NULL; + + /* Check if there was reclaim activity in the given interval. The concern is the following case: + * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending + * cgroup. Even after this, well-behaved processes will fault in recently resident pages and + * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need + * to kill something (it won't help anyways). */ + if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC) + continue; + + log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity", + t->path, + LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), + LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), + FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + + r = update_monitored_cgroup_contexts_candidates( + m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); + else + clear_candidates = NULL; + + r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, + /* prefix= */ t->path, + /* dry_run= */ m->dry_run, + &selected); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_notice_errno(r, "Failed to kill any cgroups under %s based on pressure: %m", t->path); + else { + /* Don't act on all the high pressure cgroups at once; return as soon as we kill one. + * If r == 0 then it means there were not eligible candidates, the candidate cgroup + * disappeared, or the candidate cgroup has no processes by the time we tried to kill + * it. In either case, go through the event loop again and select a new candidate if + * pressure is still high. */ + m->mem_pressure_post_action_delay_start = usec_now; + if (selected && r > 0) { + log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" + " for > %s with reclaim activity", + selected, t->path, + LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10), + LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit), + FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + + /* send dbus signal */ + (void) sd_bus_emit_signal(m->bus, + "/org/freedesktop/oom1", + "org.freedesktop.oom1.Manager", + "Killed", + "ss", + selected, + "memory-pressure"); + } + return 0; + } + } + } else { + /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every + * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill + * might happen. + * Candidate cgroup data will continue to get updated during the post-action delay period in case + * pressure continues to be high after a kill. */ + OomdCGroupContext *c; + HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) { + if (c->mem_pressure_limit_hit_start == 0) + continue; + + r = update_monitored_cgroup_contexts_candidates( + m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); + else { + clear_candidates = NULL; + break; + } + } + } + + return 0; +} + +static int monitor_swap_contexts(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(m); + assert(m->event); + + r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m); + if (r < 0) + return r; + + r = sd_event_source_set_exit_on_failure(s, true); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s, SD_EVENT_OFF); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s, "oomd-swap-timer"); + + m->swap_context_event_source = TAKE_PTR(s); + return 0; +} + +static int monitor_memory_pressure_contexts(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(m); + assert(m->event); + + r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m); + if (r < 0) + return r; + + r = sd_event_source_set_exit_on_failure(s, true); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s, SD_EVENT_ON); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer"); + + m->mem_pressure_context_event_source = TAKE_PTR(s); + return 0; +} + +Manager* manager_free(Manager *m) { + assert(m); + + varlink_server_unref(m->varlink_server); + varlink_close_unref(m->varlink_client); + sd_event_source_unref(m->swap_context_event_source); + sd_event_source_unref(m->mem_pressure_context_event_source); + sd_event_unref(m->event); + + bus_verify_polkit_async_registry_free(m->polkit_registry); + sd_bus_flush_close_unref(m->bus); + + hashmap_free(m->monitored_swap_cgroup_contexts); + hashmap_free(m->monitored_mem_pressure_cgroup_contexts); + hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates); + + return mfree(m); +} + +int manager_new(Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + assert(ret); + + m = new0(Manager, 1); + if (!m) + return -ENOMEM; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + (void) sd_event_set_watchdog(m->event, true); + + r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return r; + + m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_swap_cgroup_contexts) + return -ENOMEM; + + m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_mem_pressure_cgroup_contexts) + return -ENOMEM; + + m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_mem_pressure_cgroup_contexts_candidates) + return -ENOMEM; + + *ret = TAKE_PTR(m); + return 0; +} + +static int manager_connect_bus(Manager *m) { + int r; + + assert(m); + assert(!m->bus); + + r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom"); + if (r < 0) + return log_error_errno(r, "Failed to connect to bus: %m"); + + r = bus_add_implementation(m->bus, &manager_object, m); + if (r < 0) + return r; + + r = bus_log_control_api_register(m->bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(m->bus, m->event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + return 0; +} + +static int manager_varlink_init(Manager *m, int fd) { + _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; + int r; + + assert(m); + assert(!m->varlink_server); + + r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA); + if (r < 0) + return log_error_errno(r, "Failed to allocate varlink server object: %m"); + + varlink_server_set_userdata(s, m); + + r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request); + if (r < 0) + return log_error_errno(r, "Failed to register varlink method: %m"); + + if (fd < 0) + r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666); + else + r = varlink_server_listen_fd(s, fd); + if (r < 0) + return log_error_errno(r, "Failed to bind to varlink socket: %m"); + + r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + log_debug("Initialized systemd-oomd varlink server"); + + m->varlink_server = TAKE_PTR(s); + return 0; +} + +int manager_start( + Manager *m, + bool dry_run, + int swap_used_limit_permyriad, + int mem_pressure_limit_permyriad, + usec_t mem_pressure_usec, + int fd) { + + unsigned long l, f; + int r; + + assert(m); + + m->dry_run = dry_run; + + m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100; + assert(m->swap_used_limit_permyriad <= 10000); + + if (mem_pressure_limit_permyriad >= 0) { + assert(mem_pressure_limit_permyriad <= 10000); + + l = mem_pressure_limit_permyriad / 100; + f = mem_pressure_limit_permyriad % 100; + } else { + l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT; + f = 0; + } + r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit); + if (r < 0) + return r; + + m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC; + + r = manager_connect_bus(m); + if (r < 0) + return r; + + r = acquire_managed_oom_connect(m); + if (r < 0) + return r; + + r = manager_varlink_init(m, fd); + if (r < 0) + return r; + + r = monitor_memory_pressure_contexts(m); + if (r < 0) + return r; + + r = monitor_swap_contexts(m); + if (r < 0) + return r; + + return 0; +} + +int manager_get_dump_string(Manager *m, char **ret) { + _cleanup_free_ char *dump = NULL; + _cleanup_fclose_ FILE *f = NULL; + OomdCGroupContext *c; + size_t size; + char *key; + int r; + + assert(m); + assert(ret); + + f = open_memstream_unlocked(&dump, &size); + if (!f) + return -errno; + + fprintf(f, + "Dry Run: %s\n" + "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n" + "Default Memory Pressure Limit: %lu.%02lu%%\n" + "Default Memory Pressure Duration: %s\n" + "System Context:\n", + yes_no(m->dry_run), + PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad), + LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit), + FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + oomd_dump_system_context(&m->system_context, f, "\t"); + + fprintf(f, "Swap Monitored CGroups:\n"); + HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts) + oomd_dump_swap_cgroup_context(c, f, "\t"); + + fprintf(f, "Memory Pressure Monitored CGroups:\n"); + HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts) + oomd_dump_memory_pressure_cgroup_context(c, f, "\t"); + + r = fflush_and_check(f); + if (r < 0) + return r; + + f = safe_fclose(f); + + *ret = TAKE_PTR(dump); + return 0; +} diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h new file mode 100644 index 0000000..8f0dd41 --- /dev/null +++ b/src/oom/oomd-manager.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "sd-bus.h" +#include "sd-event.h" + +#include "conf-parser.h" +#include "oomd-util.h" +#include "varlink.h" + +/* Polling interval for monitoring stats */ +#define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */ +/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */ +#define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC) + +/* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the + * percentage of time all tasks were delayed (i.e. unproductive). + * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in + * system.slice are assumed to be less latency sensitive. */ +#define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC) +#define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60 +#define DEFAULT_SWAP_USED_LIMIT_PERCENT 90 + +/* Only tackle candidates with large swap usage. */ +#define THRESHOLD_SWAP_USED_PERCENT 5 + +#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC) +#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC) + +typedef struct Manager Manager; + +struct Manager { + sd_bus *bus; + sd_event *event; + + Hashmap *polkit_registry; + + bool dry_run; + int swap_used_limit_permyriad; + loadavg_t default_mem_pressure_limit; + usec_t default_mem_pressure_duration_usec; + + /* k: cgroup paths -> v: OomdCGroupContext + * Used to detect when to take action. */ + Hashmap *monitored_swap_cgroup_contexts; + Hashmap *monitored_mem_pressure_cgroup_contexts; + Hashmap *monitored_mem_pressure_cgroup_contexts_candidates; + + OomdSystemContext system_context; + + usec_t mem_pressure_post_action_delay_start; + + sd_event_source *swap_context_event_source; + sd_event_source *mem_pressure_context_event_source; + + /* This varlink object is used to manage the subscription from systemd-oomd to PID1 which it uses to + * listen for changes in ManagedOOM settings (oomd client - systemd server). */ + Varlink *varlink_client; + /* This varlink server object is used to manage systemd-oomd's varlink server which is used by user + * managers to report changes in ManagedOOM settings (oomd server - systemd client). */ + VarlinkServer *varlink_server; +}; + +Manager* manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_new(Manager **ret); + +int manager_start(Manager *m, bool dry_run, int swap_used_limit_permyriad, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec, int fd); + +int manager_get_dump_string(Manager *m, char **ret); + +CONFIG_PARSER_PROTOTYPE(config_parse_oomd_default); diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c new file mode 100644 index 0000000..391d846 --- /dev/null +++ b/src/oom/oomd-util.c @@ -0,0 +1,646 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <sys/xattr.h> +#include <unistd.h> + +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "oomd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "procfs-util.h" +#include "signal-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "user-util.h" + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + oomd_cgroup_ctx_hash_ops, + char, + string_hash_func, + string_compare_func, + OomdCGroupContext, + oomd_cgroup_context_free); + +static int log_kill(pid_t pid, int sig, void *userdata) { + log_debug("oomd attempting to kill " PID_FMT " with %s", pid, signal_to_string(sig)); + return 0; +} + +static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) { + _cleanup_free_ char *value = NULL; + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + uint64_t curr_count = 0; + int r; + + assert(path); + assert(xattr); + + r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, xattr, &value); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) + return r; + + if (!isempty(value)) { + r = safe_atou64(value, &curr_count); + if (r < 0) + return r; + } + + if (curr_count > UINT64_MAX - num_procs_killed) + return -EOVERFLOW; + + xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed); + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, path, xattr, buf, strlen(buf), 0); + if (r < 0) + return r; + + return 0; +} + +OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) { + if (!ctx) + return NULL; + + free(ctx->path); + return mfree(ctx); +} + +int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { + _cleanup_set_free_ Set *targets = NULL; + OomdCGroupContext *ctx; + char *key; + int r; + + assert(h); + assert(ret); + + targets = set_new(NULL); + if (!targets) + return -ENOMEM; + + HASHMAP_FOREACH_KEY(ctx, key, h) { + if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) { + usec_t diff; + + if (ctx->mem_pressure_limit_hit_start == 0) + ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC); + + diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start; + if (diff >= duration) { + r = set_put(targets, ctx); + if (r < 0) + return -ENOMEM; + } + } else + ctx->mem_pressure_limit_hit_start = 0; + } + + if (!set_isempty(targets)) { + *ret = TAKE_PTR(targets); + return 1; + } + + *ret = NULL; + return 0; +} + +uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) { + uint64_t last_pgscan; + + assert(c); + + /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero. + * pgscan is monotonic and in practice should not decrease (except in the recreation case). */ + last_pgscan = c->last_pgscan; + if (c->last_pgscan > c->pgscan) { + log_debug("Last pgscan %"PRIu64" greater than current pgscan %"PRIu64" for %s. Using last pgscan of zero.", + c->last_pgscan, c->pgscan, c->path); + last_pgscan = 0; + } + + return c->pgscan - last_pgscan; +} + +bool oomd_mem_available_below(const OomdSystemContext *ctx, int threshold_permyriad) { + uint64_t mem_threshold; + + assert(ctx); + assert(threshold_permyriad <= 10000); + + mem_threshold = ctx->mem_total * threshold_permyriad / (uint64_t) 10000; + return LESS_BY(ctx->mem_total, ctx->mem_used) < mem_threshold; +} + +bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) { + uint64_t swap_threshold; + + assert(ctx); + assert(threshold_permyriad <= 10000); + + swap_threshold = ctx->swap_total * threshold_permyriad / (uint64_t) 10000; + return (ctx->swap_total - ctx->swap_used) < swap_threshold; +} + +int oomd_fetch_cgroup_oom_preference(OomdCGroupContext *ctx, const char *prefix) { + uid_t uid, prefix_uid; + int r; + + assert(ctx); + + prefix = empty_to_root(prefix); + + if (!path_startswith(ctx->path, prefix)) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "%s is not a descendant of %s", ctx->path, prefix); + + r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, ctx->path, &uid); + if (r < 0) + return log_debug_errno(r, "Failed to get owner/group from %s: %m", ctx->path); + + r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, prefix, &prefix_uid); + if (r < 0) + return log_debug_errno(r, "Failed to get owner/group from %s: %m", ctx->path); + + if (uid == prefix_uid || uid == 0) { + /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used + * as an optional feature of systemd-oomd (and the system might not even support them). */ + r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, ctx->path, "user.oomd_avoid"); + if (r == -ENOMEM) + return log_oom_debug(); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) + log_debug_errno(r, "Failed to get xattr user.oomd_avoid, ignoring: %m"); + ctx->preference = r > 0 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference; + + r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, ctx->path, "user.oomd_omit"); + if (r == -ENOMEM) + return log_oom_debug(); + if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r)) + log_debug_errno(r, "Failed to get xattr user.oomd_omit, ignoring: %m"); + ctx->preference = r > 0 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference; + } else + ctx->preference = MANAGED_OOM_PREFERENCE_NONE; + + return 0; +} + +int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + OomdCGroupContext *item; + size_t k = 0; + int r; + + assert(h); + assert(compare_func); + assert(ret); + + sorted = new0(OomdCGroupContext*, hashmap_size(h)); + if (!sorted) + return -ENOMEM; + + HASHMAP_FOREACH(item, h) { + /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */ + if (item->path && prefix && !path_startswith(item->path, prefix)) + continue; + + r = oomd_fetch_cgroup_oom_preference(item, prefix); + if (r == -ENOMEM) + return r; + + if (item->preference == MANAGED_OOM_PREFERENCE_OMIT) + continue; + + sorted[k++] = item; + } + + typesafe_qsort(sorted, k, compare_func); + + *ret = TAKE_PTR(sorted); + + assert(k <= INT_MAX); + return (int) k; +} + +int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { + _cleanup_set_free_ Set *pids_killed = NULL; + int r; + + assert(path); + + if (dry_run) { + _cleanup_free_ char *cg_path = NULL; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path); + if (r < 0) + return r; + + log_info("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse)); + return 0; + } + + pids_killed = set_new(NULL); + if (!pids_killed) + return -ENOMEM; + + r = increment_oomd_xattr(path, "user.oomd_ooms", 1); + if (r < 0) + log_debug_errno(r, "Failed to set user.oomd_ooms before kill: %m"); + + if (recurse) + r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + else + r = cg_kill(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + + /* The cgroup could have been cleaned up after we have sent SIGKILL to all of the processes, but before + * we could do one last iteration of cgroup.procs to check. Or the service unit could have exited and + * was removed between picking candidates and coming into this function. In either case, let's log + * about it let the caller decide what to do once they know how many PIDs were killed. */ + if (IN_SET(r, -ENOENT, -ENODEV)) + log_debug_errno(r, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", path); + else if (r < 0) + return r; + + if (set_isempty(pids_killed)) + log_debug("Nothing killed when attempting to kill %s", path); + + r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed)); + if (r < 0) + log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m"); + + return set_size(pids_killed) != 0; +} + +typedef void (*dump_candidate_func)(const OomdCGroupContext *ctx, FILE *f, const char *prefix); + +static int dump_kill_candidates(OomdCGroupContext **sorted, int n, int dump_until, dump_candidate_func dump_func) { + /* Try dumping top offendors, ignoring any errors that might happen. */ + _cleanup_free_ char *dump = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + size_t size; + + f = open_memstream_unlocked(&dump, &size); + if (!f) + return -errno; + + fprintf(f, "Considered %d cgroups for killing, top candidates were:\n", n); + for (int i = 0; i < dump_until; i++) + dump_func(sorted[i], f, "\t"); + + r = fflush_and_check(f); + if (r < 0) + return r; + + return log_dump(LOG_INFO, dump); +} + +int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + int n, r, ret = 0; + int dump_until; + + assert(h); + assert(ret_selected); + + n = oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, prefix, &sorted); + if (n < 0) + return n; + + dump_until = MIN(n, DUMP_ON_KILL_COUNT); + for (int i = 0; i < n; i++) { + /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. + * Continue since there might be "avoid" cgroups at the end. */ + if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0) + continue; + + r = oomd_cgroup_kill(sorted[i]->path, /* recurse= */ true, /* dry_run= */ dry_run); + if (r == -ENOMEM) + return r; /* Treat oom as a hard error */ + if (r < 0) { + if (ret == 0) + ret = r; + continue; /* Try to find something else to kill */ + } + + dump_until = MAX(dump_until, i + 1); + char *selected = strdup(sorted[i]->path); + if (!selected) + return -ENOMEM; + *ret_selected = selected; + ret = r; + break; + } + + dump_kill_candidates(sorted, n, dump_until, oomd_dump_memory_pressure_cgroup_context); + + return ret; +} + +int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + int n, r, ret = 0; + int dump_until; + + assert(h); + assert(ret_selected); + + n = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted); + if (n < 0) + return n; + + dump_until = MIN(n, DUMP_ON_KILL_COUNT); + /* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with + * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */ + for (int i = 0; i < n; i++) { + /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid" + * cgroups at the end. */ + if (sorted[i]->swap_usage <= threshold_usage) + continue; + + r = oomd_cgroup_kill(sorted[i]->path, /* recurse= */ true, /* dry_run= */ dry_run); + if (r == -ENOMEM) + return r; /* Treat oom as a hard error */ + if (r < 0) { + if (ret == 0) + ret = r; + continue; /* Try to find something else to kill */ + } + + dump_until = MAX(dump_until, i + 1); + char *selected = strdup(sorted[i]->path); + if (!selected) + return -ENOMEM; + *ret_selected = selected; + ret = r; + break; + } + + dump_kill_candidates(sorted, n, dump_until, oomd_dump_swap_cgroup_context); + + return ret; +} + +int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *p = NULL, *val = NULL; + bool is_root; + int r; + + assert(path); + assert(ret); + + ctx = new0(OomdCGroupContext, 1); + if (!ctx) + return -ENOMEM; + + is_root = empty_or_root(path); + ctx->preference = MANAGED_OOM_PREFERENCE_NONE; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p); + if (r < 0) + return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path); + + r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure); + if (r < 0) + return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p); + + if (is_root) { + r = procfs_memory_get_used(&ctx->current_memory_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory used from procfs: %m"); + } else { + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory.current from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min); + if (r < 0) + return log_debug_errno(r, "Error getting memory.min from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low); + if (r < 0) + return log_debug_errno(r, "Error getting memory.low from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage); + if (r == -ENODATA) + /* The kernel can be compiled without support for memory.swap.* files, + * or it can be disabled with boot param 'swapaccount=0' */ + log_once(LOG_WARNING, "No kernel support for memory.swap.current from %s (try boot param swapaccount=1), ignoring.", path); + else if (r < 0) + return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path); + + r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val); + if (r < 0) + return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path); + + r = safe_atou64(val, &ctx->pgscan); + if (r < 0) + return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m"); + } + + ctx->path = strdup(empty_to_root(path)); + if (!ctx->path) + return -ENOMEM; + + *ret = TAKE_PTR(ctx); + return 0; +} + +int oomd_system_context_acquire(const char *proc_meminfo_path, OomdSystemContext *ret) { + _cleanup_fclose_ FILE *f = NULL; + unsigned field_filled = 0; + OomdSystemContext ctx = {}; + uint64_t mem_available, swap_free; + int r; + + enum { + MEM_TOTAL = 1U << 0, + MEM_AVAILABLE = 1U << 1, + SWAP_TOTAL = 1U << 2, + SWAP_FREE = 1U << 3, + ALL = MEM_TOTAL|MEM_AVAILABLE|SWAP_TOTAL|SWAP_FREE, + }; + + assert(proc_meminfo_path); + assert(ret); + + f = fopen(proc_meminfo_path, "re"); + if (!f) + return -errno; + + for (;;) { + _cleanup_free_ char *line = NULL; + char *word; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + if ((word = startswith(line, "MemTotal:"))) { + field_filled |= MEM_TOTAL; + r = convert_meminfo_value_to_uint64_bytes(word, &ctx.mem_total); + } else if ((word = startswith(line, "MemAvailable:"))) { + field_filled |= MEM_AVAILABLE; + r = convert_meminfo_value_to_uint64_bytes(word, &mem_available); + } else if ((word = startswith(line, "SwapTotal:"))) { + field_filled |= SWAP_TOTAL; + r = convert_meminfo_value_to_uint64_bytes(word, &ctx.swap_total); + } else if ((word = startswith(line, "SwapFree:"))) { + field_filled |= SWAP_FREE; + r = convert_meminfo_value_to_uint64_bytes(word, &swap_free); + } else + continue; + + if (r < 0) + return log_debug_errno(r, "Error converting '%s' from %s to uint64_t: %m", line, proc_meminfo_path); + + if (field_filled == ALL) + break; + } + + if (field_filled != ALL) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "%s is missing expected fields", proc_meminfo_path); + + if (mem_available > ctx.mem_total) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "MemAvailable (%" PRIu64 ") cannot be greater than MemTotal (%" PRIu64 ") %m", + mem_available, + ctx.mem_total); + + if (swap_free > ctx.swap_total) + return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), + "SwapFree (%" PRIu64 ") cannot be greater than SwapTotal (%" PRIu64 ") %m", + swap_free, + ctx.swap_total); + + ctx.mem_used = ctx.mem_total - mem_available; + ctx.swap_used = ctx.swap_total - swap_free; + + *ret = ctx; + return 0; +} + +int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL; + OomdCGroupContext *old_ctx; + int r; + + assert(new_h); + assert(path); + + path = empty_to_root(path); + + r = oomd_cgroup_context_acquire(path, &curr_ctx); + if (r < 0) + return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path); + + assert_se(streq(path, curr_ctx->path)); + + old_ctx = hashmap_get(old_h, path); + if (old_ctx) { + curr_ctx->last_pgscan = old_ctx->pgscan; + curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; + curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; + curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; + } + + if (oomd_pgscan_rate(curr_ctx) > 0) + curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC); + + r = hashmap_put(new_h, curr_ctx->path, curr_ctx); + if (r < 0) + return r; + + TAKE_PTR(curr_ctx); + return 0; +} + +void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h) { + OomdCGroupContext *ctx; + + assert(old_h); + assert(curr_h); + + HASHMAP_FOREACH(ctx, curr_h) { + OomdCGroupContext *old_ctx; + + old_ctx = hashmap_get(old_h, ctx->path); + if (!old_ctx) + continue; + + ctx->last_pgscan = old_ctx->pgscan; + ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; + ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; + ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; + + if (oomd_pgscan_rate(ctx) > 0) + ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC); + } +} + +void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { + assert(ctx); + assert(f); + + if (!empty_or_root(ctx->path)) + fprintf(f, + "%sPath: %s\n" + "%s\tSwap Usage: %s\n", + strempty(prefix), ctx->path, + strempty(prefix), FORMAT_BYTES(ctx->swap_usage)); + else + fprintf(f, + "%sPath: %s\n" + "%s\tSwap Usage: (see System Context)\n", + strempty(prefix), ctx->path, + strempty(prefix)); +} + +void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { + assert(ctx); + assert(f); + + fprintf(f, + "%sPath: %s\n" + "%s\tMemory Pressure Limit: %lu.%02lu%%\n" + "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n" + "%s\tCurrent Memory Usage: %s\n", + strempty(prefix), ctx->path, + strempty(prefix), LOADAVG_INT_SIDE(ctx->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(ctx->mem_pressure_limit), + strempty(prefix), + LOADAVG_INT_SIDE(ctx->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg10), + LOADAVG_INT_SIDE(ctx->memory_pressure.avg60), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg60), + LOADAVG_INT_SIDE(ctx->memory_pressure.avg300), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg300), + FORMAT_TIMESPAN(ctx->memory_pressure.total, USEC_PER_SEC), + strempty(prefix), FORMAT_BYTES(ctx->current_memory_usage)); + + if (!empty_or_root(ctx->path)) + fprintf(f, + "%s\tMemory Min: %s\n" + "%s\tMemory Low: %s\n" + "%s\tPgscan: %" PRIu64 "\n" + "%s\tLast Pgscan: %" PRIu64 "\n", + strempty(prefix), FORMAT_BYTES_CGROUP_PROTECTION(ctx->memory_min), + strempty(prefix), FORMAT_BYTES_CGROUP_PROTECTION(ctx->memory_low), + strempty(prefix), ctx->pgscan, + strempty(prefix), ctx->last_pgscan); +} + +void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) { + assert(ctx); + assert(f); + + fprintf(f, + "%sMemory: Used: %s Total: %s\n" + "%sSwap: Used: %s Total: %s\n", + strempty(prefix), + FORMAT_BYTES(ctx->mem_used), + FORMAT_BYTES(ctx->mem_total), + strempty(prefix), + FORMAT_BYTES(ctx->swap_used), + FORMAT_BYTES(ctx->swap_total)); +} diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h new file mode 100644 index 0000000..7fd9e92 --- /dev/null +++ b/src/oom/oomd-util.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +#include "cgroup-util.h" +#include "hashmap.h" +#include "psi-util.h" + +#define DUMP_ON_KILL_COUNT 10 +#define GROWING_SIZE_PERCENTILE 80 + +extern const struct hash_ops oomd_cgroup_ctx_hash_ops; + +typedef struct OomdCGroupContext OomdCGroupContext; +typedef struct OomdSystemContext OomdSystemContext; + +typedef int (oomd_compare_t)(OomdCGroupContext * const *, OomdCGroupContext * const *); + +struct OomdCGroupContext { + char *path; + + ResourcePressure memory_pressure; + + uint64_t current_memory_usage; + + uint64_t memory_min; + uint64_t memory_low; + uint64_t swap_usage; + + uint64_t last_pgscan; + uint64_t pgscan; + + ManagedOOMPreference preference; + + /* These are only used for acting on high memory pressure. */ + loadavg_t mem_pressure_limit; + usec_t mem_pressure_limit_hit_start; + usec_t last_had_mem_reclaim; +}; + +struct OomdSystemContext { + uint64_t mem_total; + uint64_t mem_used; + uint64_t swap_total; + uint64_t swap_used; +}; + +OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx); +DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free); + +/* All hashmaps used with these functions are expected to be of the form + * key: cgroup paths -> value: OomdCGroupContext. */ + +/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret` + * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time. + * `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns + * below the limit. + * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`. + * Returns -ENOMEM for allocation errors. */ +int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret); + +/* Returns true if the amount of memory available (see proc(5)) is below the permyriad of memory specified by `threshold_permyriad`. */ +bool oomd_mem_available_below(const OomdSystemContext *ctx, int threshold_permyriad); + +/* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */ +bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad); + +/* Returns pgscan - last_pgscan, accounting for corner cases. */ +uint64_t oomd_pgscan_rate(const OomdCGroupContext *c); + +/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end + * (after the smallest values). */ +static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + uint64_t diff1, diff2; + int r; + + assert(c1); + assert(c2); + + r = CMP((*c1)->preference, (*c2)->preference); + if (r != 0) + return r; + + diff1 = oomd_pgscan_rate(*c1); + diff2 = oomd_pgscan_rate(*c2); + r = CMP(diff2, diff1); + if (r != 0) + return r; + + return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage); +} + +static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + int r; + + assert(c1); + assert(c2); + + r = CMP((*c1)->preference, (*c2)->preference); + if (r != 0) + return r; + + return CMP((*c2)->swap_usage, (*c1)->swap_usage); +} + +/* Get an array of OomdCGroupContexts from `h`, qsorted from largest to smallest values according to `compare_func`. + * If `prefix` is not NULL, only include OomdCGroupContexts whose paths start with prefix. Otherwise all paths are sorted. + * Returns the number of sorted items; negative on error. */ +int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret); + +/* If the cgroups represented by `ctx` and `prefix` are owned by the same user, + * then set `ctx->preference` using the `user.oomd_avoid` and `user.oomd_omit` + * xattrs. Otherwise, set `ctx->preference` to MANAGED_OOM_PREFERENCE_NONE. + * + * If `prefix` is NULL or the empty string, it is treated as root. If `prefix` + * does not specify an ancestor cgroup of `ctx`, -EINVAL is returned. Returns + * negative on all other errors. */ +int oomd_fetch_cgroup_oom_preference(OomdCGroupContext *ctx, const char *prefix); + +/* Returns a negative value on error, 0 if no processes were killed, or 1 if processes were killed. */ +int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run); + +/* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */ +/* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise, + * everything in `h` is a candidate. + * Returns the killed cgroup in ret_selected. */ +int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected); +int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected); + +int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret); +int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret); + +/* Get the OomdCGroupContext of `path` and insert it into `new_h`. The key for the inserted context will be `path`. + * + * `old_h` is used to get data used to calculate prior interval information. `old_h` can be NULL in which case there + * was no prior data to reference. */ +int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path); + +/* Update each OomdCGroupContext in `curr_h` with prior interval information from `old_h`. */ +void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h); + +void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix); +void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix); +void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix); diff --git a/src/oom/oomd.c b/src/oom/oomd.c new file mode 100644 index 0000000..1ccbed1 --- /dev/null +++ b/src/oom/oomd.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <getopt.h> + +#include "bus-log-control-api.h" +#include "bus-object.h" +#include "cgroup-util.h" +#include "conf-parser.h" +#include "daemon-util.h" +#include "fileio.h" +#include "log.h" +#include "main-func.h" +#include "oomd-manager-bus.h" +#include "oomd-manager.h" +#include "parse-util.h" +#include "pretty-print.h" +#include "psi-util.h" +#include "signal-util.h" + +static bool arg_dry_run = false; +static int arg_swap_used_limit_permyriad = -1; +static int arg_mem_pressure_limit_permyriad = -1; +static usec_t arg_mem_pressure_usec = 0; + +static int parse_config(void) { + static const ConfigTableItem items[] = { + { "OOM", "SwapUsedLimit", config_parse_permyriad, 0, &arg_swap_used_limit_permyriad }, + { "OOM", "DefaultMemoryPressureLimit", config_parse_permyriad, 0, &arg_mem_pressure_limit_permyriad }, + { "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec }, + {} + }; + + return config_parse_many_nulstr(PKGSYSCONFDIR "/oomd.conf", + CONF_PATHS_NULSTR("systemd/oomd.conf.d"), + "OOM\0", + config_item_table_lookup, + items, + CONFIG_PARSE_WARN, + NULL, + NULL); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-oomd", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "Run the userspace out-of-memory (OOM) killer.\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --dry-run Only print destructive actions instead of doing them\n" + " --bus-introspect=PATH Write D-Bus XML introspection data\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_DRY_RUN, + ARG_BUS_INTROSPECT, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "dry-run", no_argument, NULL, ARG_DRY_RUN }, + { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_DRY_RUN: + arg_dry_run = true; + break; + + case ARG_BUS_INTROSPECT: + return bus_introspect_implementations( + stdout, + optarg, + BUS_IMPLEMENTATIONS(&manager_object, + &log_control_object)); + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (optind < argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "This program takes no arguments."); + + return 1; +} + +static int run(int argc, char *argv[]) { + _unused_ _cleanup_(notify_on_cleanup) const char *notify_msg = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_free_ char *swap = NULL; + unsigned long long s = 0; + CGroupMask mask; + int r; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = parse_config(); + if (r < 0) + return r; + + /* Do some basic requirement checks for running systemd-oomd. It's not exhaustive as some of the other + * requirements do not have a reliable means to check for in code. */ + + int n = sd_listen_fds(0); + if (n > 1) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Received too many file descriptors"); + + int fd = n == 1 ? SD_LISTEN_FDS_START : -1; + + /* SwapTotal is always available in /proc/meminfo and defaults to 0, even on swap-disabled kernels. */ + r = get_proc_field("/proc/meminfo", "SwapTotal", WHITESPACE, &swap); + if (r < 0) + return log_error_errno(r, "Failed to get SwapTotal from /proc/meminfo: %m"); + + r = safe_atollu(swap, &s); + if (r < 0 || s == 0) + log_warning("No swap; memory pressure usage will be degraded"); + + if (!is_pressure_supported()) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Pressure Stall Information (PSI) is not supported"); + + r = cg_all_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires the unified cgroups hierarchy"); + + r = cg_mask_supported(&mask); + if (r < 0) + return log_error_errno(r, "Failed to get supported cgroup controllers: %m"); + + if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires the cgroup memory controller."); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + + if (arg_mem_pressure_usec > 0 && arg_mem_pressure_usec < 1 * USEC_PER_SEC) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "DefaultMemoryPressureDurationSec= must be 0 or at least 1s"); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Failed to create manager: %m"); + + r = manager_start( + m, + arg_dry_run, + arg_swap_used_limit_permyriad, + arg_mem_pressure_limit_permyriad, + arg_mem_pressure_usec, + fd); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + + notify_msg = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + + log_debug("systemd-oomd started%s.", arg_dry_run ? " in dry run mode" : ""); + + r = sd_event_loop(m->event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf new file mode 100644 index 0000000..b3a457f --- /dev/null +++ b/src/oom/oomd.conf @@ -0,0 +1,20 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# Entries in this file show the compile time defaults. Local configuration +# should be created by either modifying this file, or by creating "drop-ins" in +# the oomd.conf.d/ subdirectory. The latter is generally recommended. +# Defaults can be restored by simply deleting this file and all drop-ins. +# +# Use 'systemd-analyze cat-config systemd/oomd.conf' to display the full config. +# +# See oomd.conf(5) for details + +[OOM] +#SwapUsedLimit=90% +#DefaultMemoryPressureLimit=60% +#DefaultMemoryPressureDurationSec=30s diff --git a/src/oom/org.freedesktop.oom1.conf b/src/oom/org.freedesktop.oom1.conf new file mode 100644 index 0000000..d00bdcd --- /dev/null +++ b/src/oom/org.freedesktop.oom1.conf @@ -0,0 +1,47 @@ +<?xml version="1.0"?> <!--*-nxml-*--> +<!DOCTYPE busconfig PUBLIC "-//freedesktop//DTD D-BUS Bus Configuration 1.0//EN" + "https://www.freedesktop.org/standards/dbus/1.0/busconfig.dtd"> + +<!-- SPDX-License-Identifier: LGPL-2.1-or-later --> + +<busconfig> + + <policy user="systemd-oom"> + <allow own="org.freedesktop.oom1"/> + <allow send_destination="org.freedesktop.oom1"/> + <allow receive_sender="org.freedesktop.oom1"/> + </policy> + + <policy user="root"> + <allow send_destination="org.freedesktop.oom1"/> + </policy> + + <policy context="default"> + <deny send_destination="org.freedesktop.oom1"/> + + <!-- Generic interfaces --> + + <allow send_destination="org.freedesktop.oom1" + send_interface="org.freedesktop.DBus.Introspectable"/> + + <allow send_destination="org.freedesktop.oom1" + send_interface="org.freedesktop.DBus.Peer"/> + + <allow send_destination="org.freedesktop.oom1" + send_interface="org.freedesktop.DBus.Properties" + send_member="Get"/> + + <allow send_destination="org.freedesktop.oom1" + send_interface="org.freedesktop.DBus.Properties" + send_member="GetAll"/> + + <!-- Manager interface --> + + <allow send_destination="org.freedesktop.oom1" + send_interface="org.freedesktop.oom1.Manager" + send_member="DumpByFileDescriptor"/> + + <allow receive_sender="org.freedesktop.oom1"/> + </policy> + +</busconfig> diff --git a/src/oom/org.freedesktop.oom1.service b/src/oom/org.freedesktop.oom1.service new file mode 100644 index 0000000..4fd5138 --- /dev/null +++ b/src/oom/org.freedesktop.oom1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.oom1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.oom1.service diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c new file mode 100644 index 0000000..1d12045 --- /dev/null +++ b/src/oom/test-oomd-util.c @@ -0,0 +1,513 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <unistd.h> + +#include "alloc-util.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "oomd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" + +static int fork_and_sleep(unsigned sleep_min) { + usec_t n, timeout, ts; + + pid_t pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + timeout = sleep_min * USEC_PER_MINUTE; + ts = now(CLOCK_MONOTONIC); + for (;;) { + n = now(CLOCK_MONOTONIC); + if (ts + timeout < n) { + log_error("Child timed out waiting to be killed"); + abort(); + } + sleep(1); + } + } + + return pid; +} + +static void test_oomd_cgroup_kill(void) { + _cleanup_free_ char *cgroup_root = NULL, *cgroup = NULL; + int pid[2]; + int r; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup_root) >= 0); + + /* Create another cgroup below this one for the pids we forked off. We need this to be managed + * by the test so that pid1 doesn't delete it before we can read the xattrs. */ + cgroup = path_join(cgroup_root, "oomdkilltest"); + assert_se(cgroup); + assert_se(cg_create(SYSTEMD_CGROUP_CONTROLLER, cgroup) >= 0); + + /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities */ + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "test", 4, 0); + if (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r)) + return (void) log_tests_skipped("Cannot set user xattrs"); + + /* Do this twice to also check the increment behavior on the xattrs */ + for (int i = 0; i < 2; i++) { + _cleanup_free_ char *v = NULL; + + for (int j = 0; j < 2; j++) { + pid[j] = fork_and_sleep(5); + assert_se(cg_attach(SYSTEMD_CGROUP_CONTROLLER, cgroup, pid[j]) >= 0); + } + + r = oomd_cgroup_kill(cgroup, false /* recurse */, false /* dry run */); + if (r <= 0) { + log_debug_errno(r, "Failed to kill processes under %s: %m", cgroup); + abort(); + } + + assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_ooms", &v) >= 0); + assert_se(streq(v, i == 0 ? "1" : "2")); + v = mfree(v); + + /* Wait a bit since processes may take some time to be cleaned up. */ + sleep(2); + assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true); + + assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_kill", &v) >= 0); + assert_se(streq(v, i == 0 ? "2" : "4")); + } +} + +static void test_oomd_cgroup_context_acquire_and_insert(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL; + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *cgroup = NULL; + OomdCGroupContext *c1, *c2; + CGroupMask mask; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (!is_pressure_supported()) + return (void) log_tests_skipped("system does not support pressure"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_mask_supported(&mask) >= 0); + + if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY)) + return (void) log_tests_skipped("cgroup memory controller is not available"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0); + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + assert_se(streq(ctx->path, cgroup)); + assert_se(ctx->current_memory_usage > 0); + assert_se(ctx->memory_min == 0); + assert_se(ctx->memory_low == 0); + assert_se(ctx->swap_usage == 0); + assert_se(ctx->last_pgscan == 0); + assert_se(ctx->pgscan == 0); + ctx = oomd_cgroup_context_free(ctx); + + assert_se(oomd_cgroup_context_acquire("", &ctx) == 0); + assert_se(streq(ctx->path, "/")); + assert_se(ctx->current_memory_usage > 0); + + /* Test hashmap inserts */ + assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); + assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == 0); + c1 = hashmap_get(h1, cgroup); + assert_se(c1); + assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST); + + /* make sure certain values from h1 get updated in h2 */ + c1->pgscan = UINT64_MAX; + c1->mem_pressure_limit = 6789; + c1->mem_pressure_limit_hit_start = 42; + c1->last_had_mem_reclaim = 888; + assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); + assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0); + c1 = hashmap_get(h1, cgroup); + c2 = hashmap_get(h2, cgroup); + assert_se(c1); + assert_se(c2); + assert_se(c1 != c2); + assert_se(c2->last_pgscan == UINT64_MAX); + assert_se(c2->mem_pressure_limit == 6789); + assert_se(c2->mem_pressure_limit_hit_start == 42); + assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */ +} + +static void test_oomd_update_cgroup_contexts_between_hashmaps(void) { + _cleanup_hashmap_free_ Hashmap *h_old = NULL, *h_new = NULL; + OomdCGroupContext *c_old, *c_new; + char **paths = STRV_MAKE("/0.slice", + "/1.slice"); + + OomdCGroupContext ctx_old[2] = { + { .path = paths[0], + .mem_pressure_limit = 5, + .mem_pressure_limit_hit_start = 777, + .last_had_mem_reclaim = 888, + .pgscan = 57 }, + { .path = paths[1], + .mem_pressure_limit = 6, + .mem_pressure_limit_hit_start = 888, + .last_had_mem_reclaim = 888, + .pgscan = 42 }, + }; + + OomdCGroupContext ctx_new[2] = { + { .path = paths[0], + .pgscan = 57 }, + { .path = paths[1], + .pgscan = 101 }, + }; + + assert_se(h_old = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h_old, paths[0], &ctx_old[0]) >= 0); + assert_se(hashmap_put(h_old, paths[1], &ctx_old[1]) >= 0); + + assert_se(h_new = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h_new, paths[0], &ctx_new[0]) >= 0); + assert_se(hashmap_put(h_new, paths[1], &ctx_new[1]) >= 0); + + oomd_update_cgroup_contexts_between_hashmaps(h_old, h_new); + + assert_se(c_old = hashmap_get(h_old, "/0.slice")); + assert_se(c_new = hashmap_get(h_new, "/0.slice")); + assert_se(c_old->pgscan == c_new->last_pgscan); + assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); + assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start); + assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim); + + assert_se(c_old = hashmap_get(h_old, "/1.slice")); + assert_se(c_new = hashmap_get(h_new, "/1.slice")); + assert_se(c_old->pgscan == c_new->last_pgscan); + assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); + assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start); + assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim); +} + +static void test_oomd_system_context_acquire(void) { + _cleanup_(unlink_tempfilep) char path[] = "/tmp/oomdgetsysctxtestXXXXXX"; + _cleanup_close_ int fd = -1; + OomdSystemContext ctx; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + assert_se((fd = mkostemp_safe(path)) >= 0); + + assert_se(oomd_system_context_acquire("/verylikelynonexistentpath", &ctx) == -ENOENT); + + assert_se(oomd_system_context_acquire(path, &ctx) == -EINVAL); + + assert_se(write_string_file(path, "some\nwords\nacross\nmultiple\nlines", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == -EINVAL); + + assert_se(write_string_file(path, "MemTotal: 32495256 kB trailing\n" + "MemFree: 9880512 kB data\n" + "SwapTotal: 8388604 kB is\n" + "SwapFree: 7604 kB bad\n", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == -EINVAL); + + assert_se(write_string_file(path, "MemTotal: 32495256 kB\n" + "MemFree: 9880512 kB\n" + "MemAvailable: 21777088 kB\n" + "Buffers: 5968 kB\n" + "Cached: 14344796 kB\n" + "Unevictable: 740004 kB\n" + "Mlocked: 4484 kB\n" + "SwapTotal: 8388604 kB\n" + "SwapFree: 7604 kB\n", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == 0); + assert_se(ctx.mem_total == 33275142144); + assert_se(ctx.mem_used == 10975404032); + assert_se(ctx.swap_total == 8589930496); + assert_se(ctx.swap_used == 8582144000); +} + +static void test_oomd_pressure_above(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL; + _cleanup_set_free_ Set *t1 = NULL, *t2 = NULL, *t3 = NULL; + OomdCGroupContext ctx[2] = {}, *c; + loadavg_t threshold; + + assert_se(store_loadavg_fixed_point(80, 0, &threshold) == 0); + + /* /herp.slice */ + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg10)) == 0); + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg60)) == 0); + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg300)) == 0); + ctx[0].mem_pressure_limit = threshold; + + /* /derp.slice */ + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg10)) == 0); + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg60)) == 0); + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg300)) == 0); + ctx[1].mem_pressure_limit = threshold; + + /* High memory pressure */ + assert_se(h1 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h1, "/herp.slice", &ctx[0]) >= 0); + assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1); + assert_se(set_contains(t1, &ctx[0])); + assert_se(c = hashmap_get(h1, "/herp.slice")); + assert_se(c->mem_pressure_limit_hit_start > 0); + + /* Low memory pressure */ + assert_se(h2 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h2, "/derp.slice", &ctx[1]) >= 0); + assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0); + assert_se(!t2); + assert_se(c = hashmap_get(h2, "/derp.slice")); + assert_se(c->mem_pressure_limit_hit_start == 0); + + /* High memory pressure w/ multiple cgroups */ + assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0); + assert_se(oomd_pressure_above(h1, 0 /* duration */, &t3) == 1); + assert_se(set_contains(t3, &ctx[0])); + assert_se(set_size(t3) == 1); + assert_se(c = hashmap_get(h1, "/herp.slice")); + assert_se(c->mem_pressure_limit_hit_start > 0); + assert_se(c = hashmap_get(h1, "/derp.slice")); + assert_se(c->mem_pressure_limit_hit_start == 0); +} + +static void test_oomd_mem_and_swap_free_below(void) { + OomdSystemContext ctx = (OomdSystemContext) { + .mem_total = UINT64_C(20971512) * 1024U, + .mem_used = UINT64_C(3310136) * 1024U, + .swap_total = UINT64_C(20971512) * 1024U, + .swap_used = UINT64_C(20971440) * 1024U, + }; + assert_se(oomd_mem_available_below(&ctx, 2000) == false); + assert_se(oomd_swap_free_below(&ctx, 2000) == true); + + ctx = (OomdSystemContext) { + .mem_total = UINT64_C(20971512) * 1024U, + .mem_used = UINT64_C(20971440) * 1024U, + .swap_total = UINT64_C(20971512) * 1024U, + .swap_used = UINT64_C(3310136) * 1024U, + }; + assert_se(oomd_mem_available_below(&ctx, 2000) == true); + assert_se(oomd_swap_free_below(&ctx, 2000) == false); + + ctx = (OomdSystemContext) { + .mem_total = 0, + .mem_used = 0, + .swap_total = 0, + .swap_used = 0, + }; + assert_se(oomd_mem_available_below(&ctx, 2000) == false); + assert_se(oomd_swap_free_below(&ctx, 2000) == false); +} + +static void test_oomd_sort_cgroups(void) { + _cleanup_hashmap_free_ Hashmap *h = NULL; + _cleanup_free_ OomdCGroupContext **sorted_cgroups; + char **paths = STRV_MAKE("/herp.slice", + "/herp.slice/derp.scope", + "/herp.slice/derp.scope/sheep.service", + "/zupa.slice", + "/boop.slice", + "/omitted.slice", + "/avoid.slice"); + + OomdCGroupContext ctx[7] = { + { .path = paths[0], + .swap_usage = 20, + .last_pgscan = 0, + .pgscan = 33, + .current_memory_usage = 10 }, + { .path = paths[1], + .swap_usage = 60, + .last_pgscan = 33, + .pgscan = 1, + .current_memory_usage = 20 }, + { .path = paths[2], + .swap_usage = 40, + .last_pgscan = 1, + .pgscan = 33, + .current_memory_usage = 40 }, + { .path = paths[3], + .swap_usage = 10, + .last_pgscan = 33, + .pgscan = 2, + .current_memory_usage = 10 }, + { .path = paths[4], + .swap_usage = 11, + .last_pgscan = 33, + .pgscan = 33, + .current_memory_usage = 10 }, + { .path = paths[5], + .swap_usage = 90, + .last_pgscan = 0, + .pgscan = UINT64_MAX, + .preference = MANAGED_OOM_PREFERENCE_OMIT }, + { .path = paths[6], + .swap_usage = 99, + .last_pgscan = 0, + .pgscan = UINT64_MAX, + .preference = MANAGED_OOM_PREFERENCE_AVOID }, + }; + + assert_se(h = hashmap_new(&string_hash_ops)); + + assert_se(hashmap_put(h, "/herp.slice", &ctx[0]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0); + assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0); + assert_se(hashmap_put(h, "/boop.slice", &ctx[4]) >= 0); + assert_se(hashmap_put(h, "/omitted.slice", &ctx[5]) >= 0); + assert_se(hashmap_put(h, "/avoid.slice", &ctx[6]) >= 0); + + assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 6); + assert_se(sorted_cgroups[0] == &ctx[1]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == &ctx[0]); + assert_se(sorted_cgroups[3] == &ctx[4]); + assert_se(sorted_cgroups[4] == &ctx[3]); + assert_se(sorted_cgroups[5] == &ctx[6]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, NULL, &sorted_cgroups) == 6); + assert_se(sorted_cgroups[0] == &ctx[0]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == &ctx[3]); + assert_se(sorted_cgroups[3] == &ctx[1]); + assert_se(sorted_cgroups[4] == &ctx[4]); + assert_se(sorted_cgroups[5] == &ctx[6]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2); + assert_se(sorted_cgroups[0] == &ctx[2]); + assert_se(sorted_cgroups[1] == &ctx[1]); + assert_se(sorted_cgroups[2] == 0); + assert_se(sorted_cgroups[3] == 0); + assert_se(sorted_cgroups[4] == 0); + assert_se(sorted_cgroups[5] == 0); + assert_se(sorted_cgroups[6] == 0); + sorted_cgroups = mfree(sorted_cgroups); +} + +static void test_oomd_fetch_cgroup_oom_preference(void) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *cgroup = NULL; + ManagedOOMPreference root_pref; + CGroupMask mask; + bool test_xattrs; + int root_xattrs, r; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (!is_pressure_supported()) + return (void) log_tests_skipped("system does not support pressure"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_mask_supported(&mask) >= 0); + + if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY)) + return (void) log_tests_skipped("cgroup memory controller is not available"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0); + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities + * so skip the xattr portions of the test. */ + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "1", 1, 0); + test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r); + + if (test_xattrs) { + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "1", 1, 0) >= 0); + assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0); + + /* omit takes precedence over avoid when both are set to true */ + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_OMIT); + } else { + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) < 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE); + } + ctx = oomd_cgroup_context_free(ctx); + + /* also check when only avoid is set to true */ + if (test_xattrs) { + assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "0", 1, 0) >= 0); + assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0); + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID); + ctx = oomd_cgroup_context_free(ctx); + } + + /* Test the root cgroup */ + /* Root cgroup is live and not made on demand like the cgroup the test runs in. It can have varying + * xattrs set already so let's read in the booleans first to get the final preference value. */ + assert_se(oomd_cgroup_context_acquire("", &ctx) == 0); + root_xattrs = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, "", "user.oomd_omit"); + root_pref = root_xattrs > 0 ? MANAGED_OOM_PREFERENCE_OMIT : MANAGED_OOM_PREFERENCE_NONE; + root_xattrs = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, "", "user.oomd_avoid"); + root_pref = root_xattrs > 0 ? MANAGED_OOM_PREFERENCE_AVOID : MANAGED_OOM_PREFERENCE_NONE; + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(ctx->preference == root_pref); + + assert_se(oomd_fetch_cgroup_oom_preference(ctx, "/herp.slice/derp.scope") == -EINVAL); + + /* Assert that avoid/omit are not set if the cgroup and prefix are not + * owned by the same user.*/ + if (test_xattrs && !empty_or_root(cgroup)) { + ctx = oomd_cgroup_context_free(ctx); + assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 61183, 0) >= 0); + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE); + + assert_se(oomd_fetch_cgroup_oom_preference(ctx, ctx->path) == 0); + assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID); + } +} + +int main(void) { + int r; + + test_setup_logging(LOG_DEBUG); + + test_oomd_update_cgroup_contexts_between_hashmaps(); + test_oomd_system_context_acquire(); + test_oomd_pressure_above(); + test_oomd_mem_and_swap_free_below(); + test_oomd_sort_cgroups(); + + /* The following tests operate on live cgroups */ + + r = enter_cgroup_root(NULL); + if (r < 0) + return log_tests_skipped_errno(r, "failed to enter a test cgroup scope"); + + test_oomd_cgroup_kill(); + test_oomd_cgroup_context_acquire_and_insert(); + test_oomd_fetch_cgroup_oom_preference(); + + return 0; +} |