summaryrefslogtreecommitdiffstats
path: root/src/oom
diff options
context:
space:
mode:
Diffstat (limited to 'src/oom')
-rw-r--r--src/oom/meson.build34
-rw-r--r--src/oom/oomctl.c139
-rw-r--r--src/oom/oomd-manager-bus.c52
-rw-r--r--src/oom/oomd-manager-bus.h8
-rw-r--r--src/oom/oomd-manager.c855
-rw-r--r--src/oom/oomd-manager.h73
-rw-r--r--src/oom/oomd-util.c646
-rw-r--r--src/oom/oomd-util.h145
-rw-r--r--src/oom/oomd.c201
-rw-r--r--src/oom/oomd.conf20
-rw-r--r--src/oom/org.freedesktop.oom1.conf47
-rw-r--r--src/oom/org.freedesktop.oom1.service14
-rw-r--r--src/oom/test-oomd-util.c513
13 files changed, 2747 insertions, 0 deletions
diff --git a/src/oom/meson.build b/src/oom/meson.build
new file mode 100644
index 0000000..1203518
--- /dev/null
+++ b/src/oom/meson.build
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+systemd_oomd_sources = files(
+ 'oomd-manager-bus.c',
+ 'oomd-manager-bus.h',
+ 'oomd-manager.c',
+ 'oomd-manager.h',
+ 'oomd-util.c',
+ 'oomd-util.h',
+ 'oomd.c',
+)
+
+oomctl_sources = files('oomctl.c')
+
+if conf.get('ENABLE_OOMD') == 1
+ install_data('org.freedesktop.oom1.conf',
+ install_dir : dbuspolicydir)
+
+ install_data('org.freedesktop.oom1.service',
+ install_dir : dbussystemservicedir)
+
+ if install_sysconfdir_samples
+ install_data('oomd.conf',
+ install_dir : pkgsysconfdir)
+ endif
+endif
+
+tests += [
+ [files('test-oomd-util.c',
+ 'oomd-util.c',
+ 'oomd-util.h'),
+ [],
+ [libatomic]]
+]
diff --git a/src/oom/oomctl.c b/src/oom/oomctl.c
new file mode 100644
index 0000000..2ffb9d4
--- /dev/null
+++ b/src/oom/oomctl.c
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <getopt.h>
+#include <unistd.h>
+
+#include "bus-error.h"
+#include "copy.h"
+#include "main-func.h"
+#include "pretty-print.h"
+#include "terminal-util.h"
+#include "verbs.h"
+
+static PagerFlags arg_pager_flags = 0;
+
+static int help(int argc, char *argv[], void *userdata) {
+ _cleanup_free_ char *link = NULL;
+ int r;
+
+ pager_open(arg_pager_flags);
+
+ r = terminal_urlify_man("oomctl", "1", &link);
+ if (r < 0)
+ return log_oom();
+
+ printf("%1$s [OPTIONS...] COMMAND ...\n\n"
+ "%2$sManage or inspect the userspace OOM killer.%3$s\n"
+ "\n%4$sCommands:%5$s\n"
+ " dump Output the current state of systemd-oomd\n"
+ "\n%4$sOptions:%5$s\n"
+ " -h --help Show this help\n"
+ " --version Show package version\n"
+ " --no-pager Do not pipe output into a pager\n"
+ "\nSee the %6$s for details.\n",
+ program_invocation_short_name,
+ ansi_highlight(),
+ ansi_normal(),
+ ansi_underline(),
+ ansi_normal(),
+ link);
+
+ return 0;
+}
+
+static int dump_state(int argc, char *argv[], void *userdata) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+ int fd = -1;
+ int r;
+
+ r = sd_bus_open_system(&bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect system bus: %m");
+
+ pager_open(arg_pager_flags);
+
+ r = sd_bus_call_method(
+ bus,
+ "org.freedesktop.oom1",
+ "/org/freedesktop/oom1",
+ "org.freedesktop.oom1.Manager",
+ "DumpByFileDescriptor",
+ &error,
+ &reply,
+ NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to dump context: %s", bus_error_message(&error, r));
+
+ r = sd_bus_message_read(reply, "h", &fd);
+ if (r < 0)
+ return bus_log_parse_error(r);
+
+ fflush(stdout);
+ return copy_bytes(fd, STDOUT_FILENO, UINT64_MAX, 0);
+}
+
+static int parse_argv(int argc, char *argv[]) {
+ enum {
+ ARG_VERSION = 0x100,
+ ARG_NO_PAGER,
+ };
+
+ static const struct option options[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, ARG_VERSION },
+ { "no-pager", no_argument, NULL, ARG_NO_PAGER },
+ {}
+ };
+
+ int c;
+
+ assert(argc >= 0);
+ assert(argv);
+
+ while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+ switch (c) {
+
+ case 'h':
+ return help(0, NULL, NULL);
+
+ case ARG_VERSION:
+ return version();
+
+ case ARG_NO_PAGER:
+ arg_pager_flags |= PAGER_DISABLE;
+ break;
+
+ case '?':
+ return -EINVAL;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 1;
+}
+
+static int run(int argc, char* argv[]) {
+ static const Verb verbs[] = {
+ { "help", VERB_ANY, VERB_ANY, 0, help },
+ { "dump", VERB_ANY, 1, VERB_DEFAULT, dump_state },
+ {}
+ };
+
+ int r;
+
+ log_show_color(true);
+ log_parse_environment();
+ log_open();
+
+ r = parse_argv(argc, argv);
+ if (r <= 0)
+ return r;
+
+ return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/oom/oomd-manager-bus.c b/src/oom/oomd-manager-bus.c
new file mode 100644
index 0000000..3a3308f
--- /dev/null
+++ b/src/oom/oomd-manager-bus.c
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <linux/capability.h>
+
+#include "bus-common-errors.h"
+#include "bus-polkit.h"
+#include "data-fd-util.h"
+#include "fd-util.h"
+#include "oomd-manager-bus.h"
+#include "oomd-manager.h"
+#include "user-util.h"
+
+static int bus_method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *dump = NULL;
+ _cleanup_close_ int fd = -1;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = manager_get_dump_string(m, &dump);
+ if (r < 0)
+ return r;
+
+ fd = acquire_data_fd(dump, strlen(dump), 0);
+ if (fd < 0)
+ return fd;
+
+ return sd_bus_reply_method_return(message, "h", fd);
+}
+
+static const sd_bus_vtable manager_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_METHOD_WITH_NAMES("DumpByFileDescriptor",
+ NULL,,
+ "h",
+ SD_BUS_PARAM(fd),
+ bus_method_dump_by_fd,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_SIGNAL_WITH_NAMES("Killed",
+ "ss",
+ SD_BUS_PARAM(cgroup)
+ SD_BUS_PARAM(reason),
+ 0),
+ SD_BUS_VTABLE_END
+};
+
+const BusObjectImplementation manager_object = {
+ "/org/freedesktop/oom1",
+ "org.freedesktop.oom1.Manager",
+ .vtables = BUS_VTABLES(manager_vtable),
+};
diff --git a/src/oom/oomd-manager-bus.h b/src/oom/oomd-manager-bus.h
new file mode 100644
index 0000000..7935b35
--- /dev/null
+++ b/src/oom/oomd-manager-bus.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "bus-object.h"
+
+typedef struct Manager Manager;
+
+extern const BusObjectImplementation manager_object;
diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c
new file mode 100644
index 0000000..836eeb4
--- /dev/null
+++ b/src/oom/oomd-manager.c
@@ -0,0 +1,855 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-daemon.h"
+
+#include "bus-log-control-api.h"
+#include "bus-util.h"
+#include "bus-polkit.h"
+#include "cgroup-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "memory-util.h"
+#include "oomd-manager-bus.h"
+#include "oomd-manager.h"
+#include "path-util.h"
+#include "percent-util.h"
+
+typedef struct ManagedOOMMessage {
+ ManagedOOMMode mode;
+ char *path;
+ char *property;
+ uint32_t limit;
+} ManagedOOMMessage;
+
+static void managed_oom_message_destroy(ManagedOOMMessage *message) {
+ assert(message);
+ free(message->path);
+ free(message->property);
+}
+
+static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
+ ManagedOOMMode *mode = userdata, m;
+ const char *s;
+
+ assert(mode);
+ assert_se(s = json_variant_string(v));
+
+ m = managed_oom_mode_from_string(s);
+ if (m < 0)
+ return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s);
+
+ *mode = m;
+ return 0;
+}
+
+static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) {
+ JsonVariant *c, *cgroups;
+ int r;
+
+ static const JsonDispatch dispatch_table[] = {
+ { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMMessage, mode), JSON_MANDATORY },
+ { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, path), JSON_MANDATORY },
+ { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY },
+ { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 },
+ {},
+ };
+
+ assert(m);
+ assert(parameters);
+
+ cgroups = json_variant_by_key(parameters, "cgroups");
+ if (!cgroups)
+ return -EINVAL;
+
+ /* Skip malformed elements and keep processing in case the others are good */
+ JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
+ _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {};
+ OomdCGroupContext *ctx;
+ Hashmap *monitor_hm;
+ loadavg_t limit;
+
+ if (!json_variant_is_object(c))
+ continue;
+
+ r = json_dispatch(c, dispatch_table, NULL, 0, &message);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0)
+ continue;
+
+ if (uid != 0) {
+ uid_t cg_uid;
+
+ r = cg_path_get_owner_uid(message.path, &cg_uid);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to get cgroup %s owner uid: %m", message.path);
+ continue;
+ }
+
+ /* Let's not be lenient for permission errors and skip processing if we receive an
+ * update for a cgroup that doesn't belong to the user. */
+ if (uid != cg_uid)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "cgroup path owner UID does not match sender uid "
+ "(" UID_FMT " != " UID_FMT ")", uid, cg_uid);
+ }
+
+ monitor_hm = streq(message.property, "ManagedOOMSwap") ?
+ m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
+
+ if (message.mode == MANAGED_OOM_AUTO) {
+ (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path)));
+ continue;
+ }
+
+ limit = m->default_mem_pressure_limit;
+
+ if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) {
+ int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit);
+
+ r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit);
+ if (r < 0)
+ continue;
+ }
+
+ r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0 && r != -EEXIST)
+ log_debug_errno(r, "Failed to insert message, ignoring: %m");
+
+ /* Always update the limit in case it was changed. For non-memory pressure detection the value is
+ * ignored so always updating it here is not a problem. */
+ ctx = hashmap_get(monitor_hm, empty_to_root(message.path));
+ if (ctx)
+ ctx->mem_pressure_limit = limit;
+ }
+
+ /* Toggle wake-ups for "ManagedOOMSwap" if entries are present. */
+ r = sd_event_source_set_enabled(m->swap_context_event_source,
+ hashmap_isempty(m->monitored_swap_cgroup_contexts) ? SD_EVENT_OFF : SD_EVENT_ON);
+ if (r < 0)
+ return log_error_errno(r, "Failed to toggle enabled state of swap context source: %m");
+
+ return 0;
+}
+
+static int process_managed_oom_request(
+ Varlink *link,
+ JsonVariant *parameters,
+ VarlinkMethodFlags flags,
+ void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ uid_t uid;
+ int r;
+
+ r = varlink_get_peer_uid(link, &uid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get varlink peer uid: %m");
+
+ return process_managed_oom_message(m, uid, parameters);
+}
+
+static int process_managed_oom_reply(
+ Varlink *link,
+ JsonVariant *parameters,
+ const char *error_id,
+ VarlinkReplyFlags flags,
+ void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ uid_t uid;
+ int r;
+
+ if (error_id) {
+ r = -EIO;
+ log_debug("Error getting ManagedOOM cgroups: %s", error_id);
+ goto finish;
+ }
+
+ r = varlink_get_peer_uid(link, &uid);
+ if (r < 0) {
+ log_error_errno(r, "Failed to get varlink peer uid: %m");
+ goto finish;
+ }
+
+ r = process_managed_oom_message(m, uid, parameters);
+
+finish:
+ if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
+ m->varlink_client = varlink_close_unref(link);
+
+ return r;
+}
+
+/* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible
+ * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
+ *
+ * This function ignores most errors in order to handle cgroups that may have been cleaned up while
+ * populating the hashmap.
+ *
+ * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */
+static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
+ _cleanup_free_ char *subpath = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+ int r;
+
+ assert(new_h);
+ assert(path);
+
+ r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
+ if (r < 0)
+ return r;
+
+ r = cg_read_subgroup(d, &subpath);
+ if (r < 0)
+ return r;
+ else if (r == 0) { /* No subgroups? We're a leaf node */
+ r = oomd_insert_cgroup_context(NULL, new_h, path);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path);
+ return 0;
+ }
+
+ do {
+ _cleanup_free_ char *cg_path = NULL;
+ bool oom_group;
+
+ cg_path = path_join(empty_to_root(path), subpath);
+ if (!cg_path)
+ return -ENOMEM;
+
+ subpath = mfree(subpath);
+
+ r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
+ /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0) {
+ log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path);
+ return 0;
+ }
+
+ if (oom_group)
+ r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
+ else
+ r = recursively_get_cgroup_context(new_h, cg_path);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path);
+ } while ((r = cg_read_subgroup(d, &subpath)) > 0);
+
+ return 0;
+}
+
+static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
+ _cleanup_hashmap_free_ Hashmap *new_base = NULL;
+ OomdCGroupContext *ctx;
+ int r;
+
+ assert(monitored_cgroups);
+
+ new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+ if (!new_base)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(ctx, *monitored_cgroups) {
+ /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
+ r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT))
+ log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path);
+ }
+
+ hashmap_free(*monitored_cgroups);
+ *monitored_cgroups = TAKE_PTR(new_base);
+
+ return 0;
+}
+
+static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
+ _cleanup_hashmap_free_ Hashmap *candidates = NULL;
+ OomdCGroupContext *ctx;
+ int r;
+
+ assert(monitored_cgroups);
+ assert(ret_candidates);
+
+ candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+ if (!candidates)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(ctx, monitored_cgroups) {
+ r = recursively_get_cgroup_context(candidates, ctx->path);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path);
+ }
+
+ *ret_candidates = TAKE_PTR(candidates);
+
+ return 0;
+}
+
+static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) {
+ _cleanup_hashmap_free_ Hashmap *new_candidates = NULL;
+ int r;
+
+ assert(monitored_cgroups);
+ assert(candidates);
+ assert(*candidates);
+
+ r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get candidate contexts: %m");
+
+ oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates);
+
+ hashmap_free(*candidates);
+ *candidates = TAKE_PTR(new_candidates);
+
+ return 0;
+}
+
+static int acquire_managed_oom_connect(Manager *m) {
+ _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
+ int r;
+
+ assert(m);
+ assert(m->event);
+
+ r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM);
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m");
+
+ (void) varlink_set_userdata(link, m);
+ (void) varlink_set_description(link, "oomd");
+ (void) varlink_set_relative_timeout(link, USEC_INFINITY);
+
+ r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+ r = varlink_bind_reply(link, process_managed_oom_reply);
+ if (r < 0)
+ return log_error_errno(r, "Failed to bind reply callback: %m");
+
+ r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to observe varlink call: %m");
+
+ m->varlink_client = TAKE_PTR(link);
+ return 0;
+}
+
+static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ usec_t usec_now;
+ int r;
+
+ assert(s);
+ assert(!hashmap_isempty(m->monitored_swap_cgroup_contexts));
+
+ /* Reset timer */
+ r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
+ if (r < 0)
+ return log_error_errno(r, "Failed to reset event timer: %m");
+
+ r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set relative time for timer: %m");
+
+ /* Reconnect if our connection dropped */
+ if (!m->varlink_client) {
+ r = acquire_managed_oom_connect(m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire varlink connection: %m");
+ }
+
+ /* We still try to acquire system information for oomctl even if no units want swap monitoring */
+ r = oomd_system_context_acquire("/proc/meminfo", &m->system_context);
+ /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire system context: %m");
+
+ /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
+ * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
+ * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
+ * nodes are the ones that matter). */
+
+ /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */
+ if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) &&
+ oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
+ _cleanup_hashmap_free_ Hashmap *candidates = NULL;
+ _cleanup_free_ char *selected = NULL;
+ uint64_t threshold;
+
+ log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and "
+ "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
+ m->system_context.mem_used, m->system_context.mem_total,
+ m->system_context.swap_used, m->system_context.swap_total,
+ PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
+
+ r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
+
+ threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
+ r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ log_notice_errno(r, "Failed to kill any cgroups based on swap: %m");
+ else {
+ if (selected && r > 0) {
+ log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and "
+ "swap used (%"PRIu64") / total (%"PRIu64") being more than "
+ PERMYRIAD_AS_PERCENT_FORMAT_STR,
+ selected,
+ m->system_context.mem_used, m->system_context.mem_total,
+ m->system_context.swap_used, m->system_context.swap_total,
+ PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
+
+ /* send dbus signal */
+ (void) sd_bus_emit_signal(m->bus,
+ "/org/freedesktop/oom1",
+ "org.freedesktop.oom1.Manager",
+ "Killed",
+ "ss",
+ selected,
+ "memory-used");
+ }
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+static void clear_candidate_hashmapp(Manager **m) {
+ if (*m)
+ hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates);
+}
+
+static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
+ /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
+ * update the candidate data (in which case clear_candidates will be NULL). */
+ _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
+ _cleanup_set_free_ Set *targets = NULL;
+ bool in_post_action_delay = false;
+ Manager *m = ASSERT_PTR(userdata);
+ usec_t usec_now;
+ int r;
+
+ assert(s);
+
+ /* Reset timer */
+ r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
+ if (r < 0)
+ return log_error_errno(r, "Failed to reset event timer: %m");
+
+ r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set relative time for timer: %m");
+
+ /* Reconnect if our connection dropped */
+ if (!m->varlink_client) {
+ r = acquire_managed_oom_connect(m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire varlink connection: %m");
+ }
+
+ /* Return early if nothing is requesting memory pressure monitoring */
+ if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
+ return 0;
+
+ /* Update the cgroups used for detection/action */
+ r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
+
+ /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
+ * values and go on a kill storm. */
+ if (m->mem_pressure_post_action_delay_start > 0) {
+ if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
+ in_post_action_delay = true;
+ else
+ m->mem_pressure_post_action_delay_start = 0;
+ }
+
+ r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
+ else if (r == 1 && !in_post_action_delay) {
+ OomdCGroupContext *t;
+ SET_FOREACH(t, targets) {
+ _cleanup_free_ char *selected = NULL;
+
+ /* Check if there was reclaim activity in the given interval. The concern is the following case:
+ * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
+ * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
+ * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
+ * to kill something (it won't help anyways). */
+ if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
+ continue;
+
+ log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
+ t->path,
+ LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
+ LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
+ FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
+
+ r = update_monitored_cgroup_contexts_candidates(
+ m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
+ else
+ clear_candidates = NULL;
+
+ r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates,
+ /* prefix= */ t->path,
+ /* dry_run= */ m->dry_run,
+ &selected);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ log_notice_errno(r, "Failed to kill any cgroups under %s based on pressure: %m", t->path);
+ else {
+ /* Don't act on all the high pressure cgroups at once; return as soon as we kill one.
+ * If r == 0 then it means there were not eligible candidates, the candidate cgroup
+ * disappeared, or the candidate cgroup has no processes by the time we tried to kill
+ * it. In either case, go through the event loop again and select a new candidate if
+ * pressure is still high. */
+ m->mem_pressure_post_action_delay_start = usec_now;
+ if (selected && r > 0) {
+ log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
+ " for > %s with reclaim activity",
+ selected, t->path,
+ LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
+ LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
+ FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
+
+ /* send dbus signal */
+ (void) sd_bus_emit_signal(m->bus,
+ "/org/freedesktop/oom1",
+ "org.freedesktop.oom1.Manager",
+ "Killed",
+ "ss",
+ selected,
+ "memory-pressure");
+ }
+ return 0;
+ }
+ }
+ } else {
+ /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
+ * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
+ * might happen.
+ * Candidate cgroup data will continue to get updated during the post-action delay period in case
+ * pressure continues to be high after a kill. */
+ OomdCGroupContext *c;
+ HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
+ if (c->mem_pressure_limit_hit_start == 0)
+ continue;
+
+ r = update_monitored_cgroup_contexts_candidates(
+ m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
+ else {
+ clear_candidates = NULL;
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int monitor_swap_contexts(Manager *m) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+ int r;
+
+ assert(m);
+ assert(m->event);
+
+ r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_exit_on_failure(s, true);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(s, "oomd-swap-timer");
+
+ m->swap_context_event_source = TAKE_PTR(s);
+ return 0;
+}
+
+static int monitor_memory_pressure_contexts(Manager *m) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+ int r;
+
+ assert(m);
+ assert(m->event);
+
+ r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_exit_on_failure(s, true);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_enabled(s, SD_EVENT_ON);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
+
+ m->mem_pressure_context_event_source = TAKE_PTR(s);
+ return 0;
+}
+
+Manager* manager_free(Manager *m) {
+ assert(m);
+
+ varlink_server_unref(m->varlink_server);
+ varlink_close_unref(m->varlink_client);
+ sd_event_source_unref(m->swap_context_event_source);
+ sd_event_source_unref(m->mem_pressure_context_event_source);
+ sd_event_unref(m->event);
+
+ bus_verify_polkit_async_registry_free(m->polkit_registry);
+ sd_bus_flush_close_unref(m->bus);
+
+ hashmap_free(m->monitored_swap_cgroup_contexts);
+ hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
+ hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates);
+
+ return mfree(m);
+}
+
+int manager_new(Manager **ret) {
+ _cleanup_(manager_freep) Manager *m = NULL;
+ int r;
+
+ assert(ret);
+
+ m = new0(Manager, 1);
+ if (!m)
+ return -ENOMEM;
+
+ r = sd_event_default(&m->event);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_set_watchdog(m->event, true);
+
+ r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+ if (!m->monitored_swap_cgroup_contexts)
+ return -ENOMEM;
+
+ m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+ if (!m->monitored_mem_pressure_cgroup_contexts)
+ return -ENOMEM;
+
+ m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+ if (!m->monitored_mem_pressure_cgroup_contexts_candidates)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+
+static int manager_connect_bus(Manager *m) {
+ int r;
+
+ assert(m);
+ assert(!m->bus);
+
+ r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to bus: %m");
+
+ r = bus_add_implementation(m->bus, &manager_object, m);
+ if (r < 0)
+ return r;
+
+ r = bus_log_control_api_register(m->bus);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to request name: %m");
+
+ r = sd_bus_attach_event(m->bus, m->event, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach bus to event loop: %m");
+
+ return 0;
+}
+
+static int manager_varlink_init(Manager *m, int fd) {
+ _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+ int r;
+
+ assert(m);
+ assert(!m->varlink_server);
+
+ r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate varlink server object: %m");
+
+ varlink_server_set_userdata(s, m);
+
+ r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request);
+ if (r < 0)
+ return log_error_errno(r, "Failed to register varlink method: %m");
+
+ if (fd < 0)
+ r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666);
+ else
+ r = varlink_server_listen_fd(s, fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to bind to varlink socket: %m");
+
+ r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+ log_debug("Initialized systemd-oomd varlink server");
+
+ m->varlink_server = TAKE_PTR(s);
+ return 0;
+}
+
+int manager_start(
+ Manager *m,
+ bool dry_run,
+ int swap_used_limit_permyriad,
+ int mem_pressure_limit_permyriad,
+ usec_t mem_pressure_usec,
+ int fd) {
+
+ unsigned long l, f;
+ int r;
+
+ assert(m);
+
+ m->dry_run = dry_run;
+
+ m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100;
+ assert(m->swap_used_limit_permyriad <= 10000);
+
+ if (mem_pressure_limit_permyriad >= 0) {
+ assert(mem_pressure_limit_permyriad <= 10000);
+
+ l = mem_pressure_limit_permyriad / 100;
+ f = mem_pressure_limit_permyriad % 100;
+ } else {
+ l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT;
+ f = 0;
+ }
+ r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit);
+ if (r < 0)
+ return r;
+
+ m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
+
+ r = manager_connect_bus(m);
+ if (r < 0)
+ return r;
+
+ r = acquire_managed_oom_connect(m);
+ if (r < 0)
+ return r;
+
+ r = manager_varlink_init(m, fd);
+ if (r < 0)
+ return r;
+
+ r = monitor_memory_pressure_contexts(m);
+ if (r < 0)
+ return r;
+
+ r = monitor_swap_contexts(m);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int manager_get_dump_string(Manager *m, char **ret) {
+ _cleanup_free_ char *dump = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ OomdCGroupContext *c;
+ size_t size;
+ char *key;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ f = open_memstream_unlocked(&dump, &size);
+ if (!f)
+ return -errno;
+
+ fprintf(f,
+ "Dry Run: %s\n"
+ "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
+ "Default Memory Pressure Limit: %lu.%02lu%%\n"
+ "Default Memory Pressure Duration: %s\n"
+ "System Context:\n",
+ yes_no(m->dry_run),
+ PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad),
+ LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit),
+ FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
+ oomd_dump_system_context(&m->system_context, f, "\t");
+
+ fprintf(f, "Swap Monitored CGroups:\n");
+ HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
+ oomd_dump_swap_cgroup_context(c, f, "\t");
+
+ fprintf(f, "Memory Pressure Monitored CGroups:\n");
+ HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
+ oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ f = safe_fclose(f);
+
+ *ret = TAKE_PTR(dump);
+ return 0;
+}
diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h
new file mode 100644
index 0000000..8f0dd41
--- /dev/null
+++ b/src/oom/oomd-manager.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-event.h"
+
+#include "conf-parser.h"
+#include "oomd-util.h"
+#include "varlink.h"
+
+/* Polling interval for monitoring stats */
+#define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */
+/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */
+#define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC)
+
+/* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the
+ * percentage of time all tasks were delayed (i.e. unproductive).
+ * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in
+ * system.slice are assumed to be less latency sensitive. */
+#define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC)
+#define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60
+#define DEFAULT_SWAP_USED_LIMIT_PERCENT 90
+
+/* Only tackle candidates with large swap usage. */
+#define THRESHOLD_SWAP_USED_PERCENT 5
+
+#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC)
+#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
+
+typedef struct Manager Manager;
+
+struct Manager {
+ sd_bus *bus;
+ sd_event *event;
+
+ Hashmap *polkit_registry;
+
+ bool dry_run;
+ int swap_used_limit_permyriad;
+ loadavg_t default_mem_pressure_limit;
+ usec_t default_mem_pressure_duration_usec;
+
+ /* k: cgroup paths -> v: OomdCGroupContext
+ * Used to detect when to take action. */
+ Hashmap *monitored_swap_cgroup_contexts;
+ Hashmap *monitored_mem_pressure_cgroup_contexts;
+ Hashmap *monitored_mem_pressure_cgroup_contexts_candidates;
+
+ OomdSystemContext system_context;
+
+ usec_t mem_pressure_post_action_delay_start;
+
+ sd_event_source *swap_context_event_source;
+ sd_event_source *mem_pressure_context_event_source;
+
+ /* This varlink object is used to manage the subscription from systemd-oomd to PID1 which it uses to
+ * listen for changes in ManagedOOM settings (oomd client - systemd server). */
+ Varlink *varlink_client;
+ /* This varlink server object is used to manage systemd-oomd's varlink server which is used by user
+ * managers to report changes in ManagedOOM settings (oomd server - systemd client). */
+ VarlinkServer *varlink_server;
+};
+
+Manager* manager_free(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
+
+int manager_new(Manager **ret);
+
+int manager_start(Manager *m, bool dry_run, int swap_used_limit_permyriad, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec, int fd);
+
+int manager_get_dump_string(Manager *m, char **ret);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_oomd_default);
diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c
new file mode 100644
index 0000000..391d846
--- /dev/null
+++ b/src/oom/oomd-util.c
@@ -0,0 +1,646 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "oomd-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "procfs-util.h"
+#include "signal-util.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "user-util.h"
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+ oomd_cgroup_ctx_hash_ops,
+ char,
+ string_hash_func,
+ string_compare_func,
+ OomdCGroupContext,
+ oomd_cgroup_context_free);
+
+static int log_kill(pid_t pid, int sig, void *userdata) {
+ log_debug("oomd attempting to kill " PID_FMT " with %s", pid, signal_to_string(sig));
+ return 0;
+}
+
+static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) {
+ _cleanup_free_ char *value = NULL;
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+ uint64_t curr_count = 0;
+ int r;
+
+ assert(path);
+ assert(xattr);
+
+ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, xattr, &value);
+ if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+ return r;
+
+ if (!isempty(value)) {
+ r = safe_atou64(value, &curr_count);
+ if (r < 0)
+ return r;
+ }
+
+ if (curr_count > UINT64_MAX - num_procs_killed)
+ return -EOVERFLOW;
+
+ xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed);
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, path, xattr, buf, strlen(buf), 0);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) {
+ if (!ctx)
+ return NULL;
+
+ free(ctx->path);
+ return mfree(ctx);
+}
+
+int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
+ _cleanup_set_free_ Set *targets = NULL;
+ OomdCGroupContext *ctx;
+ char *key;
+ int r;
+
+ assert(h);
+ assert(ret);
+
+ targets = set_new(NULL);
+ if (!targets)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH_KEY(ctx, key, h) {
+ if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) {
+ usec_t diff;
+
+ if (ctx->mem_pressure_limit_hit_start == 0)
+ ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC);
+
+ diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start;
+ if (diff >= duration) {
+ r = set_put(targets, ctx);
+ if (r < 0)
+ return -ENOMEM;
+ }
+ } else
+ ctx->mem_pressure_limit_hit_start = 0;
+ }
+
+ if (!set_isempty(targets)) {
+ *ret = TAKE_PTR(targets);
+ return 1;
+ }
+
+ *ret = NULL;
+ return 0;
+}
+
+uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) {
+ uint64_t last_pgscan;
+
+ assert(c);
+
+ /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero.
+ * pgscan is monotonic and in practice should not decrease (except in the recreation case). */
+ last_pgscan = c->last_pgscan;
+ if (c->last_pgscan > c->pgscan) {
+ log_debug("Last pgscan %"PRIu64" greater than current pgscan %"PRIu64" for %s. Using last pgscan of zero.",
+ c->last_pgscan, c->pgscan, c->path);
+ last_pgscan = 0;
+ }
+
+ return c->pgscan - last_pgscan;
+}
+
+bool oomd_mem_available_below(const OomdSystemContext *ctx, int threshold_permyriad) {
+ uint64_t mem_threshold;
+
+ assert(ctx);
+ assert(threshold_permyriad <= 10000);
+
+ mem_threshold = ctx->mem_total * threshold_permyriad / (uint64_t) 10000;
+ return LESS_BY(ctx->mem_total, ctx->mem_used) < mem_threshold;
+}
+
+bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) {
+ uint64_t swap_threshold;
+
+ assert(ctx);
+ assert(threshold_permyriad <= 10000);
+
+ swap_threshold = ctx->swap_total * threshold_permyriad / (uint64_t) 10000;
+ return (ctx->swap_total - ctx->swap_used) < swap_threshold;
+}
+
+int oomd_fetch_cgroup_oom_preference(OomdCGroupContext *ctx, const char *prefix) {
+ uid_t uid, prefix_uid;
+ int r;
+
+ assert(ctx);
+
+ prefix = empty_to_root(prefix);
+
+ if (!path_startswith(ctx->path, prefix))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "%s is not a descendant of %s", ctx->path, prefix);
+
+ r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, ctx->path, &uid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get owner/group from %s: %m", ctx->path);
+
+ r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, prefix, &prefix_uid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get owner/group from %s: %m", ctx->path);
+
+ if (uid == prefix_uid || uid == 0) {
+ /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
+ * as an optional feature of systemd-oomd (and the system might not even support them). */
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, ctx->path, "user.oomd_avoid");
+ if (r == -ENOMEM)
+ return log_oom_debug();
+ if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+ log_debug_errno(r, "Failed to get xattr user.oomd_avoid, ignoring: %m");
+ ctx->preference = r > 0 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference;
+
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, ctx->path, "user.oomd_omit");
+ if (r == -ENOMEM)
+ return log_oom_debug();
+ if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+ log_debug_errno(r, "Failed to get xattr user.oomd_omit, ignoring: %m");
+ ctx->preference = r > 0 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference;
+ } else
+ ctx->preference = MANAGED_OOM_PREFERENCE_NONE;
+
+ return 0;
+}
+
+int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) {
+ _cleanup_free_ OomdCGroupContext **sorted = NULL;
+ OomdCGroupContext *item;
+ size_t k = 0;
+ int r;
+
+ assert(h);
+ assert(compare_func);
+ assert(ret);
+
+ sorted = new0(OomdCGroupContext*, hashmap_size(h));
+ if (!sorted)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(item, h) {
+ /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
+ if (item->path && prefix && !path_startswith(item->path, prefix))
+ continue;
+
+ r = oomd_fetch_cgroup_oom_preference(item, prefix);
+ if (r == -ENOMEM)
+ return r;
+
+ if (item->preference == MANAGED_OOM_PREFERENCE_OMIT)
+ continue;
+
+ sorted[k++] = item;
+ }
+
+ typesafe_qsort(sorted, k, compare_func);
+
+ *ret = TAKE_PTR(sorted);
+
+ assert(k <= INT_MAX);
+ return (int) k;
+}
+
+int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) {
+ _cleanup_set_free_ Set *pids_killed = NULL;
+ int r;
+
+ assert(path);
+
+ if (dry_run) {
+ _cleanup_free_ char *cg_path = NULL;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path);
+ if (r < 0)
+ return r;
+
+ log_info("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse));
+ return 0;
+ }
+
+ pids_killed = set_new(NULL);
+ if (!pids_killed)
+ return -ENOMEM;
+
+ r = increment_oomd_xattr(path, "user.oomd_ooms", 1);
+ if (r < 0)
+ log_debug_errno(r, "Failed to set user.oomd_ooms before kill: %m");
+
+ if (recurse)
+ r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
+ else
+ r = cg_kill(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
+
+ /* The cgroup could have been cleaned up after we have sent SIGKILL to all of the processes, but before
+ * we could do one last iteration of cgroup.procs to check. Or the service unit could have exited and
+ * was removed between picking candidates and coming into this function. In either case, let's log
+ * about it let the caller decide what to do once they know how many PIDs were killed. */
+ if (IN_SET(r, -ENOENT, -ENODEV))
+ log_debug_errno(r, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", path);
+ else if (r < 0)
+ return r;
+
+ if (set_isempty(pids_killed))
+ log_debug("Nothing killed when attempting to kill %s", path);
+
+ r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed));
+ if (r < 0)
+ log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m");
+
+ return set_size(pids_killed) != 0;
+}
+
+typedef void (*dump_candidate_func)(const OomdCGroupContext *ctx, FILE *f, const char *prefix);
+
+static int dump_kill_candidates(OomdCGroupContext **sorted, int n, int dump_until, dump_candidate_func dump_func) {
+ /* Try dumping top offendors, ignoring any errors that might happen. */
+ _cleanup_free_ char *dump = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+ size_t size;
+
+ f = open_memstream_unlocked(&dump, &size);
+ if (!f)
+ return -errno;
+
+ fprintf(f, "Considered %d cgroups for killing, top candidates were:\n", n);
+ for (int i = 0; i < dump_until; i++)
+ dump_func(sorted[i], f, "\t");
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ return log_dump(LOG_INFO, dump);
+}
+
+int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected) {
+ _cleanup_free_ OomdCGroupContext **sorted = NULL;
+ int n, r, ret = 0;
+ int dump_until;
+
+ assert(h);
+ assert(ret_selected);
+
+ n = oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, prefix, &sorted);
+ if (n < 0)
+ return n;
+
+ dump_until = MIN(n, DUMP_ON_KILL_COUNT);
+ for (int i = 0; i < n; i++) {
+ /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure.
+ * Continue since there might be "avoid" cgroups at the end. */
+ if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0)
+ continue;
+
+ r = oomd_cgroup_kill(sorted[i]->path, /* recurse= */ true, /* dry_run= */ dry_run);
+ if (r == -ENOMEM)
+ return r; /* Treat oom as a hard error */
+ if (r < 0) {
+ if (ret == 0)
+ ret = r;
+ continue; /* Try to find something else to kill */
+ }
+
+ dump_until = MAX(dump_until, i + 1);
+ char *selected = strdup(sorted[i]->path);
+ if (!selected)
+ return -ENOMEM;
+ *ret_selected = selected;
+ ret = r;
+ break;
+ }
+
+ dump_kill_candidates(sorted, n, dump_until, oomd_dump_memory_pressure_cgroup_context);
+
+ return ret;
+}
+
+int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) {
+ _cleanup_free_ OomdCGroupContext **sorted = NULL;
+ int n, r, ret = 0;
+ int dump_until;
+
+ assert(h);
+ assert(ret_selected);
+
+ n = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted);
+ if (n < 0)
+ return n;
+
+ dump_until = MIN(n, DUMP_ON_KILL_COUNT);
+ /* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with
+ * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */
+ for (int i = 0; i < n; i++) {
+ /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid"
+ * cgroups at the end. */
+ if (sorted[i]->swap_usage <= threshold_usage)
+ continue;
+
+ r = oomd_cgroup_kill(sorted[i]->path, /* recurse= */ true, /* dry_run= */ dry_run);
+ if (r == -ENOMEM)
+ return r; /* Treat oom as a hard error */
+ if (r < 0) {
+ if (ret == 0)
+ ret = r;
+ continue; /* Try to find something else to kill */
+ }
+
+ dump_until = MAX(dump_until, i + 1);
+ char *selected = strdup(sorted[i]->path);
+ if (!selected)
+ return -ENOMEM;
+ *ret_selected = selected;
+ ret = r;
+ break;
+ }
+
+ dump_kill_candidates(sorted, n, dump_until, oomd_dump_swap_cgroup_context);
+
+ return ret;
+}
+
+int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
+ _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
+ _cleanup_free_ char *p = NULL, *val = NULL;
+ bool is_root;
+ int r;
+
+ assert(path);
+ assert(ret);
+
+ ctx = new0(OomdCGroupContext, 1);
+ if (!ctx)
+ return -ENOMEM;
+
+ is_root = empty_or_root(path);
+ ctx->preference = MANAGED_OOM_PREFERENCE_NONE;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path);
+
+ r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure);
+ if (r < 0)
+ return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p);
+
+ if (is_root) {
+ r = procfs_memory_get_used(&ctx->current_memory_usage);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting memory used from procfs: %m");
+ } else {
+ r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting memory.current from %s: %m", path);
+
+ r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting memory.min from %s: %m", path);
+
+ r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting memory.low from %s: %m", path);
+
+ r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage);
+ if (r == -ENODATA)
+ /* The kernel can be compiled without support for memory.swap.* files,
+ * or it can be disabled with boot param 'swapaccount=0' */
+ log_once(LOG_WARNING, "No kernel support for memory.swap.current from %s (try boot param swapaccount=1), ignoring.", path);
+ else if (r < 0)
+ return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path);
+
+ r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path);
+
+ r = safe_atou64(val, &ctx->pgscan);
+ if (r < 0)
+ return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m");
+ }
+
+ ctx->path = strdup(empty_to_root(path));
+ if (!ctx->path)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(ctx);
+ return 0;
+}
+
+int oomd_system_context_acquire(const char *proc_meminfo_path, OomdSystemContext *ret) {
+ _cleanup_fclose_ FILE *f = NULL;
+ unsigned field_filled = 0;
+ OomdSystemContext ctx = {};
+ uint64_t mem_available, swap_free;
+ int r;
+
+ enum {
+ MEM_TOTAL = 1U << 0,
+ MEM_AVAILABLE = 1U << 1,
+ SWAP_TOTAL = 1U << 2,
+ SWAP_FREE = 1U << 3,
+ ALL = MEM_TOTAL|MEM_AVAILABLE|SWAP_TOTAL|SWAP_FREE,
+ };
+
+ assert(proc_meminfo_path);
+ assert(ret);
+
+ f = fopen(proc_meminfo_path, "re");
+ if (!f)
+ return -errno;
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ char *word;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ if ((word = startswith(line, "MemTotal:"))) {
+ field_filled |= MEM_TOTAL;
+ r = convert_meminfo_value_to_uint64_bytes(word, &ctx.mem_total);
+ } else if ((word = startswith(line, "MemAvailable:"))) {
+ field_filled |= MEM_AVAILABLE;
+ r = convert_meminfo_value_to_uint64_bytes(word, &mem_available);
+ } else if ((word = startswith(line, "SwapTotal:"))) {
+ field_filled |= SWAP_TOTAL;
+ r = convert_meminfo_value_to_uint64_bytes(word, &ctx.swap_total);
+ } else if ((word = startswith(line, "SwapFree:"))) {
+ field_filled |= SWAP_FREE;
+ r = convert_meminfo_value_to_uint64_bytes(word, &swap_free);
+ } else
+ continue;
+
+ if (r < 0)
+ return log_debug_errno(r, "Error converting '%s' from %s to uint64_t: %m", line, proc_meminfo_path);
+
+ if (field_filled == ALL)
+ break;
+ }
+
+ if (field_filled != ALL)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "%s is missing expected fields", proc_meminfo_path);
+
+ if (mem_available > ctx.mem_total)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "MemAvailable (%" PRIu64 ") cannot be greater than MemTotal (%" PRIu64 ") %m",
+ mem_available,
+ ctx.mem_total);
+
+ if (swap_free > ctx.swap_total)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "SwapFree (%" PRIu64 ") cannot be greater than SwapTotal (%" PRIu64 ") %m",
+ swap_free,
+ ctx.swap_total);
+
+ ctx.mem_used = ctx.mem_total - mem_available;
+ ctx.swap_used = ctx.swap_total - swap_free;
+
+ *ret = ctx;
+ return 0;
+}
+
+int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) {
+ _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL;
+ OomdCGroupContext *old_ctx;
+ int r;
+
+ assert(new_h);
+ assert(path);
+
+ path = empty_to_root(path);
+
+ r = oomd_cgroup_context_acquire(path, &curr_ctx);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path);
+
+ assert_se(streq(path, curr_ctx->path));
+
+ old_ctx = hashmap_get(old_h, path);
+ if (old_ctx) {
+ curr_ctx->last_pgscan = old_ctx->pgscan;
+ curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
+ curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
+ curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
+ }
+
+ if (oomd_pgscan_rate(curr_ctx) > 0)
+ curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
+
+ r = hashmap_put(new_h, curr_ctx->path, curr_ctx);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(curr_ctx);
+ return 0;
+}
+
+void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h) {
+ OomdCGroupContext *ctx;
+
+ assert(old_h);
+ assert(curr_h);
+
+ HASHMAP_FOREACH(ctx, curr_h) {
+ OomdCGroupContext *old_ctx;
+
+ old_ctx = hashmap_get(old_h, ctx->path);
+ if (!old_ctx)
+ continue;
+
+ ctx->last_pgscan = old_ctx->pgscan;
+ ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
+ ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
+ ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
+
+ if (oomd_pgscan_rate(ctx) > 0)
+ ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
+ }
+}
+
+void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) {
+ assert(ctx);
+ assert(f);
+
+ if (!empty_or_root(ctx->path))
+ fprintf(f,
+ "%sPath: %s\n"
+ "%s\tSwap Usage: %s\n",
+ strempty(prefix), ctx->path,
+ strempty(prefix), FORMAT_BYTES(ctx->swap_usage));
+ else
+ fprintf(f,
+ "%sPath: %s\n"
+ "%s\tSwap Usage: (see System Context)\n",
+ strempty(prefix), ctx->path,
+ strempty(prefix));
+}
+
+void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) {
+ assert(ctx);
+ assert(f);
+
+ fprintf(f,
+ "%sPath: %s\n"
+ "%s\tMemory Pressure Limit: %lu.%02lu%%\n"
+ "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n"
+ "%s\tCurrent Memory Usage: %s\n",
+ strempty(prefix), ctx->path,
+ strempty(prefix), LOADAVG_INT_SIDE(ctx->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(ctx->mem_pressure_limit),
+ strempty(prefix),
+ LOADAVG_INT_SIDE(ctx->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg10),
+ LOADAVG_INT_SIDE(ctx->memory_pressure.avg60), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg60),
+ LOADAVG_INT_SIDE(ctx->memory_pressure.avg300), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg300),
+ FORMAT_TIMESPAN(ctx->memory_pressure.total, USEC_PER_SEC),
+ strempty(prefix), FORMAT_BYTES(ctx->current_memory_usage));
+
+ if (!empty_or_root(ctx->path))
+ fprintf(f,
+ "%s\tMemory Min: %s\n"
+ "%s\tMemory Low: %s\n"
+ "%s\tPgscan: %" PRIu64 "\n"
+ "%s\tLast Pgscan: %" PRIu64 "\n",
+ strempty(prefix), FORMAT_BYTES_CGROUP_PROTECTION(ctx->memory_min),
+ strempty(prefix), FORMAT_BYTES_CGROUP_PROTECTION(ctx->memory_low),
+ strempty(prefix), ctx->pgscan,
+ strempty(prefix), ctx->last_pgscan);
+}
+
+void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) {
+ assert(ctx);
+ assert(f);
+
+ fprintf(f,
+ "%sMemory: Used: %s Total: %s\n"
+ "%sSwap: Used: %s Total: %s\n",
+ strempty(prefix),
+ FORMAT_BYTES(ctx->mem_used),
+ FORMAT_BYTES(ctx->mem_total),
+ strempty(prefix),
+ FORMAT_BYTES(ctx->swap_used),
+ FORMAT_BYTES(ctx->swap_total));
+}
diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h
new file mode 100644
index 0000000..7fd9e92
--- /dev/null
+++ b/src/oom/oomd-util.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "cgroup-util.h"
+#include "hashmap.h"
+#include "psi-util.h"
+
+#define DUMP_ON_KILL_COUNT 10
+#define GROWING_SIZE_PERCENTILE 80
+
+extern const struct hash_ops oomd_cgroup_ctx_hash_ops;
+
+typedef struct OomdCGroupContext OomdCGroupContext;
+typedef struct OomdSystemContext OomdSystemContext;
+
+typedef int (oomd_compare_t)(OomdCGroupContext * const *, OomdCGroupContext * const *);
+
+struct OomdCGroupContext {
+ char *path;
+
+ ResourcePressure memory_pressure;
+
+ uint64_t current_memory_usage;
+
+ uint64_t memory_min;
+ uint64_t memory_low;
+ uint64_t swap_usage;
+
+ uint64_t last_pgscan;
+ uint64_t pgscan;
+
+ ManagedOOMPreference preference;
+
+ /* These are only used for acting on high memory pressure. */
+ loadavg_t mem_pressure_limit;
+ usec_t mem_pressure_limit_hit_start;
+ usec_t last_had_mem_reclaim;
+};
+
+struct OomdSystemContext {
+ uint64_t mem_total;
+ uint64_t mem_used;
+ uint64_t swap_total;
+ uint64_t swap_used;
+};
+
+OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx);
+DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
+
+/* All hashmaps used with these functions are expected to be of the form
+ * key: cgroup paths -> value: OomdCGroupContext. */
+
+/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret`
+ * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time.
+ * `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns
+ * below the limit.
+ * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`.
+ * Returns -ENOMEM for allocation errors. */
+int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
+
+/* Returns true if the amount of memory available (see proc(5)) is below the permyriad of memory specified by `threshold_permyriad`. */
+bool oomd_mem_available_below(const OomdSystemContext *ctx, int threshold_permyriad);
+
+/* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */
+bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad);
+
+/* Returns pgscan - last_pgscan, accounting for corner cases. */
+uint64_t oomd_pgscan_rate(const OomdCGroupContext *c);
+
+/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
+ * (after the smallest values). */
+static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
+ uint64_t diff1, diff2;
+ int r;
+
+ assert(c1);
+ assert(c2);
+
+ r = CMP((*c1)->preference, (*c2)->preference);
+ if (r != 0)
+ return r;
+
+ diff1 = oomd_pgscan_rate(*c1);
+ diff2 = oomd_pgscan_rate(*c2);
+ r = CMP(diff2, diff1);
+ if (r != 0)
+ return r;
+
+ return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage);
+}
+
+static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
+ int r;
+
+ assert(c1);
+ assert(c2);
+
+ r = CMP((*c1)->preference, (*c2)->preference);
+ if (r != 0)
+ return r;
+
+ return CMP((*c2)->swap_usage, (*c1)->swap_usage);
+}
+
+/* Get an array of OomdCGroupContexts from `h`, qsorted from largest to smallest values according to `compare_func`.
+ * If `prefix` is not NULL, only include OomdCGroupContexts whose paths start with prefix. Otherwise all paths are sorted.
+ * Returns the number of sorted items; negative on error. */
+int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret);
+
+/* If the cgroups represented by `ctx` and `prefix` are owned by the same user,
+ * then set `ctx->preference` using the `user.oomd_avoid` and `user.oomd_omit`
+ * xattrs. Otherwise, set `ctx->preference` to MANAGED_OOM_PREFERENCE_NONE.
+ *
+ * If `prefix` is NULL or the empty string, it is treated as root. If `prefix`
+ * does not specify an ancestor cgroup of `ctx`, -EINVAL is returned. Returns
+ * negative on all other errors. */
+int oomd_fetch_cgroup_oom_preference(OomdCGroupContext *ctx, const char *prefix);
+
+/* Returns a negative value on error, 0 if no processes were killed, or 1 if processes were killed. */
+int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run);
+
+/* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */
+/* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise,
+ * everything in `h` is a candidate.
+ * Returns the killed cgroup in ret_selected. */
+int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected);
+int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected);
+
+int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
+int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret);
+
+/* Get the OomdCGroupContext of `path` and insert it into `new_h`. The key for the inserted context will be `path`.
+ *
+ * `old_h` is used to get data used to calculate prior interval information. `old_h` can be NULL in which case there
+ * was no prior data to reference. */
+int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path);
+
+/* Update each OomdCGroupContext in `curr_h` with prior interval information from `old_h`. */
+void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h);
+
+void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix);
+void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix);
+void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix);
diff --git a/src/oom/oomd.c b/src/oom/oomd.c
new file mode 100644
index 0000000..1ccbed1
--- /dev/null
+++ b/src/oom/oomd.c
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <getopt.h>
+
+#include "bus-log-control-api.h"
+#include "bus-object.h"
+#include "cgroup-util.h"
+#include "conf-parser.h"
+#include "daemon-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "main-func.h"
+#include "oomd-manager-bus.h"
+#include "oomd-manager.h"
+#include "parse-util.h"
+#include "pretty-print.h"
+#include "psi-util.h"
+#include "signal-util.h"
+
+static bool arg_dry_run = false;
+static int arg_swap_used_limit_permyriad = -1;
+static int arg_mem_pressure_limit_permyriad = -1;
+static usec_t arg_mem_pressure_usec = 0;
+
+static int parse_config(void) {
+ static const ConfigTableItem items[] = {
+ { "OOM", "SwapUsedLimit", config_parse_permyriad, 0, &arg_swap_used_limit_permyriad },
+ { "OOM", "DefaultMemoryPressureLimit", config_parse_permyriad, 0, &arg_mem_pressure_limit_permyriad },
+ { "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec },
+ {}
+ };
+
+ return config_parse_many_nulstr(PKGSYSCONFDIR "/oomd.conf",
+ CONF_PATHS_NULSTR("systemd/oomd.conf.d"),
+ "OOM\0",
+ config_item_table_lookup,
+ items,
+ CONFIG_PARSE_WARN,
+ NULL,
+ NULL);
+}
+
+static int help(void) {
+ _cleanup_free_ char *link = NULL;
+ int r;
+
+ r = terminal_urlify_man("systemd-oomd", "8", &link);
+ if (r < 0)
+ return log_oom();
+
+ printf("%s [OPTIONS...]\n\n"
+ "Run the userspace out-of-memory (OOM) killer.\n\n"
+ " -h --help Show this help\n"
+ " --version Show package version\n"
+ " --dry-run Only print destructive actions instead of doing them\n"
+ " --bus-introspect=PATH Write D-Bus XML introspection data\n"
+ "\nSee the %s for details.\n",
+ program_invocation_short_name,
+ link);
+
+ return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+ enum {
+ ARG_VERSION = 0x100,
+ ARG_DRY_RUN,
+ ARG_BUS_INTROSPECT,
+ };
+
+ static const struct option options[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, ARG_VERSION },
+ { "dry-run", no_argument, NULL, ARG_DRY_RUN },
+ { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT },
+ {}
+ };
+
+ int c;
+
+ assert(argc >= 0);
+ assert(argv);
+
+ while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+ switch (c) {
+
+ case 'h':
+ return help();
+
+ case ARG_VERSION:
+ return version();
+
+ case ARG_DRY_RUN:
+ arg_dry_run = true;
+ break;
+
+ case ARG_BUS_INTROSPECT:
+ return bus_introspect_implementations(
+ stdout,
+ optarg,
+ BUS_IMPLEMENTATIONS(&manager_object,
+ &log_control_object));
+
+ case '?':
+ return -EINVAL;
+
+ default:
+ assert_not_reached();
+ }
+
+ if (optind < argc)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "This program takes no arguments.");
+
+ return 1;
+}
+
+static int run(int argc, char *argv[]) {
+ _unused_ _cleanup_(notify_on_cleanup) const char *notify_msg = NULL;
+ _cleanup_(manager_freep) Manager *m = NULL;
+ _cleanup_free_ char *swap = NULL;
+ unsigned long long s = 0;
+ CGroupMask mask;
+ int r;
+
+ log_setup();
+
+ r = parse_argv(argc, argv);
+ if (r <= 0)
+ return r;
+
+ r = parse_config();
+ if (r < 0)
+ return r;
+
+ /* Do some basic requirement checks for running systemd-oomd. It's not exhaustive as some of the other
+ * requirements do not have a reliable means to check for in code. */
+
+ int n = sd_listen_fds(0);
+ if (n > 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Received too many file descriptors");
+
+ int fd = n == 1 ? SD_LISTEN_FDS_START : -1;
+
+ /* SwapTotal is always available in /proc/meminfo and defaults to 0, even on swap-disabled kernels. */
+ r = get_proc_field("/proc/meminfo", "SwapTotal", WHITESPACE, &swap);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get SwapTotal from /proc/meminfo: %m");
+
+ r = safe_atollu(swap, &s);
+ if (r < 0 || s == 0)
+ log_warning("No swap; memory pressure usage will be degraded");
+
+ if (!is_pressure_supported())
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Pressure Stall Information (PSI) is not supported");
+
+ r = cg_all_unified();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires the unified cgroups hierarchy");
+
+ r = cg_mask_supported(&mask);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get supported cgroup controllers: %m");
+
+ if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY))
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires the cgroup memory controller.");
+
+ assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
+
+ if (arg_mem_pressure_usec > 0 && arg_mem_pressure_usec < 1 * USEC_PER_SEC)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "DefaultMemoryPressureDurationSec= must be 0 or at least 1s");
+
+ r = manager_new(&m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create manager: %m");
+
+ r = manager_start(
+ m,
+ arg_dry_run,
+ arg_swap_used_limit_permyriad,
+ arg_mem_pressure_limit_permyriad,
+ arg_mem_pressure_usec,
+ fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to start up daemon: %m");
+
+ notify_msg = notify_start(NOTIFY_READY, NOTIFY_STOPPING);
+
+ log_debug("systemd-oomd started%s.", arg_dry_run ? " in dry run mode" : "");
+
+ r = sd_event_loop(m->event);
+ if (r < 0)
+ return log_error_errno(r, "Event loop failed: %m");
+
+ return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf
new file mode 100644
index 0000000..b3a457f
--- /dev/null
+++ b/src/oom/oomd.conf
@@ -0,0 +1,20 @@
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# Entries in this file show the compile time defaults. Local configuration
+# should be created by either modifying this file, or by creating "drop-ins" in
+# the oomd.conf.d/ subdirectory. The latter is generally recommended.
+# Defaults can be restored by simply deleting this file and all drop-ins.
+#
+# Use 'systemd-analyze cat-config systemd/oomd.conf' to display the full config.
+#
+# See oomd.conf(5) for details
+
+[OOM]
+#SwapUsedLimit=90%
+#DefaultMemoryPressureLimit=60%
+#DefaultMemoryPressureDurationSec=30s
diff --git a/src/oom/org.freedesktop.oom1.conf b/src/oom/org.freedesktop.oom1.conf
new file mode 100644
index 0000000..d00bdcd
--- /dev/null
+++ b/src/oom/org.freedesktop.oom1.conf
@@ -0,0 +1,47 @@
+<?xml version="1.0"?> <!--*-nxml-*-->
+<!DOCTYPE busconfig PUBLIC "-//freedesktop//DTD D-BUS Bus Configuration 1.0//EN"
+ "https://www.freedesktop.org/standards/dbus/1.0/busconfig.dtd">
+
+<!-- SPDX-License-Identifier: LGPL-2.1-or-later -->
+
+<busconfig>
+
+ <policy user="systemd-oom">
+ <allow own="org.freedesktop.oom1"/>
+ <allow send_destination="org.freedesktop.oom1"/>
+ <allow receive_sender="org.freedesktop.oom1"/>
+ </policy>
+
+ <policy user="root">
+ <allow send_destination="org.freedesktop.oom1"/>
+ </policy>
+
+ <policy context="default">
+ <deny send_destination="org.freedesktop.oom1"/>
+
+ <!-- Generic interfaces -->
+
+ <allow send_destination="org.freedesktop.oom1"
+ send_interface="org.freedesktop.DBus.Introspectable"/>
+
+ <allow send_destination="org.freedesktop.oom1"
+ send_interface="org.freedesktop.DBus.Peer"/>
+
+ <allow send_destination="org.freedesktop.oom1"
+ send_interface="org.freedesktop.DBus.Properties"
+ send_member="Get"/>
+
+ <allow send_destination="org.freedesktop.oom1"
+ send_interface="org.freedesktop.DBus.Properties"
+ send_member="GetAll"/>
+
+ <!-- Manager interface -->
+
+ <allow send_destination="org.freedesktop.oom1"
+ send_interface="org.freedesktop.oom1.Manager"
+ send_member="DumpByFileDescriptor"/>
+
+ <allow receive_sender="org.freedesktop.oom1"/>
+ </policy>
+
+</busconfig>
diff --git a/src/oom/org.freedesktop.oom1.service b/src/oom/org.freedesktop.oom1.service
new file mode 100644
index 0000000..4fd5138
--- /dev/null
+++ b/src/oom/org.freedesktop.oom1.service
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+[D-BUS Service]
+Name=org.freedesktop.oom1
+Exec=/bin/false
+User=root
+SystemdService=dbus-org.freedesktop.oom1.service
diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c
new file mode 100644
index 0000000..1d12045
--- /dev/null
+++ b/src/oom/test-oomd-util.c
@@ -0,0 +1,513 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "oomd-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tests.h"
+#include "tmpfile-util.h"
+
+static int fork_and_sleep(unsigned sleep_min) {
+ usec_t n, timeout, ts;
+
+ pid_t pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ timeout = sleep_min * USEC_PER_MINUTE;
+ ts = now(CLOCK_MONOTONIC);
+ for (;;) {
+ n = now(CLOCK_MONOTONIC);
+ if (ts + timeout < n) {
+ log_error("Child timed out waiting to be killed");
+ abort();
+ }
+ sleep(1);
+ }
+ }
+
+ return pid;
+}
+
+static void test_oomd_cgroup_kill(void) {
+ _cleanup_free_ char *cgroup_root = NULL, *cgroup = NULL;
+ int pid[2];
+ int r;
+
+ if (geteuid() != 0)
+ return (void) log_tests_skipped("not root");
+
+ if (cg_all_unified() <= 0)
+ return (void) log_tests_skipped("cgroups are not running in unified mode");
+
+ assert_se(cg_pid_get_path(NULL, 0, &cgroup_root) >= 0);
+
+ /* Create another cgroup below this one for the pids we forked off. We need this to be managed
+ * by the test so that pid1 doesn't delete it before we can read the xattrs. */
+ cgroup = path_join(cgroup_root, "oomdkilltest");
+ assert_se(cgroup);
+ assert_se(cg_create(SYSTEMD_CGROUP_CONTROLLER, cgroup) >= 0);
+
+ /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities */
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "test", 4, 0);
+ if (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r))
+ return (void) log_tests_skipped("Cannot set user xattrs");
+
+ /* Do this twice to also check the increment behavior on the xattrs */
+ for (int i = 0; i < 2; i++) {
+ _cleanup_free_ char *v = NULL;
+
+ for (int j = 0; j < 2; j++) {
+ pid[j] = fork_and_sleep(5);
+ assert_se(cg_attach(SYSTEMD_CGROUP_CONTROLLER, cgroup, pid[j]) >= 0);
+ }
+
+ r = oomd_cgroup_kill(cgroup, false /* recurse */, false /* dry run */);
+ if (r <= 0) {
+ log_debug_errno(r, "Failed to kill processes under %s: %m", cgroup);
+ abort();
+ }
+
+ assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_ooms", &v) >= 0);
+ assert_se(streq(v, i == 0 ? "1" : "2"));
+ v = mfree(v);
+
+ /* Wait a bit since processes may take some time to be cleaned up. */
+ sleep(2);
+ assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true);
+
+ assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_kill", &v) >= 0);
+ assert_se(streq(v, i == 0 ? "2" : "4"));
+ }
+}
+
+static void test_oomd_cgroup_context_acquire_and_insert(void) {
+ _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL;
+ _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
+ _cleanup_free_ char *cgroup = NULL;
+ OomdCGroupContext *c1, *c2;
+ CGroupMask mask;
+
+ if (geteuid() != 0)
+ return (void) log_tests_skipped("not root");
+
+ if (!is_pressure_supported())
+ return (void) log_tests_skipped("system does not support pressure");
+
+ if (cg_all_unified() <= 0)
+ return (void) log_tests_skipped("cgroups are not running in unified mode");
+
+ assert_se(cg_mask_supported(&mask) >= 0);
+
+ if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY))
+ return (void) log_tests_skipped("cgroup memory controller is not available");
+
+ assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0);
+ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+
+ assert_se(streq(ctx->path, cgroup));
+ assert_se(ctx->current_memory_usage > 0);
+ assert_se(ctx->memory_min == 0);
+ assert_se(ctx->memory_low == 0);
+ assert_se(ctx->swap_usage == 0);
+ assert_se(ctx->last_pgscan == 0);
+ assert_se(ctx->pgscan == 0);
+ ctx = oomd_cgroup_context_free(ctx);
+
+ assert_se(oomd_cgroup_context_acquire("", &ctx) == 0);
+ assert_se(streq(ctx->path, "/"));
+ assert_se(ctx->current_memory_usage > 0);
+
+ /* Test hashmap inserts */
+ assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
+ assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == 0);
+ c1 = hashmap_get(h1, cgroup);
+ assert_se(c1);
+ assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST);
+
+ /* make sure certain values from h1 get updated in h2 */
+ c1->pgscan = UINT64_MAX;
+ c1->mem_pressure_limit = 6789;
+ c1->mem_pressure_limit_hit_start = 42;
+ c1->last_had_mem_reclaim = 888;
+ assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
+ assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0);
+ c1 = hashmap_get(h1, cgroup);
+ c2 = hashmap_get(h2, cgroup);
+ assert_se(c1);
+ assert_se(c2);
+ assert_se(c1 != c2);
+ assert_se(c2->last_pgscan == UINT64_MAX);
+ assert_se(c2->mem_pressure_limit == 6789);
+ assert_se(c2->mem_pressure_limit_hit_start == 42);
+ assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */
+}
+
+static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
+ _cleanup_hashmap_free_ Hashmap *h_old = NULL, *h_new = NULL;
+ OomdCGroupContext *c_old, *c_new;
+ char **paths = STRV_MAKE("/0.slice",
+ "/1.slice");
+
+ OomdCGroupContext ctx_old[2] = {
+ { .path = paths[0],
+ .mem_pressure_limit = 5,
+ .mem_pressure_limit_hit_start = 777,
+ .last_had_mem_reclaim = 888,
+ .pgscan = 57 },
+ { .path = paths[1],
+ .mem_pressure_limit = 6,
+ .mem_pressure_limit_hit_start = 888,
+ .last_had_mem_reclaim = 888,
+ .pgscan = 42 },
+ };
+
+ OomdCGroupContext ctx_new[2] = {
+ { .path = paths[0],
+ .pgscan = 57 },
+ { .path = paths[1],
+ .pgscan = 101 },
+ };
+
+ assert_se(h_old = hashmap_new(&string_hash_ops));
+ assert_se(hashmap_put(h_old, paths[0], &ctx_old[0]) >= 0);
+ assert_se(hashmap_put(h_old, paths[1], &ctx_old[1]) >= 0);
+
+ assert_se(h_new = hashmap_new(&string_hash_ops));
+ assert_se(hashmap_put(h_new, paths[0], &ctx_new[0]) >= 0);
+ assert_se(hashmap_put(h_new, paths[1], &ctx_new[1]) >= 0);
+
+ oomd_update_cgroup_contexts_between_hashmaps(h_old, h_new);
+
+ assert_se(c_old = hashmap_get(h_old, "/0.slice"));
+ assert_se(c_new = hashmap_get(h_new, "/0.slice"));
+ assert_se(c_old->pgscan == c_new->last_pgscan);
+ assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
+ assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
+ assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim);
+
+ assert_se(c_old = hashmap_get(h_old, "/1.slice"));
+ assert_se(c_new = hashmap_get(h_new, "/1.slice"));
+ assert_se(c_old->pgscan == c_new->last_pgscan);
+ assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
+ assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
+ assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim);
+}
+
+static void test_oomd_system_context_acquire(void) {
+ _cleanup_(unlink_tempfilep) char path[] = "/tmp/oomdgetsysctxtestXXXXXX";
+ _cleanup_close_ int fd = -1;
+ OomdSystemContext ctx;
+
+ if (geteuid() != 0)
+ return (void) log_tests_skipped("not root");
+
+ assert_se((fd = mkostemp_safe(path)) >= 0);
+
+ assert_se(oomd_system_context_acquire("/verylikelynonexistentpath", &ctx) == -ENOENT);
+
+ assert_se(oomd_system_context_acquire(path, &ctx) == -EINVAL);
+
+ assert_se(write_string_file(path, "some\nwords\nacross\nmultiple\nlines", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(oomd_system_context_acquire(path, &ctx) == -EINVAL);
+
+ assert_se(write_string_file(path, "MemTotal: 32495256 kB trailing\n"
+ "MemFree: 9880512 kB data\n"
+ "SwapTotal: 8388604 kB is\n"
+ "SwapFree: 7604 kB bad\n", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(oomd_system_context_acquire(path, &ctx) == -EINVAL);
+
+ assert_se(write_string_file(path, "MemTotal: 32495256 kB\n"
+ "MemFree: 9880512 kB\n"
+ "MemAvailable: 21777088 kB\n"
+ "Buffers: 5968 kB\n"
+ "Cached: 14344796 kB\n"
+ "Unevictable: 740004 kB\n"
+ "Mlocked: 4484 kB\n"
+ "SwapTotal: 8388604 kB\n"
+ "SwapFree: 7604 kB\n", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(oomd_system_context_acquire(path, &ctx) == 0);
+ assert_se(ctx.mem_total == 33275142144);
+ assert_se(ctx.mem_used == 10975404032);
+ assert_se(ctx.swap_total == 8589930496);
+ assert_se(ctx.swap_used == 8582144000);
+}
+
+static void test_oomd_pressure_above(void) {
+ _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL;
+ _cleanup_set_free_ Set *t1 = NULL, *t2 = NULL, *t3 = NULL;
+ OomdCGroupContext ctx[2] = {}, *c;
+ loadavg_t threshold;
+
+ assert_se(store_loadavg_fixed_point(80, 0, &threshold) == 0);
+
+ /* /herp.slice */
+ assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg10)) == 0);
+ assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg60)) == 0);
+ assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg300)) == 0);
+ ctx[0].mem_pressure_limit = threshold;
+
+ /* /derp.slice */
+ assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg10)) == 0);
+ assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg60)) == 0);
+ assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg300)) == 0);
+ ctx[1].mem_pressure_limit = threshold;
+
+ /* High memory pressure */
+ assert_se(h1 = hashmap_new(&string_hash_ops));
+ assert_se(hashmap_put(h1, "/herp.slice", &ctx[0]) >= 0);
+ assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1);
+ assert_se(set_contains(t1, &ctx[0]));
+ assert_se(c = hashmap_get(h1, "/herp.slice"));
+ assert_se(c->mem_pressure_limit_hit_start > 0);
+
+ /* Low memory pressure */
+ assert_se(h2 = hashmap_new(&string_hash_ops));
+ assert_se(hashmap_put(h2, "/derp.slice", &ctx[1]) >= 0);
+ assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0);
+ assert_se(!t2);
+ assert_se(c = hashmap_get(h2, "/derp.slice"));
+ assert_se(c->mem_pressure_limit_hit_start == 0);
+
+ /* High memory pressure w/ multiple cgroups */
+ assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0);
+ assert_se(oomd_pressure_above(h1, 0 /* duration */, &t3) == 1);
+ assert_se(set_contains(t3, &ctx[0]));
+ assert_se(set_size(t3) == 1);
+ assert_se(c = hashmap_get(h1, "/herp.slice"));
+ assert_se(c->mem_pressure_limit_hit_start > 0);
+ assert_se(c = hashmap_get(h1, "/derp.slice"));
+ assert_se(c->mem_pressure_limit_hit_start == 0);
+}
+
+static void test_oomd_mem_and_swap_free_below(void) {
+ OomdSystemContext ctx = (OomdSystemContext) {
+ .mem_total = UINT64_C(20971512) * 1024U,
+ .mem_used = UINT64_C(3310136) * 1024U,
+ .swap_total = UINT64_C(20971512) * 1024U,
+ .swap_used = UINT64_C(20971440) * 1024U,
+ };
+ assert_se(oomd_mem_available_below(&ctx, 2000) == false);
+ assert_se(oomd_swap_free_below(&ctx, 2000) == true);
+
+ ctx = (OomdSystemContext) {
+ .mem_total = UINT64_C(20971512) * 1024U,
+ .mem_used = UINT64_C(20971440) * 1024U,
+ .swap_total = UINT64_C(20971512) * 1024U,
+ .swap_used = UINT64_C(3310136) * 1024U,
+ };
+ assert_se(oomd_mem_available_below(&ctx, 2000) == true);
+ assert_se(oomd_swap_free_below(&ctx, 2000) == false);
+
+ ctx = (OomdSystemContext) {
+ .mem_total = 0,
+ .mem_used = 0,
+ .swap_total = 0,
+ .swap_used = 0,
+ };
+ assert_se(oomd_mem_available_below(&ctx, 2000) == false);
+ assert_se(oomd_swap_free_below(&ctx, 2000) == false);
+}
+
+static void test_oomd_sort_cgroups(void) {
+ _cleanup_hashmap_free_ Hashmap *h = NULL;
+ _cleanup_free_ OomdCGroupContext **sorted_cgroups;
+ char **paths = STRV_MAKE("/herp.slice",
+ "/herp.slice/derp.scope",
+ "/herp.slice/derp.scope/sheep.service",
+ "/zupa.slice",
+ "/boop.slice",
+ "/omitted.slice",
+ "/avoid.slice");
+
+ OomdCGroupContext ctx[7] = {
+ { .path = paths[0],
+ .swap_usage = 20,
+ .last_pgscan = 0,
+ .pgscan = 33,
+ .current_memory_usage = 10 },
+ { .path = paths[1],
+ .swap_usage = 60,
+ .last_pgscan = 33,
+ .pgscan = 1,
+ .current_memory_usage = 20 },
+ { .path = paths[2],
+ .swap_usage = 40,
+ .last_pgscan = 1,
+ .pgscan = 33,
+ .current_memory_usage = 40 },
+ { .path = paths[3],
+ .swap_usage = 10,
+ .last_pgscan = 33,
+ .pgscan = 2,
+ .current_memory_usage = 10 },
+ { .path = paths[4],
+ .swap_usage = 11,
+ .last_pgscan = 33,
+ .pgscan = 33,
+ .current_memory_usage = 10 },
+ { .path = paths[5],
+ .swap_usage = 90,
+ .last_pgscan = 0,
+ .pgscan = UINT64_MAX,
+ .preference = MANAGED_OOM_PREFERENCE_OMIT },
+ { .path = paths[6],
+ .swap_usage = 99,
+ .last_pgscan = 0,
+ .pgscan = UINT64_MAX,
+ .preference = MANAGED_OOM_PREFERENCE_AVOID },
+ };
+
+ assert_se(h = hashmap_new(&string_hash_ops));
+
+ assert_se(hashmap_put(h, "/herp.slice", &ctx[0]) >= 0);
+ assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0);
+ assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0);
+ assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0);
+ assert_se(hashmap_put(h, "/boop.slice", &ctx[4]) >= 0);
+ assert_se(hashmap_put(h, "/omitted.slice", &ctx[5]) >= 0);
+ assert_se(hashmap_put(h, "/avoid.slice", &ctx[6]) >= 0);
+
+ assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 6);
+ assert_se(sorted_cgroups[0] == &ctx[1]);
+ assert_se(sorted_cgroups[1] == &ctx[2]);
+ assert_se(sorted_cgroups[2] == &ctx[0]);
+ assert_se(sorted_cgroups[3] == &ctx[4]);
+ assert_se(sorted_cgroups[4] == &ctx[3]);
+ assert_se(sorted_cgroups[5] == &ctx[6]);
+ sorted_cgroups = mfree(sorted_cgroups);
+
+ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, NULL, &sorted_cgroups) == 6);
+ assert_se(sorted_cgroups[0] == &ctx[0]);
+ assert_se(sorted_cgroups[1] == &ctx[2]);
+ assert_se(sorted_cgroups[2] == &ctx[3]);
+ assert_se(sorted_cgroups[3] == &ctx[1]);
+ assert_se(sorted_cgroups[4] == &ctx[4]);
+ assert_se(sorted_cgroups[5] == &ctx[6]);
+ sorted_cgroups = mfree(sorted_cgroups);
+
+ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
+ assert_se(sorted_cgroups[0] == &ctx[2]);
+ assert_se(sorted_cgroups[1] == &ctx[1]);
+ assert_se(sorted_cgroups[2] == 0);
+ assert_se(sorted_cgroups[3] == 0);
+ assert_se(sorted_cgroups[4] == 0);
+ assert_se(sorted_cgroups[5] == 0);
+ assert_se(sorted_cgroups[6] == 0);
+ sorted_cgroups = mfree(sorted_cgroups);
+}
+
+static void test_oomd_fetch_cgroup_oom_preference(void) {
+ _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
+ _cleanup_free_ char *cgroup = NULL;
+ ManagedOOMPreference root_pref;
+ CGroupMask mask;
+ bool test_xattrs;
+ int root_xattrs, r;
+
+ if (geteuid() != 0)
+ return (void) log_tests_skipped("not root");
+
+ if (!is_pressure_supported())
+ return (void) log_tests_skipped("system does not support pressure");
+
+ if (cg_all_unified() <= 0)
+ return (void) log_tests_skipped("cgroups are not running in unified mode");
+
+ assert_se(cg_mask_supported(&mask) >= 0);
+
+ if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY))
+ return (void) log_tests_skipped("cgroup memory controller is not available");
+
+ assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0);
+ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+
+ /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities
+ * so skip the xattr portions of the test. */
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "1", 1, 0);
+ test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r);
+
+ if (test_xattrs) {
+ assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0);
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "1", 1, 0) >= 0);
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
+
+ /* omit takes precedence over avoid when both are set to true */
+ assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0);
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_OMIT);
+ } else {
+ assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) < 0);
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
+ }
+ ctx = oomd_cgroup_context_free(ctx);
+
+ /* also check when only avoid is set to true */
+ if (test_xattrs) {
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "0", 1, 0) >= 0);
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
+ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+ assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0);
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID);
+ ctx = oomd_cgroup_context_free(ctx);
+ }
+
+ /* Test the root cgroup */
+ /* Root cgroup is live and not made on demand like the cgroup the test runs in. It can have varying
+ * xattrs set already so let's read in the booleans first to get the final preference value. */
+ assert_se(oomd_cgroup_context_acquire("", &ctx) == 0);
+ root_xattrs = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, "", "user.oomd_omit");
+ root_pref = root_xattrs > 0 ? MANAGED_OOM_PREFERENCE_OMIT : MANAGED_OOM_PREFERENCE_NONE;
+ root_xattrs = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, "", "user.oomd_avoid");
+ root_pref = root_xattrs > 0 ? MANAGED_OOM_PREFERENCE_AVOID : MANAGED_OOM_PREFERENCE_NONE;
+ assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0);
+ assert_se(ctx->preference == root_pref);
+
+ assert_se(oomd_fetch_cgroup_oom_preference(ctx, "/herp.slice/derp.scope") == -EINVAL);
+
+ /* Assert that avoid/omit are not set if the cgroup and prefix are not
+ * owned by the same user.*/
+ if (test_xattrs && !empty_or_root(cgroup)) {
+ ctx = oomd_cgroup_context_free(ctx);
+ assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 61183, 0) >= 0);
+ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+
+ assert_se(oomd_fetch_cgroup_oom_preference(ctx, NULL) == 0);
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
+
+ assert_se(oomd_fetch_cgroup_oom_preference(ctx, ctx->path) == 0);
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID);
+ }
+}
+
+int main(void) {
+ int r;
+
+ test_setup_logging(LOG_DEBUG);
+
+ test_oomd_update_cgroup_contexts_between_hashmaps();
+ test_oomd_system_context_acquire();
+ test_oomd_pressure_above();
+ test_oomd_mem_and_swap_free_below();
+ test_oomd_sort_cgroups();
+
+ /* The following tests operate on live cgroups */
+
+ r = enter_cgroup_root(NULL);
+ if (r < 0)
+ return log_tests_skipped_errno(r, "failed to enter a test cgroup scope");
+
+ test_oomd_cgroup_kill();
+ test_oomd_cgroup_context_acquire_and_insert();
+ test_oomd_fetch_cgroup_oom_preference();
+
+ return 0;
+}