summaryrefslogtreecommitdiffstats
path: root/src/core/cgroup.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/cgroup.h')
-rw-r--r--src/core/cgroup.h429
1 files changed, 429 insertions, 0 deletions
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
new file mode 100644
index 0000000..f1b674b
--- /dev/null
+++ b/src/core/cgroup.h
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "bpf-lsm.h"
+#include "cgroup-util.h"
+#include "cpu-set-util.h"
+#include "firewall-util.h"
+#include "list.h"
+#include "pidref.h"
+#include "time-util.h"
+
+typedef struct CGroupTasksMax {
+ /* If scale == 0, just use value; otherwise, value / scale.
+ * See tasks_max_resolve(). */
+ uint64_t value;
+ uint64_t scale;
+} CGroupTasksMax;
+
+#define CGROUP_TASKS_MAX_UNSET ((CGroupTasksMax) { .value = UINT64_MAX, .scale = 0 })
+
+static inline bool cgroup_tasks_max_isset(const CGroupTasksMax *tasks_max) {
+ return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
+}
+
+uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max);
+
+typedef struct CGroupContext CGroupContext;
+typedef struct CGroupDeviceAllow CGroupDeviceAllow;
+typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
+typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
+typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
+typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
+typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
+typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram;
+typedef struct CGroupSocketBindItem CGroupSocketBindItem;
+
+typedef enum CGroupDevicePolicy {
+ /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
+ * everything. */
+ CGROUP_DEVICE_POLICY_AUTO,
+
+ /* Everything forbidden, except built-in ones and listed ones. */
+ CGROUP_DEVICE_POLICY_CLOSED,
+
+ /* Everything forbidden, except for the listed devices */
+ CGROUP_DEVICE_POLICY_STRICT,
+
+ _CGROUP_DEVICE_POLICY_MAX,
+ _CGROUP_DEVICE_POLICY_INVALID = -EINVAL,
+} CGroupDevicePolicy;
+
+typedef enum FreezerAction {
+ FREEZER_FREEZE,
+ FREEZER_THAW,
+
+ _FREEZER_ACTION_MAX,
+ _FREEZER_ACTION_INVALID = -EINVAL,
+} FreezerAction;
+
+typedef enum CGroupDevicePermissions {
+ /* We reuse the same bit meanings the kernel's BPF_DEVCG_ACC_xyz definitions use */
+ CGROUP_DEVICE_MKNOD = 1 << 0,
+ CGROUP_DEVICE_READ = 1 << 1,
+ CGROUP_DEVICE_WRITE = 1 << 2,
+ _CGROUP_DEVICE_PERMISSIONS_MAX = 1 << 3,
+ _CGROUP_DEVICE_PERMISSIONS_ALL = _CGROUP_DEVICE_PERMISSIONS_MAX - 1,
+ _CGROUP_DEVICE_PERMISSIONS_INVALID = -EINVAL,
+} CGroupDevicePermissions;
+
+struct CGroupDeviceAllow {
+ LIST_FIELDS(CGroupDeviceAllow, device_allow);
+ char *path;
+ CGroupDevicePermissions permissions;
+};
+
+struct CGroupIODeviceWeight {
+ LIST_FIELDS(CGroupIODeviceWeight, device_weights);
+ char *path;
+ uint64_t weight;
+};
+
+struct CGroupIODeviceLimit {
+ LIST_FIELDS(CGroupIODeviceLimit, device_limits);
+ char *path;
+ uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
+};
+
+struct CGroupIODeviceLatency {
+ LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
+ char *path;
+ usec_t target_usec;
+};
+
+struct CGroupBlockIODeviceWeight {
+ LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
+ char *path;
+ uint64_t weight;
+};
+
+struct CGroupBlockIODeviceBandwidth {
+ LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
+ char *path;
+ uint64_t rbps;
+ uint64_t wbps;
+};
+
+struct CGroupBPFForeignProgram {
+ LIST_FIELDS(CGroupBPFForeignProgram, programs);
+ uint32_t attach_type;
+ char *bpffs_path;
+};
+
+struct CGroupSocketBindItem {
+ LIST_FIELDS(CGroupSocketBindItem, socket_bind_items);
+ int address_family;
+ int ip_protocol;
+ uint16_t nr_ports;
+ uint16_t port_min;
+};
+
+typedef enum CGroupPressureWatch {
+ CGROUP_PRESSURE_WATCH_OFF, /* → tells the service payload explicitly not to watch for memory pressure */
+ CGROUP_PRESSURE_WATCH_AUTO, /* → on if memory account is on anyway for the unit, otherwise off */
+ CGROUP_PRESSURE_WATCH_ON,
+ CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
+ _CGROUP_PRESSURE_WATCH_MAX,
+ _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
+} CGroupPressureWatch;
+
+struct CGroupContext {
+ bool cpu_accounting;
+ bool io_accounting;
+ bool blockio_accounting;
+ bool memory_accounting;
+ bool tasks_accounting;
+ bool ip_accounting;
+
+ /* Configures the memory.oom.group attribute (on unified) */
+ bool memory_oom_group;
+
+ bool delegate;
+ CGroupMask delegate_controllers;
+ CGroupMask disable_controllers;
+ char *delegate_subgroup;
+
+ /* For unified hierarchy */
+ uint64_t cpu_weight;
+ uint64_t startup_cpu_weight;
+ usec_t cpu_quota_per_sec_usec;
+ usec_t cpu_quota_period_usec;
+
+ CPUSet cpuset_cpus;
+ CPUSet startup_cpuset_cpus;
+ CPUSet cpuset_mems;
+ CPUSet startup_cpuset_mems;
+
+ uint64_t io_weight;
+ uint64_t startup_io_weight;
+ LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
+ LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
+ LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);
+
+ uint64_t default_memory_min;
+ uint64_t default_memory_low;
+ uint64_t default_startup_memory_low;
+ uint64_t memory_min;
+ uint64_t memory_low;
+ uint64_t startup_memory_low;
+ uint64_t memory_high;
+ uint64_t startup_memory_high;
+ uint64_t memory_max;
+ uint64_t startup_memory_max;
+ uint64_t memory_swap_max;
+ uint64_t startup_memory_swap_max;
+ uint64_t memory_zswap_max;
+ uint64_t startup_memory_zswap_max;
+
+ bool default_memory_min_set:1;
+ bool default_memory_low_set:1;
+ bool default_startup_memory_low_set:1;
+ bool memory_min_set:1;
+ bool memory_low_set:1;
+ bool startup_memory_low_set:1;
+ bool startup_memory_high_set:1;
+ bool startup_memory_max_set:1;
+ bool startup_memory_swap_max_set:1;
+ bool startup_memory_zswap_max_set:1;
+
+ Set *ip_address_allow;
+ Set *ip_address_deny;
+ /* These two flags indicate that redundant entries have been removed from
+ * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
+ bool ip_address_allow_reduced;
+ bool ip_address_deny_reduced;
+
+ char **ip_filters_ingress;
+ char **ip_filters_egress;
+ LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs);
+
+ Set *restrict_network_interfaces;
+ bool restrict_network_interfaces_is_allow_list;
+
+ /* For legacy hierarchies */
+ uint64_t cpu_shares;
+ uint64_t startup_cpu_shares;
+
+ uint64_t blockio_weight;
+ uint64_t startup_blockio_weight;
+ LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
+ LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);
+
+ uint64_t memory_limit;
+
+ CGroupDevicePolicy device_policy;
+ LIST_HEAD(CGroupDeviceAllow, device_allow);
+
+ LIST_HEAD(CGroupSocketBindItem, socket_bind_allow);
+ LIST_HEAD(CGroupSocketBindItem, socket_bind_deny);
+
+ /* Common */
+ CGroupTasksMax tasks_max;
+
+ /* Settings for systemd-oomd */
+ ManagedOOMMode moom_swap;
+ ManagedOOMMode moom_mem_pressure;
+ uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
+ ManagedOOMPreference moom_preference;
+
+ /* Memory pressure logic */
+ CGroupPressureWatch memory_pressure_watch;
+ usec_t memory_pressure_threshold_usec;
+ /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
+ * triggers, nor triggers for non-memory pressure. We might add that later. */
+
+ NFTSetContext nft_set_context;
+
+ /* Forward coredumps for processes that crash within this cgroup.
+ * Requires 'delegate' to also be true. */
+ bool coredump_receive;
+};
+
+/* Used when querying IP accounting data */
+typedef enum CGroupIPAccountingMetric {
+ CGROUP_IP_INGRESS_BYTES,
+ CGROUP_IP_INGRESS_PACKETS,
+ CGROUP_IP_EGRESS_BYTES,
+ CGROUP_IP_EGRESS_PACKETS,
+ _CGROUP_IP_ACCOUNTING_METRIC_MAX,
+ _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupIPAccountingMetric;
+
+/* Used when querying IO accounting data */
+typedef enum CGroupIOAccountingMetric {
+ CGROUP_IO_READ_BYTES,
+ CGROUP_IO_WRITE_BYTES,
+ CGROUP_IO_READ_OPERATIONS,
+ CGROUP_IO_WRITE_OPERATIONS,
+ _CGROUP_IO_ACCOUNTING_METRIC_MAX,
+ _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupIOAccountingMetric;
+
+typedef enum CGroupMemoryAccountingMetric {
+ CGROUP_MEMORY_PEAK,
+ CGROUP_MEMORY_SWAP_PEAK,
+ /* We cache the above attributes, so that they can be fetched even after the cgroup is gone, e.g.
+ * when systemd-run exits. */
+ _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST = CGROUP_MEMORY_SWAP_PEAK,
+
+ /* These attributes are transient, so no need for caching. */
+ CGROUP_MEMORY_SWAP_CURRENT,
+ CGROUP_MEMORY_ZSWAP_CURRENT,
+
+ _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX,
+ _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupMemoryAccountingMetric;
+
+typedef struct Unit Unit;
+typedef struct Manager Manager;
+typedef enum ManagerState ManagerState;
+
+uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
+
+usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
+
+void cgroup_context_init(CGroupContext *c);
+void cgroup_context_done(CGroupContext *c);
+void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
+void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
+void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f);
+
+void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
+void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
+void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
+void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
+void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
+void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
+void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p);
+void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head);
+
+static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
+ assert(c);
+
+ return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON ||
+ (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting);
+}
+
+int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
+int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
+int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
+
+void unit_modify_nft_set(Unit *u, bool add);
+
+CGroupMask unit_get_own_mask(Unit *u);
+CGroupMask unit_get_delegate_mask(Unit *u);
+CGroupMask unit_get_members_mask(Unit *u);
+CGroupMask unit_get_siblings_mask(Unit *u);
+CGroupMask unit_get_ancestor_disable_mask(Unit *u);
+
+CGroupMask unit_get_target_mask(Unit *u);
+CGroupMask unit_get_enable_mask(Unit *u);
+
+void unit_invalidate_cgroup_members_masks(Unit *u);
+
+void unit_add_family_to_cgroup_realize_queue(Unit *u);
+
+const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
+int unit_default_cgroup_path(const Unit *u, char **ret);
+int unit_set_cgroup_path(Unit *u, const char *path);
+int unit_pick_cgroup_path(Unit *u);
+
+int unit_realize_cgroup(Unit *u);
+void unit_prune_cgroup(Unit *u);
+int unit_watch_cgroup(Unit *u);
+int unit_watch_cgroup_memory(Unit *u);
+void unit_add_to_cgroup_realize_queue(Unit *u);
+
+void unit_release_cgroup(Unit *u);
+/* Releases the cgroup only if it is recursively empty.
+ * Returns true if the cgroup was released, false otherwise. */
+bool unit_maybe_release_cgroup(Unit *u);
+
+void unit_add_to_cgroup_empty_queue(Unit *u);
+int unit_check_oomd_kill(Unit *u);
+int unit_check_oom(Unit *u);
+
+int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
+
+int manager_setup_cgroup(Manager *m);
+void manager_shutdown_cgroup(Manager *m, bool delete);
+
+unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
+
+Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
+Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid);
+Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid);
+Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid);
+Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
+
+uint64_t unit_get_ancestor_memory_min(Unit *u);
+uint64_t unit_get_ancestor_memory_low(Unit *u);
+uint64_t unit_get_ancestor_startup_memory_low(Unit *u);
+
+int unit_search_main_pid(Unit *u, PidRef *ret);
+int unit_watch_all_pids(Unit *u);
+
+int unit_synthesize_cgroup_empty_event(Unit *u);
+
+int unit_get_memory_available(Unit *u, uint64_t *ret);
+int unit_get_memory_current(Unit *u, uint64_t *ret);
+int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret);
+int unit_get_tasks_current(Unit *u, uint64_t *ret);
+int unit_get_cpu_usage(Unit *u, nsec_t *ret);
+int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
+int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
+
+int unit_reset_cpu_accounting(Unit *u);
+void unit_reset_memory_accounting_last(Unit *u);
+int unit_reset_ip_accounting(Unit *u);
+void unit_reset_io_accounting_last(Unit *u);
+int unit_reset_io_accounting(Unit *u);
+int unit_reset_accounting(Unit *u);
+
+#define UNIT_CGROUP_BOOL(u, name) \
+ ({ \
+ CGroupContext *cc = unit_get_cgroup_context(u); \
+ cc ? cc->name : false; \
+ })
+
+bool manager_owns_host_root_cgroup(Manager *m);
+bool unit_has_host_root_cgroup(Unit *u);
+
+bool unit_has_startup_cgroup_constraints(Unit *u);
+
+int manager_notify_cgroup_empty(Manager *m, const char *group);
+
+void unit_invalidate_cgroup(Unit *u, CGroupMask m);
+void unit_invalidate_cgroup_bpf(Unit *u);
+
+void manager_invalidate_startup_units(Manager *m);
+
+const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
+CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
+
+void unit_cgroup_catchup(Unit *u);
+
+bool unit_cgroup_delegate(Unit *u);
+
+int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);
+int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
+
+const char* freezer_action_to_string(FreezerAction a) _const_;
+FreezerAction freezer_action_from_string(const char *s) _pure_;
+
+const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_;
+CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_;
+
+const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) _const_;
+CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) _pure_;
+
+const char* cgroup_ip_accounting_metric_to_string(CGroupIPAccountingMetric m) _const_;
+CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) _pure_;
+
+const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_;
+CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_;
+
+const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_;
+CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_;