diff options
Diffstat (limited to 'src/core/cgroup.h')
-rw-r--r-- | src/core/cgroup.h | 132 |
1 files changed, 128 insertions, 4 deletions
diff --git a/src/core/cgroup.h b/src/core/cgroup.h index f1b674b..72fe275 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -3,7 +3,10 @@ #include <stdbool.h> -#include "bpf-lsm.h" +#include "sd-event.h" + +#include "bpf-program.h" +#include "bpf-restrict-fs.h" #include "cgroup-util.h" #include "cpu-set-util.h" #include "firewall-util.h" @@ -35,6 +38,7 @@ typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight; typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth; typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram; typedef struct CGroupSocketBindItem CGroupSocketBindItem; +typedef struct CGroupRuntime CGroupRuntime; typedef enum CGroupDevicePolicy { /* When devices listed, will allow those, plus built-in ones, if none are listed will allow @@ -53,7 +57,9 @@ typedef enum CGroupDevicePolicy { typedef enum FreezerAction { FREEZER_FREEZE, + FREEZER_PARENT_FREEZE, FREEZER_THAW, + FREEZER_PARENT_THAW, _FREEZER_ACTION_MAX, _FREEZER_ACTION_INVALID = -EINVAL, @@ -129,6 +135,9 @@ typedef enum CGroupPressureWatch { _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL, } CGroupPressureWatch; +/* The user-supplied cgroup-related configuration options. This remains mostly immutable while the service + * manager is running (except for an occasional SetProperty() configuration change), outside of reload + * cycles. When adding members make sure to update cgroup_context_copy() accordingly. */ struct CGroupContext { bool cpu_accounting; bool io_accounting; @@ -188,6 +197,8 @@ struct CGroupContext { bool startup_memory_swap_max_set:1; bool startup_memory_zswap_max_set:1; + bool memory_zswap_writeback; + Set *ip_address_allow; Set *ip_address_deny; /* These two flags indicate that redundant entries have been removed from @@ -276,6 +287,95 @@ typedef enum CGroupMemoryAccountingMetric { _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL, } CGroupMemoryAccountingMetric; +/* Used for limits whose value sets have infimum */ +typedef enum CGroupLimitType { + CGROUP_LIMIT_MEMORY_MAX, + CGROUP_LIMIT_MEMORY_HIGH, + CGROUP_LIMIT_TASKS_MAX, + _CGROUP_LIMIT_TYPE_MAX, + _CGROUP_LIMIT_INVALID = -EINVAL, +} CGroupLimitType; + +/* The dynamic, regular updated information about a unit that as a realized cgroup. This is only allocated when a unit is first realized */ +typedef struct CGroupRuntime { + /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */ + nsec_t cpu_usage_base; + nsec_t cpu_usage_last; /* the most recently read value */ + + /* Most recently read value of memory accounting metrics */ + uint64_t memory_accounting_last[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1]; + + /* The current counter of OOM kills initiated by systemd-oomd */ + uint64_t managed_oom_kill_last; + + /* The current counter of the oom_kill field in the memory.events cgroup attribute */ + uint64_t oom_kill_last; + + /* Where the io.stat data was at the time the unit was started */ + uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; + uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */ + + /* Counterparts in the cgroup filesystem */ + char *cgroup_path; + uint64_t cgroup_id; + CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */ + CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */ + CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */ + CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ + + /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */ + int cgroup_control_inotify_wd; + int cgroup_memory_inotify_wd; + + /* Device Controller BPF program */ + BPFProgram *bpf_device_control_installed; + + /* IP BPF Firewalling/accounting */ + int ip_accounting_ingress_map_fd; + int ip_accounting_egress_map_fd; + uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX]; + + int ipv4_allow_map_fd; + int ipv6_allow_map_fd; + int ipv4_deny_map_fd; + int ipv6_deny_map_fd; + BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed; + BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed; + + Set *ip_bpf_custom_ingress; + Set *ip_bpf_custom_ingress_installed; + Set *ip_bpf_custom_egress; + Set *ip_bpf_custom_egress_installed; + + /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd, + * attached to unit cgroup by provided program fd and attach type. */ + Hashmap *bpf_foreign_by_key; + + FDSet *initial_socket_bind_link_fds; +#if BPF_FRAMEWORK + /* BPF links to BPF programs attached to cgroup/bind{4|6} hooks and + * responsible for allowing or denying a unit to bind(2) to a socket + * address. */ + struct bpf_link *ipv4_socket_bind_link; + struct bpf_link *ipv6_socket_bind_link; +#endif + + FDSet *initial_restrict_ifaces_link_fds; +#if BPF_FRAMEWORK + struct bpf_link *restrict_ifaces_ingress_bpf_link; + struct bpf_link *restrict_ifaces_egress_bpf_link; +#endif + + bool cgroup_realized:1; + bool cgroup_members_mask_valid:1; + + /* Reset cgroup accounting next time we fork something off */ + bool reset_accounting:1; + + /* Whether we warned about clamping the CPU quota period */ + bool warned_clamping_cpu_quota_period:1; +} CGroupRuntime; + typedef struct Unit Unit; typedef struct Manager Manager; typedef enum ManagerState ManagerState; @@ -285,6 +385,7 @@ uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state); usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period); void cgroup_context_init(CGroupContext *c); +int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src); void cgroup_context_done(CGroupContext *c); void cgroup_context_dump(Unit *u, FILE* f, const char *prefix); void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f); @@ -309,6 +410,17 @@ static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) { int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p); int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p); int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path); +static inline int cgroup_context_add_bpf_foreign_program_dup(CGroupContext *c, const CGroupBPFForeignProgram *p) { + return cgroup_context_add_bpf_foreign_program(c, p->attach_type, p->bpffs_path); +} +int cgroup_context_add_io_device_limit_dup(CGroupContext *c, const CGroupIODeviceLimit *l); +int cgroup_context_add_io_device_weight_dup(CGroupContext *c, const CGroupIODeviceWeight *w); +int cgroup_context_add_io_device_latency_dup(CGroupContext *c, const CGroupIODeviceLatency *l); +int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, const CGroupBlockIODeviceWeight *w); +int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, const CGroupBlockIODeviceBandwidth *b); +int cgroup_context_add_device_allow_dup(CGroupContext *c, const CGroupDeviceAllow *a); +int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, const CGroupSocketBindItem *i); +int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, const CGroupSocketBindItem *i); void unit_modify_nft_set(Unit *u, bool add); @@ -336,6 +448,7 @@ int unit_watch_cgroup(Unit *u); int unit_watch_cgroup_memory(Unit *u); void unit_add_to_cgroup_realize_queue(Unit *u); +int unit_cgroup_is_empty(Unit *u); void unit_release_cgroup(Unit *u); /* Releases the cgroup only if it is recursively empty. * Returns true if the cgroup was released, false otherwise. */ @@ -353,9 +466,9 @@ void manager_shutdown_cgroup(Manager *m, bool delete); unsigned manager_dispatch_cgroup_realize_queue(Manager *m); Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup); -Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid); -Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid); -Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid); +Unit *manager_get_unit_by_pidref_cgroup(Manager *m, const PidRef *pid); +Unit *manager_get_unit_by_pidref_watching(Manager *m, const PidRef *pid); +Unit* manager_get_unit_by_pidref(Manager *m, const PidRef *pid); Unit* manager_get_unit_by_pid(Manager *m, pid_t pid); uint64_t unit_get_ancestor_memory_min(Unit *u); @@ -374,6 +487,7 @@ int unit_get_tasks_current(Unit *u, uint64_t *ret); int unit_get_cpu_usage(Unit *u, nsec_t *ret); int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret); int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret); +int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret); int unit_reset_cpu_accounting(Unit *u); void unit_reset_memory_accounting_last(Unit *u); @@ -413,6 +527,13 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action); const char* freezer_action_to_string(FreezerAction a) _const_; FreezerAction freezer_action_from_string(const char *s) _pure_; +CGroupRuntime *cgroup_runtime_new(void); +CGroupRuntime *cgroup_runtime_free(CGroupRuntime *crt); +DEFINE_TRIVIAL_CLEANUP_FUNC(CGroupRuntime*, cgroup_runtime_free); + +int cgroup_runtime_serialize(Unit *u, FILE *f, FDSet *fds); +int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, FDSet *fds); + const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_; CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_; @@ -425,5 +546,8 @@ CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_; CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_; +const char* cgroup_effective_limit_type_to_string(CGroupLimitType m) _const_; +CGroupLimitType cgroup_effective_limit_type_from_string(const char *s) _pure_; + const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_; CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_; |