summaryrefslogtreecommitdiffstats
path: root/src/core/cgroup.h
blob: f1b674b4b74710b3af4b7fb408afd6dcde3afc63 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once

#include <stdbool.h>

#include "bpf-lsm.h"
#include "cgroup-util.h"
#include "cpu-set-util.h"
#include "firewall-util.h"
#include "list.h"
#include "pidref.h"
#include "time-util.h"

typedef struct CGroupTasksMax {
        /* If scale == 0, just use value; otherwise, value / scale.
         * See tasks_max_resolve(). */
        uint64_t value;
        uint64_t scale;
} CGroupTasksMax;

#define CGROUP_TASKS_MAX_UNSET ((CGroupTasksMax) { .value = UINT64_MAX, .scale = 0 })

static inline bool cgroup_tasks_max_isset(const CGroupTasksMax *tasks_max) {
        return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
}

uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max);

typedef struct CGroupContext CGroupContext;
typedef struct CGroupDeviceAllow CGroupDeviceAllow;
typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram;
typedef struct CGroupSocketBindItem CGroupSocketBindItem;

typedef enum CGroupDevicePolicy {
        /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
         * everything. */
        CGROUP_DEVICE_POLICY_AUTO,

        /* Everything forbidden, except built-in ones and listed ones. */
        CGROUP_DEVICE_POLICY_CLOSED,

        /* Everything forbidden, except for the listed devices */
        CGROUP_DEVICE_POLICY_STRICT,

        _CGROUP_DEVICE_POLICY_MAX,
        _CGROUP_DEVICE_POLICY_INVALID = -EINVAL,
} CGroupDevicePolicy;

typedef enum FreezerAction {
        FREEZER_FREEZE,
        FREEZER_THAW,

        _FREEZER_ACTION_MAX,
        _FREEZER_ACTION_INVALID = -EINVAL,
} FreezerAction;

typedef enum CGroupDevicePermissions {
        /* We reuse the same bit meanings the kernel's BPF_DEVCG_ACC_xyz definitions use */
        CGROUP_DEVICE_MKNOD                = 1 << 0,
        CGROUP_DEVICE_READ                 = 1 << 1,
        CGROUP_DEVICE_WRITE                = 1 << 2,
        _CGROUP_DEVICE_PERMISSIONS_MAX     = 1 << 3,
        _CGROUP_DEVICE_PERMISSIONS_ALL     = _CGROUP_DEVICE_PERMISSIONS_MAX - 1,
        _CGROUP_DEVICE_PERMISSIONS_INVALID = -EINVAL,
} CGroupDevicePermissions;

struct CGroupDeviceAllow {
        LIST_FIELDS(CGroupDeviceAllow, device_allow);
        char *path;
        CGroupDevicePermissions permissions;
};

struct CGroupIODeviceWeight {
        LIST_FIELDS(CGroupIODeviceWeight, device_weights);
        char *path;
        uint64_t weight;
};

struct CGroupIODeviceLimit {
        LIST_FIELDS(CGroupIODeviceLimit, device_limits);
        char *path;
        uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
};

struct CGroupIODeviceLatency {
        LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
        char *path;
        usec_t target_usec;
};

struct CGroupBlockIODeviceWeight {
        LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
        char *path;
        uint64_t weight;
};

struct CGroupBlockIODeviceBandwidth {
        LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
        char *path;
        uint64_t rbps;
        uint64_t wbps;
};

struct CGroupBPFForeignProgram {
        LIST_FIELDS(CGroupBPFForeignProgram, programs);
        uint32_t attach_type;
        char *bpffs_path;
};

struct CGroupSocketBindItem {
        LIST_FIELDS(CGroupSocketBindItem, socket_bind_items);
        int address_family;
        int ip_protocol;
        uint16_t nr_ports;
        uint16_t port_min;
};

typedef enum CGroupPressureWatch {
        CGROUP_PRESSURE_WATCH_OFF,      /* → tells the service payload explicitly not to watch for memory pressure */
        CGROUP_PRESSURE_WATCH_AUTO,     /* → on if memory account is on anyway for the unit, otherwise off */
        CGROUP_PRESSURE_WATCH_ON,
        CGROUP_PRESSURE_WATCH_SKIP,     /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
        _CGROUP_PRESSURE_WATCH_MAX,
        _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
} CGroupPressureWatch;

struct CGroupContext {
        bool cpu_accounting;
        bool io_accounting;
        bool blockio_accounting;
        bool memory_accounting;
        bool tasks_accounting;
        bool ip_accounting;

        /* Configures the memory.oom.group attribute (on unified) */
        bool memory_oom_group;

        bool delegate;
        CGroupMask delegate_controllers;
        CGroupMask disable_controllers;
        char *delegate_subgroup;

        /* For unified hierarchy */
        uint64_t cpu_weight;
        uint64_t startup_cpu_weight;
        usec_t cpu_quota_per_sec_usec;
        usec_t cpu_quota_period_usec;

        CPUSet cpuset_cpus;
        CPUSet startup_cpuset_cpus;
        CPUSet cpuset_mems;
        CPUSet startup_cpuset_mems;

        uint64_t io_weight;
        uint64_t startup_io_weight;
        LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
        LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
        LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);

        uint64_t default_memory_min;
        uint64_t default_memory_low;
        uint64_t default_startup_memory_low;
        uint64_t memory_min;
        uint64_t memory_low;
        uint64_t startup_memory_low;
        uint64_t memory_high;
        uint64_t startup_memory_high;
        uint64_t memory_max;
        uint64_t startup_memory_max;
        uint64_t memory_swap_max;
        uint64_t startup_memory_swap_max;
        uint64_t memory_zswap_max;
        uint64_t startup_memory_zswap_max;

        bool default_memory_min_set:1;
        bool default_memory_low_set:1;
        bool default_startup_memory_low_set:1;
        bool memory_min_set:1;
        bool memory_low_set:1;
        bool startup_memory_low_set:1;
        bool startup_memory_high_set:1;
        bool startup_memory_max_set:1;
        bool startup_memory_swap_max_set:1;
        bool startup_memory_zswap_max_set:1;

        Set *ip_address_allow;
        Set *ip_address_deny;
        /* These two flags indicate that redundant entries have been removed from
         * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
        bool ip_address_allow_reduced;
        bool ip_address_deny_reduced;

        char **ip_filters_ingress;
        char **ip_filters_egress;
        LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs);

        Set *restrict_network_interfaces;
        bool restrict_network_interfaces_is_allow_list;

        /* For legacy hierarchies */
        uint64_t cpu_shares;
        uint64_t startup_cpu_shares;

        uint64_t blockio_weight;
        uint64_t startup_blockio_weight;
        LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
        LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);

        uint64_t memory_limit;

        CGroupDevicePolicy device_policy;
        LIST_HEAD(CGroupDeviceAllow, device_allow);

        LIST_HEAD(CGroupSocketBindItem, socket_bind_allow);
        LIST_HEAD(CGroupSocketBindItem, socket_bind_deny);

        /* Common */
        CGroupTasksMax tasks_max;

        /* Settings for systemd-oomd */
        ManagedOOMMode moom_swap;
        ManagedOOMMode moom_mem_pressure;
        uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
        ManagedOOMPreference moom_preference;

        /* Memory pressure logic */
        CGroupPressureWatch memory_pressure_watch;
        usec_t memory_pressure_threshold_usec;
        /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
         * triggers, nor triggers for non-memory pressure. We might add that later. */

        NFTSetContext nft_set_context;

        /* Forward coredumps for processes that crash within this cgroup.
         * Requires 'delegate' to also be true. */
        bool coredump_receive;
};

/* Used when querying IP accounting data */
typedef enum CGroupIPAccountingMetric {
        CGROUP_IP_INGRESS_BYTES,
        CGROUP_IP_INGRESS_PACKETS,
        CGROUP_IP_EGRESS_BYTES,
        CGROUP_IP_EGRESS_PACKETS,
        _CGROUP_IP_ACCOUNTING_METRIC_MAX,
        _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -EINVAL,
} CGroupIPAccountingMetric;

/* Used when querying IO accounting data */
typedef enum CGroupIOAccountingMetric {
        CGROUP_IO_READ_BYTES,
        CGROUP_IO_WRITE_BYTES,
        CGROUP_IO_READ_OPERATIONS,
        CGROUP_IO_WRITE_OPERATIONS,
        _CGROUP_IO_ACCOUNTING_METRIC_MAX,
        _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -EINVAL,
} CGroupIOAccountingMetric;

typedef enum CGroupMemoryAccountingMetric {
        CGROUP_MEMORY_PEAK,
        CGROUP_MEMORY_SWAP_PEAK,
        /* We cache the above attributes, so that they can be fetched even after the cgroup is gone, e.g.
         * when systemd-run exits. */
        _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST = CGROUP_MEMORY_SWAP_PEAK,

        /* These attributes are transient, so no need for caching. */
        CGROUP_MEMORY_SWAP_CURRENT,
        CGROUP_MEMORY_ZSWAP_CURRENT,

        _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX,
        _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL,
} CGroupMemoryAccountingMetric;

typedef struct Unit Unit;
typedef struct Manager Manager;
typedef enum ManagerState ManagerState;

uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);

usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);

void cgroup_context_init(CGroupContext *c);
void cgroup_context_done(CGroupContext *c);
void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f);

void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p);
void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head);

static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
        assert(c);

        return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON ||
                (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting);
}

int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);

void unit_modify_nft_set(Unit *u, bool add);

CGroupMask unit_get_own_mask(Unit *u);
CGroupMask unit_get_delegate_mask(Unit *u);
CGroupMask unit_get_members_mask(Unit *u);
CGroupMask unit_get_siblings_mask(Unit *u);
CGroupMask unit_get_ancestor_disable_mask(Unit *u);

CGroupMask unit_get_target_mask(Unit *u);
CGroupMask unit_get_enable_mask(Unit *u);

void unit_invalidate_cgroup_members_masks(Unit *u);

void unit_add_family_to_cgroup_realize_queue(Unit *u);

const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
int unit_default_cgroup_path(const Unit *u, char **ret);
int unit_set_cgroup_path(Unit *u, const char *path);
int unit_pick_cgroup_path(Unit *u);

int unit_realize_cgroup(Unit *u);
void unit_prune_cgroup(Unit *u);
int unit_watch_cgroup(Unit *u);
int unit_watch_cgroup_memory(Unit *u);
void unit_add_to_cgroup_realize_queue(Unit *u);

void unit_release_cgroup(Unit *u);
/* Releases the cgroup only if it is recursively empty.
 * Returns true if the cgroup was released, false otherwise. */
bool unit_maybe_release_cgroup(Unit *u);

void unit_add_to_cgroup_empty_queue(Unit *u);
int unit_check_oomd_kill(Unit *u);
int unit_check_oom(Unit *u);

int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);

int manager_setup_cgroup(Manager *m);
void manager_shutdown_cgroup(Manager *m, bool delete);

unsigned manager_dispatch_cgroup_realize_queue(Manager *m);

Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid);
Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid);
Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid);
Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);

uint64_t unit_get_ancestor_memory_min(Unit *u);
uint64_t unit_get_ancestor_memory_low(Unit *u);
uint64_t unit_get_ancestor_startup_memory_low(Unit *u);

int unit_search_main_pid(Unit *u, PidRef *ret);
int unit_watch_all_pids(Unit *u);

int unit_synthesize_cgroup_empty_event(Unit *u);

int unit_get_memory_available(Unit *u, uint64_t *ret);
int unit_get_memory_current(Unit *u, uint64_t *ret);
int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret);
int unit_get_tasks_current(Unit *u, uint64_t *ret);
int unit_get_cpu_usage(Unit *u, nsec_t *ret);
int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);

int unit_reset_cpu_accounting(Unit *u);
void unit_reset_memory_accounting_last(Unit *u);
int unit_reset_ip_accounting(Unit *u);
void unit_reset_io_accounting_last(Unit *u);
int unit_reset_io_accounting(Unit *u);
int unit_reset_accounting(Unit *u);

#define UNIT_CGROUP_BOOL(u, name)                       \
        ({                                              \
        CGroupContext *cc = unit_get_cgroup_context(u); \
        cc ? cc->name : false;                          \
        })

bool manager_owns_host_root_cgroup(Manager *m);
bool unit_has_host_root_cgroup(Unit *u);

bool unit_has_startup_cgroup_constraints(Unit *u);

int manager_notify_cgroup_empty(Manager *m, const char *group);

void unit_invalidate_cgroup(Unit *u, CGroupMask m);
void unit_invalidate_cgroup_bpf(Unit *u);

void manager_invalidate_startup_units(Manager *m);

const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;

void unit_cgroup_catchup(Unit *u);

bool unit_cgroup_delegate(Unit *u);

int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);
int unit_cgroup_freezer_action(Unit *u, FreezerAction action);

const char* freezer_action_to_string(FreezerAction a) _const_;
FreezerAction freezer_action_from_string(const char *s) _pure_;

const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_;
CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_;

const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) _const_;
CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) _pure_;

const char* cgroup_ip_accounting_metric_to_string(CGroupIPAccountingMetric m) _const_;
CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) _pure_;

const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_;
CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_;

const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_;
CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_;