src/nsresourced/userns-restrict.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346

/* SPDX-License-Identifier: LGPL-2.1-or-later */

#include "userns-restrict.h"

#if HAVE_VMLINUX_H

#include <sched.h>

#include "bpf-dlopen.h"
#include "bpf-link.h"
#include "fd-util.h"
#include "fs-util.h"
#include "lsm-util.h"
#include "missing_mount.h"
#include "mkdir.h"
#include "mount-util.h"
#include "mountpoint-util.h"
#include "namespace-util.h"
#include "path-util.h"

#define USERNS_MAX (16U*1024U)
#define MOUNTS_MAX 4096U

#define PROGRAM_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/programs"
#define MAP_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/maps"

struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
        (void) userns_restrict_bpf__destroy(obj); /* this call is fine with NULL */
        return NULL;
}

static int make_inner_hash_map(void) {
        int fd;

        fd = compat_bpf_map_create(
                        BPF_MAP_TYPE_HASH,
                        NULL,
                        sizeof(int),
                        sizeof(uint32_t),
                        MOUNTS_MAX,
                        NULL);
        if (fd < 0)
                return log_debug_errno(errno, "Failed allocate inner BPF map: %m");

        return fd;
}

int userns_restrict_install(
                bool pin,
                struct userns_restrict_bpf **ret) {

        _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL;
        _cleanup_close_ int dummy_mnt_id_hash_fd = -EBADF;
        int r;

        r = lsm_supported("bpf");
        if (r < 0)
                return r;
        if (r == 0)
                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm not supported, can't lock down user namespace.");

        r = dlopen_bpf();
        if (r < 0)
                return r;

        /* bpf_object__next_map() is not available in libbpf pre-0.7.0, and we want to use it. */
        if (!sym_bpf_object__next_map)
                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libbpf too old for locking down user namespace.");

        obj = userns_restrict_bpf__open();
        if (!obj)
                return log_error_errno(errno, "Failed to open userns_restrict BPF object: %m");

        if (pin) {
                struct bpf_map *map;

                /* libbpf will only create one level of dirs. Let's create the rest */
                (void) mkdir_p(MAP_LINK_PREFIX, 0755);
                (void) mkdir_p(PROGRAM_LINK_PREFIX, 0755);

                map = sym_bpf_object__next_map(obj->obj, NULL);
                while (map) {
                        _cleanup_free_ char *fn = NULL;

                        fn = path_join(MAP_LINK_PREFIX, sym_bpf_map__name(map));
                        if (!fn)
                                return log_oom();

                        r = sym_bpf_map__set_pin_path(map, fn);
                        if (r < 0)
                                return log_error_errno(r, "Failed to set pin path to '%s': %m", fn);

                        map = sym_bpf_object__next_map(obj->obj, map);
                }
        }

        r = sym_bpf_map__set_max_entries(obj->maps.userns_mnt_id_hash, USERNS_MAX);
        if (r < 0)
                return log_error_errno(r, "Failed to size userns/mnt_id hash table: %m");

        r = sym_bpf_map__set_max_entries(obj->maps.userns_ringbuf, USERNS_MAX * sizeof(unsigned int));
        if (r < 0)
                return log_error_errno(r, "Failed to size userns ring buffer: %m");

        /* Dummy map to satisfy the verifier */
        dummy_mnt_id_hash_fd = make_inner_hash_map();
        if (dummy_mnt_id_hash_fd < 0)
                return dummy_mnt_id_hash_fd;

        r = sym_bpf_map__set_inner_map_fd(obj->maps.userns_mnt_id_hash, dummy_mnt_id_hash_fd);
        if (r < 0)
                return log_error_errno(r, "Failed to set inner BPF map: %m");

        r = userns_restrict_bpf__load(obj);
        if (r < 0)
                return log_error_errno(r, "Failed to load BPF object: %m");

        for (int i = 0; i < obj->skeleton->prog_cnt; i++) {
                _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
                struct bpf_prog_skeleton *ps = obj->skeleton->progs + i;
                _cleanup_free_ char *fn = NULL;
                bool linked = false;
                const char *e;

                e = startswith(ps->name, "userns_restrict_");
                assert(e);

                if (pin) {
                        fn = path_join(PROGRAM_LINK_PREFIX, e);
                        if (!fn)
                                return log_oom();

                        link = sym_bpf_link__open(fn);
                        r = bpf_get_error_translated(link);
                        if (r < 0) {
                                if (r != -ENOENT)
                                        return log_error_errno(r, "Unable to open pinned program link: %m");
                                link = NULL;
                        } else {
                                linked = true;
                                log_info("userns-restrict BPF-LSM program %s already attached.", ps->name);
                        }
                }

                if (!link) {
                        link = sym_bpf_program__attach(*ps->prog);
                        r = bpf_get_error_translated(link);
                        if (r < 0)
                                return log_error_errno(r, "Failed to attach LSM BPF program: %m");

                        log_info("userns-restrict BPF-LSM program %s now attached.", ps->name);
                }

                if (pin && !linked) {
                        assert(fn);

                        r = sym_bpf_link__pin(link, fn);
                        if (r < 0)
                                return log_error_errno(r, "Failed to pin LSM attachment: %m");
                }

                *ps->link = TAKE_PTR(link);
        }

        if (pin) {
                r = sym_bpf_object__pin_maps(obj->obj, NULL);
                if (r < 0)
                        return log_error_errno(r, "Failed to pin BPF maps: %m");
        }

        if (ret)
                *ret = TAKE_PTR(obj);

        return 0;
}

int userns_restrict_put_by_inode(
                struct userns_restrict_bpf *obj,
                uint64_t userns_inode,
                bool replace,
                const int mount_fds[],
                size_t n_mount_fds) {

        _cleanup_close_ int inner_map_fd = -EBADF;
        _cleanup_free_ int *mnt_ids = NULL;
        uint64_t ino = userns_inode;
        int r, outer_map_fd;

        assert(obj);
        assert(userns_inode != 0);
        assert(n_mount_fds == 0 || mount_fds);

        /* The BPF map type BPF_MAP_TYPE_HASH_OF_MAPS only supports 32bit keys, and user namespace inode
         * numbers are 32bit too, even though ino_t is 64bit these days. Should we ever run into a 64bit
         * inode let's refuse early, we can't support this with the current BPF code for now. */
        if (userns_inode > UINT32_MAX)
                return -EINVAL;

        mnt_ids = new(int, n_mount_fds);
        if (!mnt_ids)
                return -ENOMEM;

        for (size_t i = 0; i < n_mount_fds; i++) {
                r = path_get_mnt_id_at(mount_fds[i], "", mnt_ids + i);
                if (r < 0)
                        return log_debug_errno(r, "Failed to get mount ID: %m");
        }

        outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
        if (outer_map_fd < 0)
                return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");

        if (replace) {
                /* Add if missing, replace if already exists */
                inner_map_fd = make_inner_hash_map();
                if (inner_map_fd < 0)
                        return inner_map_fd;

                r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_ANY);
                if (r < 0)
                        return log_debug_errno(r, "Failed to replace map in inode hash: %m");
        } else {
                /* Let's add an entry for this userns inode if missing. If it exists just extend the existing map. We
                 * might race against each other, hence we try a couple of times */
                for (size_t n_try = 10;; n_try--) {
                        uint32_t innermap_id;

                        if (n_try == 0)
                                return log_debug_errno(SYNTHETIC_ERRNO(EEXIST),
                                                       "Stillcan't create inode entry in BPF map after 10 tries.");

                        r = sym_bpf_map_lookup_elem(outer_map_fd, &ino, &innermap_id);
                        if (r >= 0) {
                                inner_map_fd = sym_bpf_map_get_fd_by_id(innermap_id);
                                if (inner_map_fd < 0)
                                        return log_debug_errno(inner_map_fd, "Failed to get file descriptor for inner map: %m");

                                break;
                        }
                        if (errno != ENOENT)
                                return log_debug_errno(errno, "Failed to look up inode hash entry: %m");

                        /* No entry for this user namespace yet. Let's create one */
                        inner_map_fd = make_inner_hash_map();
                        if (inner_map_fd < 0)
                                return inner_map_fd;

                        r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_NOEXIST);
                        if (r >= 0)
                                break;
                        if (errno != EEXIST)
                                return log_debug_errno(errno, "Failed to add mount ID list to inode hash: %m");
                }
        }

        FOREACH_ARRAY(mntid, mnt_ids, n_mount_fds) {
                uint32_t dummy_value = 1;

                r = sym_bpf_map_update_elem(inner_map_fd, mntid, &dummy_value, BPF_ANY);
                if (r < 0)
                        return log_debug_errno(r, "Failed to add mount ID to map: %m");

                log_debug("Allowing mount %i on userns inode %" PRIu64, *mntid, ino);
        }

        return 0;
}

int userns_restrict_put_by_fd(
                struct userns_restrict_bpf *obj,
                int userns_fd,
                bool replace,
                const int mount_fds[],
                size_t n_mount_fds) {

        struct stat st;
        int r;

        assert(obj);
        assert(userns_fd >= 0);
        assert(n_mount_fds == 0 || mount_fds);

        r = fd_is_ns(userns_fd, CLONE_NEWUSER);
        if (r < 0)
                return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
        if (r == 0)
                return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd.");

        if (fstat(userns_fd, &st) < 0)
                return log_debug_errno(errno, "Failed to fstat() user namespace: %m");

        return userns_restrict_put_by_inode(
                        obj,
                        st.st_ino,
                        replace,
                        mount_fds,
                        n_mount_fds);
}

int userns_restrict_reset_by_inode(
                struct userns_restrict_bpf *obj,
                uint64_t ino) {

        int r, outer_map_fd;
        unsigned u;

        assert(obj);
        assert(ino != 0);

        if (ino > UINT32_MAX) /* inodes larger than 32bit are definitely not included in our map, exit early */
                return 0;

        outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
        if (outer_map_fd < 0)
                return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");

        u = (uint32_t) ino;

        r = sym_bpf_map_delete_elem(outer_map_fd, &u);
        if (r < 0)
                return log_debug_errno(r, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", ino);

        return 0;
}

#else
int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret) {
        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
}

struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
        return NULL;
}

int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds) {
        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
}

int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds) {
        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
}

int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) {
        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
}
#endif