summaryrefslogtreecommitdiffstats
path: root/src/nsresourced/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'src/nsresourced/bpf')
-rw-r--r--src/nsresourced/bpf/userns_restrict/meson.build25
-rw-r--r--src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h17
-rw-r--r--src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c179
3 files changed, 221 insertions, 0 deletions
diff --git a/src/nsresourced/bpf/userns_restrict/meson.build b/src/nsresourced/bpf/userns_restrict/meson.build
new file mode 100644
index 0000000..d773c75
--- /dev/null
+++ b/src/nsresourced/bpf/userns_restrict/meson.build
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('HAVE_VMLINUX_H') != 1
+ subdir_done()
+endif
+
+userns_restrict_bpf_o_unstripped = custom_target(
+ 'userns-restrict.bpf.unstripped.o',
+ input : 'userns-restrict.bpf.c',
+ output : 'userns-restrict.bpf.unstripped.o',
+ command : bpf_o_unstripped_cmd,
+ depends : vmlinux_h_dependency)
+
+userns_restrict_bpf_o = custom_target(
+ 'userns-restrict.bpf.o',
+ input : userns_restrict_bpf_o_unstripped,
+ output : 'userns-restrict.bpf.o',
+ command : bpf_o_cmd)
+
+userns_restrict_skel_h = custom_target(
+ 'userns-restrict.skel.h',
+ input : userns_restrict_bpf_o,
+ output : 'userns-restrict.skel.h',
+ command : skel_h_cmd,
+ capture : true)
diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h b/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h
new file mode 100644
index 0000000..271caf4
--- /dev/null
+++ b/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include "bpf-dlopen.h"
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__attach_skeleton sym_bpf_object__attach_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+
+#include "bpf/userns_restrict/userns-restrict.skel.h"
diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c b/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c
new file mode 100644
index 0000000..126422b
--- /dev/null
+++ b/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* If offsetof() is implemented via __builtin_offset() then it doesn't work on current compilers, since the
+ * built-ins do not understand CO-RE. Let's undefine any such macros here, to force bpf_helpers.h to define
+ * its own definitions for this. (In new versions it will do so automatically, but at least in libbpf 1.1.0
+ * it does not.) */
+#undef offsetof
+#undef container_of
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <errno.h>
+
+#ifndef bpf_core_cast
+/* bpf_rdonly_cast() was introduced in libbpf commit 688879f together with
+ * the definition of a bpf_core_cast macro. So use that one to avoid
+ * defining a prototype for bpf_rdonly_cast */
+void *bpf_rdonly_cast(void *, __u32) __ksym;
+#endif
+
+/* BPF module that implements an allowlist of mounts (identified by mount ID) for user namespaces (identified
+ * by their inode number in nsfs) that restricts creation of inodes (which would inherit the callers UID/GID)
+ * or changing of ownership (similar).
+ *
+ * This hooks into the various path-based LSM entrypoints that control inode creation as well as chmod(), and
+ * then looks up the calling process' user namespace in a global map of namespaces, which points us to
+ * another map that is simply a list of allowed mnt_ids. */
+
+// FIXME: ACL adjustments are currently not blocked. There's no path-based LSM hook available in the kernel
+// for setting xattrs or ACLs, hence we cannot easily block them, even though we want that. We can get away
+// with ignoring this for now, as ACLs never define ownership, but purely access: i.e. ACLs never allow
+// taking possession of an object, but only control access to it. Thus, things like suid access modes should
+// not be reachable through it. It still sucks though that a user can persistently add an ACL entry to a file
+// with their transient UIDs/GIDs.
+
+/* kernel currently enforces a maximum usernamespace nesting depth of 32, see create_user_ns() in the kernel sources */
+#define USER_NAMESPACE_DEPTH_MAX 32U
+
+struct mnt_id_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1); /* placeholder, configured otherwise by nsresourced */
+ __type(key, int);
+ __type(value, int);
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __uint(max_entries, 1); /* placeholder, configured otherwise by nsresourced */
+ __type(key, unsigned); /* userns inode */
+ __array(values, struct mnt_id_map);
+} userns_mnt_id_hash SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_RINGBUF);
+ __uint(max_entries, 4096);
+} userns_ringbuf SEC(".maps");
+
+static inline struct mount *real_mount(struct vfsmount *mnt) {
+ return container_of(mnt, struct mount, mnt);
+}
+
+static int validate_inode_on_mount(struct inode *inode, struct vfsmount *v) {
+ struct user_namespace *mount_userns, *task_userns, *p;
+ unsigned task_userns_inode;
+ struct task_struct *task;
+ void *mnt_id_map;
+ struct mount *m;
+ int mnt_id;
+
+ /* Get user namespace from vfsmount */
+ m = bpf_rdonly_cast(real_mount(v), bpf_core_type_id_kernel(struct mount));
+ mount_userns = m->mnt_ns->user_ns;
+
+ /* Get user namespace from task */
+ task = (struct task_struct*) bpf_get_current_task_btf();
+ task_userns = task->cred->user_ns;
+
+ /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say
+ * yes immediately. */
+ p = mount_userns;
+ for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) {
+ if (p == task_userns)
+ return 0; /* our task's user namespace (or a child thereof) owns this superblock: allow! */
+
+ p = p->parent;
+ if (!p)
+ break;
+ }
+
+ /* Hmm, something is fishy if there's more than 32 levels of namespaces involved. Let's better be
+ * safe than sorry, and refuse. */
+ if (p)
+ return -EPERM;
+
+ /* This is a mount foreign to our task's user namespace, let's consult our allow list */
+ task_userns_inode = task_userns->ns.inum;
+
+ mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &task_userns_inode);
+ if (!mnt_id_map) /* No rules installed for this userns? Then say yes, too! */
+ return 0;
+
+ mnt_id = m->mnt_id;
+
+ /* Otherwise, say yes if the mount ID is allowlisted */
+ if (bpf_map_lookup_elem(mnt_id_map, &mnt_id))
+ return 0;
+
+ return -EPERM;
+}
+
+static int validate_path(const struct path *path, int ret) {
+ struct inode *inode;
+ struct vfsmount *v;
+
+ if (ret != 0) /* propagate earlier error */
+ return ret;
+
+ inode = path->dentry->d_inode;
+ v = path->mnt;
+
+ return validate_inode_on_mount(inode, v);
+}
+
+SEC("lsm/path_chown")
+int BPF_PROG(userns_restrict_path_chown, struct path *path, void* uid, void *gid, int ret) {
+ return validate_path(path, ret);
+}
+
+SEC("lsm/path_mkdir")
+int BPF_PROG(userns_restrict_path_mkdir, struct path *dir, struct dentry *dentry, umode_t mode, int ret) {
+ return validate_path(dir, ret);
+}
+
+SEC("lsm/path_mknod")
+int BPF_PROG(userns_restrict_path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev, int ret) {
+ return validate_path(dir, ret);
+}
+
+SEC("lsm/path_symlink")
+int BPF_PROG(userns_restrict_path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name, int ret) {
+ return validate_path(dir, ret);
+}
+
+SEC("lsm/path_link")
+int BPF_PROG(userns_restrict_path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, int ret) {
+ return validate_path(new_dir, ret);
+}
+
+SEC("kprobe/free_user_ns")
+void BPF_KPROBE(userns_restrict_free_user_ns, struct work_struct *work) {
+ struct user_namespace *userns;
+ unsigned inode;
+ void *mnt_id_map;
+
+ /* Inform userspace that a user namespace just went away. I wish there was a nicer way to hook into
+ * user namespaces being deleted than using kprobes, but couldn't find any. */
+
+ userns = bpf_rdonly_cast(container_of(work, struct user_namespace, work),
+ bpf_core_type_id_kernel(struct user_namespace));
+
+ inode = userns->ns.inum;
+
+ mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &inode);
+ if (!mnt_id_map) /* No rules installed for this userns? Then send no notification. */
+ return;
+
+ bpf_ringbuf_output(&userns_ringbuf, &inode, sizeof(inode), 0);
+}
+
+static const char _license[] SEC("license") = "GPL";