diff options
Diffstat (limited to 'src/nsresourced/bpf')
3 files changed, 221 insertions, 0 deletions
diff --git a/src/nsresourced/bpf/userns_restrict/meson.build b/src/nsresourced/bpf/userns_restrict/meson.build new file mode 100644 index 0000000..d773c75 --- /dev/null +++ b/src/nsresourced/bpf/userns_restrict/meson.build @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if conf.get('HAVE_VMLINUX_H') != 1 + subdir_done() +endif + +userns_restrict_bpf_o_unstripped = custom_target( + 'userns-restrict.bpf.unstripped.o', + input : 'userns-restrict.bpf.c', + output : 'userns-restrict.bpf.unstripped.o', + command : bpf_o_unstripped_cmd, + depends : vmlinux_h_dependency) + +userns_restrict_bpf_o = custom_target( + 'userns-restrict.bpf.o', + input : userns_restrict_bpf_o_unstripped, + output : 'userns-restrict.bpf.o', + command : bpf_o_cmd) + +userns_restrict_skel_h = custom_target( + 'userns-restrict.skel.h', + input : userns_restrict_bpf_o, + output : 'userns-restrict.skel.h', + command : skel_h_cmd, + capture : true) diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h b/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h new file mode 100644 index 0000000..271caf4 --- /dev/null +++ b/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +#include "bpf-dlopen.h" + +/* libbpf is used via dlopen(), so rename symbols */ +#define bpf_object__attach_skeleton sym_bpf_object__attach_skeleton +#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton +#define bpf_object__load_skeleton sym_bpf_object__load_skeleton +#define bpf_object__open_skeleton sym_bpf_object__open_skeleton + +#include "bpf/userns_restrict/userns-restrict.skel.h" diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c b/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c new file mode 100644 index 0000000..126422b --- /dev/null +++ b/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +/* If offsetof() is implemented via __builtin_offset() then it doesn't work on current compilers, since the + * built-ins do not understand CO-RE. Let's undefine any such macros here, to force bpf_helpers.h to define + * its own definitions for this. (In new versions it will do so automatically, but at least in libbpf 1.1.0 + * it does not.) */ +#undef offsetof +#undef container_of + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <errno.h> + +#ifndef bpf_core_cast +/* bpf_rdonly_cast() was introduced in libbpf commit 688879f together with + * the definition of a bpf_core_cast macro. So use that one to avoid + * defining a prototype for bpf_rdonly_cast */ +void *bpf_rdonly_cast(void *, __u32) __ksym; +#endif + +/* BPF module that implements an allowlist of mounts (identified by mount ID) for user namespaces (identified + * by their inode number in nsfs) that restricts creation of inodes (which would inherit the callers UID/GID) + * or changing of ownership (similar). + * + * This hooks into the various path-based LSM entrypoints that control inode creation as well as chmod(), and + * then looks up the calling process' user namespace in a global map of namespaces, which points us to + * another map that is simply a list of allowed mnt_ids. */ + +// FIXME: ACL adjustments are currently not blocked. There's no path-based LSM hook available in the kernel +// for setting xattrs or ACLs, hence we cannot easily block them, even though we want that. We can get away +// with ignoring this for now, as ACLs never define ownership, but purely access: i.e. ACLs never allow +// taking possession of an object, but only control access to it. Thus, things like suid access modes should +// not be reachable through it. It still sucks though that a user can persistently add an ACL entry to a file +// with their transient UIDs/GIDs. + +/* kernel currently enforces a maximum usernamespace nesting depth of 32, see create_user_ns() in the kernel sources */ +#define USER_NAMESPACE_DEPTH_MAX 32U + +struct mnt_id_map { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); /* placeholder, configured otherwise by nsresourced */ + __type(key, int); + __type(value, int); +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); /* placeholder, configured otherwise by nsresourced */ + __type(key, unsigned); /* userns inode */ + __array(values, struct mnt_id_map); +} userns_mnt_id_hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); +} userns_ringbuf SEC(".maps"); + +static inline struct mount *real_mount(struct vfsmount *mnt) { + return container_of(mnt, struct mount, mnt); +} + +static int validate_inode_on_mount(struct inode *inode, struct vfsmount *v) { + struct user_namespace *mount_userns, *task_userns, *p; + unsigned task_userns_inode; + struct task_struct *task; + void *mnt_id_map; + struct mount *m; + int mnt_id; + + /* Get user namespace from vfsmount */ + m = bpf_rdonly_cast(real_mount(v), bpf_core_type_id_kernel(struct mount)); + mount_userns = m->mnt_ns->user_ns; + + /* Get user namespace from task */ + task = (struct task_struct*) bpf_get_current_task_btf(); + task_userns = task->cred->user_ns; + + /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say + * yes immediately. */ + p = mount_userns; + for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) { + if (p == task_userns) + return 0; /* our task's user namespace (or a child thereof) owns this superblock: allow! */ + + p = p->parent; + if (!p) + break; + } + + /* Hmm, something is fishy if there's more than 32 levels of namespaces involved. Let's better be + * safe than sorry, and refuse. */ + if (p) + return -EPERM; + + /* This is a mount foreign to our task's user namespace, let's consult our allow list */ + task_userns_inode = task_userns->ns.inum; + + mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &task_userns_inode); + if (!mnt_id_map) /* No rules installed for this userns? Then say yes, too! */ + return 0; + + mnt_id = m->mnt_id; + + /* Otherwise, say yes if the mount ID is allowlisted */ + if (bpf_map_lookup_elem(mnt_id_map, &mnt_id)) + return 0; + + return -EPERM; +} + +static int validate_path(const struct path *path, int ret) { + struct inode *inode; + struct vfsmount *v; + + if (ret != 0) /* propagate earlier error */ + return ret; + + inode = path->dentry->d_inode; + v = path->mnt; + + return validate_inode_on_mount(inode, v); +} + +SEC("lsm/path_chown") +int BPF_PROG(userns_restrict_path_chown, struct path *path, void* uid, void *gid, int ret) { + return validate_path(path, ret); +} + +SEC("lsm/path_mkdir") +int BPF_PROG(userns_restrict_path_mkdir, struct path *dir, struct dentry *dentry, umode_t mode, int ret) { + return validate_path(dir, ret); +} + +SEC("lsm/path_mknod") +int BPF_PROG(userns_restrict_path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev, int ret) { + return validate_path(dir, ret); +} + +SEC("lsm/path_symlink") +int BPF_PROG(userns_restrict_path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name, int ret) { + return validate_path(dir, ret); +} + +SEC("lsm/path_link") +int BPF_PROG(userns_restrict_path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, int ret) { + return validate_path(new_dir, ret); +} + +SEC("kprobe/free_user_ns") +void BPF_KPROBE(userns_restrict_free_user_ns, struct work_struct *work) { + struct user_namespace *userns; + unsigned inode; + void *mnt_id_map; + + /* Inform userspace that a user namespace just went away. I wish there was a nicer way to hook into + * user namespaces being deleted than using kprobes, but couldn't find any. */ + + userns = bpf_rdonly_cast(container_of(work, struct user_namespace, work), + bpf_core_type_id_kernel(struct user_namespace)); + + inode = userns->ns.inum; + + mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &inode); + if (!mnt_id_map) /* No rules installed for this userns? Then send no notification. */ + return; + + bpf_ringbuf_output(&userns_ringbuf, &inode, sizeof(inode), 0); +} + +static const char _license[] SEC("license") = "GPL"; |