summaryrefslogtreecommitdiffstats
path: root/src/shared/bpf-program.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/shared/bpf-program.c')
-rw-r--r--src/shared/bpf-program.c513
1 files changed, 513 insertions, 0 deletions
diff --git a/src/shared/bpf-program.c b/src/shared/bpf-program.c
new file mode 100644
index 0000000..bbdd4f6
--- /dev/null
+++ b/src/shared/bpf-program.c
@@ -0,0 +1,513 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bpf-program.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "memory-util.h"
+#include "missing_syscall.h"
+#include "path-util.h"
+#include "serialize.h"
+#include "string-table.h"
+
+static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = {
+ [BPF_CGROUP_INET_INGRESS] = "ingress",
+ [BPF_CGROUP_INET_EGRESS] = "egress",
+ [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
+ [BPF_CGROUP_SOCK_OPS] = "sock_ops",
+ [BPF_CGROUP_DEVICE] = "device",
+ [BPF_CGROUP_INET4_BIND] = "bind4",
+ [BPF_CGROUP_INET6_BIND] = "bind6",
+ [BPF_CGROUP_INET4_CONNECT] = "connect4",
+ [BPF_CGROUP_INET6_CONNECT] = "connect6",
+ [BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
+ [BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
+ [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
+ [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
+ [BPF_CGROUP_SYSCTL] = "sysctl",
+ [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
+ [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
+ [BPF_CGROUP_GETSOCKOPT] = "getsockopt",
+ [BPF_CGROUP_SETSOCKOPT] = "setsockopt",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int);
+
+DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_free);
+
+BPFProgram *bpf_program_free(BPFProgram *p) {
+ if (!p)
+ return NULL;
+ /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
+ * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
+ * programs that attached one of their BPF programs to a cgroup will leave this program pinned for good with
+ * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
+ * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
+ * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
+ * counter this, we track closely to which cgroup a program was attached to and will detach it on our own
+ * whenever we close the BPF fd. */
+ (void) bpf_program_cgroup_detach(p);
+
+ safe_close(p->kernel_fd);
+ free(p->prog_name);
+ free(p->instructions);
+ free(p->attached_path);
+
+ return mfree(p);
+}
+
+ /* struct bpf_prog_info info must be initialized since its value is both input and output
+ * for BPF_OBJ_GET_INFO_BY_FD syscall. */
+static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) {
+ union bpf_attr attr;
+
+ /* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when
+ * structured initialization is used.
+ * Refer to https://github.com/systemd/systemd/issues/18164
+ */
+ zero(attr);
+ attr.info.bpf_fd = prog_fd;
+ attr.info.info_len = info_len;
+ attr.info.info = PTR_TO_UINT64(info);
+
+ return RET_NERRNO(bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)));
+}
+
+int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret) {
+ _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
+ _cleanup_free_ char *name = NULL;
+
+ if (prog_name) {
+ if (strlen(prog_name) >= BPF_OBJ_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ name = strdup(prog_name);
+ if (!name)
+ return -ENOMEM;
+ }
+
+ p = new(BPFProgram, 1);
+ if (!p)
+ return -ENOMEM;
+
+ *p = (BPFProgram) {
+ .prog_type = prog_type,
+ .kernel_fd = -EBADF,
+ .prog_name = TAKE_PTR(name),
+ };
+
+ *ret = TAKE_PTR(p);
+
+ return 0;
+}
+
+int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) {
+ _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
+ struct bpf_prog_info info = {};
+ int r;
+
+ assert(path);
+ assert(ret);
+
+ p = new(BPFProgram, 1);
+ if (!p)
+ return -ENOMEM;
+
+ *p = (BPFProgram) {
+ .prog_type = BPF_PROG_TYPE_UNSPEC,
+ .kernel_fd = -EBADF,
+ };
+
+ r = bpf_program_load_from_bpf_fs(p, path);
+ if (r < 0)
+ return r;
+
+ r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ p->prog_type = info.type;
+ *ret = TAKE_PTR(p);
+
+ return 0;
+}
+
+
+int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) {
+
+ assert(p);
+
+ if (p->kernel_fd >= 0) /* don't allow modification after we uploaded things to the kernel */
+ return -EBUSY;
+
+ if (!GREEDY_REALLOC(p->instructions, p->n_instructions + count))
+ return -ENOMEM;
+
+ memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count);
+ p->n_instructions += count;
+
+ return 0;
+}
+
+int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) {
+ union bpf_attr attr;
+
+ assert(p);
+
+ if (p->kernel_fd >= 0) { /* make this idempotent */
+ memzero(log_buf, log_size);
+ return 0;
+ }
+
+ // FIXME: Clang doesn't 0-pad with structured initialization, causing
+ // the kernel to reject the bpf_attr as invalid. See:
+ // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
+ // Ideally it should behave like GCC, so that we can remove these workarounds.
+ zero(attr);
+ attr.prog_type = p->prog_type;
+ attr.insns = PTR_TO_UINT64(p->instructions);
+ attr.insn_cnt = p->n_instructions;
+ attr.license = PTR_TO_UINT64("GPL");
+ attr.log_buf = PTR_TO_UINT64(log_buf);
+ attr.log_level = !!log_buf;
+ attr.log_size = log_size;
+ if (p->prog_name)
+ strncpy(attr.prog_name, p->prog_name, BPF_OBJ_NAME_LEN - 1);
+
+ p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+ if (p->kernel_fd < 0)
+ return -errno;
+
+ return 0;
+}
+
+int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path) {
+ union bpf_attr attr;
+
+ assert(p);
+
+ if (p->kernel_fd >= 0) /* don't overwrite an assembled or loaded program */
+ return -EBUSY;
+
+ zero(attr);
+ attr.pathname = PTR_TO_UINT64(path);
+
+ p->kernel_fd = bpf(BPF_OBJ_GET, &attr, sizeof(attr));
+ if (p->kernel_fd < 0)
+ return -errno;
+
+ return 0;
+}
+
+int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
+ _cleanup_free_ char *copy = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ union bpf_attr attr;
+ int r;
+
+ assert(p);
+ assert(type >= 0);
+ assert(path);
+
+ if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
+ return -EINVAL;
+
+ /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
+ * refuse this early. */
+ if (p->attached_path) {
+ if (!path_equal(p->attached_path, path))
+ return -EBUSY;
+ if (p->attached_type != type)
+ return -EBUSY;
+ if (p->attached_flags != flags)
+ return -EBUSY;
+
+ /* Here's a shortcut: if we previously attached this program already, then we don't have to do so
+ * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
+ * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
+ * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
+ * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
+ * would remain in effect. */
+ if (flags != BPF_F_ALLOW_OVERRIDE)
+ return 0;
+ }
+
+ /* Ensure we have a kernel object for this. */
+ r = bpf_program_load_kernel(p, NULL, 0);
+ if (r < 0)
+ return r;
+
+ copy = strdup(path);
+ if (!copy)
+ return -ENOMEM;
+
+ fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+
+ zero(attr);
+ attr.attach_type = type;
+ attr.target_fd = fd;
+ attr.attach_bpf_fd = p->kernel_fd;
+ attr.attach_flags = flags;
+
+ if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
+ return -errno;
+
+ free_and_replace(p->attached_path, copy);
+ p->attached_type = type;
+ p->attached_flags = flags;
+
+ return 0;
+}
+
+int bpf_program_cgroup_detach(BPFProgram *p) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(p);
+
+ if (!p->attached_path)
+ return -EUNATCH;
+
+ fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOENT)
+ return -errno;
+
+ /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
+ * implicitly by the removal, hence don't complain */
+
+ } else {
+ union bpf_attr attr;
+
+ zero(attr);
+ attr.attach_type = p->attached_type;
+ attr.target_fd = fd;
+ attr.attach_bpf_fd = p->kernel_fd;
+
+ if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
+ return -errno;
+ }
+
+ p->attached_path = mfree(p->attached_path);
+
+ return 0;
+}
+
+int bpf_map_new(
+ const char *name,
+ enum bpf_map_type type,
+ size_t key_size,
+ size_t value_size,
+ size_t max_entries,
+ uint32_t flags) {
+
+ union bpf_attr attr;
+ const char *n = name;
+
+ zero(attr);
+ attr.map_type = type;
+ attr.key_size = key_size;
+ attr.value_size = value_size;
+ attr.max_entries = max_entries;
+ attr.map_flags = flags;
+
+ /* The map name is primarily informational for debugging purposes, and typically too short
+ * to carry the full unit name, hence we employ a trivial lossy escaping to make it fit
+ * (truncation + only alphanumerical, "." and "_" are allowed as per
+ * https://www.kernel.org/doc/html/next/bpf/maps.html#usage-notes) */
+ for (size_t i = 0; i < sizeof(attr.map_name) - 1 && *n; i++, n++)
+ attr.map_name[i] = strchr(ALPHANUMERICAL ".", *n) ? *n : '_';
+
+ return RET_NERRNO(bpf(BPF_MAP_CREATE, &attr, sizeof(attr)));
+}
+
+int bpf_map_update_element(int fd, const void *key, void *value) {
+ union bpf_attr attr;
+
+ zero(attr);
+ attr.map_fd = fd;
+ attr.key = PTR_TO_UINT64(key);
+ attr.value = PTR_TO_UINT64(value);
+
+ return RET_NERRNO(bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)));
+}
+
+int bpf_map_lookup_element(int fd, const void *key, void *value) {
+ union bpf_attr attr;
+
+ zero(attr);
+ attr.map_fd = fd;
+ attr.key = PTR_TO_UINT64(key);
+ attr.value = PTR_TO_UINT64(value);
+
+ return RET_NERRNO(bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)));
+}
+
+int bpf_program_pin(int prog_fd, const char *bpffs_path) {
+ union bpf_attr attr;
+
+ zero(attr);
+ attr.pathname = PTR_TO_UINT64((void *) bpffs_path);
+ attr.bpf_fd = prog_fd;
+
+ return RET_NERRNO(bpf(BPF_OBJ_PIN, &attr, sizeof(attr)));
+}
+
+int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) {
+ struct bpf_prog_info info = {};
+ int r;
+
+ assert(ret_id);
+
+ r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ *ret_id = info.id;
+
+ return 0;
+};
+
+int bpf_program_serialize_attachment(
+ FILE *f,
+ FDSet *fds,
+ const char *key,
+ BPFProgram *p) {
+
+ _cleanup_free_ char *escaped = NULL;
+ int copy, r;
+
+ if (!p || !p->attached_path)
+ return 0;
+
+ assert(p->kernel_fd >= 0);
+
+ escaped = cescape(p->attached_path);
+ if (!escaped)
+ return -ENOMEM;
+
+ copy = fdset_put_dup(fds, p->kernel_fd);
+ if (copy < 0)
+ return log_error_errno(copy, "Failed to add BPF kernel fd to serialize: %m");
+
+ r = serialize_item_format(
+ f,
+ key,
+ "%i %s %s",
+ copy,
+ bpf_cgroup_attach_type_to_string(p->attached_type),
+ escaped);
+ if (r < 0)
+ return r;
+
+ /* After serialization, let's forget the fact that this program is attached. The attachment — if you
+ * so will — is now 'owned' by the serialization, and not us anymore. Why does that matter? Because
+ * of BPF's less-than-ideal lifecycle handling: to detach a program from a cgroup we have to
+ * explicitly do so, it's not done implicitly on close(). Now, since we are serializing here we don't
+ * want the program to be detached while freeing things, so that the attachment can be retained after
+ * deserializing again. bpf_program_free() implicitly detaches things, if attached_path is non-NULL,
+ * hence we set it to NULL here. */
+
+ p->attached_path = mfree(p->attached_path);
+ return 0;
+}
+
+int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set) {
+ BPFProgram *p;
+ int r;
+
+ SET_FOREACH(p, set) {
+ r = bpf_program_serialize_attachment(f, fds, key, p);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp) {
+ _cleanup_free_ char *sfd = NULL, *sat = NULL, *unescaped = NULL;
+ _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ ssize_t l;
+ int ifd, at, r;
+
+ assert(v);
+ assert(bpfp);
+
+ /* Extract first word: the fd number */
+ r = extract_first_word(&v, &sfd, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ ifd = parse_fd(sfd);
+ if (ifd < 0)
+ return r;
+
+ /* Extract second word: the attach type */
+ r = extract_first_word(&v, &sat, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ at = bpf_cgroup_attach_type_from_string(sat);
+ if (at < 0)
+ return at;
+
+ /* The rest is the path */
+ if (isempty(v))
+ return -EINVAL;
+
+ l = cunescape(v, 0, &unescaped);
+ if (l < 0)
+ return l;
+
+ fd = fdset_remove(fds, ifd);
+ if (fd < 0)
+ return fd;
+
+ p = new(BPFProgram, 1);
+ if (!p)
+ return -ENOMEM;
+
+ *p = (BPFProgram) {
+ .kernel_fd = TAKE_FD(fd),
+ .prog_type = BPF_PROG_TYPE_UNSPEC,
+ .attached_path = TAKE_PTR(unescaped),
+ .attached_type = at,
+ };
+
+ if (*bpfp)
+ bpf_program_free(*bpfp);
+
+ *bpfp = TAKE_PTR(p);
+ return 0;
+}
+
+int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp) {
+ BPFProgram *p = NULL;
+ int r;
+
+ assert(v);
+ assert(bpfsetp);
+
+ r = bpf_program_deserialize_attachment(v, fds, &p);
+ if (r < 0)
+ return r;
+
+ r = set_ensure_consume(bpfsetp, &bpf_program_hash_ops, p);
+ if (r < 0)
+ return r;
+
+ return 0;
+}