diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 08:24:23 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 08:24:23 +0000 |
commit | 67c27783d7735af6ba22b9f031d97ca4ea56c29c (patch) | |
tree | 17770fad3c90bf420cb2470e6e51255fcbf31bf9 /src | |
parent | Initial commit. (diff) | |
download | libbpf-upstream.tar.xz libbpf-upstream.zip |
Adding upstream version 1.1.0.upstream/1.1.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src')
41 files changed, 42565 insertions, 0 deletions
diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..0473afb --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,6 @@ +*.o +*.a +/libbpf.pc +/libbpf.so* +/staticobjs +/sharedobjs diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..d535f81 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,183 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' "$(1)" "$(2)" "$(if $(3), $(3))"; +endif + +LIBBPF_MAJOR_VERSION := 1 +LIBBPF_MINOR_VERSION := 1 +LIBBPF_PATCH_VERSION := 0 +LIBBPF_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).$(LIBBPF_PATCH_VERSION) +LIBBPF_MAJMIN_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).0 +LIBBPF_MAP_VERSION := $(shell grep -oE '^LIBBPF_([0-9.]+)' libbpf.map | sort -rV | head -n1 | cut -d'_' -f2) +ifneq ($(LIBBPF_MAJMIN_VERSION), $(LIBBPF_MAP_VERSION)) +$(error Libbpf release ($(LIBBPF_VERSION)) and map ($(LIBBPF_MAP_VERSION)) versions are out of sync!) +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +TOPDIR = .. + +INCLUDES := -I. -I$(TOPDIR)/include -I$(TOPDIR)/include/uapi +ALL_CFLAGS := $(INCLUDES) + +SHARED_CFLAGS += -fPIC -fvisibility=hidden -DSHARED + +CFLAGS ?= -g -O2 -Werror -Wall -std=gnu89 +ALL_CFLAGS += $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(EXTRA_CFLAGS) +ALL_LDFLAGS += $(LDFLAGS) $(EXTRA_LDFLAGS) + +ifdef NO_PKG_CONFIG + ALL_LDFLAGS += -lelf -lz +else + PKG_CONFIG ?= pkg-config + ALL_CFLAGS += $(shell $(PKG_CONFIG) --cflags libelf zlib) + ALL_LDFLAGS += $(shell $(PKG_CONFIG) --libs libelf zlib) +endif + +OBJDIR ?= . +SHARED_OBJDIR := $(OBJDIR)/sharedobjs +STATIC_OBJDIR := $(OBJDIR)/staticobjs +OBJS := bpf.o btf.o libbpf.o libbpf_errno.o netlink.o \ + nlattr.o str_error.o libbpf_probes.o bpf_prog_linfo.o \ + btf_dump.o hashmap.o ringbuf.o strset.o linker.o gen_loader.o \ + relo_core.o usdt.o +SHARED_OBJS := $(addprefix $(SHARED_OBJDIR)/,$(OBJS)) +STATIC_OBJS := $(addprefix $(STATIC_OBJDIR)/,$(OBJS)) + +STATIC_LIBS := $(OBJDIR)/libbpf.a +ifndef BUILD_STATIC_ONLY + SHARED_LIBS := $(OBJDIR)/libbpf.so \ + $(OBJDIR)/libbpf.so.$(LIBBPF_MAJOR_VERSION) \ + $(OBJDIR)/libbpf.so.$(LIBBPF_VERSION) + VERSION_SCRIPT := libbpf.map +endif + +HEADERS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h \ + bpf_helpers.h bpf_helper_defs.h bpf_tracing.h \ + bpf_endian.h bpf_core_read.h skel_internal.h libbpf_version.h \ + usdt.bpf.h +UAPI_HEADERS := $(addprefix $(TOPDIR)/include/uapi/linux/,\ + bpf.h bpf_common.h btf.h) + +PC_FILE := $(OBJDIR)/libbpf.pc + +INSTALL = install + +DESTDIR ?= + +HOSTARCH = $(firstword $(subst -, ,$(shell $(CC) -dumpmachine))) +ifeq ($(filter-out %64 %64be %64eb %64le %64el s390x, $(HOSTARCH)),) + LIBSUBDIR := lib64 +else + LIBSUBDIR := lib +endif + +# By default let the pc file itself use ${prefix} in includedir/libdir so that +# the prefix can be overridden at runtime (eg: --define-prefix) +ifndef LIBDIR + LIBDIR_PC := $$\{prefix\}/$(LIBSUBDIR) +else + LIBDIR_PC := $(LIBDIR) +endif +PREFIX ?= /usr +LIBDIR ?= $(PREFIX)/$(LIBSUBDIR) +INCLUDEDIR ?= $(PREFIX)/include +UAPIDIR ?= $(PREFIX)/include + +TAGS_PROG := $(if $(shell which etags 2>/dev/null),etags,ctags) + +all: $(STATIC_LIBS) $(SHARED_LIBS) $(PC_FILE) + +$(OBJDIR)/libbpf.a: $(STATIC_OBJS) + $(call msg,AR,$@) + $(Q)$(AR) rcs $@ $^ + +$(OBJDIR)/libbpf.so: $(OBJDIR)/libbpf.so.$(LIBBPF_MAJOR_VERSION) + $(Q)ln -sf $(^F) $@ + +$(OBJDIR)/libbpf.so.$(LIBBPF_MAJOR_VERSION): $(OBJDIR)/libbpf.so.$(LIBBPF_VERSION) + $(Q)ln -sf $(^F) $@ + +$(OBJDIR)/libbpf.so.$(LIBBPF_VERSION): $(SHARED_OBJS) + $(call msg,CC,$@) + $(Q)$(CC) -shared -Wl,--version-script=$(VERSION_SCRIPT) \ + -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \ + $^ $(ALL_LDFLAGS) -o $@ + +$(OBJDIR)/libbpf.pc: force + $(Q)sed -e "s|@PREFIX@|$(PREFIX)|" \ + -e "s|@LIBDIR@|$(LIBDIR_PC)|" \ + -e "s|@VERSION@|$(LIBBPF_VERSION)|" \ + < libbpf.pc.template > $@ + +$(STATIC_OBJDIR) $(SHARED_OBJDIR): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +$(STATIC_OBJDIR)/%.o: %.c | $(STATIC_OBJDIR) + $(call msg,CC,$@) + $(Q)$(CC) $(ALL_CFLAGS) $(CPPFLAGS) -c $< -o $@ + +$(SHARED_OBJDIR)/%.o: %.c | $(SHARED_OBJDIR) + $(call msg,CC,$@) + $(Q)$(CC) $(ALL_CFLAGS) $(SHARED_CFLAGS) $(CPPFLAGS) -c $< -o $@ + +define do_install + $(call msg,INSTALL,$1) + $(Q)if [ ! -d '$(DESTDIR)$2' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR)$2'; \ + fi; + $(Q)$(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR)$2' +endef + +# Preserve symlinks at installation. +define do_s_install + $(call msg,INSTALL,$1) + $(Q)if [ ! -d '$(DESTDIR)$2' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR)$2'; \ + fi; + $(Q)cp -fR $1 '$(DESTDIR)$2' +endef + +install: all install_headers install_pkgconfig + $(call do_s_install,$(STATIC_LIBS) $(SHARED_LIBS),$(LIBDIR)) + +install_headers: + $(call do_install,$(HEADERS),$(INCLUDEDIR)/bpf,644) + +# UAPI headers can be installed by a different package so they're not installed +# in by install rule. +install_uapi_headers: + $(call do_install,$(UAPI_HEADERS),$(UAPIDIR)/linux,644) + +install_pkgconfig: $(PC_FILE) + $(call do_install,$(PC_FILE),$(LIBDIR)/pkgconfig,644) + +clean: + $(call msg,CLEAN) + $(Q)rm -rf *.o *.a *.so *.so.* *.pc $(SHARED_OBJDIR) $(STATIC_OBJDIR) + +.PHONY: cscope tags force +cscope: + $(call msg,CSCOPE) + $(Q)ls *.c *.h > cscope.files + $(Q)cscope -b -q -f cscope.out + +tags: + $(call msg,CTAGS) + $(Q)rm -f TAGS tags + $(Q)ls *.c *.h | xargs $(TAGS_PROG) -a + +force: diff --git a/src/bpf.c b/src/bpf.c new file mode 100644 index 0000000..9aff98f --- /dev/null +++ b/src/bpf.c @@ -0,0 +1,1165 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * common eBPF ELF operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org> + * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com> + * Copyright (C) 2015 Huawei Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License (not later!) + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses> + */ + +#include <stdlib.h> +#include <string.h> +#include <memory.h> +#include <unistd.h> +#include <asm/unistd.h> +#include <errno.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/kernel.h> +#include <limits.h> +#include <sys/resource.h> +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +/* + * When building perf, unistd.h is overridden. __NR_bpf is + * required to be defined explicitly. + */ +#ifndef __NR_bpf +# if defined(__i386__) +# define __NR_bpf 357 +# elif defined(__x86_64__) +# define __NR_bpf 321 +# elif defined(__aarch64__) +# define __NR_bpf 280 +# elif defined(__sparc__) +# define __NR_bpf 349 +# elif defined(__s390__) +# define __NR_bpf 351 +# elif defined(__arc__) +# define __NR_bpf 280 +# elif defined(__mips__) && defined(_ABIO32) +# define __NR_bpf 4355 +# elif defined(__mips__) && defined(_ABIN32) +# define __NR_bpf 6319 +# elif defined(__mips__) && defined(_ABI64) +# define __NR_bpf 5315 +# else +# error __NR_bpf not defined. libbpf does not support your arch. +# endif +#endif + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ + return syscall(__NR_bpf, cmd, attr, size); +} + +static inline int sys_bpf_fd(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ + int fd; + + fd = sys_bpf(cmd, attr, size); + return ensure_good_fd(fd); +} + +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) +{ + int fd; + + do { + fd = sys_bpf_fd(BPF_PROG_LOAD, attr, size); + } while (fd < 0 && errno == EAGAIN && --attempts > 0); + + return fd; +} + +/* Probe whether kernel switched from memlock-based (RLIMIT_MEMLOCK) to + * memcg-based memory accounting for BPF maps and progs. This was done in [0]. + * We use the support for bpf_ktime_get_coarse_ns() helper, which was added in + * the same 5.11 Linux release ([1]), to detect memcg-based accounting for BPF. + * + * [0] https://lore.kernel.org/bpf/20201201215900.3569844-1-guro@fb.com/ + * [1] d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper") + */ +int probe_memcg_account(void) +{ + const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); + struct bpf_insn insns[] = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns), + BPF_EXIT_INSN(), + }; + size_t insn_cnt = ARRAY_SIZE(insns); + union bpf_attr attr; + int prog_fd; + + /* attempt loading freplace trying to use custom BTF */ + memset(&attr, 0, attr_sz); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insn_cnt; + attr.license = ptr_to_u64("GPL"); + + prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, attr_sz); + if (prog_fd >= 0) { + close(prog_fd); + return 1; + } + return 0; +} + +static bool memlock_bumped; +static rlim_t memlock_rlim = RLIM_INFINITY; + +int libbpf_set_memlock_rlim(size_t memlock_bytes) +{ + if (memlock_bumped) + return libbpf_err(-EBUSY); + + memlock_rlim = memlock_bytes; + return 0; +} + +int bump_rlimit_memlock(void) +{ + struct rlimit rlim; + + /* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */ + if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT)) + return 0; + + memlock_bumped = true; + + /* zero memlock_rlim_max disables auto-bumping RLIMIT_MEMLOCK */ + if (memlock_rlim == 0) + return 0; + + rlim.rlim_cur = rlim.rlim_max = memlock_rlim; + if (setrlimit(RLIMIT_MEMLOCK, &rlim)) + return -errno; + + return 0; +} + +int bpf_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + union bpf_attr attr; + int fd; + + bump_rlimit_memlock(); + + memset(&attr, 0, attr_sz); + + if (!OPTS_VALID(opts, bpf_map_create_opts)) + return libbpf_err(-EINVAL); + + attr.map_type = map_type; + if (map_name && kernel_supports(NULL, FEAT_PROG_NAME)) + libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name)); + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + + attr.btf_fd = OPTS_GET(opts, btf_fd, 0); + attr.btf_key_type_id = OPTS_GET(opts, btf_key_type_id, 0); + attr.btf_value_type_id = OPTS_GET(opts, btf_value_type_id, 0); + attr.btf_vmlinux_value_type_id = OPTS_GET(opts, btf_vmlinux_value_type_id, 0); + + attr.inner_map_fd = OPTS_GET(opts, inner_map_fd, 0); + attr.map_flags = OPTS_GET(opts, map_flags, 0); + attr.map_extra = OPTS_GET(opts, map_extra, 0); + attr.numa_node = OPTS_GET(opts, numa_node, 0); + attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); + + fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +static void * +alloc_zero_tailing_info(const void *orecord, __u32 cnt, + __u32 actual_rec_size, __u32 expected_rec_size) +{ + __u64 info_len = (__u64)actual_rec_size * cnt; + void *info, *nrecord; + int i; + + info = malloc(info_len); + if (!info) + return NULL; + + /* zero out bytes kernel does not understand */ + nrecord = info; + for (i = 0; i < cnt; i++) { + memcpy(nrecord, orecord, expected_rec_size); + memset(nrecord + expected_rec_size, 0, + actual_rec_size - expected_rec_size); + orecord += actual_rec_size; + nrecord += actual_rec_size; + } + + return info; +} + +int bpf_prog_load(enum bpf_prog_type prog_type, + const char *prog_name, const char *license, + const struct bpf_insn *insns, size_t insn_cnt, + const struct bpf_prog_load_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, fd_array); + void *finfo = NULL, *linfo = NULL; + const char *func_info, *line_info; + __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; + __u32 func_info_rec_size, line_info_rec_size; + int fd, attempts; + union bpf_attr attr; + char *log_buf; + + bump_rlimit_memlock(); + + if (!OPTS_VALID(opts, bpf_prog_load_opts)) + return libbpf_err(-EINVAL); + + attempts = OPTS_GET(opts, attempts, 0); + if (attempts < 0) + return libbpf_err(-EINVAL); + if (attempts == 0) + attempts = PROG_LOAD_ATTEMPTS; + + memset(&attr, 0, attr_sz); + + attr.prog_type = prog_type; + attr.expected_attach_type = OPTS_GET(opts, expected_attach_type, 0); + + attr.prog_btf_fd = OPTS_GET(opts, prog_btf_fd, 0); + attr.prog_flags = OPTS_GET(opts, prog_flags, 0); + attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0); + attr.kern_version = OPTS_GET(opts, kern_version, 0); + + if (prog_name && kernel_supports(NULL, FEAT_PROG_NAME)) + libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); + attr.license = ptr_to_u64(license); + + if (insn_cnt > UINT_MAX) + return libbpf_err(-E2BIG); + + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = (__u32)insn_cnt; + + attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); + attach_btf_obj_fd = OPTS_GET(opts, attach_btf_obj_fd, 0); + + if (attach_prog_fd && attach_btf_obj_fd) + return libbpf_err(-EINVAL); + + attr.attach_btf_id = OPTS_GET(opts, attach_btf_id, 0); + if (attach_prog_fd) + attr.attach_prog_fd = attach_prog_fd; + else + attr.attach_btf_obj_fd = attach_btf_obj_fd; + + log_buf = OPTS_GET(opts, log_buf, NULL); + log_size = OPTS_GET(opts, log_size, 0); + log_level = OPTS_GET(opts, log_level, 0); + + if (!!log_buf != !!log_size) + return libbpf_err(-EINVAL); + if (log_level > (4 | 2 | 1)) + return libbpf_err(-EINVAL); + if (log_level && !log_buf) + return libbpf_err(-EINVAL); + + func_info_rec_size = OPTS_GET(opts, func_info_rec_size, 0); + func_info = OPTS_GET(opts, func_info, NULL); + attr.func_info_rec_size = func_info_rec_size; + attr.func_info = ptr_to_u64(func_info); + attr.func_info_cnt = OPTS_GET(opts, func_info_cnt, 0); + + line_info_rec_size = OPTS_GET(opts, line_info_rec_size, 0); + line_info = OPTS_GET(opts, line_info, NULL); + attr.line_info_rec_size = line_info_rec_size; + attr.line_info = ptr_to_u64(line_info); + attr.line_info_cnt = OPTS_GET(opts, line_info_cnt, 0); + + attr.fd_array = ptr_to_u64(OPTS_GET(opts, fd_array, NULL)); + + if (log_level) { + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_size; + attr.log_level = log_level; + } + + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + if (fd >= 0) + return fd; + + /* After bpf_prog_load, the kernel may modify certain attributes + * to give user space a hint how to deal with loading failure. + * Check to see whether we can make some changes and load again. + */ + while (errno == E2BIG && (!finfo || !linfo)) { + if (!finfo && attr.func_info_cnt && + attr.func_info_rec_size < func_info_rec_size) { + /* try with corrected func info records */ + finfo = alloc_zero_tailing_info(func_info, + attr.func_info_cnt, + func_info_rec_size, + attr.func_info_rec_size); + if (!finfo) { + errno = E2BIG; + goto done; + } + + attr.func_info = ptr_to_u64(finfo); + attr.func_info_rec_size = func_info_rec_size; + } else if (!linfo && attr.line_info_cnt && + attr.line_info_rec_size < line_info_rec_size) { + linfo = alloc_zero_tailing_info(line_info, + attr.line_info_cnt, + line_info_rec_size, + attr.line_info_rec_size); + if (!linfo) { + errno = E2BIG; + goto done; + } + + attr.line_info = ptr_to_u64(linfo); + attr.line_info_rec_size = line_info_rec_size; + } else { + break; + } + + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + if (fd >= 0) + goto done; + } + + if (log_level == 0 && log_buf) { + /* log_level == 0 with non-NULL log_buf requires retrying on error + * with log_level == 1 and log_buf/log_buf_size set, to get details of + * failure + */ + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_size; + attr.log_level = 1; + + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + } +done: + /* free() doesn't affect errno, so we don't need to restore it */ + free(finfo); + free(linfo); + return libbpf_err_errno(fd); +} + +int bpf_map_update_elem(int fd, const void *key, const void *value, + __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_elem(int fd, const void *key, void *value) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_delete_elem(int fd, const void *key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_delete_elem_flags(int fd, const void *key, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_get_next_key(int fd, const void *key, void *next_key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, next_key); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.next_key = ptr_to_u64(next_key); + + ret = sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_freeze(int fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_fd); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + + ret = sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +static int bpf_map_batch_common(int cmd, int fd, void *in_batch, + void *out_batch, void *keys, void *values, + __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, batch); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_map_batch_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.batch.map_fd = fd; + attr.batch.in_batch = ptr_to_u64(in_batch); + attr.batch.out_batch = ptr_to_u64(out_batch); + attr.batch.keys = ptr_to_u64(keys); + attr.batch.values = ptr_to_u64(values); + attr.batch.count = *count; + attr.batch.elem_flags = OPTS_GET(opts, elem_flags, 0); + attr.batch.flags = OPTS_GET(opts, flags, 0); + + ret = sys_bpf(cmd, &attr, attr_sz); + *count = attr.batch.count; + + return libbpf_err_errno(ret); +} + +int bpf_map_delete_batch(int fd, const void *keys, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, NULL, + NULL, (void *)keys, NULL, count, opts); +} + +int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, void *keys, + void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_LOOKUP_BATCH, fd, in_batch, + out_batch, keys, values, count, opts); +} + +int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, void *out_batch, + void *keys, void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_LOOKUP_AND_DELETE_BATCH, + fd, in_batch, out_batch, keys, values, + count, opts); +} + +int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH, fd, NULL, NULL, + (void *)keys, (void *)values, count, opts); +} + +int bpf_obj_pin(int fd, const char *pathname) +{ + const size_t attr_sz = offsetofend(union bpf_attr, file_flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.pathname = ptr_to_u64((void *)pathname); + attr.bpf_fd = fd; + + ret = sys_bpf(BPF_OBJ_PIN, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_obj_get(const char *pathname) +{ + return bpf_obj_get_opts(pathname, NULL); +} + +int bpf_obj_get_opts(const char *pathname, const struct bpf_obj_get_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, file_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_obj_get_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.pathname = ptr_to_u64((void *)pathname); + attr.file_flags = OPTS_GET(opts, file_flags, 0); + + fd = sys_bpf_fd(BPF_OBJ_GET, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, + unsigned int flags) +{ + DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts, + .flags = flags, + ); + + return bpf_prog_attach_opts(prog_fd, target_fd, type, &opts); +} + +int bpf_prog_attach_opts(int prog_fd, int target_fd, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_prog_attach_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.target_fd = target_fd; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + attr.attach_flags = OPTS_GET(opts, flags, 0); + attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0); + + ret = sys_bpf(BPF_PROG_ATTACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_prog_detach(int target_fd, enum bpf_attach_type type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.target_fd = target_fd; + attr.attach_type = type; + + ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.target_fd = target_fd; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + + ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_create); + __u32 target_btf_id, iter_info_len; + union bpf_attr attr; + int fd, err; + + if (!OPTS_VALID(opts, bpf_link_create_opts)) + return libbpf_err(-EINVAL); + + iter_info_len = OPTS_GET(opts, iter_info_len, 0); + target_btf_id = OPTS_GET(opts, target_btf_id, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (iter_info_len || target_btf_id) { + if (iter_info_len && target_btf_id) + return libbpf_err(-EINVAL); + if (!OPTS_ZEROED(opts, target_btf_id)) + return libbpf_err(-EINVAL); + } + + memset(&attr, 0, attr_sz); + attr.link_create.prog_fd = prog_fd; + attr.link_create.target_fd = target_fd; + attr.link_create.attach_type = attach_type; + attr.link_create.flags = OPTS_GET(opts, flags, 0); + + if (target_btf_id) { + attr.link_create.target_btf_id = target_btf_id; + goto proceed; + } + + switch (attach_type) { + case BPF_TRACE_ITER: + attr.link_create.iter_info = ptr_to_u64(OPTS_GET(opts, iter_info, (void *)0)); + attr.link_create.iter_info_len = iter_info_len; + break; + case BPF_PERF_EVENT: + attr.link_create.perf_event.bpf_cookie = OPTS_GET(opts, perf_event.bpf_cookie, 0); + if (!OPTS_ZEROED(opts, perf_event)) + return libbpf_err(-EINVAL); + break; + case BPF_TRACE_KPROBE_MULTI: + attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0); + attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0); + attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0)); + attr.link_create.kprobe_multi.addrs = ptr_to_u64(OPTS_GET(opts, kprobe_multi.addrs, 0)); + attr.link_create.kprobe_multi.cookies = ptr_to_u64(OPTS_GET(opts, kprobe_multi.cookies, 0)); + if (!OPTS_ZEROED(opts, kprobe_multi)) + return libbpf_err(-EINVAL); + break; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + case BPF_LSM_MAC: + attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0); + if (!OPTS_ZEROED(opts, tracing)) + return libbpf_err(-EINVAL); + break; + default: + if (!OPTS_ZEROED(opts, flags)) + return libbpf_err(-EINVAL); + break; + } +proceed: + fd = sys_bpf_fd(BPF_LINK_CREATE, &attr, attr_sz); + if (fd >= 0) + return fd; + /* we'll get EINVAL if LINK_CREATE doesn't support attaching fentry + * and other similar programs + */ + err = -errno; + if (err != -EINVAL) + return libbpf_err(err); + + /* if user used features not supported by + * BPF_RAW_TRACEPOINT_OPEN command, then just give up immediately + */ + if (attr.link_create.target_fd || attr.link_create.target_btf_id) + return libbpf_err(err); + if (!OPTS_ZEROED(opts, sz)) + return libbpf_err(err); + + /* otherwise, for few select kinds of programs that can be + * attached using BPF_RAW_TRACEPOINT_OPEN command, try that as + * a fallback for older kernels + */ + switch (attach_type) { + case BPF_TRACE_RAW_TP: + case BPF_LSM_MAC: + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + return bpf_raw_tracepoint_open(NULL, prog_fd); + default: + return libbpf_err(err); + } +} + +int bpf_link_detach(int link_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_detach); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.link_detach.link_fd = link_fd; + + ret = sys_bpf(BPF_LINK_DETACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_update); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_link_update_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.link_update.link_fd = link_fd; + attr.link_update.new_prog_fd = new_prog_fd; + attr.link_update.flags = OPTS_GET(opts, flags, 0); + attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + + ret = sys_bpf(BPF_LINK_UPDATE, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_iter_create(int link_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, iter_create); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + attr.iter_create.link_fd = link_fd; + + fd = sys_bpf_fd(BPF_ITER_CREATE, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, query); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_prog_query_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + + attr.query.target_fd = target_fd; + attr.query.attach_type = type; + attr.query.query_flags = OPTS_GET(opts, query_flags, 0); + attr.query.prog_cnt = OPTS_GET(opts, prog_cnt, 0); + attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL)); + attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL)); + + ret = sys_bpf(BPF_PROG_QUERY, &attr, attr_sz); + + OPTS_SET(opts, attach_flags, attr.query.attach_flags); + OPTS_SET(opts, prog_cnt, attr.query.prog_cnt); + + return libbpf_err_errno(ret); +} + +int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, + __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) +{ + LIBBPF_OPTS(bpf_prog_query_opts, opts); + int ret; + + opts.query_flags = query_flags; + opts.prog_ids = prog_ids; + opts.prog_cnt = *prog_cnt; + + ret = bpf_prog_query_opts(target_fd, type, &opts); + + if (attach_flags) + *attach_flags = opts.attach_flags; + *prog_cnt = opts.prog_cnt; + + return libbpf_err_errno(ret); +} + +int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, test); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_test_run_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.test.prog_fd = prog_fd; + attr.test.batch_size = OPTS_GET(opts, batch_size, 0); + attr.test.cpu = OPTS_GET(opts, cpu, 0); + attr.test.flags = OPTS_GET(opts, flags, 0); + attr.test.repeat = OPTS_GET(opts, repeat, 0); + attr.test.duration = OPTS_GET(opts, duration, 0); + attr.test.ctx_size_in = OPTS_GET(opts, ctx_size_in, 0); + attr.test.ctx_size_out = OPTS_GET(opts, ctx_size_out, 0); + attr.test.data_size_in = OPTS_GET(opts, data_size_in, 0); + attr.test.data_size_out = OPTS_GET(opts, data_size_out, 0); + attr.test.ctx_in = ptr_to_u64(OPTS_GET(opts, ctx_in, NULL)); + attr.test.ctx_out = ptr_to_u64(OPTS_GET(opts, ctx_out, NULL)); + attr.test.data_in = ptr_to_u64(OPTS_GET(opts, data_in, NULL)); + attr.test.data_out = ptr_to_u64(OPTS_GET(opts, data_out, NULL)); + + ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, attr_sz); + + OPTS_SET(opts, data_size_out, attr.test.data_size_out); + OPTS_SET(opts, ctx_size_out, attr.test.ctx_size_out); + OPTS_SET(opts, duration, attr.test.duration); + OPTS_SET(opts, retval, attr.test.retval); + + return libbpf_err_errno(ret); +} + +static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int err; + + memset(&attr, 0, attr_sz); + attr.start_id = start_id; + + err = sys_bpf(cmd, &attr, attr_sz); + if (!err) + *next_id = attr.next_id; + + return libbpf_err_errno(err); +} + +int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_PROG_GET_NEXT_ID); +} + +int bpf_map_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_MAP_GET_NEXT_ID); +} + +int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_BTF_GET_NEXT_ID); +} + +int bpf_link_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_LINK_GET_NEXT_ID); +} + +int bpf_prog_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.prog_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_PROG_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_get_fd_by_id(__u32 id) +{ + return bpf_prog_get_fd_by_id_opts(id, NULL); +} + +int bpf_map_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.map_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_map_get_fd_by_id(__u32 id) +{ + return bpf_map_get_fd_by_id_opts(id, NULL); +} + +int bpf_btf_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.btf_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_BTF_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_btf_get_fd_by_id(__u32 id) +{ + return bpf_btf_get_fd_by_id_opts(id, NULL); +} + +int bpf_link_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.link_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_LINK_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_link_get_fd_by_id(__u32 id) +{ + return bpf_link_get_fd_by_id_opts(id, NULL); +} + +int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) +{ + const size_t attr_sz = offsetofend(union bpf_attr, info); + union bpf_attr attr; + int err; + + memset(&attr, 0, attr_sz); + attr.info.bpf_fd = bpf_fd; + attr.info.info_len = *info_len; + attr.info.info = ptr_to_u64(info); + + err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz); + if (!err) + *info_len = attr.info.info_len; + return libbpf_err_errno(err); +} + +int bpf_raw_tracepoint_open(const char *name, int prog_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + attr.raw_tracepoint.name = ptr_to_u64(name); + attr.raw_tracepoint.prog_fd = prog_fd; + + fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_load_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, btf_log_level); + union bpf_attr attr; + char *log_buf; + size_t log_size; + __u32 log_level; + int fd; + + bump_rlimit_memlock(); + + memset(&attr, 0, attr_sz); + + if (!OPTS_VALID(opts, bpf_btf_load_opts)) + return libbpf_err(-EINVAL); + + log_buf = OPTS_GET(opts, log_buf, NULL); + log_size = OPTS_GET(opts, log_size, 0); + log_level = OPTS_GET(opts, log_level, 0); + + if (log_size > UINT_MAX) + return libbpf_err(-EINVAL); + if (log_size && !log_buf) + return libbpf_err(-EINVAL); + + attr.btf = ptr_to_u64(btf_data); + attr.btf_size = btf_size; + /* log_level == 0 and log_buf != NULL means "try loading without + * log_buf, but retry with log_buf and log_level=1 on error", which is + * consistent across low-level and high-level BTF and program loading + * APIs within libbpf and provides a sensible behavior in practice + */ + if (log_level) { + attr.btf_log_buf = ptr_to_u64(log_buf); + attr.btf_log_size = (__u32)log_size; + attr.btf_log_level = log_level; + } + + fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); + if (fd < 0 && log_buf && log_level == 0) { + attr.btf_log_buf = ptr_to_u64(log_buf); + attr.btf_log_size = (__u32)log_size; + attr.btf_log_level = 1; + fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); + } + return libbpf_err_errno(fd); +} + +int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, + __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, + __u64 *probe_addr) +{ + const size_t attr_sz = offsetofend(union bpf_attr, task_fd_query); + union bpf_attr attr; + int err; + + memset(&attr, 0, attr_sz); + attr.task_fd_query.pid = pid; + attr.task_fd_query.fd = fd; + attr.task_fd_query.flags = flags; + attr.task_fd_query.buf = ptr_to_u64(buf); + attr.task_fd_query.buf_len = *buf_len; + + err = sys_bpf(BPF_TASK_FD_QUERY, &attr, attr_sz); + + *buf_len = attr.task_fd_query.buf_len; + *prog_id = attr.task_fd_query.prog_id; + *fd_type = attr.task_fd_query.fd_type; + *probe_offset = attr.task_fd_query.probe_offset; + *probe_addr = attr.task_fd_query.probe_addr; + + return libbpf_err_errno(err); +} + +int bpf_enable_stats(enum bpf_stats_type type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, enable_stats); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + attr.enable_stats.type = type; + + fd = sys_bpf_fd(BPF_ENABLE_STATS, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_bind_map); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_prog_bind_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.prog_bind_map.prog_fd = prog_fd; + attr.prog_bind_map.map_fd = map_fd; + attr.prog_bind_map.flags = OPTS_GET(opts, flags, 0); + + ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, attr_sz); + return libbpf_err_errno(ret); +} diff --git a/src/bpf.h b/src/bpf.h new file mode 100644 index 0000000..7468978 --- /dev/null +++ b/src/bpf.h @@ -0,0 +1,461 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * common eBPF ELF operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org> + * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com> + * Copyright (C) 2015 Huawei Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License (not later!) + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses> + */ +#ifndef __LIBBPF_BPF_H +#define __LIBBPF_BPF_H + +#include <linux/bpf.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +#include "libbpf_common.h" +#include "libbpf_legacy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int libbpf_set_memlock_rlim(size_t memlock_bytes); + +struct bpf_map_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 btf_fd; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 btf_vmlinux_value_type_id; + + __u32 inner_map_fd; + __u32 map_flags; + __u64 map_extra; + + __u32 numa_node; + __u32 map_ifindex; +}; +#define bpf_map_create_opts__last_field map_ifindex + +LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts); + +struct bpf_prog_load_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + /* libbpf can retry BPF_PROG_LOAD command if bpf() syscall returns + * -EAGAIN. This field determines how many attempts libbpf has to + * make. If not specified, libbpf will use default value of 5. + */ + int attempts; + + enum bpf_attach_type expected_attach_type; + __u32 prog_btf_fd; + __u32 prog_flags; + __u32 prog_ifindex; + __u32 kern_version; + + __u32 attach_btf_id; + __u32 attach_prog_fd; + __u32 attach_btf_obj_fd; + + const int *fd_array; + + /* .BTF.ext func info data */ + const void *func_info; + __u32 func_info_cnt; + __u32 func_info_rec_size; + + /* .BTF.ext line info data */ + const void *line_info; + __u32 line_info_cnt; + __u32 line_info_rec_size; + + /* verifier log options */ + __u32 log_level; + __u32 log_size; + char *log_buf; +}; +#define bpf_prog_load_opts__last_field log_buf + +LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type, + const char *prog_name, const char *license, + const struct bpf_insn *insns, size_t insn_cnt, + const struct bpf_prog_load_opts *opts); + +/* Flags to direct loading requirements */ +#define MAPS_RELAX_COMPAT 0x01 + +/* Recommended log buffer size */ +#define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) /* verifier maximum in kernels <= 5.1 */ + +struct bpf_btf_load_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + /* kernel log options */ + char *log_buf; + __u32 log_level; + __u32 log_size; +}; +#define bpf_btf_load_opts__last_field log_size + +LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size, + const struct bpf_btf_load_opts *opts); + +LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, + __u64 flags); + +LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value); +LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, + __u64 flags); +LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key, + void *value); +LIBBPF_API int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, + void *value, __u64 flags); +LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); +LIBBPF_API int bpf_map_delete_elem_flags(int fd, const void *key, __u64 flags); +LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); +LIBBPF_API int bpf_map_freeze(int fd); + +struct bpf_map_batch_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u64 elem_flags; + __u64 flags; +}; +#define bpf_map_batch_opts__last_field flags + + +/** + * @brief **bpf_map_delete_batch()** allows for batch deletion of multiple + * elements in a BPF map. + * + * @param fd BPF map file descriptor + * @param keys pointer to an array of *count* keys + * @param count input and output parameter; on input **count** represents the + * number of elements in the map to delete in batch; + * on output if a non-EFAULT error is returned, **count** represents the number of deleted + * elements if the output **count** value is not equal to the input **count** value + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch deletion works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_delete_batch(int fd, const void *keys, + __u32 *count, + const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_lookup_batch()** allows for batch lookup of BPF map elements. + * + * The parameter *in_batch* is the address of the first element in the batch to read. + * *out_batch* is an output parameter that should be passed as *in_batch* to subsequent + * calls to **bpf_map_lookup_batch()**. NULL can be passed for *in_batch* to indicate + * that the batched lookup starts from the beginning of the map. + * + * The *keys* and *values* are output parameters which must point to memory large enough to + * hold *count* items based on the key and value size of the map *map_fd*. The *keys* + * buffer must be of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * @param fd BPF map file descriptor + * @param in_batch address of the first element in batch to read, can pass NULL to + * indicate that the batched lookup starts from the beginning of the map. + * @param out_batch output parameter that should be passed to next call as *in_batch* + * @param keys pointer to an array large enough for *count* keys + * @param values pointer to an array large enough for *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to read in batch; on output it's the number of elements that were + * successfully read. + * If a non-EFAULT error is returned, count will be set as the number of elements + * that were read before the error occurred. + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch lookup works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, + void *keys, void *values, __u32 *count, + const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_lookup_and_delete_batch()** allows for batch lookup and deletion + * of BPF map elements where each element is deleted after being retrieved. + * + * @param fd BPF map file descriptor + * @param in_batch address of the first element in batch to read, can pass NULL to + * get address of the first element in *out_batch* + * @param out_batch output parameter that should be passed to next call as *in_batch* + * @param keys pointer to an array of *count* keys + * @param values pointer to an array large enough for *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to read and delete in batch; on output it represents the number of + * elements that were successfully read and deleted + * If a non-**EFAULT** error code is returned and if the output **count** value + * is not equal to the input **count** value, up to **count** elements may + * have been deleted. + * if **EFAULT** is returned up to *count* elements may have been deleted without + * being returned via the *keys* and *values* output parameters. + * @param opts options for configuring the way the batch lookup and delete works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, + void *out_batch, void *keys, + void *values, __u32 *count, + const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_update_batch()** updates multiple elements in a map + * by specifying keys and their corresponding values. + * + * The *keys* and *values* parameters must point to memory large enough + * to hold *count* items based on the key and value size of the map. + * + * The *opts* parameter can be used to control how *bpf_map_update_batch()* + * should handle keys that either do or do not already exist in the map. + * In particular the *flags* parameter of *bpf_map_batch_opts* can be + * one of the following: + * + * Note that *count* is an input and output parameter, where on output it + * represents how many elements were successfully updated. Also note that if + * **EFAULT** then *count* should not be trusted to be correct. + * + * **BPF_ANY** + * Create new elements or update existing. + * + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * + * **BPF_EXIST** + * Update existing elements. + * + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * @param fd BPF map file descriptor + * @param keys pointer to an array of *count* keys + * @param values pointer to an array of *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to update in batch; on output if a non-EFAULT error is returned, + * **count** represents the number of updated elements if the output **count** + * value is not equal to the input **count** value. + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch update works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_update_batch(int fd, const void *keys, const void *values, + __u32 *count, + const struct bpf_map_batch_opts *opts); + +struct bpf_obj_get_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 file_flags; + + size_t :0; +}; +#define bpf_obj_get_opts__last_field file_flags + +LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); +LIBBPF_API int bpf_obj_get(const char *pathname); +LIBBPF_API int bpf_obj_get_opts(const char *pathname, + const struct bpf_obj_get_opts *opts); + +struct bpf_prog_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + unsigned int flags; + int replace_prog_fd; +}; +#define bpf_prog_attach_opts__last_field replace_prog_fd + +LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, + enum bpf_attach_type type, unsigned int flags); +LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int attachable_fd, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts); +LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); +LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, + enum bpf_attach_type type); + +union bpf_iter_link_info; /* defined in up-to-date linux/bpf.h */ +struct bpf_link_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; + union bpf_iter_link_info *iter_info; + __u32 iter_info_len; + __u32 target_btf_id; + union { + struct { + __u64 bpf_cookie; + } perf_event; + struct { + __u32 flags; + __u32 cnt; + const char **syms; + const unsigned long *addrs; + const __u64 *cookies; + } kprobe_multi; + struct { + __u64 cookie; + } tracing; + }; + size_t :0; +}; +#define bpf_link_create_opts__last_field kprobe_multi.cookies + +LIBBPF_API int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts); + +LIBBPF_API int bpf_link_detach(int link_fd); + +struct bpf_link_update_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; /* extra flags */ + __u32 old_prog_fd; /* expected old program FD */ +}; +#define bpf_link_update_opts__last_field old_prog_fd + +LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts); + +LIBBPF_API int bpf_iter_create(int link_fd); + +struct bpf_prog_test_run_attr { + int prog_fd; + int repeat; + const void *data_in; + __u32 data_size_in; + void *data_out; /* optional */ + __u32 data_size_out; /* in: max length of data_out + * out: length of data_out */ + __u32 retval; /* out: return code of the BPF program */ + __u32 duration; /* out: average per repetition in ns */ + const void *ctx_in; /* optional */ + __u32 ctx_size_in; + void *ctx_out; /* optional */ + __u32 ctx_size_out; /* in: max length of ctx_out + * out: length of cxt_out */ +}; + +LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_link_get_next_id(__u32 start_id, __u32 *next_id); + +struct bpf_get_fd_by_id_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 open_flags; /* permissions requested for the operation on fd */ + size_t :0; +}; +#define bpf_get_fd_by_id_opts__last_field open_flags + +LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_prog_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_map_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_btf_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_link_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_link_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); + +struct bpf_prog_query_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 query_flags; + __u32 attach_flags; /* output argument */ + __u32 *prog_ids; + __u32 prog_cnt; /* input+output argument */ + __u32 *prog_attach_flags; +}; +#define bpf_prog_query_opts__last_field prog_attach_flags + +LIBBPF_API int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts); +LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, + __u32 query_flags, __u32 *attach_flags, + __u32 *prog_ids, __u32 *prog_cnt); + +LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); +LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, + __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, + __u64 *probe_offset, __u64 *probe_addr); + +#ifdef __cplusplus +/* forward-declaring enums in C++ isn't compatible with pure C enums, so + * instead define bpf_enable_stats() as accepting int as an input + */ +LIBBPF_API int bpf_enable_stats(int type); +#else +enum bpf_stats_type; /* defined in up-to-date linux/bpf.h */ +LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); +#endif + +struct bpf_prog_bind_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; +}; +#define bpf_prog_bind_opts__last_field flags + +LIBBPF_API int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts); + +struct bpf_test_run_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + const void *data_in; /* optional */ + void *data_out; /* optional */ + __u32 data_size_in; + __u32 data_size_out; /* in: max length of data_out + * out: length of data_out + */ + const void *ctx_in; /* optional */ + void *ctx_out; /* optional */ + __u32 ctx_size_in; + __u32 ctx_size_out; /* in: max length of ctx_out + * out: length of cxt_out + */ + __u32 retval; /* out: return code of the BPF program */ + int repeat; + __u32 duration; /* out: average per repetition in ns */ + __u32 flags; + __u32 cpu; + __u32 batch_size; +}; +#define bpf_test_run_opts__last_field batch_size + +LIBBPF_API int bpf_prog_test_run_opts(int prog_fd, + struct bpf_test_run_opts *opts); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_BPF_H */ diff --git a/src/bpf_core_read.h b/src/bpf_core_read.h new file mode 100644 index 0000000..496e6a8 --- /dev/null +++ b/src/bpf_core_read.h @@ -0,0 +1,484 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_CORE_READ_H__ +#define __BPF_CORE_READ_H__ + +/* + * enum bpf_field_info_kind is passed as a second argument into + * __builtin_preserve_field_info() built-in to get a specific aspect of + * a field, captured as a first argument. __builtin_preserve_field_info(field, + * info_kind) returns __u32 integer and produces BTF field relocation, which + * is understood and processed by libbpf during BPF object loading. See + * selftests/bpf for examples. + */ +enum bpf_field_info_kind { + BPF_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_FIELD_BYTE_SIZE = 1, + BPF_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_FIELD_SIGNED = 3, + BPF_FIELD_LSHIFT_U64 = 4, + BPF_FIELD_RSHIFT_U64 = 5, +}; + +/* second argument to __builtin_btf_type_id() built-in */ +enum bpf_type_id_kind { + BPF_TYPE_ID_LOCAL = 0, /* BTF type ID in local program */ + BPF_TYPE_ID_TARGET = 1, /* BTF type ID in target kernel */ +}; + +/* second argument to __builtin_preserve_type_info() built-in */ +enum bpf_type_info_kind { + BPF_TYPE_EXISTS = 0, /* type existence in target kernel */ + BPF_TYPE_SIZE = 1, /* type size in target kernel */ + BPF_TYPE_MATCHES = 2, /* type match in target kernel */ +}; + +/* second argument to __builtin_preserve_enum_value() built-in */ +enum bpf_enum_value_kind { + BPF_ENUMVAL_EXISTS = 0, /* enum value existence in kernel */ + BPF_ENUMVAL_VALUE = 1, /* enum value value relocation */ +}; + +#define __CORE_RELO(src, field, info) \ + __builtin_preserve_field_info((src)->field, BPF_FIELD_##info) + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ + bpf_probe_read_kernel( \ + (void *)dst, \ + __CORE_RELO(src, fld, BYTE_SIZE), \ + (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) +#else +/* semantics of LSHIFT_64 assumes loading values into low-ordered bytes, so + * for big-endian we need to adjust destination pointer accordingly, based on + * field byte size + */ +#define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ + bpf_probe_read_kernel( \ + (void *)dst + (8 - __CORE_RELO(src, fld, BYTE_SIZE)), \ + __CORE_RELO(src, fld, BYTE_SIZE), \ + (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) +#endif + +/* + * Extract bitfield, identified by s->field, and return its value as u64. + * All this is done in relocatable manner, so bitfield changes such as + * signedness, bit size, offset changes, this will be handled automatically. + * This version of macro is using bpf_probe_read_kernel() to read underlying + * integer storage. Macro functions as an expression and its return type is + * bpf_probe_read_kernel()'s return value: 0, on success, <0 on error. + */ +#define BPF_CORE_READ_BITFIELD_PROBED(s, field) ({ \ + unsigned long long val = 0; \ + \ + __CORE_BITFIELD_PROBE_READ(&val, s, field); \ + val <<= __CORE_RELO(s, field, LSHIFT_U64); \ + if (__CORE_RELO(s, field, SIGNED)) \ + val = ((long long)val) >> __CORE_RELO(s, field, RSHIFT_U64); \ + else \ + val = val >> __CORE_RELO(s, field, RSHIFT_U64); \ + val; \ +}) + +/* + * Extract bitfield, identified by s->field, and return its value as u64. + * This version of macro is using direct memory reads and should be used from + * BPF program types that support such functionality (e.g., typed raw + * tracepoints). + */ +#define BPF_CORE_READ_BITFIELD(s, field) ({ \ + const void *p = (const void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \ + unsigned long long val; \ + \ + /* This is a so-called barrier_var() operation that makes specified \ + * variable "a black box" for optimizing compiler. \ + * It forces compiler to perform BYTE_OFFSET relocation on p and use \ + * its calculated value in the switch below, instead of applying \ + * the same relocation 4 times for each individual memory load. \ + */ \ + asm volatile("" : "=r"(p) : "0"(p)); \ + \ + switch (__CORE_RELO(s, field, BYTE_SIZE)) { \ + case 1: val = *(const unsigned char *)p; break; \ + case 2: val = *(const unsigned short *)p; break; \ + case 4: val = *(const unsigned int *)p; break; \ + case 8: val = *(const unsigned long long *)p; break; \ + } \ + val <<= __CORE_RELO(s, field, LSHIFT_U64); \ + if (__CORE_RELO(s, field, SIGNED)) \ + val = ((long long)val) >> __CORE_RELO(s, field, RSHIFT_U64); \ + else \ + val = val >> __CORE_RELO(s, field, RSHIFT_U64); \ + val; \ +}) + +#define ___bpf_field_ref1(field) (field) +#define ___bpf_field_ref2(type, field) (((typeof(type) *)0)->field) +#define ___bpf_field_ref(args...) \ + ___bpf_apply(___bpf_field_ref, ___bpf_narg(args))(args) + +/* + * Convenience macro to check that field actually exists in target kernel's. + * Returns: + * 1, if matching field is present in target kernel; + * 0, if no matching field found. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_exists(p->my_field); + * - field reference through type and field names: + * bpf_core_field_exists(struct my_type, my_field). + */ +#define bpf_core_field_exists(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_EXISTS) + +/* + * Convenience macro to get the byte size of a field. Works for integers, + * struct/unions, pointers, arrays, and enums. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_size(p->my_field); + * - field reference through type and field names: + * bpf_core_field_size(struct my_type, my_field). + */ +#define bpf_core_field_size(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_SIZE) + +/* + * Convenience macro to get field's byte offset. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_offset(p->my_field); + * - field reference through type and field names: + * bpf_core_field_offset(struct my_type, my_field). + */ +#define bpf_core_field_offset(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_OFFSET) + +/* + * Convenience macro to get BTF type ID of a specified type, using a local BTF + * information. Return 32-bit unsigned integer with type ID from program's own + * BTF. Always succeeds. + */ +#define bpf_core_type_id_local(type) \ + __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_LOCAL) + +/* + * Convenience macro to get BTF type ID of a target kernel's type that matches + * specified local type. + * Returns: + * - valid 32-bit unsigned type ID in kernel BTF; + * - 0, if no matching type was found in a target kernel BTF. + */ +#define bpf_core_type_id_kernel(type) \ + __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_TARGET) + +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) exists in a target kernel. + * Returns: + * 1, if such type is present in target kernel's BTF; + * 0, if no matching type is found. + */ +#define bpf_core_type_exists(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_EXISTS) + +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) "matches" that in a target kernel. + * Returns: + * 1, if the type matches in the target kernel's BTF; + * 0, if the type does not match any in the target kernel + */ +#define bpf_core_type_matches(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_MATCHES) + +/* + * Convenience macro to get the byte size of a provided named type + * (struct/union/enum/typedef) in a target kernel. + * Returns: + * >= 0 size (in bytes), if type is present in target kernel's BTF; + * 0, if no matching type is found. + */ +#define bpf_core_type_size(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_SIZE) + +/* + * Convenience macro to check that provided enumerator value is defined in + * a target kernel. + * Returns: + * 1, if specified enum type and its enumerator value are present in target + * kernel's BTF; + * 0, if no matching enum and/or enum value within that enum is found. + */ +#define bpf_core_enum_value_exists(enum_type, enum_value) \ + __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_EXISTS) + +/* + * Convenience macro to get the integer value of an enumerator value in + * a target kernel. + * Returns: + * 64-bit value, if specified enum type and its enumerator value are + * present in target kernel's BTF; + * 0, if no matching enum and/or enum value within that enum is found. + */ +#define bpf_core_enum_value(enum_type, enum_value) \ + __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_VALUE) + +/* + * bpf_core_read() abstracts away bpf_probe_read_kernel() call and captures + * offset relocation for source address using __builtin_preserve_access_index() + * built-in, provided by Clang. + * + * __builtin_preserve_access_index() takes as an argument an expression of + * taking an address of a field within struct/union. It makes compiler emit + * a relocation, which records BTF type ID describing root struct/union and an + * accessor string which describes exact embedded field that was used to take + * an address. See detailed description of this relocation format and + * semantics in comments to struct bpf_field_reloc in libbpf_internal.h. + * + * This relocation allows libbpf to adjust BPF instruction to use correct + * actual field offset, based on target kernel BTF type that matches original + * (local) BTF, used to record relocation. + */ +#define bpf_core_read(dst, sz, src) \ + bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ +#define bpf_core_read_user(dst, sz, src) \ + bpf_probe_read_user(dst, sz, (const void *)__builtin_preserve_access_index(src)) +/* + * bpf_core_read_str() is a thin wrapper around bpf_probe_read_str() + * additionally emitting BPF CO-RE field relocation for specified source + * argument. + */ +#define bpf_core_read_str(dst, sz, src) \ + bpf_probe_read_kernel_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ +#define bpf_core_read_user_str(dst, sz, src) \ + bpf_probe_read_user_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +#define ___concat(a, b) a ## b +#define ___apply(fn, n) ___concat(fn, n) +#define ___nth(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, __11, N, ...) N + +/* + * return number of provided arguments; used for switch-based variadic macro + * definitions (see ___last, ___arrow, etc below) + */ +#define ___narg(...) ___nth(_, ##__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +/* + * return 0 if no arguments are passed, N - otherwise; used for + * recursively-defined macros to specify termination (0) case, and generic + * (N) case (e.g., ___read_ptrs, ___core_read) + */ +#define ___empty(...) ___nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0) + +#define ___last1(x) x +#define ___last2(a, x) x +#define ___last3(a, b, x) x +#define ___last4(a, b, c, x) x +#define ___last5(a, b, c, d, x) x +#define ___last6(a, b, c, d, e, x) x +#define ___last7(a, b, c, d, e, f, x) x +#define ___last8(a, b, c, d, e, f, g, x) x +#define ___last9(a, b, c, d, e, f, g, h, x) x +#define ___last10(a, b, c, d, e, f, g, h, i, x) x +#define ___last(...) ___apply(___last, ___narg(__VA_ARGS__))(__VA_ARGS__) + +#define ___nolast2(a, _) a +#define ___nolast3(a, b, _) a, b +#define ___nolast4(a, b, c, _) a, b, c +#define ___nolast5(a, b, c, d, _) a, b, c, d +#define ___nolast6(a, b, c, d, e, _) a, b, c, d, e +#define ___nolast7(a, b, c, d, e, f, _) a, b, c, d, e, f +#define ___nolast8(a, b, c, d, e, f, g, _) a, b, c, d, e, f, g +#define ___nolast9(a, b, c, d, e, f, g, h, _) a, b, c, d, e, f, g, h +#define ___nolast10(a, b, c, d, e, f, g, h, i, _) a, b, c, d, e, f, g, h, i +#define ___nolast(...) ___apply(___nolast, ___narg(__VA_ARGS__))(__VA_ARGS__) + +#define ___arrow1(a) a +#define ___arrow2(a, b) a->b +#define ___arrow3(a, b, c) a->b->c +#define ___arrow4(a, b, c, d) a->b->c->d +#define ___arrow5(a, b, c, d, e) a->b->c->d->e +#define ___arrow6(a, b, c, d, e, f) a->b->c->d->e->f +#define ___arrow7(a, b, c, d, e, f, g) a->b->c->d->e->f->g +#define ___arrow8(a, b, c, d, e, f, g, h) a->b->c->d->e->f->g->h +#define ___arrow9(a, b, c, d, e, f, g, h, i) a->b->c->d->e->f->g->h->i +#define ___arrow10(a, b, c, d, e, f, g, h, i, j) a->b->c->d->e->f->g->h->i->j +#define ___arrow(...) ___apply(___arrow, ___narg(__VA_ARGS__))(__VA_ARGS__) + +#define ___type(...) typeof(___arrow(__VA_ARGS__)) + +#define ___read(read_fn, dst, src_type, src, accessor) \ + read_fn((void *)(dst), sizeof(*(dst)), &((src_type)(src))->accessor) + +/* "recursively" read a sequence of inner pointers using local __t var */ +#define ___rd_first(fn, src, a) ___read(fn, &__t, ___type(src), src, a); +#define ___rd_last(fn, ...) \ + ___read(fn, &__t, ___type(___nolast(__VA_ARGS__)), __t, ___last(__VA_ARGS__)); +#define ___rd_p1(fn, ...) const void *__t; ___rd_first(fn, __VA_ARGS__) +#define ___rd_p2(fn, ...) ___rd_p1(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p3(fn, ...) ___rd_p2(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p4(fn, ...) ___rd_p3(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p5(fn, ...) ___rd_p4(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p6(fn, ...) ___rd_p5(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p7(fn, ...) ___rd_p6(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p8(fn, ...) ___rd_p7(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p9(fn, ...) ___rd_p8(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___read_ptrs(fn, src, ...) \ + ___apply(___rd_p, ___narg(__VA_ARGS__))(fn, src, __VA_ARGS__) + +#define ___core_read0(fn, fn_ptr, dst, src, a) \ + ___read(fn, dst, ___type(src), src, a); +#define ___core_readN(fn, fn_ptr, dst, src, ...) \ + ___read_ptrs(fn_ptr, src, ___nolast(__VA_ARGS__)) \ + ___read(fn, dst, ___type(src, ___nolast(__VA_ARGS__)), __t, \ + ___last(__VA_ARGS__)); +#define ___core_read(fn, fn_ptr, dst, src, a, ...) \ + ___apply(___core_read, ___empty(__VA_ARGS__))(fn, fn_ptr, dst, \ + src, a, ##__VA_ARGS__) + +/* + * BPF_CORE_READ_INTO() is a more performance-conscious variant of + * BPF_CORE_READ(), in which final field is read into user-provided storage. + * See BPF_CORE_READ() below for more details on general usage. + */ +#define BPF_CORE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Variant of BPF_CORE_READ_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ +#define BPF_CORE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_INTO() */ +#define BPF_PROBE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read, bpf_probe_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_USER_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * BPF_CORE_READ_STR_INTO() does same "pointer chasing" as + * BPF_CORE_READ() for intermediate pointers, but then executes (and returns + * corresponding error code) bpf_core_read_str() for final string read. + */ +#define BPF_CORE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_str, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Variant of BPF_CORE_READ_STR_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ +#define BPF_CORE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user_str, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_STR_INTO() */ +#define BPF_PROBE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_str, bpf_probe_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Non-CO-RE variant of BPF_CORE_READ_USER_STR_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user_str, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * BPF_CORE_READ() is used to simplify BPF CO-RE relocatable read, especially + * when there are few pointer chasing steps. + * E.g., what in non-BPF world (or in BPF w/ BCC) would be something like: + * int x = s->a.b.c->d.e->f->g; + * can be succinctly achieved using BPF_CORE_READ as: + * int x = BPF_CORE_READ(s, a.b.c, d.e, f, g); + * + * BPF_CORE_READ will decompose above statement into 4 bpf_core_read (BPF + * CO-RE relocatable bpf_probe_read_kernel() wrapper) calls, logically + * equivalent to: + * 1. const void *__t = s->a.b.c; + * 2. __t = __t->d.e; + * 3. __t = __t->f; + * 4. return __t->g; + * + * Equivalence is logical, because there is a heavy type casting/preservation + * involved, as well as all the reads are happening through + * bpf_probe_read_kernel() calls using __builtin_preserve_access_index() to + * emit CO-RE relocations. + * + * N.B. Only up to 9 "field accessors" are supported, which should be more + * than enough for any practical purpose. + */ +#define BPF_CORE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* + * Variant of BPF_CORE_READ() for reading from user-space memory. + * + * NOTE: all the source types involved are still *kernel types* and need to + * exist in kernel (or kernel module) BTF, otherwise CO-RE relocation will + * fail. Custom user types are not relocatable with CO-RE. + * The typical situation in which BPF_CORE_READ_USER() might be used is to + * read kernel UAPI types from the user-space memory passed in as a syscall + * input argument. + */ +#define BPF_CORE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ() */ +#define BPF_PROBE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* + * Non-CO-RE variant of BPF_CORE_READ_USER(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +#endif + diff --git a/src/bpf_endian.h b/src/bpf_endian.h new file mode 100644 index 0000000..ec9db4f --- /dev/null +++ b/src/bpf_endian.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_ENDIAN__ +#define __BPF_ENDIAN__ + +/* + * Isolate byte #n and put it into byte #m, for __u##b type. + * E.g., moving byte #6 (nnnnnnnn) into byte #1 (mmmmmmmm) for __u64: + * 1) xxxxxxxx nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx + * 2) nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx 00000000 + * 3) 00000000 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn + * 4) 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn 00000000 + */ +#define ___bpf_mvb(x, b, n, m) ((__u##b)(x) << (b-(n+1)*8) >> (b-8) << (m*8)) + +#define ___bpf_swab16(x) ((__u16)( \ + ___bpf_mvb(x, 16, 0, 1) | \ + ___bpf_mvb(x, 16, 1, 0))) + +#define ___bpf_swab32(x) ((__u32)( \ + ___bpf_mvb(x, 32, 0, 3) | \ + ___bpf_mvb(x, 32, 1, 2) | \ + ___bpf_mvb(x, 32, 2, 1) | \ + ___bpf_mvb(x, 32, 3, 0))) + +#define ___bpf_swab64(x) ((__u64)( \ + ___bpf_mvb(x, 64, 0, 7) | \ + ___bpf_mvb(x, 64, 1, 6) | \ + ___bpf_mvb(x, 64, 2, 5) | \ + ___bpf_mvb(x, 64, 3, 4) | \ + ___bpf_mvb(x, 64, 4, 3) | \ + ___bpf_mvb(x, 64, 5, 2) | \ + ___bpf_mvb(x, 64, 6, 1) | \ + ___bpf_mvb(x, 64, 7, 0))) + +/* LLVM's BPF target selects the endianness of the CPU + * it compiles on, or the user specifies (bpfel/bpfeb), + * respectively. The used __BYTE_ORDER__ is defined by + * the compiler, we cannot rely on __BYTE_ORDER from + * libc headers, since it doesn't reflect the actual + * requested byte order. + * + * Note, LLVM's BPF target has different __builtin_bswapX() + * semantics. It does map to BPF_ALU | BPF_END | BPF_TO_BE + * in bpfel and bpfeb case, which means below, that we map + * to cpu_to_be16(). We could use it unconditionally in BPF + * case, but better not rely on it, so that this header here + * can be used from application and BPF program side, which + * use different targets. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define __bpf_ntohs(x) __builtin_bswap16(x) +# define __bpf_htons(x) __builtin_bswap16(x) +# define __bpf_constant_ntohs(x) ___bpf_swab16(x) +# define __bpf_constant_htons(x) ___bpf_swab16(x) +# define __bpf_ntohl(x) __builtin_bswap32(x) +# define __bpf_htonl(x) __builtin_bswap32(x) +# define __bpf_constant_ntohl(x) ___bpf_swab32(x) +# define __bpf_constant_htonl(x) ___bpf_swab32(x) +# define __bpf_be64_to_cpu(x) __builtin_bswap64(x) +# define __bpf_cpu_to_be64(x) __builtin_bswap64(x) +# define __bpf_constant_be64_to_cpu(x) ___bpf_swab64(x) +# define __bpf_constant_cpu_to_be64(x) ___bpf_swab64(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define __bpf_ntohs(x) (x) +# define __bpf_htons(x) (x) +# define __bpf_constant_ntohs(x) (x) +# define __bpf_constant_htons(x) (x) +# define __bpf_ntohl(x) (x) +# define __bpf_htonl(x) (x) +# define __bpf_constant_ntohl(x) (x) +# define __bpf_constant_htonl(x) (x) +# define __bpf_be64_to_cpu(x) (x) +# define __bpf_cpu_to_be64(x) (x) +# define __bpf_constant_be64_to_cpu(x) (x) +# define __bpf_constant_cpu_to_be64(x) (x) +#else +# error "Fix your compiler's __BYTE_ORDER__?!" +#endif + +#define bpf_htons(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_htons(x) : __bpf_htons(x)) +#define bpf_ntohs(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_ntohs(x) : __bpf_ntohs(x)) +#define bpf_htonl(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_htonl(x) : __bpf_htonl(x)) +#define bpf_ntohl(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_ntohl(x) : __bpf_ntohl(x)) +#define bpf_cpu_to_be64(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_cpu_to_be64(x) : __bpf_cpu_to_be64(x)) +#define bpf_be64_to_cpu(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_be64_to_cpu(x) : __bpf_be64_to_cpu(x)) + +#endif /* __BPF_ENDIAN__ */ diff --git a/src/bpf_gen_internal.h b/src/bpf_gen_internal.h new file mode 100644 index 0000000..2233089 --- /dev/null +++ b/src/bpf_gen_internal.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021 Facebook */ +#ifndef __BPF_GEN_INTERNAL_H +#define __BPF_GEN_INTERNAL_H + +#include "bpf.h" + +struct ksym_relo_desc { + const char *name; + int kind; + int insn_idx; + bool is_weak; + bool is_typeless; +}; + +struct ksym_desc { + const char *name; + int ref; + int kind; + union { + /* used for kfunc */ + int off; + /* used for typeless ksym */ + bool typeless; + }; + int insn; +}; + +struct bpf_gen { + struct gen_loader_opts *opts; + void *data_start; + void *data_cur; + void *insn_start; + void *insn_cur; + ssize_t cleanup_label; + __u32 nr_progs; + __u32 nr_maps; + int log_level; + int error; + struct ksym_relo_desc *relos; + int relo_cnt; + struct bpf_core_relo *core_relos; + int core_relo_cnt; + char attach_target[128]; + int attach_kind; + struct ksym_desc *ksyms; + __u32 nr_ksyms; + int fd_array; + int nr_fd_array; +}; + +void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps); +int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps); +void bpf_gen__free(struct bpf_gen *gen); +void bpf_gen__load_btf(struct bpf_gen *gen, const void *raw_data, __u32 raw_size); +void bpf_gen__map_create(struct bpf_gen *gen, + enum bpf_map_type map_type, const char *map_name, + __u32 key_size, __u32 value_size, __u32 max_entries, + struct bpf_map_create_opts *map_attr, int map_idx); +void bpf_gen__prog_load(struct bpf_gen *gen, + enum bpf_prog_type prog_type, const char *prog_name, + const char *license, struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *load_attr, int prog_idx); +void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *value, __u32 value_size); +void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx); +void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum bpf_attach_type type); +void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, + bool is_typeless, int kind, int insn_idx); +void bpf_gen__record_relo_core(struct bpf_gen *gen, const struct bpf_core_relo *core_relo); +void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int key, int inner_map_idx); + +#endif diff --git a/src/bpf_helper_defs.h b/src/bpf_helper_defs.h new file mode 100644 index 0000000..9bcaca1 --- /dev/null +++ b/src/bpf_helper_defs.h @@ -0,0 +1,4718 @@ +/* This is auto-generated file. See bpf_doc.py for details. */ + +/* Forward declarations of BPF structs */ +struct bpf_fib_lookup; +struct bpf_sk_lookup; +struct bpf_perf_event_data; +struct bpf_perf_event_value; +struct bpf_pidns_info; +struct bpf_redir_neigh; +struct bpf_sock; +struct bpf_sock_addr; +struct bpf_sock_ops; +struct bpf_sock_tuple; +struct bpf_spin_lock; +struct bpf_sysctl; +struct bpf_tcp_sock; +struct bpf_tunnel_key; +struct bpf_xfrm_state; +struct linux_binprm; +struct pt_regs; +struct sk_reuseport_md; +struct sockaddr; +struct tcphdr; +struct seq_file; +struct tcp6_sock; +struct tcp_sock; +struct tcp_timewait_sock; +struct tcp_request_sock; +struct udp6_sock; +struct unix_sock; +struct task_struct; +struct cgroup; +struct __sk_buff; +struct sk_msg_md; +struct xdp_md; +struct path; +struct btf_ptr; +struct inode; +struct socket; +struct file; +struct bpf_timer; +struct mptcp_sock; +struct bpf_dynptr; +struct iphdr; +struct ipv6hdr; + +/* + * bpf_map_lookup_elem + * + * Perform a lookup in *map* for an entry associated to *key*. + * + * Returns + * Map value associated to *key*, or **NULL** if no entry was + * found. + */ +static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1; + +/* + * bpf_map_update_elem + * + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *) 2; + +/* + * bpf_map_delete_elem + * + * Delete entry with *key* from *map*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3; + +/* + * bpf_probe_read + * + * For tracing programs, safely attempt to read *size* bytes from + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 4; + +/* + * bpf_ktime_get_ns + * + * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5; + +/* + * bpf_trace_printk + * + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is + * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: <formatted msg> + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ``<formatted msg>`` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * block (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * + * Returns + * The number of bytes written to the buffer, or a negative error + * in case of failure. + */ +static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) 6; + +/* + * bpf_get_prandom_u32 + * + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * + * Returns + * A random 32-bit unsigned value. + */ +static __u32 (*bpf_get_prandom_u32)(void) = (void *) 7; + +/* + * bpf_get_smp_processor_id + * + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with migration disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * + * Returns + * The SMP id of the processor running the program. + */ +static __u32 (*bpf_get_smp_processor_id)(void) = (void *) 8; + +/* + * bpf_skb_store_bytes + * + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len, __u64 flags) = (void *) 9; + +/* + * bpf_l3_csum_replace + * + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 size) = (void *) 10; + +/* + * bpf_l4_csum_replace + * + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 flags) = (void *) 11; + +/* + * bpf_tail_call + * + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 33. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (void *) 12; + +/* + * bpf_clone_redirect + * + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13; + +/* + * bpf_get_current_pid_tgid + * + * Get the current pid and tgid. + * + * Returns + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + */ +static __u64 (*bpf_get_current_pid_tgid)(void) = (void *) 14; + +/* + * bpf_get_current_uid_gid + * + * Get the current uid and gid. + * + * Returns + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + */ +static __u64 (*bpf_get_current_uid_gid)(void) = (void *) 15; + +/* + * bpf_get_current_comm + * + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16; + +/* + * bpf_get_cgroup_classid + * + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * + * Returns + * The classid, or 0 for the default unconfigured classid. + */ +static __u32 (*bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17; + +/* + * bpf_skb_vlan_push + * + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 vlan_tci) = (void *) 18; + +/* + * bpf_skb_vlan_pop + * + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19; + +/* + * bpf_skb_get_tunnel_key + * + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 20; + +/* + * bpf_skb_set_tunnel_key + * + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 21; + +/* + * bpf_perf_event_read + * + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * + * Returns + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + */ +static __u64 (*bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22; + +/* + * bpf_redirect + * + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. + * + * Returns + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + */ +static long (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23; + +/* + * bpf_get_route_realm + * + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * identifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * + * Returns + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + */ +static __u32 (*bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24; + +/* + * bpf_perf_event_output + * + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 25; + +/* + * bpf_skb_load_bytes + * + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 len) = (void *) 26; + +/* + * bpf_get_stackid + * + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack=<new value> + * + * Returns + * The positive or null stack id on success, or a negative error + * in case of failure. + */ +static long (*bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27; + +/* + * bpf_csum_diff + * + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * + * Returns + * The checksum result, or a negative error code in case of + * failure. + */ +static __s64 (*bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 to_size, __wsum seed) = (void *) 28; + +/* + * bpf_skb_get_tunnel_opt + * + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * + * Returns + * The size of the option data retrieved. + */ +static long (*bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 29; + +/* + * bpf_skb_set_tunnel_opt + * + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 30; + +/* + * bpf_skb_change_proto + * + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 flags) = (void *) 31; + +/* + * bpf_skb_change_type + * + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) 32; + +/* + * bpf_skb_under_cgroup + * + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * + * Returns + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + */ +static long (*bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 index) = (void *) 33; + +/* + * bpf_get_hash_recalc + * + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * + * Returns + * The 32-bit hash. + */ +static __u32 (*bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34; + +/* + * bpf_get_current_task + * + * Get the current task. + * + * Returns + * A pointer to the current task struct. + */ +static __u64 (*bpf_get_current_task)(void) = (void *) 35; + +/* + * bpf_probe_write_user + * + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (void *) 36; + +/* + * bpf_current_task_under_cgroup + * + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * + * Returns + * The return value depends on the result of the test, and can be: + * + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + */ +static long (*bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) 37; + +/* + * bpf_skb_change_tail + * + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 38; + +/* + * bpf_skb_pull_data + * + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39; + +/* + * bpf_csum_update + * + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * + * Returns + * The checksum on success, or a negative error code in case of + * failure. + */ +static __s64 (*bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 40; + +/* + * bpf_set_hash_invalid + * + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * + * Returns + * void. + */ +static void (*bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41; + +/* + * bpf_get_numa_node_id + * + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * + * Returns + * The id of current NUMA node. + */ +static long (*bpf_get_numa_node_id)(void) = (void *) 42; + +/* + * bpf_skb_change_head + * + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 43; + +/* + * bpf_xdp_adjust_head + * + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44; + +/* + * bpf_probe_read_str + * + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for + * more details. + * + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. + * + * Returns + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + */ +static long (*bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 45; + +/* + * bpf_get_socket_cookie + * + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. + * + * Returns + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. + */ +static __u64 (*bpf_get_socket_cookie)(void *ctx) = (void *) 46; + +/* + * bpf_get_socket_uid + * + * Get the owner UID of the socked associated to *skb*. + * + * Returns + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + */ +static __u32 (*bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47; + +/* + * bpf_set_hash + * + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * + * Returns + * 0 + */ +static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; + +/* + * bpf_setsockopt + * + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49; + +/* + * bpf_skb_adjust_room + * + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed between the layer 2 and + * layer 3 headers). + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed between the layer 3 and + * layer 4 headers). + * + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, + * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; *len* is the length of the inner MAC header. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 mode, __u64 flags) = (void *) 50; + +/* + * bpf_redirect_map + * + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. + * + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. + * + * Returns + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the *flags* argument on error. + */ +static long (*bpf_redirect_map)(void *map, __u64 key, __u64 flags) = (void *) 51; + +/* + * bpf_sk_redirect_map + * + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, __u64 flags) = (void *) 52; + +/* + * bpf_sock_map_update + * + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 53; + +/* + * bpf_xdp_adjust_meta + * + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54; + +/* + * bpf_perf_event_read_value + * + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 55; + +/* + * bpf_perf_prog_read_value + * + * For en eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 56; + +/* + * bpf_getsockopt + * + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57; + +/* + * bpf_override_return + * + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * + * Returns + * 0 + */ +static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58; + +/* + * bpf_sock_ops_cb_flags_set + * + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * *argval* is a flag array which can combine these flags: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) + * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * + * Returns + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + */ +static long (*bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argval) = (void *) 59; + +/* + * bpf_msg_redirect_map + * + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, __u64 flags) = (void *) 60; + +/* + * bpf_msg_apply_bytes + * + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * + * Returns + * 0 + */ +static long (*bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 61; + +/* + * bpf_msg_cork_bytes + * + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * + * Returns + * 0 + */ +static long (*bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 62; + +/* + * bpf_msg_pull_data + * + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, __u64 flags) = (void *) 63; + +/* + * bpf_bind + * + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) = (void *) 64; + +/* + * bpf_xdp_adjust_tail + * + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65; + +/* + * bpf_skb_get_xfrm_state + * + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct bpf_xfrm_state *xfrm_state, __u32 size, __u64 flags) = (void *) 66; + +/* + * bpf_get_stack + * + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack=<new value> + * + * Returns + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + */ +static long (*bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (void *) 67; + +/* + * bpf_skb_load_bytes_relative + * + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void *to, __u32 len, __u32 start_header) = (void *) 68; + +/* + * bpf_fib_lookup + * + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * + * Returns + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * packet is not forwarded or needs assist from full stack + * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + */ +static long (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) 69; + +/* + * bpf_sock_hash_update + * + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 70; + +/* + * bpf_msg_redirect_hash + * + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key, __u64 flags) = (void *) 71; + +/* + * bpf_sk_redirect_hash + * + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, __u64 flags) = (void *) 72; + +/* + * bpf_lwt_push_encap + * + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to **LWT_BPF_MAX_HEADROOM** + * total bytes in all prepended headers. Please note that + * if **skb_is_gso**\ (*skb*) is true, no more than two + * headers can be prepended, and the inner header, if + * present, should be either GRE or UDP/GUE. + * + * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs + * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can + * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and + * **BPF_PROG_TYPE_LWT_XMIT**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, __u32 len) = (void *) 73; + +/* + * bpf_lwt_seg6_store_bytes + * + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len) = (void *) 74; + +/* + * bpf_lwt_seg6_adjust_srh + * + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s32 delta) = (void *) 75; + +/* + * bpf_lwt_seg6_action + * + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *param, __u32 param_len) = (void *) 76; + +/* + * bpf_rc_repeat + * + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Returns + * 0 + */ +static long (*bpf_rc_repeat)(void *ctx) = (void *) 77; + +/* + * bpf_rc_keydown + * + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Returns + * 0 + */ +static long (*bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 toggle) = (void *) 78; + +/* + * bpf_skb_cgroup_id + * + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79; + +/* + * bpf_get_current_cgroup_id + * + * Get the current cgroup id based on the cgroup within which + * the current task is running. + * + * Returns + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. + */ +static __u64 (*bpf_get_current_cgroup_id)(void) = (void *) 80; + +/* + * bpf_get_local_storage + * + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the **BPF_ATOMIC** instructions to alter + * the shared data. + * + * Returns + * A pointer to the local storage area. + */ +static void *(*bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81; + +/* + * bpf_sk_select_reuseport + * + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, void *key, __u64 flags) = (void *) 82; + +/* + * bpf_skb_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_level) = (void *) 83; + +/* + * bpf_sk_lookup_tcp + * + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * + * Returns + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + */ +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 84; + +/* + * bpf_sk_lookup_udp + * + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * + * Returns + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + */ +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 85; + +/* + * bpf_sk_release + * + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sk_release)(void *sock) = (void *) 86; + +/* + * bpf_map_push_elem + * + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (void *) 87; + +/* + * bpf_map_pop_elem + * + * Pop an element from *map*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_pop_elem)(void *map, void *value) = (void *) 88; + +/* + * bpf_map_peek_elem + * + * Get an element from *map* without removing it. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_peek_elem)(void *map, void *value) = (void *) 89; + +/* + * bpf_msg_push_data + * + * For socket policies, insert *len* bytes into *msg* at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the *msg*. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 90; + +/* + * bpf_msg_pop_data + * + * Will remove *len* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 91; + +/* + * bpf_rc_pointer_rel + * + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Returns + * 0 + */ +static long (*bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void *) 92; + +/* + * bpf_spin_lock + * + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * + * Returns + * 0 + */ +static long (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93; + +/* + * bpf_spin_unlock + * + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * + * Returns + * 0 + */ +static long (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94; + +/* + * bpf_sk_fullsock + * + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in this **bpf_sock** can be accessed. + * + * Returns + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + */ +static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95; + +/* + * bpf_tcp_sock + * + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Returns + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in + * case of failure. + */ +static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96; + +/* + * bpf_skb_ecn_set_ce + * + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * + * Returns + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + */ +static long (*bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97; + +/* + * bpf_get_listener_sock + * + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * + * Returns + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + */ +static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) 98; + +/* + * bpf_skc_lookup_tcp + * + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to **bpf_sk_lookup_tcp**\ (), except + * that it also returns timewait or request sockets. Use + * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the + * full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * + * Returns + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + */ +static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99; + +/* + * bpf_tcp_check_syncookie + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK for + * the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative + * error otherwise. + */ +static long (*bpf_tcp_check_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 100; + +/* + * bpf_sysctl_get_name + * + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * + * Returns + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + */ +static long (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len, __u64 flags) = (void *) 101; + +/* + * bpf_sysctl_get_current_value + * + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * Returns + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + */ +static long (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 102; + +/* + * bpf_sysctl_get_new_value + * + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * Returns + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + */ +static long (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 103; + +/* + * bpf_sysctl_set_new_value + * + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * + * Returns + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + */ +static long (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, unsigned long buf_len) = (void *) 104; + +/* + * bpf_strtol + * + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)) followed by a single + * optional '**-**' sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtol**\ (3). + * + * Returns + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + */ +static long (*bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, long *res) = (void *) 105; + +/* + * bpf_strtoul + * + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtoul**\ (3). + * + * Returns + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + */ +static long (*bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, unsigned long *res) = (void *) 106; + +/* + * bpf_sk_storage_get + * + * Get a bpf-local-storage from a *sk*. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this + * helper enforces the key must be a full socket and the map must + * be a **BPF_MAP_TYPE_SK_STORAGE** also. + * + * Underneath, the value is stored locally at *sk* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf-local-storages residing at *sk*. + * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * + * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf-local-storage. If *value* is + * **NULL**, the new bpf-local-storage will be zero initialized. + * + * Returns + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + */ +static void *(*bpf_sk_storage_get)(void *map, void *sk, void *value, __u64 flags) = (void *) 107; + +/* + * bpf_sk_storage_delete + * + * Delete a bpf-local-storage from a *sk*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). + */ +static long (*bpf_sk_storage_delete)(void *map, void *sk) = (void *) 108; + +/* + * bpf_send_signal + * + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. + * + * Returns + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + */ +static long (*bpf_send_signal)(__u32 sig) = (void *) 109; + +/* + * bpf_tcp_gen_syncookie + * + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + */ +static __s64 (*bpf_tcp_gen_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 110; + +/* + * bpf_skb_output + * + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 111; + +/* + * bpf_probe_read_user + * + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 112; + +/* + * bpf_probe_read_kernel + * + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 113; + +/* + * bpf_probe_read_user_str + * + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user**\ () helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * + * Returns + * On success, the strictly positive length of the output string, + * including the trailing NUL character. On error, a negative + * value. + */ +static long (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 114; + +/* + * bpf_probe_read_kernel_str + * + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. + * + * Returns + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. + */ +static long (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115; + +/* + * bpf_tcp_send_ack + * + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. + * *rcv_nxt* is the ack_seq to be sent out. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116; + +/* + * bpf_send_signal_thread + * + * Send signal *sig* to the thread corresponding to the current task. + * + * Returns + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + */ +static long (*bpf_send_signal_thread)(__u32 sig) = (void *) 117; + +/* + * bpf_jiffies64 + * + * Obtain the 64bit jiffies + * + * Returns + * The 64 bit jiffies + */ +static __u64 (*bpf_jiffies64)(void) = (void *) 118; + +/* + * bpf_read_branch_records + * + * For an eBPF program attached to a perf event, retrieve the + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * + * Returns + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of **sizeof**\ (**struct perf_branch_entry**\ ). + * + * **-ENOENT** if architecture does not support branch records. + */ +static long (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf, __u32 size, __u64 flags) = (void *) 119; + +/* + * bpf_get_ns_current_pid_tgid + * + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * Returns + * 0 on success, or one of the following in case of failure: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + */ +static long (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120; + +/* + * bpf_xdp_output + * + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 121; + +/* + * bpf_get_netns_cookie + * + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. + * + * Returns + * A 8-byte long opaque number. + */ +static __u64 (*bpf_get_netns_cookie)(void *ctx) = (void *) 122; + +/* + * bpf_get_current_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void *) 123; + +/* + * bpf_sk_assign + * + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). + */ +static long (*bpf_sk_assign)(void *ctx, void *sk, __u64 flags) = (void *) 124; + +/* + * bpf_ktime_get_boot_ns + * + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125; + +/* + * bpf_seq_printf + * + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a **u64** array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the size of *data* in bytes - must be a multiple of 8. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + */ +static long (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126; + +/* + * bpf_seq_write + * + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + */ +static long (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127; + +/* + * bpf_sk_cgroup_id + * + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_sk_cgroup_id)(void *sk) = (void *) 128; + +/* + * bpf_sk_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_sk_ancestor_cgroup_id)(void *sk, int ancestor_level) = (void *) 129; + +/* + * bpf_ringbuf_output + * + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *) 130; + +/* + * bpf_ringbuf_reserve + * + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. + * + * Returns + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + */ +static void *(*bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (void *) 131; + +/* + * bpf_ringbuf_submit + * + * Submit reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132; + +/* + * bpf_ringbuf_discard + * + * Discard reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133; + +/* + * bpf_ringbuf_query + * + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * + * Returns + * Requested value, or 0, if *flags* are not recognized. + */ +static __u64 (*bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134; + +/* + * bpf_csum_level + * + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * + * Returns + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. + */ +static long (*bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135; + +/* + * bpf_skc_to_tcp6_sock + * + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp6_sock *(*bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136; + +/* + * bpf_skc_to_tcp_sock + * + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp_sock *(*bpf_skc_to_tcp_sock)(void *sk) = (void *) 137; + +/* + * bpf_skc_to_tcp_timewait_sock + * + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp_timewait_sock *(*bpf_skc_to_tcp_timewait_sock)(void *sk) = (void *) 138; + +/* + * bpf_skc_to_tcp_request_sock + * + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp_request_sock *(*bpf_skc_to_tcp_request_sock)(void *sk) = (void *) 139; + +/* + * bpf_skc_to_udp6_sock + * + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; + +/* + * bpf_get_task_stack + * + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack=<new value> + * + * Returns + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + */ +static long (*bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 size, __u64 flags) = (void *) 141; + +/* + * bpf_load_hdr_opt + * + * Load header option. Support reading a particular TCP header + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). + * + * If *flags* is 0, it will search the option from the + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** + * has details on what skb_data contains under different + * *skops*\ **->op**. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * + * Returns + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOMSG** if the option is not found. + * + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. + * + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** on failure to parse the header options in the + * packet. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + */ +static long (*bpf_load_hdr_opt)(struct bpf_sock_ops *skops, void *searchby_res, __u32 len, __u64 flags) = (void *) 142; + +/* + * bpf_store_hdr_opt + * + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * + * Returns + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written + * + * **-EEXIST** if the option already exists. + * + * **-EFAULT** on failure to parse the existing header options. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + */ +static long (*bpf_store_hdr_opt)(struct bpf_sock_ops *skops, const void *from, __u32 len, __u64 flags) = (void *) 143; + +/* + * bpf_reserve_hdr_opt + * + * Reserve *len* bytes for the bpf header option. The + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * If **bpf_reserve_hdr_opt**\ () is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. + * + * + * Returns + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + */ +static long (*bpf_reserve_hdr_opt)(struct bpf_sock_ops *skops, __u32 len, __u64 flags) = (void *) 144; + +/* + * bpf_inode_storage_get + * + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(*bpf_inode_storage_get)(void *map, void *inode, void *value, __u64 flags) = (void *) 145; + +/* + * bpf_inode_storage_delete + * + * Delete a bpf_local_storage from an *inode*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static int (*bpf_inode_storage_delete)(void *map, void *inode) = (void *) 146; + +/* + * bpf_d_path + * + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and + * is zero terminated. + * + * + * Returns + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + */ +static long (*bpf_d_path)(struct path *path, char *buf, __u32 sz) = (void *) 147; + +/* + * bpf_copy_from_user + * + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_copy_from_user)(void *dst, __u32 size, const void *user_ptr) = (void *) 148; + +/* + * bpf_snprintf_btf + * + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * + * Returns + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. + */ +static long (*bpf_snprintf_btf)(char *str, __u32 str_size, struct btf_ptr *ptr, __u32 btf_ptr_size, __u64 flags) = (void *) 149; + +/* + * bpf_seq_printf_btf + * + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * + * Returns + * 0 on success or a negative error in case of failure. + */ +static long (*bpf_seq_printf_btf)(struct seq_file *m, struct btf_ptr *ptr, __u32 ptr_size, __u64 flags) = (void *) 150; + +/* + * bpf_skb_cgroup_classid + * + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_skb_cgroup_classid)(struct __sk_buff *skb) = (void *) 151; + +/* + * bpf_redirect_neigh + * + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * populates L2 addresses as well, meaning, internally, the helper + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * + * Returns + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + */ +static long (*bpf_redirect_neigh)(__u32 ifindex, struct bpf_redir_neigh *params, int plen, __u64 flags) = (void *) 152; + +/* + * bpf_per_cpu_ptr + * + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * + * Returns + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + */ +static void *(*bpf_per_cpu_ptr)(const void *percpu_ptr, __u32 cpu) = (void *) 153; + +/* + * bpf_this_cpu_ptr + * + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * + * Returns + * A pointer pointing to the kernel percpu variable on this cpu. + */ +static void *(*bpf_this_cpu_ptr)(const void *percpu_ptr) = (void *) 154; + +/* + * bpf_redirect_peer + * + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * + * Returns + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + */ +static long (*bpf_redirect_peer)(__u32 ifindex, __u64 flags) = (void *) 155; + +/* + * bpf_task_storage_get + * + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be a task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(*bpf_task_storage_get)(void *map, struct task_struct *task, void *value, __u64 flags) = (void *) 156; + +/* + * bpf_task_storage_delete + * + * Delete a bpf_local_storage from a *task*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static long (*bpf_task_storage_delete)(void *map, struct task_struct *task) = (void *) 157; + +/* + * bpf_get_current_task_btf + * + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * + * Returns + * Pointer to the current task. + */ +static struct task_struct *(*bpf_get_current_task_btf)(void) = (void *) 158; + +/* + * bpf_bprm_opts_set + * + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * + * Returns + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + */ +static long (*bpf_bprm_opts_set)(struct linux_binprm *bprm, __u64 flags) = (void *) 159; + +/* + * bpf_ktime_get_coarse_ns + * + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_coarse_ns)(void) = (void *) 160; + +/* + * bpf_ima_inode_hash + * + * Returns the stored IMA hash of the *inode* (if it's available). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * + * Returns + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. + */ +static long (*bpf_ima_inode_hash)(struct inode *inode, void *dst, __u32 size) = (void *) 161; + +/* + * bpf_sock_from_file + * + * If the given file represents a socket, returns the associated + * socket. + * + * Returns + * A pointer to a struct socket on success or NULL if the file is + * not a socket. + */ +static struct socket *(*bpf_sock_from_file)(struct file *file) = (void *) 162; + +/* + * bpf_check_mtu + * + * Check packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing a *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TC length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. + * + * + * Returns + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + */ +static long (*bpf_check_mtu)(void *ctx, __u32 ifindex, __u32 *mtu_len, __s32 len_diff, __u64 flags) = (void *) 163; + +/* + * bpf_for_each_map_elem + * + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * + * Returns + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. + */ +static long (*bpf_for_each_map_elem)(void *map, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 164; + +/* + * bpf_snprintf + * + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes - must be + * a multiple of 8. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * + * Returns + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + */ +static long (*bpf_snprintf)(char *str, __u32 str_size, const char *fmt, __u64 *data, __u32 data_len) = (void *) 165; + +/* + * bpf_sys_bpf + * + * Execute bpf syscall with given arguments. + * + * Returns + * A syscall result. + */ +static long (*bpf_sys_bpf)(__u32 cmd, void *attr, __u32 attr_size) = (void *) 166; + +/* + * bpf_btf_find_by_name_kind + * + * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs. + * + * Returns + * Returns btf_id and btf_obj_fd in lower and upper 32 bits. + */ +static long (*bpf_btf_find_by_name_kind)(char *name, int name_sz, __u32 kind, int flags) = (void *) 167; + +/* + * bpf_sys_close + * + * Execute close syscall for given FD. + * + * Returns + * A syscall result. + */ +static long (*bpf_sys_close)(__u32 fd) = (void *) 168; + +/* + * bpf_timer_init + * + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * + * Returns + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + */ +static long (*bpf_timer_init)(struct bpf_timer *timer, void *map, __u64 flags) = (void *) 169; + +/* + * bpf_timer_set_callback + * + * Configure the timer to call *callback_fn* static function. + * + * Returns + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + */ +static long (*bpf_timer_set_callback)(struct bpf_timer *timer, void *callback_fn) = (void *) 170; + +/* + * bpf_timer_start + * + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * + * Returns + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + */ +static long (*bpf_timer_start)(struct bpf_timer *timer, __u64 nsecs, __u64 flags) = (void *) 171; + +/* + * bpf_timer_cancel + * + * Cancel the timer and wait for callback_fn to finish if it was running. + * + * Returns + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + */ +static long (*bpf_timer_cancel)(struct bpf_timer *timer) = (void *) 172; + +/* + * bpf_get_func_ip + * + * Get address of the traced function (for tracing and kprobe programs). + * + * Returns + * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). + */ +static __u64 (*bpf_get_func_ip)(void *ctx) = (void *) 173; + +/* + * bpf_get_attach_cookie + * + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * + * Returns + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. + */ +static __u64 (*bpf_get_attach_cookie)(void *ctx) = (void *) 174; + +/* + * bpf_task_pt_regs + * + * Get the struct pt_regs associated with **task**. + * + * Returns + * A pointer to struct pt_regs. + */ +static long (*bpf_task_pt_regs)(struct task_struct *task) = (void *) 175; + +/* + * bpf_get_branch_snapshot + * + * Get branch trace from hardware engines like Intel LBR. The + * hardware engine is stopped shortly after the helper is + * called. Therefore, the user need to filter branch entries + * based on the actual use case. To capture branch trace + * before the trigger point of the BPF program, the helper + * should be called at the beginning of the BPF program. + * + * The data is stored as struct perf_branch_entry into output + * buffer *entries*. *size* is the size of *entries* in bytes. + * *flags* is reserved for now and must be zero. + * + * + * Returns + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-ENOENT** if architecture does not support branch records. + */ +static long (*bpf_get_branch_snapshot)(void *entries, __u32 size, __u64 flags) = (void *) 176; + +/* + * bpf_trace_vprintk + * + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format and can handle more format args as a result. + * + * Arguments are to be used as in **bpf_seq_printf**\ () helper. + * + * Returns + * The number of bytes written to the buffer, or a negative error + * in case of failure. + */ +static long (*bpf_trace_vprintk)(const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 177; + +/* + * bpf_skc_to_unix_sock + * + * Dynamically cast a *sk* pointer to a *unix_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct unix_sock *(*bpf_skc_to_unix_sock)(void *sk) = (void *) 178; + +/* + * bpf_kallsyms_lookup_name + * + * Get the address of a kernel symbol, returned in *res*. *res* is + * set to 0 if the symbol is not found. + * + * Returns + * On success, zero. On error, a negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-EINVAL** if string *name* is not the same size as *name_sz*. + * + * **-ENOENT** if symbol is not found. + * + * **-EPERM** if caller does not have permission to obtain kernel address. + */ +static long (*bpf_kallsyms_lookup_name)(const char *name, int name_sz, int flags, __u64 *res) = (void *) 179; + +/* + * bpf_find_vma + * + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * + * Returns + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. + */ +static long (*bpf_find_vma)(struct task_struct *task, __u64 addr, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 180; + +/* + * bpf_loop + * + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * + * Returns + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + */ +static long (*bpf_loop)(__u32 nr_loops, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 181; + +/* + * bpf_strncmp + * + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * + * Returns + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. + */ +static long (*bpf_strncmp)(const char *s1, __u32 s1_sz, const char *s2) = (void *) 182; + +/* + * bpf_get_func_arg + * + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * + * Returns + * 0 on success. + * **-EINVAL** if n >= argument register count of traced function. + */ +static long (*bpf_get_func_arg)(void *ctx, __u32 n, __u64 *value) = (void *) 183; + +/* + * bpf_get_func_ret + * + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * + * Returns + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + */ +static long (*bpf_get_func_ret)(void *ctx, __u64 *value) = (void *) 184; + +/* + * bpf_get_func_arg_cnt + * + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. + * + * + * Returns + * The number of argument registers of the traced function. + */ +static long (*bpf_get_func_arg_cnt)(void *ctx) = (void *) 185; + +/* + * bpf_get_retval + * + * Get the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Returns + * The BPF program's return value. + */ +static int (*bpf_get_retval)(void) = (void *) 186; + +/* + * bpf_set_retval + * + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. + * + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static int (*bpf_set_retval)(int retval) = (void *) 187; + +/* + * bpf_xdp_get_buff_len + * + * Get the total size of a given xdp buff (linear and paged area) + * + * Returns + * The total size of a given xdp buffer. + */ +static __u64 (*bpf_xdp_get_buff_len)(struct xdp_md *xdp_md) = (void *) 188; + +/* + * bpf_xdp_load_bytes + * + * This helper is provided as an easy way to load data from a + * xdp buffer. It can be used to load *len* bytes from *offset* from + * the frame associated to *xdp_md*, into the buffer pointed by + * *buf*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 189; + +/* + * bpf_xdp_store_bytes + * + * Store *len* bytes from buffer *buf* into the frame + * associated to *xdp_md*, at *offset*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_store_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 190; + +/* + * bpf_copy_from_user_task + * + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * + * Returns + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. + */ +static long (*bpf_copy_from_user_task)(void *dst, __u32 size, const void *user_ptr, struct task_struct *tsk, __u64 flags) = (void *) 191; + +/* + * bpf_skb_set_tstamp + * + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. + * + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp + * instead. + * + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * + * Returns + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported protocol + */ +static long (*bpf_skb_set_tstamp)(struct __sk_buff *skb, __u64 tstamp, __u32 tstamp_type) = (void *) 192; + +/* + * bpf_ima_file_hash + * + * Returns a calculated IMA hash of the *file*. + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * + * Returns + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * invalid arguments are passed. + */ +static long (*bpf_ima_file_hash)(struct file *file, void *dst, __u32 size) = (void *) 193; + +/* + * bpf_kptr_xchg + * + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * + * Returns + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. + */ +static void *(*bpf_kptr_xchg)(void *map_value, void *ptr) = (void *) 194; + +/* + * bpf_map_lookup_percpu_elem + * + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * + * Returns + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. + */ +static void *(*bpf_map_lookup_percpu_elem)(void *map, const void *key, __u32 cpu) = (void *) 195; + +/* + * bpf_skc_to_mptcp_sock + * + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct mptcp_sock *(*bpf_skc_to_mptcp_sock)(void *sk) = (void *) 196; + +/* + * bpf_dynptr_from_mem + * + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * + * Returns + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. + */ +static long (*bpf_dynptr_from_mem)(void *data, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 197; + +/* + * bpf_ringbuf_reserve_dynptr + * + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_ringbuf_reserve_dynptr)(void *ringbuf, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 198; + +/* + * bpf_ringbuf_submit_dynptr + * + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_submit_dynptr)(struct bpf_dynptr *ptr, __u64 flags) = (void *) 199; + +/* + * bpf_ringbuf_discard_dynptr + * + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_discard_dynptr)(struct bpf_dynptr *ptr, __u64 flags) = (void *) 200; + +/* + * bpf_dynptr_read + * + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * *flags* is currently unused. + * + * Returns + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. + */ +static long (*bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *src, __u32 offset, __u64 flags) = (void *) 201; + +/* + * bpf_dynptr_write + * + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * *flags* is currently unused. + * + * Returns + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr or if *flags* is not 0. + */ +static long (*bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len, __u64 flags) = (void *) 202; + +/* + * bpf_dynptr_data + * + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * + * Returns + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. + */ +static void *(*bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u32 offset, __u32 len) = (void *) 203; + +/* + * bpf_tcp_raw_gen_syncookie_ipv4 + * + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + */ +static __s64 (*bpf_tcp_raw_gen_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th, __u32 th_len) = (void *) 204; + +/* + * bpf_tcp_raw_gen_syncookie_ipv6 + * + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + */ +static __s64 (*bpf_tcp_raw_gen_syncookie_ipv6)(struct ipv6hdr *iph, struct tcphdr *th, __u32 th_len) = (void *) 205; + +/* + * bpf_tcp_raw_check_syncookie_ipv4 + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + */ +static long (*bpf_tcp_raw_check_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th) = (void *) 206; + +/* + * bpf_tcp_raw_check_syncookie_ipv6 + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + */ +static long (*bpf_tcp_raw_check_syncookie_ipv6)(struct ipv6hdr *iph, struct tcphdr *th) = (void *) 207; + +/* + * bpf_ktime_get_tai_ns + * + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_tai_ns)(void) = (void *) 208; + +/* + * bpf_user_ringbuf_drain + * + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * + * Returns + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. + */ +static long (*bpf_user_ringbuf_drain)(void *map, void *callback_fn, void *ctx, __u64 flags) = (void *) 209; + +/* + * bpf_cgrp_storage_get + * + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(*bpf_cgrp_storage_get)(void *map, struct cgroup *cgroup, void *value, __u64 flags) = (void *) 210; + +/* + * bpf_cgrp_storage_delete + * + * Delete a bpf_local_storage from a *cgroup*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static long (*bpf_cgrp_storage_delete)(void *map, struct cgroup *cgroup) = (void *) 211; + + diff --git a/src/bpf_helpers.h b/src/bpf_helpers.h new file mode 100644 index 0000000..d37c4fe --- /dev/null +++ b/src/bpf_helpers.h @@ -0,0 +1,289 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_HELPERS__ +#define __BPF_HELPERS__ + +/* + * Note that bpf programs need to include either + * vmlinux.h (auto-generated from BTF) or linux/types.h + * in advance since bpf_helper_defs.h uses such types + * as __u64. + */ +#include "bpf_helper_defs.h" + +#define __uint(name, val) int (*name)[val] +#define __type(name, val) typeof(val) *name +#define __array(name, val) typeof(val) *name[] + +/* + * Helper macro to place programs, maps, license in + * different sections in elf_bpf file. Section names + * are interpreted by libbpf depending on the context (BPF programs, BPF maps, + * extern variables, etc). + * To allow use of SEC() with externs (e.g., for extern .maps declarations), + * make sure __attribute__((unused)) doesn't trigger compilation warning. + */ +#if __GNUC__ && !__clang__ + +/* + * Pragma macros are broken on GCC + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90400 + */ +#define SEC(name) __attribute__((section(name), used)) + +#else + +#define SEC(name) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ + __attribute__((section(name), used)) \ + _Pragma("GCC diagnostic pop") \ + +#endif + +/* Avoid 'linux/stddef.h' definition of '__always_inline'. */ +#undef __always_inline +#define __always_inline inline __attribute__((always_inline)) + +#ifndef __noinline +#define __noinline __attribute__((noinline)) +#endif +#ifndef __weak +#define __weak __attribute__((weak)) +#endif + +/* + * Use __hidden attribute to mark a non-static BPF subprogram effectively + * static for BPF verifier's verification algorithm purposes, allowing more + * extensive and permissive BPF verification process, taking into account + * subprogram's caller context. + */ +#define __hidden __attribute__((visibility("hidden"))) + +/* When utilizing vmlinux.h with BPF CO-RE, user BPF programs can't include + * any system-level headers (such as stddef.h, linux/version.h, etc), and + * commonly-used macros like NULL and KERNEL_VERSION aren't available through + * vmlinux.h. This just adds unnecessary hurdles and forces users to re-define + * them on their own. So as a convenience, provide such definitions here. + */ +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c))) +#endif + +/* + * Helper macros to manipulate data structures + */ +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((unsigned long)&((TYPE *)0)->MEMBER) +#endif +#ifndef container_of +#define container_of(ptr, type, member) \ + ({ \ + void *__mptr = (void *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +/* + * Compiler (optimization) barrier. + */ +#ifndef barrier +#define barrier() asm volatile("" ::: "memory") +#endif + +/* Variable-specific compiler (optimization) barrier. It's a no-op which makes + * compiler believe that there is some black box modification of a given + * variable and thus prevents compiler from making extra assumption about its + * value and potential simplifications and optimizations on this variable. + * + * E.g., compiler might often delay or even omit 32-bit to 64-bit casting of + * a variable, making some code patterns unverifiable. Putting barrier_var() + * in place will ensure that cast is performed before the barrier_var() + * invocation, because compiler has to pessimistically assume that embedded + * asm section might perform some extra operations on that variable. + * + * This is a variable-specific variant of more global barrier(). + */ +#ifndef barrier_var +#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var)) +#endif + +/* + * Helper macro to throw a compilation error if __bpf_unreachable() gets + * built into the resulting code. This works given BPF back end does not + * implement __builtin_trap(). This is useful to assert that certain paths + * of the program code are never used and hence eliminated by the compiler. + * + * For example, consider a switch statement that covers known cases used by + * the program. __bpf_unreachable() can then reside in the default case. If + * the program gets extended such that a case is not covered in the switch + * statement, then it will throw a build error due to the default case not + * being compiled out. + */ +#ifndef __bpf_unreachable +# define __bpf_unreachable() __builtin_trap() +#endif + +/* + * Helper function to perform a tail call with a constant/immediate map slot. + */ +#if __clang_major__ >= 8 && defined(__bpf__) +static __always_inline void +bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) +{ + if (!__builtin_constant_p(slot)) + __bpf_unreachable(); + + /* + * Provide a hard guarantee that LLVM won't optimize setting r2 (map + * pointer) and r3 (constant map index) from _different paths_ ending + * up at the _same_ call insn as otherwise we won't be able to use the + * jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel + * given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key + * tracking for prog array pokes") for details on verifier tracking. + * + * Note on clobber list: we need to stay in-line with BPF calling + * convention, so even if we don't end up using r0, r4, r5, we need + * to mark them as clobber so that LLVM doesn't end up using them + * before / after the call. + */ + asm volatile("r1 = %[ctx]\n\t" + "r2 = %[map]\n\t" + "r3 = %[slot]\n\t" + "call 12" + :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot) + : "r0", "r1", "r2", "r3", "r4", "r5"); +} +#endif + +enum libbpf_pin_type { + LIBBPF_PIN_NONE, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + LIBBPF_PIN_BY_NAME, +}; + +enum libbpf_tristate { + TRI_NO = 0, + TRI_YES = 1, + TRI_MODULE = 2, +}; + +#define __kconfig __attribute__((section(".kconfig"))) +#define __ksym __attribute__((section(".ksyms"))) +#define __kptr __attribute__((btf_type_tag("kptr"))) +#define __kptr_ref __attribute__((btf_type_tag("kptr_ref"))) + +#ifndef ___bpf_concat +#define ___bpf_concat(a, b) a ## b +#endif +#ifndef ___bpf_apply +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) +#endif +#ifndef ___bpf_nth +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N +#endif +#ifndef ___bpf_narg +#define ___bpf_narg(...) \ + ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif + +#define ___bpf_fill0(arr, p, x) do {} while (0) +#define ___bpf_fill1(arr, p, x) arr[p] = x +#define ___bpf_fill2(arr, p, x, args...) arr[p] = x; ___bpf_fill1(arr, p + 1, args) +#define ___bpf_fill3(arr, p, x, args...) arr[p] = x; ___bpf_fill2(arr, p + 1, args) +#define ___bpf_fill4(arr, p, x, args...) arr[p] = x; ___bpf_fill3(arr, p + 1, args) +#define ___bpf_fill5(arr, p, x, args...) arr[p] = x; ___bpf_fill4(arr, p + 1, args) +#define ___bpf_fill6(arr, p, x, args...) arr[p] = x; ___bpf_fill5(arr, p + 1, args) +#define ___bpf_fill7(arr, p, x, args...) arr[p] = x; ___bpf_fill6(arr, p + 1, args) +#define ___bpf_fill8(arr, p, x, args...) arr[p] = x; ___bpf_fill7(arr, p + 1, args) +#define ___bpf_fill9(arr, p, x, args...) arr[p] = x; ___bpf_fill8(arr, p + 1, args) +#define ___bpf_fill10(arr, p, x, args...) arr[p] = x; ___bpf_fill9(arr, p + 1, args) +#define ___bpf_fill11(arr, p, x, args...) arr[p] = x; ___bpf_fill10(arr, p + 1, args) +#define ___bpf_fill12(arr, p, x, args...) arr[p] = x; ___bpf_fill11(arr, p + 1, args) +#define ___bpf_fill(arr, args...) \ + ___bpf_apply(___bpf_fill, ___bpf_narg(args))(arr, 0, args) + +/* + * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values + * in a structure. + */ +#define BPF_SEQ_PRINTF(seq, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) + +/* + * BPF_SNPRINTF wraps the bpf_snprintf helper with variadic arguments instead of + * an array of u64. + */ +#define BPF_SNPRINTF(out, out_size, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_snprintf(out, out_size, ___fmt, \ + ___param, sizeof(___param)); \ +}) + +#ifdef BPF_NO_GLOBAL_DATA +#define BPF_PRINTK_FMT_MOD +#else +#define BPF_PRINTK_FMT_MOD static const +#endif + +#define __bpf_printk(fmt, ...) \ +({ \ + BPF_PRINTK_FMT_MOD char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +/* + * __bpf_vprintk wraps the bpf_trace_vprintk helper with variadic arguments + * instead of an array of u64. + */ +#define __bpf_vprintk(fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_trace_vprintk(___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) + +/* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args + * Otherwise use __bpf_vprintk + */ +#define ___bpf_pick_printk(...) \ + ___bpf_nth(_, ##__VA_ARGS__, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ + __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ + __bpf_vprintk, __bpf_vprintk, __bpf_printk /*3*/, __bpf_printk /*2*/,\ + __bpf_printk /*1*/, __bpf_printk /*0*/) + +/* Helper macro to print out debug messages */ +#define bpf_printk(fmt, args...) ___bpf_pick_printk(args)(fmt, ##args) + +#endif diff --git a/src/bpf_prog_linfo.c b/src/bpf_prog_linfo.c new file mode 100644 index 0000000..5c50309 --- /dev/null +++ b/src/bpf_prog_linfo.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include <string.h> +#include <stdlib.h> +#include <linux/err.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "libbpf_internal.h" + +struct bpf_prog_linfo { + void *raw_linfo; + void *raw_jited_linfo; + __u32 *nr_jited_linfo_per_func; + __u32 *jited_linfo_func_idx; + __u32 nr_linfo; + __u32 nr_jited_func; + __u32 rec_size; + __u32 jited_rec_size; +}; + +static int dissect_jited_func(struct bpf_prog_linfo *prog_linfo, + const __u64 *ksym_func, const __u32 *ksym_len) +{ + __u32 nr_jited_func, nr_linfo; + const void *raw_jited_linfo; + const __u64 *jited_linfo; + __u64 last_jited_linfo; + /* + * Index to raw_jited_linfo: + * i: Index for searching the next ksym_func + * prev_i: Index to the last found ksym_func + */ + __u32 i, prev_i; + __u32 f; /* Index to ksym_func */ + + raw_jited_linfo = prog_linfo->raw_jited_linfo; + jited_linfo = raw_jited_linfo; + if (ksym_func[0] != *jited_linfo) + goto errout; + + prog_linfo->jited_linfo_func_idx[0] = 0; + nr_jited_func = prog_linfo->nr_jited_func; + nr_linfo = prog_linfo->nr_linfo; + + for (prev_i = 0, i = 1, f = 1; + i < nr_linfo && f < nr_jited_func; + i++) { + raw_jited_linfo += prog_linfo->jited_rec_size; + last_jited_linfo = *jited_linfo; + jited_linfo = raw_jited_linfo; + + if (ksym_func[f] == *jited_linfo) { + prog_linfo->jited_linfo_func_idx[f] = i; + + /* Sanity check */ + if (last_jited_linfo - ksym_func[f - 1] + 1 > + ksym_len[f - 1]) + goto errout; + + prog_linfo->nr_jited_linfo_per_func[f - 1] = + i - prev_i; + prev_i = i; + + /* + * The ksym_func[f] is found in jited_linfo. + * Look for the next one. + */ + f++; + } else if (*jited_linfo <= last_jited_linfo) { + /* Ensure the addr is increasing _within_ a func */ + goto errout; + } + } + + if (f != nr_jited_func) + goto errout; + + prog_linfo->nr_jited_linfo_per_func[nr_jited_func - 1] = + nr_linfo - prev_i; + + return 0; + +errout: + return -EINVAL; +} + +void bpf_prog_linfo__free(struct bpf_prog_linfo *prog_linfo) +{ + if (!prog_linfo) + return; + + free(prog_linfo->raw_linfo); + free(prog_linfo->raw_jited_linfo); + free(prog_linfo->nr_jited_linfo_per_func); + free(prog_linfo->jited_linfo_func_idx); + free(prog_linfo); +} + +struct bpf_prog_linfo *bpf_prog_linfo__new(const struct bpf_prog_info *info) +{ + struct bpf_prog_linfo *prog_linfo; + __u32 nr_linfo, nr_jited_func; + __u64 data_sz; + + nr_linfo = info->nr_line_info; + + if (!nr_linfo) + return errno = EINVAL, NULL; + + /* + * The min size that bpf_prog_linfo has to access for + * searching purpose. + */ + if (info->line_info_rec_size < + offsetof(struct bpf_line_info, file_name_off)) + return errno = EINVAL, NULL; + + prog_linfo = calloc(1, sizeof(*prog_linfo)); + if (!prog_linfo) + return errno = ENOMEM, NULL; + + /* Copy xlated line_info */ + prog_linfo->nr_linfo = nr_linfo; + prog_linfo->rec_size = info->line_info_rec_size; + data_sz = (__u64)nr_linfo * prog_linfo->rec_size; + prog_linfo->raw_linfo = malloc(data_sz); + if (!prog_linfo->raw_linfo) + goto err_free; + memcpy(prog_linfo->raw_linfo, (void *)(long)info->line_info, data_sz); + + nr_jited_func = info->nr_jited_ksyms; + if (!nr_jited_func || + !info->jited_line_info || + info->nr_jited_line_info != nr_linfo || + info->jited_line_info_rec_size < sizeof(__u64) || + info->nr_jited_func_lens != nr_jited_func || + !info->jited_ksyms || + !info->jited_func_lens) + /* Not enough info to provide jited_line_info */ + return prog_linfo; + + /* Copy jited_line_info */ + prog_linfo->nr_jited_func = nr_jited_func; + prog_linfo->jited_rec_size = info->jited_line_info_rec_size; + data_sz = (__u64)nr_linfo * prog_linfo->jited_rec_size; + prog_linfo->raw_jited_linfo = malloc(data_sz); + if (!prog_linfo->raw_jited_linfo) + goto err_free; + memcpy(prog_linfo->raw_jited_linfo, + (void *)(long)info->jited_line_info, data_sz); + + /* Number of jited_line_info per jited func */ + prog_linfo->nr_jited_linfo_per_func = malloc(nr_jited_func * + sizeof(__u32)); + if (!prog_linfo->nr_jited_linfo_per_func) + goto err_free; + + /* + * For each jited func, + * the start idx to the "linfo" and "jited_linfo" array, + */ + prog_linfo->jited_linfo_func_idx = malloc(nr_jited_func * + sizeof(__u32)); + if (!prog_linfo->jited_linfo_func_idx) + goto err_free; + + if (dissect_jited_func(prog_linfo, + (__u64 *)(long)info->jited_ksyms, + (__u32 *)(long)info->jited_func_lens)) + goto err_free; + + return prog_linfo; + +err_free: + bpf_prog_linfo__free(prog_linfo); + return errno = EINVAL, NULL; +} + +const struct bpf_line_info * +bpf_prog_linfo__lfind_addr_func(const struct bpf_prog_linfo *prog_linfo, + __u64 addr, __u32 func_idx, __u32 nr_skip) +{ + __u32 jited_rec_size, rec_size, nr_linfo, start, i; + const void *raw_jited_linfo, *raw_linfo; + const __u64 *jited_linfo; + + if (func_idx >= prog_linfo->nr_jited_func) + return errno = ENOENT, NULL; + + nr_linfo = prog_linfo->nr_jited_linfo_per_func[func_idx]; + if (nr_skip >= nr_linfo) + return errno = ENOENT, NULL; + + start = prog_linfo->jited_linfo_func_idx[func_idx] + nr_skip; + jited_rec_size = prog_linfo->jited_rec_size; + raw_jited_linfo = prog_linfo->raw_jited_linfo + + (start * jited_rec_size); + jited_linfo = raw_jited_linfo; + if (addr < *jited_linfo) + return errno = ENOENT, NULL; + + nr_linfo -= nr_skip; + rec_size = prog_linfo->rec_size; + raw_linfo = prog_linfo->raw_linfo + (start * rec_size); + for (i = 0; i < nr_linfo; i++) { + if (addr < *jited_linfo) + break; + + raw_linfo += rec_size; + raw_jited_linfo += jited_rec_size; + jited_linfo = raw_jited_linfo; + } + + return raw_linfo - rec_size; +} + +const struct bpf_line_info * +bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, + __u32 insn_off, __u32 nr_skip) +{ + const struct bpf_line_info *linfo; + __u32 rec_size, nr_linfo, i; + const void *raw_linfo; + + nr_linfo = prog_linfo->nr_linfo; + if (nr_skip >= nr_linfo) + return errno = ENOENT, NULL; + + rec_size = prog_linfo->rec_size; + raw_linfo = prog_linfo->raw_linfo + (nr_skip * rec_size); + linfo = raw_linfo; + if (insn_off < linfo->insn_off) + return errno = ENOENT, NULL; + + nr_linfo -= nr_skip; + for (i = 0; i < nr_linfo; i++) { + if (insn_off < linfo->insn_off) + break; + + raw_linfo += rec_size; + linfo = raw_linfo; + } + + return raw_linfo - rec_size; +} diff --git a/src/bpf_tracing.h b/src/bpf_tracing.h new file mode 100644 index 0000000..2972dc2 --- /dev/null +++ b/src/bpf_tracing.h @@ -0,0 +1,670 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_TRACING_H__ +#define __BPF_TRACING_H__ + +#include <bpf/bpf_helpers.h> + +/* Scan the ARCH passed in from ARCH env variable (see Makefile) */ +#if defined(__TARGET_ARCH_x86) + #define bpf_target_x86 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_s390) + #define bpf_target_s390 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm) + #define bpf_target_arm + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm64) + #define bpf_target_arm64 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_mips) + #define bpf_target_mips + #define bpf_target_defined +#elif defined(__TARGET_ARCH_powerpc) + #define bpf_target_powerpc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_sparc) + #define bpf_target_sparc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_riscv) + #define bpf_target_riscv + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arc) + #define bpf_target_arc + #define bpf_target_defined +#else + +/* Fall back to what the compiler says */ +#if defined(__x86_64__) + #define bpf_target_x86 + #define bpf_target_defined +#elif defined(__s390__) + #define bpf_target_s390 + #define bpf_target_defined +#elif defined(__arm__) + #define bpf_target_arm + #define bpf_target_defined +#elif defined(__aarch64__) + #define bpf_target_arm64 + #define bpf_target_defined +#elif defined(__mips__) + #define bpf_target_mips + #define bpf_target_defined +#elif defined(__powerpc__) + #define bpf_target_powerpc + #define bpf_target_defined +#elif defined(__sparc__) + #define bpf_target_sparc + #define bpf_target_defined +#elif defined(__riscv) && __riscv_xlen == 64 + #define bpf_target_riscv + #define bpf_target_defined +#elif defined(__arc__) + #define bpf_target_arc + #define bpf_target_defined +#endif /* no compiler target */ + +#endif + +#ifndef __BPF_TARGET_MISSING +#define __BPF_TARGET_MISSING "GCC error \"Must specify a BPF target arch via __TARGET_ARCH_xxx\"" +#endif + +#if defined(bpf_target_x86) + +#if defined(__KERNEL__) || defined(__VMLINUX_H__) + +#define __PT_PARM1_REG di +#define __PT_PARM2_REG si +#define __PT_PARM3_REG dx +#define __PT_PARM4_REG cx +#define __PT_PARM5_REG r8 +#define __PT_RET_REG sp +#define __PT_FP_REG bp +#define __PT_RC_REG ax +#define __PT_SP_REG sp +#define __PT_IP_REG ip +/* syscall uses r10 for PARM4 */ +#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10) +#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10) + +#else + +#ifdef __i386__ + +#define __PT_PARM1_REG eax +#define __PT_PARM2_REG edx +#define __PT_PARM3_REG ecx +/* i386 kernel is built with -mregparm=3 */ +#define __PT_PARM4_REG __unsupported__ +#define __PT_PARM5_REG __unsupported__ +#define __PT_RET_REG esp +#define __PT_FP_REG ebp +#define __PT_RC_REG eax +#define __PT_SP_REG esp +#define __PT_IP_REG eip + +#else /* __i386__ */ + +#define __PT_PARM1_REG rdi +#define __PT_PARM2_REG rsi +#define __PT_PARM3_REG rdx +#define __PT_PARM4_REG rcx +#define __PT_PARM5_REG r8 +#define __PT_RET_REG rsp +#define __PT_FP_REG rbp +#define __PT_RC_REG rax +#define __PT_SP_REG rsp +#define __PT_IP_REG rip +/* syscall uses r10 for PARM4 */ +#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10) +#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10) + +#endif /* __i386__ */ + +#endif /* __KERNEL__ || __VMLINUX_H__ */ + +#elif defined(bpf_target_s390) + +struct pt_regs___s390 { + unsigned long orig_gpr2; +}; + +/* s390 provides user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const user_pt_regs *)(x)) +#define __PT_PARM1_REG gprs[2] +#define __PT_PARM2_REG gprs[3] +#define __PT_PARM3_REG gprs[4] +#define __PT_PARM4_REG gprs[5] +#define __PT_PARM5_REG gprs[6] +#define __PT_RET_REG grps[14] +#define __PT_FP_REG gprs[11] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG gprs[2] +#define __PT_SP_REG gprs[15] +#define __PT_IP_REG psw.addr +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___s390 *)(x), orig_gpr2) + +#elif defined(bpf_target_arm) + +#define __PT_PARM1_REG uregs[0] +#define __PT_PARM2_REG uregs[1] +#define __PT_PARM3_REG uregs[2] +#define __PT_PARM4_REG uregs[3] +#define __PT_PARM5_REG uregs[4] +#define __PT_RET_REG uregs[14] +#define __PT_FP_REG uregs[11] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG uregs[0] +#define __PT_SP_REG uregs[13] +#define __PT_IP_REG uregs[12] + +#elif defined(bpf_target_arm64) + +struct pt_regs___arm64 { + unsigned long orig_x0; +}; + +/* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x)) +#define __PT_PARM1_REG regs[0] +#define __PT_PARM2_REG regs[1] +#define __PT_PARM3_REG regs[2] +#define __PT_PARM4_REG regs[3] +#define __PT_PARM5_REG regs[4] +#define __PT_RET_REG regs[30] +#define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG regs[0] +#define __PT_SP_REG sp +#define __PT_IP_REG pc +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___arm64 *)(x), orig_x0) + +#elif defined(bpf_target_mips) + +#define __PT_PARM1_REG regs[4] +#define __PT_PARM2_REG regs[5] +#define __PT_PARM3_REG regs[6] +#define __PT_PARM4_REG regs[7] +#define __PT_PARM5_REG regs[8] +#define __PT_RET_REG regs[31] +#define __PT_FP_REG regs[30] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG regs[2] +#define __PT_SP_REG regs[29] +#define __PT_IP_REG cp0_epc + +#elif defined(bpf_target_powerpc) + +#define __PT_PARM1_REG gpr[3] +#define __PT_PARM2_REG gpr[4] +#define __PT_PARM3_REG gpr[5] +#define __PT_PARM4_REG gpr[6] +#define __PT_PARM5_REG gpr[7] +#define __PT_RET_REG regs[31] +#define __PT_FP_REG __unsupported__ +#define __PT_RC_REG gpr[3] +#define __PT_SP_REG sp +#define __PT_IP_REG nip +/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx + +#elif defined(bpf_target_sparc) + +#define __PT_PARM1_REG u_regs[UREG_I0] +#define __PT_PARM2_REG u_regs[UREG_I1] +#define __PT_PARM3_REG u_regs[UREG_I2] +#define __PT_PARM4_REG u_regs[UREG_I3] +#define __PT_PARM5_REG u_regs[UREG_I4] +#define __PT_RET_REG u_regs[UREG_I7] +#define __PT_FP_REG __unsupported__ +#define __PT_RC_REG u_regs[UREG_I0] +#define __PT_SP_REG u_regs[UREG_FP] +/* Should this also be a bpf_target check for the sparc case? */ +#if defined(__arch64__) +#define __PT_IP_REG tpc +#else +#define __PT_IP_REG pc +#endif + +#elif defined(bpf_target_riscv) + +#define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) +#define __PT_PARM1_REG a0 +#define __PT_PARM2_REG a1 +#define __PT_PARM3_REG a2 +#define __PT_PARM4_REG a3 +#define __PT_PARM5_REG a4 +#define __PT_RET_REG ra +#define __PT_FP_REG s0 +#define __PT_RC_REG a0 +#define __PT_SP_REG sp +#define __PT_IP_REG pc +/* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx + +#elif defined(bpf_target_arc) + +/* arc provides struct user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) +#define __PT_PARM1_REG scratch.r0 +#define __PT_PARM2_REG scratch.r1 +#define __PT_PARM3_REG scratch.r2 +#define __PT_PARM4_REG scratch.r3 +#define __PT_PARM5_REG scratch.r4 +#define __PT_RET_REG scratch.blink +#define __PT_FP_REG __unsupported__ +#define __PT_RC_REG scratch.r0 +#define __PT_SP_REG scratch.sp +#define __PT_IP_REG scratch.ret +/* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx + +#endif + +#if defined(bpf_target_defined) + +struct pt_regs; + +/* allow some architecutres to override `struct pt_regs` */ +#ifndef __PT_REGS_CAST +#define __PT_REGS_CAST(x) (x) +#endif + +#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG) +#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG) +#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG) +#define PT_REGS_PARM4(x) (__PT_REGS_CAST(x)->__PT_PARM4_REG) +#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG) +#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG) +#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG) +#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG) +#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG) +#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG) + +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_REG) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_REG) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_REG) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_REG) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_REG) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RET_REG) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_FP_REG) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RC_REG) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_SP_REG) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_IP_REG) + +#if defined(bpf_target_powerpc) + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP + +#elif defined(bpf_target_sparc) + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP + +#else + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) \ + ({ bpf_probe_read_kernel(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) \ + ({ bpf_probe_read_kernel(&(ip), sizeof(ip), (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) + +#endif + +#ifndef PT_REGS_PARM1_SYSCALL +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1(x) +#endif +#define PT_REGS_PARM2_SYSCALL(x) PT_REGS_PARM2(x) +#define PT_REGS_PARM3_SYSCALL(x) PT_REGS_PARM3(x) +#ifndef PT_REGS_PARM4_SYSCALL +#define PT_REGS_PARM4_SYSCALL(x) PT_REGS_PARM4(x) +#endif +#define PT_REGS_PARM5_SYSCALL(x) PT_REGS_PARM5(x) + +#ifndef PT_REGS_PARM1_CORE_SYSCALL +#define PT_REGS_PARM1_CORE_SYSCALL(x) PT_REGS_PARM1_CORE(x) +#endif +#define PT_REGS_PARM2_CORE_SYSCALL(x) PT_REGS_PARM2_CORE(x) +#define PT_REGS_PARM3_CORE_SYSCALL(x) PT_REGS_PARM3_CORE(x) +#ifndef PT_REGS_PARM4_CORE_SYSCALL +#define PT_REGS_PARM4_CORE_SYSCALL(x) PT_REGS_PARM4_CORE(x) +#endif +#define PT_REGS_PARM5_CORE_SYSCALL(x) PT_REGS_PARM5_CORE(x) + +#else /* defined(bpf_target_defined) */ + +#define PT_REGS_PARM1(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RET(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_FP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RC(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_SP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_IP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define PT_REGS_PARM1_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RET_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_FP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RC_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_SP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_IP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define PT_REGS_PARM1_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#endif /* defined(bpf_target_defined) */ + +/* + * When invoked from a syscall handler kprobe, returns a pointer to a + * struct pt_regs containing syscall arguments and suitable for passing to + * PT_REGS_PARMn_SYSCALL() and PT_REGS_PARMn_CORE_SYSCALL(). + */ +#ifndef PT_REGS_SYSCALL_REGS +/* By default, assume that the arch selects ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ((struct pt_regs *)PT_REGS_PARM1(ctx)) +#endif + +#ifndef ___bpf_concat +#define ___bpf_concat(a, b) a ## b +#endif +#ifndef ___bpf_apply +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) +#endif +#ifndef ___bpf_nth +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N +#endif +#ifndef ___bpf_narg +#define ___bpf_narg(...) ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif + +#define ___bpf_ctx_cast0() ctx +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast(args...) ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) + +/* + * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and + * similar kinds of BPF programs, that accept input arguments as a single + * pointer to untyped u64 array, where each u64 can actually be a typed + * pointer or integer of different size. Instead of requring user to write + * manual casts and work with array elements by index, BPF_PROG macro + * allows user to declare a list of named and typed input arguments in the + * same syntax as for normal C function. All the casting is hidden and + * performed transparently, while user code can just assume working with + * function arguments of specified type and name. + * + * Original raw context argument is preserved as well as 'ctx' argument. + * This is useful when using BPF helpers that expect original context + * as one of the parameters (e.g., for bpf_perf_event_output()). + */ +#define BPF_PROG(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_ctx_cast(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args) + +#ifndef ___bpf_nth2 +#define ___bpf_nth2(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ + _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N +#endif +#ifndef ___bpf_narg2 +#define ___bpf_narg2(...) \ + ___bpf_nth2(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, \ + 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0) +#endif + +#define ___bpf_treg_cnt(t) \ + __builtin_choose_expr(sizeof(t) == 1, 1, \ + __builtin_choose_expr(sizeof(t) == 2, 1, \ + __builtin_choose_expr(sizeof(t) == 4, 1, \ + __builtin_choose_expr(sizeof(t) == 8, 1, \ + __builtin_choose_expr(sizeof(t) == 16, 2, \ + (void)0))))) + +#define ___bpf_reg_cnt0() (0) +#define ___bpf_reg_cnt1(t, x) (___bpf_reg_cnt0() + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt2(t, x, args...) (___bpf_reg_cnt1(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt3(t, x, args...) (___bpf_reg_cnt2(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt4(t, x, args...) (___bpf_reg_cnt3(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt5(t, x, args...) (___bpf_reg_cnt4(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt6(t, x, args...) (___bpf_reg_cnt5(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt7(t, x, args...) (___bpf_reg_cnt6(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt8(t, x, args...) (___bpf_reg_cnt7(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt9(t, x, args...) (___bpf_reg_cnt8(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt10(t, x, args...) (___bpf_reg_cnt9(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt11(t, x, args...) (___bpf_reg_cnt10(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt12(t, x, args...) (___bpf_reg_cnt11(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt(args...) ___bpf_apply(___bpf_reg_cnt, ___bpf_narg2(args))(args) + +#define ___bpf_union_arg(t, x, n) \ + __builtin_choose_expr(sizeof(t) == 1, ({ union { __u8 z[1]; t x; } ___t = { .z = {ctx[n]}}; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 2, ({ union { __u16 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 4, ({ union { __u32 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 8, ({ union { __u64 z[1]; t x; } ___t = {.z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 16, ({ union { __u64 z[2]; t x; } ___t = {.z = {ctx[n], ctx[n + 1]} }; ___t.x; }), \ + (void)0))))) + +#define ___bpf_ctx_arg0(n, args...) +#define ___bpf_ctx_arg1(n, t, x) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt1(t, x)) +#define ___bpf_ctx_arg2(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt2(t, x, args)) ___bpf_ctx_arg1(n, args) +#define ___bpf_ctx_arg3(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt3(t, x, args)) ___bpf_ctx_arg2(n, args) +#define ___bpf_ctx_arg4(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt4(t, x, args)) ___bpf_ctx_arg3(n, args) +#define ___bpf_ctx_arg5(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt5(t, x, args)) ___bpf_ctx_arg4(n, args) +#define ___bpf_ctx_arg6(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt6(t, x, args)) ___bpf_ctx_arg5(n, args) +#define ___bpf_ctx_arg7(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt7(t, x, args)) ___bpf_ctx_arg6(n, args) +#define ___bpf_ctx_arg8(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt8(t, x, args)) ___bpf_ctx_arg7(n, args) +#define ___bpf_ctx_arg9(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt9(t, x, args)) ___bpf_ctx_arg8(n, args) +#define ___bpf_ctx_arg10(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt10(t, x, args)) ___bpf_ctx_arg9(n, args) +#define ___bpf_ctx_arg11(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt11(t, x, args)) ___bpf_ctx_arg10(n, args) +#define ___bpf_ctx_arg12(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt12(t, x, args)) ___bpf_ctx_arg11(n, args) +#define ___bpf_ctx_arg(args...) ___bpf_apply(___bpf_ctx_arg, ___bpf_narg2(args))(___bpf_reg_cnt(args), args) + +#define ___bpf_ctx_decl0() +#define ___bpf_ctx_decl1(t, x) , t x +#define ___bpf_ctx_decl2(t, x, args...) , t x ___bpf_ctx_decl1(args) +#define ___bpf_ctx_decl3(t, x, args...) , t x ___bpf_ctx_decl2(args) +#define ___bpf_ctx_decl4(t, x, args...) , t x ___bpf_ctx_decl3(args) +#define ___bpf_ctx_decl5(t, x, args...) , t x ___bpf_ctx_decl4(args) +#define ___bpf_ctx_decl6(t, x, args...) , t x ___bpf_ctx_decl5(args) +#define ___bpf_ctx_decl7(t, x, args...) , t x ___bpf_ctx_decl6(args) +#define ___bpf_ctx_decl8(t, x, args...) , t x ___bpf_ctx_decl7(args) +#define ___bpf_ctx_decl9(t, x, args...) , t x ___bpf_ctx_decl8(args) +#define ___bpf_ctx_decl10(t, x, args...) , t x ___bpf_ctx_decl9(args) +#define ___bpf_ctx_decl11(t, x, args...) , t x ___bpf_ctx_decl10(args) +#define ___bpf_ctx_decl12(t, x, args...) , t x ___bpf_ctx_decl11(args) +#define ___bpf_ctx_decl(args...) ___bpf_apply(___bpf_ctx_decl, ___bpf_narg2(args))(args) + +/* + * BPF_PROG2 is an enhanced version of BPF_PROG in order to handle struct + * arguments. Since each struct argument might take one or two u64 values + * in the trampoline stack, argument type size is needed to place proper number + * of u64 values for each argument. Therefore, BPF_PROG2 has different + * syntax from BPF_PROG. For example, for the following BPF_PROG syntax: + * + * int BPF_PROG(test2, int a, int b) { ... } + * + * the corresponding BPF_PROG2 syntax is: + * + * int BPF_PROG2(test2, int, a, int, b) { ... } + * + * where type and the corresponding argument name are separated by comma. + * + * Use BPF_PROG2 macro if one of the arguments might be a struct/union larger + * than 8 bytes: + * + * int BPF_PROG2(test_struct_arg, struct bpf_testmod_struct_arg_1, a, int, b, + * int, c, int, d, struct bpf_testmod_struct_arg_2, e, int, ret) + * { + * // access a, b, c, d, e, and ret directly + * ... + * } + */ +#define BPF_PROG2(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + return ____##name(ctx ___bpf_ctx_arg(args)); \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) + +struct pt_regs; + +#define ___bpf_kprobe_args0() ctx +#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) + +/* + * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for + * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific + * low-level way of getting kprobe input arguments from struct pt_regs, and + * provides a familiar typed and named function arguments syntax and + * semantics of accessing kprobe input paremeters. + * + * Original struct pt_regs* context is preserved as 'ctx' argument. This might + * be necessary when using BPF helpers like bpf_perf_event_output(). + */ +#define BPF_KPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#define ___bpf_kretprobe_args0() ctx +#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx) +#define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args) + +/* + * BPF_KRETPROBE is similar to BPF_KPROBE, except, it only provides optional + * return value (in addition to `struct pt_regs *ctx`), but no input + * arguments, because they will be clobbered by the time probed function + * returns. + */ +#define BPF_KRETPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kretprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) + +/* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */ +#define ___bpf_syscall_args0() ctx +#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs) +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs) +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs) +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs) +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs) +#define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) + +/* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ +#define ___bpf_syswrap_args0() ctx +#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) + +/* + * BPF_KSYSCALL is a variant of BPF_KPROBE, which is intended for + * tracing syscall functions, like __x64_sys_close. It hides the underlying + * platform-specific low-level way of getting syscall input arguments from + * struct pt_regs, and provides a familiar typed and named function arguments + * syntax and semantics of accessing syscall input parameters. + * + * Original struct pt_regs * context is preserved as 'ctx' argument. This might + * be necessary when using BPF helpers like bpf_perf_event_output(). + * + * At the moment BPF_KSYSCALL does not transparently handle all the calling + * convention quirks for the following syscalls: + * + * - mmap(): __ARCH_WANT_SYS_OLD_MMAP. + * - clone(): CONFIG_CLONE_BACKWARDS, CONFIG_CLONE_BACKWARDS2 and + * CONFIG_CLONE_BACKWARDS3. + * - socket-related syscalls: __ARCH_WANT_SYS_SOCKETCALL. + * - compat syscalls. + * + * This may or may not change in the future. User needs to take extra measures + * to handle such quirks explicitly, if necessary. + * + * This macro relies on BPF CO-RE support and virtual __kconfig externs. + */ +#define BPF_KSYSCALL(name, args...) \ +name(struct pt_regs *ctx); \ +extern _Bool LINUX_HAS_SYSCALL_WRAPPER __kconfig; \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + struct pt_regs *regs = LINUX_HAS_SYSCALL_WRAPPER \ + ? (struct pt_regs *)PT_REGS_PARM1(ctx) \ + : ctx; \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + if (LINUX_HAS_SYSCALL_WRAPPER) \ + return ____##name(___bpf_syswrap_args(args)); \ + else \ + return ____##name(___bpf_syscall_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#define BPF_KPROBE_SYSCALL BPF_KSYSCALL + +#endif diff --git a/src/btf.c b/src/btf.c new file mode 100644 index 0000000..b032500 --- /dev/null +++ b/src/btf.c @@ -0,0 +1,5027 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include <byteswap.h> +#include <endian.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <sys/utsname.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/btf.h> +#include <gelf.h> +#include "btf.h" +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" +#include "hashmap.h" +#include "strset.h" + +#define BTF_MAX_NR_TYPES 0x7fffffffU +#define BTF_MAX_STR_OFFSET 0x7fffffffU + +static struct btf_type btf_void; + +struct btf { + /* raw BTF data in native endianness */ + void *raw_data; + /* raw BTF data in non-native endianness */ + void *raw_data_swapped; + __u32 raw_size; + /* whether target endianness differs from the native one */ + bool swapped_endian; + + /* + * When BTF is loaded from an ELF or raw memory it is stored + * in a contiguous memory block. The hdr, type_data, and, strs_data + * point inside that memory region to their respective parts of BTF + * representation: + * + * +--------------------------------+ + * | Header | Types | Strings | + * +--------------------------------+ + * ^ ^ ^ + * | | | + * hdr | | + * types_data-+ | + * strs_data------------+ + * + * If BTF data is later modified, e.g., due to types added or + * removed, BTF deduplication performed, etc, this contiguous + * representation is broken up into three independently allocated + * memory regions to be able to modify them independently. + * raw_data is nulled out at that point, but can be later allocated + * and cached again if user calls btf__raw_data(), at which point + * raw_data will contain a contiguous copy of header, types, and + * strings: + * + * +----------+ +---------+ +-----------+ + * | Header | | Types | | Strings | + * +----------+ +---------+ +-----------+ + * ^ ^ ^ + * | | | + * hdr | | + * types_data----+ | + * strset__data(strs_set)-----+ + * + * +----------+---------+-----------+ + * | Header | Types | Strings | + * raw_data----->+----------+---------+-----------+ + */ + struct btf_header *hdr; + + void *types_data; + size_t types_data_cap; /* used size stored in hdr->type_len */ + + /* type ID to `struct btf_type *` lookup index + * type_offs[0] corresponds to the first non-VOID type: + * - for base BTF it's type [1]; + * - for split BTF it's the first non-base BTF type. + */ + __u32 *type_offs; + size_t type_offs_cap; + /* number of types in this BTF instance: + * - doesn't include special [0] void type; + * - for split BTF counts number of types added on top of base BTF. + */ + __u32 nr_types; + /* if not NULL, points to the base BTF on top of which the current + * split BTF is based + */ + struct btf *base_btf; + /* BTF type ID of the first type in this BTF instance: + * - for base BTF it's equal to 1; + * - for split BTF it's equal to biggest type ID of base BTF plus 1. + */ + int start_id; + /* logical string offset of this BTF instance: + * - for base BTF it's equal to 0; + * - for split BTF it's equal to total size of base BTF's string section size. + */ + int start_str_off; + + /* only one of strs_data or strs_set can be non-NULL, depending on + * whether BTF is in a modifiable state (strs_set is used) or not + * (strs_data points inside raw_data) + */ + void *strs_data; + /* a set of unique strings */ + struct strset *strs_set; + /* whether strings are already deduplicated */ + bool strs_deduped; + + /* BTF object FD, if loaded into kernel */ + int fd; + + /* Pointer size (in bytes) for a target architecture of this BTF */ + int ptr_sz; +}; + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +/* Ensure given dynamically allocated memory region pointed to by *data* with + * capacity of *cap_cnt* elements each taking *elem_sz* bytes has enough + * memory to accommodate *add_cnt* new elements, assuming *cur_cnt* elements + * are already used. At most *max_cnt* elements can be ever allocated. + * If necessary, memory is reallocated and all existing data is copied over, + * new pointer to the memory region is stored at *data, new memory region + * capacity (in number of elements) is stored in *cap. + * On success, memory pointer to the beginning of unused memory is returned. + * On error, NULL is returned. + */ +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt) +{ + size_t new_cnt; + void *new_data; + + if (cur_cnt + add_cnt <= *cap_cnt) + return *data + cur_cnt * elem_sz; + + /* requested more than the set limit */ + if (cur_cnt + add_cnt > max_cnt) + return NULL; + + new_cnt = *cap_cnt; + new_cnt += new_cnt / 4; /* expand by 25% */ + if (new_cnt < 16) /* but at least 16 elements */ + new_cnt = 16; + if (new_cnt > max_cnt) /* but not exceeding a set limit */ + new_cnt = max_cnt; + if (new_cnt < cur_cnt + add_cnt) /* also ensure we have enough memory */ + new_cnt = cur_cnt + add_cnt; + + new_data = libbpf_reallocarray(*data, new_cnt, elem_sz); + if (!new_data) + return NULL; + + /* zero out newly allocated portion of memory */ + memset(new_data + (*cap_cnt) * elem_sz, 0, (new_cnt - *cap_cnt) * elem_sz); + + *data = new_data; + *cap_cnt = new_cnt; + return new_data + cur_cnt * elem_sz; +} + +/* Ensure given dynamically allocated memory region has enough allocated space + * to accommodate *need_cnt* elements of size *elem_sz* bytes each + */ +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt) +{ + void *p; + + if (need_cnt <= *cap_cnt) + return 0; + + p = libbpf_add_mem(data, cap_cnt, elem_sz, *cap_cnt, SIZE_MAX, need_cnt - *cap_cnt); + if (!p) + return -ENOMEM; + + return 0; +} + +static void *btf_add_type_offs_mem(struct btf *btf, size_t add_cnt) +{ + return libbpf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), + btf->nr_types, BTF_MAX_NR_TYPES, add_cnt); +} + +static int btf_add_type_idx_entry(struct btf *btf, __u32 type_off) +{ + __u32 *p; + + p = btf_add_type_offs_mem(btf, 1); + if (!p) + return -ENOMEM; + + *p = type_off; + return 0; +} + +static void btf_bswap_hdr(struct btf_header *h) +{ + h->magic = bswap_16(h->magic); + h->hdr_len = bswap_32(h->hdr_len); + h->type_off = bswap_32(h->type_off); + h->type_len = bswap_32(h->type_len); + h->str_off = bswap_32(h->str_off); + h->str_len = bswap_32(h->str_len); +} + +static int btf_parse_hdr(struct btf *btf) +{ + struct btf_header *hdr = btf->hdr; + __u32 meta_left; + + if (btf->raw_size < sizeof(struct btf_header)) { + pr_debug("BTF header not found\n"); + return -EINVAL; + } + + if (hdr->magic == bswap_16(BTF_MAGIC)) { + btf->swapped_endian = true; + if (bswap_32(hdr->hdr_len) != sizeof(struct btf_header)) { + pr_warn("Can't load BTF with non-native endianness due to unsupported header length %u\n", + bswap_32(hdr->hdr_len)); + return -ENOTSUP; + } + btf_bswap_hdr(hdr); + } else if (hdr->magic != BTF_MAGIC) { + pr_debug("Invalid BTF magic: %x\n", hdr->magic); + return -EINVAL; + } + + if (btf->raw_size < hdr->hdr_len) { + pr_debug("BTF header len %u larger than data size %u\n", + hdr->hdr_len, btf->raw_size); + return -EINVAL; + } + + meta_left = btf->raw_size - hdr->hdr_len; + if (meta_left < (long long)hdr->str_off + hdr->str_len) { + pr_debug("Invalid BTF total size: %u\n", btf->raw_size); + return -EINVAL; + } + + if ((long long)hdr->type_off + hdr->type_len > hdr->str_off) { + pr_debug("Invalid BTF data sections layout: type data at %u + %u, strings data at %u + %u\n", + hdr->type_off, hdr->type_len, hdr->str_off, hdr->str_len); + return -EINVAL; + } + + if (hdr->type_off % 4) { + pr_debug("BTF type section is not aligned to 4 bytes\n"); + return -EINVAL; + } + + return 0; +} + +static int btf_parse_str_sec(struct btf *btf) +{ + const struct btf_header *hdr = btf->hdr; + const char *start = btf->strs_data; + const char *end = start + btf->hdr->str_len; + + if (btf->base_btf && hdr->str_len == 0) + return 0; + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_STR_OFFSET || end[-1]) { + pr_debug("Invalid BTF string section\n"); + return -EINVAL; + } + if (!btf->base_btf && start[0]) { + pr_debug("Invalid BTF string section\n"); + return -EINVAL; + } + return 0; +} + +static int btf_type_size(const struct btf_type *t) +{ + const int base_size = sizeof(struct btf_type); + __u16 vlen = btf_vlen(t); + + switch (btf_kind(t)) { + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: + case BTF_KIND_TYPE_TAG: + return base_size; + case BTF_KIND_INT: + return base_size + sizeof(__u32); + case BTF_KIND_ENUM: + return base_size + vlen * sizeof(struct btf_enum); + case BTF_KIND_ENUM64: + return base_size + vlen * sizeof(struct btf_enum64); + case BTF_KIND_ARRAY: + return base_size + sizeof(struct btf_array); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + return base_size + vlen * sizeof(struct btf_member); + case BTF_KIND_FUNC_PROTO: + return base_size + vlen * sizeof(struct btf_param); + case BTF_KIND_VAR: + return base_size + sizeof(struct btf_var); + case BTF_KIND_DATASEC: + return base_size + vlen * sizeof(struct btf_var_secinfo); + case BTF_KIND_DECL_TAG: + return base_size + sizeof(struct btf_decl_tag); + default: + pr_debug("Unsupported BTF_KIND:%u\n", btf_kind(t)); + return -EINVAL; + } +} + +static void btf_bswap_type_base(struct btf_type *t) +{ + t->name_off = bswap_32(t->name_off); + t->info = bswap_32(t->info); + t->type = bswap_32(t->type); +} + +static int btf_bswap_type_rest(struct btf_type *t) +{ + struct btf_var_secinfo *v; + struct btf_enum64 *e64; + struct btf_member *m; + struct btf_array *a; + struct btf_param *p; + struct btf_enum *e; + __u16 vlen = btf_vlen(t); + int i; + + switch (btf_kind(t)) { + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: + case BTF_KIND_TYPE_TAG: + return 0; + case BTF_KIND_INT: + *(__u32 *)(t + 1) = bswap_32(*(__u32 *)(t + 1)); + return 0; + case BTF_KIND_ENUM: + for (i = 0, e = btf_enum(t); i < vlen; i++, e++) { + e->name_off = bswap_32(e->name_off); + e->val = bswap_32(e->val); + } + return 0; + case BTF_KIND_ENUM64: + for (i = 0, e64 = btf_enum64(t); i < vlen; i++, e64++) { + e64->name_off = bswap_32(e64->name_off); + e64->val_lo32 = bswap_32(e64->val_lo32); + e64->val_hi32 = bswap_32(e64->val_hi32); + } + return 0; + case BTF_KIND_ARRAY: + a = btf_array(t); + a->type = bswap_32(a->type); + a->index_type = bswap_32(a->index_type); + a->nelems = bswap_32(a->nelems); + return 0; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + for (i = 0, m = btf_members(t); i < vlen; i++, m++) { + m->name_off = bswap_32(m->name_off); + m->type = bswap_32(m->type); + m->offset = bswap_32(m->offset); + } + return 0; + case BTF_KIND_FUNC_PROTO: + for (i = 0, p = btf_params(t); i < vlen; i++, p++) { + p->name_off = bswap_32(p->name_off); + p->type = bswap_32(p->type); + } + return 0; + case BTF_KIND_VAR: + btf_var(t)->linkage = bswap_32(btf_var(t)->linkage); + return 0; + case BTF_KIND_DATASEC: + for (i = 0, v = btf_var_secinfos(t); i < vlen; i++, v++) { + v->type = bswap_32(v->type); + v->offset = bswap_32(v->offset); + v->size = bswap_32(v->size); + } + return 0; + case BTF_KIND_DECL_TAG: + btf_decl_tag(t)->component_idx = bswap_32(btf_decl_tag(t)->component_idx); + return 0; + default: + pr_debug("Unsupported BTF_KIND:%u\n", btf_kind(t)); + return -EINVAL; + } +} + +static int btf_parse_type_sec(struct btf *btf) +{ + struct btf_header *hdr = btf->hdr; + void *next_type = btf->types_data; + void *end_type = next_type + hdr->type_len; + int err, type_size; + + while (next_type + sizeof(struct btf_type) <= end_type) { + if (btf->swapped_endian) + btf_bswap_type_base(next_type); + + type_size = btf_type_size(next_type); + if (type_size < 0) + return type_size; + if (next_type + type_size > end_type) { + pr_warn("BTF type [%d] is malformed\n", btf->start_id + btf->nr_types); + return -EINVAL; + } + + if (btf->swapped_endian && btf_bswap_type_rest(next_type)) + return -EINVAL; + + err = btf_add_type_idx_entry(btf, next_type - btf->types_data); + if (err) + return err; + + next_type += type_size; + btf->nr_types++; + } + + if (next_type != end_type) { + pr_warn("BTF types data is malformed\n"); + return -EINVAL; + } + + return 0; +} + +__u32 btf__type_cnt(const struct btf *btf) +{ + return btf->start_id + btf->nr_types; +} + +const struct btf *btf__base_btf(const struct btf *btf) +{ + return btf->base_btf; +} + +/* internal helper returning non-const pointer to a type */ +struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id) +{ + if (type_id == 0) + return &btf_void; + if (type_id < btf->start_id) + return btf_type_by_id(btf->base_btf, type_id); + return btf->types_data + btf->type_offs[type_id - btf->start_id]; +} + +const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id) +{ + if (type_id >= btf->start_id + btf->nr_types) + return errno = EINVAL, NULL; + return btf_type_by_id((struct btf *)btf, type_id); +} + +static int determine_ptr_size(const struct btf *btf) +{ + static const char * const long_aliases[] = { + "long", + "long int", + "int long", + "unsigned long", + "long unsigned", + "unsigned long int", + "unsigned int long", + "long unsigned int", + "long int unsigned", + "int unsigned long", + "int long unsigned", + }; + const struct btf_type *t; + const char *name; + int i, j, n; + + if (btf->base_btf && btf->base_btf->ptr_sz > 0) + return btf->base_btf->ptr_sz; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + if (!btf_is_int(t)) + continue; + + if (t->size != 4 && t->size != 8) + continue; + + name = btf__name_by_offset(btf, t->name_off); + if (!name) + continue; + + for (j = 0; j < ARRAY_SIZE(long_aliases); j++) { + if (strcmp(name, long_aliases[j]) == 0) + return t->size; + } + } + + return -1; +} + +static size_t btf_ptr_sz(const struct btf *btf) +{ + if (!btf->ptr_sz) + ((struct btf *)btf)->ptr_sz = determine_ptr_size(btf); + return btf->ptr_sz < 0 ? sizeof(void *) : btf->ptr_sz; +} + +/* Return pointer size this BTF instance assumes. The size is heuristically + * determined by looking for 'long' or 'unsigned long' integer type and + * recording its size in bytes. If BTF type information doesn't have any such + * type, this function returns 0. In the latter case, native architecture's + * pointer size is assumed, so will be either 4 or 8, depending on + * architecture that libbpf was compiled for. It's possible to override + * guessed value by using btf__set_pointer_size() API. + */ +size_t btf__pointer_size(const struct btf *btf) +{ + if (!btf->ptr_sz) + ((struct btf *)btf)->ptr_sz = determine_ptr_size(btf); + + if (btf->ptr_sz < 0) + /* not enough BTF type info to guess */ + return 0; + + return btf->ptr_sz; +} + +/* Override or set pointer size in bytes. Only values of 4 and 8 are + * supported. + */ +int btf__set_pointer_size(struct btf *btf, size_t ptr_sz) +{ + if (ptr_sz != 4 && ptr_sz != 8) + return libbpf_err(-EINVAL); + btf->ptr_sz = ptr_sz; + return 0; +} + +static bool is_host_big_endian(void) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return false; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return true; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif +} + +enum btf_endianness btf__endianness(const struct btf *btf) +{ + if (is_host_big_endian()) + return btf->swapped_endian ? BTF_LITTLE_ENDIAN : BTF_BIG_ENDIAN; + else + return btf->swapped_endian ? BTF_BIG_ENDIAN : BTF_LITTLE_ENDIAN; +} + +int btf__set_endianness(struct btf *btf, enum btf_endianness endian) +{ + if (endian != BTF_LITTLE_ENDIAN && endian != BTF_BIG_ENDIAN) + return libbpf_err(-EINVAL); + + btf->swapped_endian = is_host_big_endian() != (endian == BTF_BIG_ENDIAN); + if (!btf->swapped_endian) { + free(btf->raw_data_swapped); + btf->raw_data_swapped = NULL; + } + return 0; +} + +static bool btf_type_is_void(const struct btf_type *t) +{ + return t == &btf_void || btf_is_fwd(t); +} + +static bool btf_type_is_void_or_null(const struct btf_type *t) +{ + return !t || btf_type_is_void(t); +} + +#define MAX_RESOLVE_DEPTH 32 + +__s64 btf__resolve_size(const struct btf *btf, __u32 type_id) +{ + const struct btf_array *array; + const struct btf_type *t; + __u32 nelems = 1; + __s64 size = -1; + int i; + + t = btf__type_by_id(btf, type_id); + for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t); i++) { + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: + size = t->size; + goto done; + case BTF_KIND_PTR: + size = btf_ptr_sz(btf); + goto done; + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + type_id = t->type; + break; + case BTF_KIND_ARRAY: + array = btf_array(t); + if (nelems && array->nelems > UINT32_MAX / nelems) + return libbpf_err(-E2BIG); + nelems *= array->nelems; + type_id = array->type; + break; + default: + return libbpf_err(-EINVAL); + } + + t = btf__type_by_id(btf, type_id); + } + +done: + if (size < 0) + return libbpf_err(-EINVAL); + if (nelems && size > UINT32_MAX / nelems) + return libbpf_err(-E2BIG); + + return nelems * size; +} + +int btf__align_of(const struct btf *btf, __u32 id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + __u16 kind = btf_kind(t); + + switch (kind) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FLOAT: + return min(btf_ptr_sz(btf), (size_t)t->size); + case BTF_KIND_PTR: + return btf_ptr_sz(btf); + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + return btf__align_of(btf, t->type); + case BTF_KIND_ARRAY: + return btf__align_of(btf, btf_array(t)->type); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + __u16 vlen = btf_vlen(t); + int i, max_align = 1, align; + + for (i = 0; i < vlen; i++, m++) { + align = btf__align_of(btf, m->type); + if (align <= 0) + return libbpf_err(align); + max_align = max(max_align, align); + + /* if field offset isn't aligned according to field + * type's alignment, then struct must be packed + */ + if (btf_member_bitfield_size(t, i) == 0 && + (m->offset % (8 * align)) != 0) + return 1; + } + + /* if struct/union size isn't a multiple of its alignment, + * then struct must be packed + */ + if ((t->size % max_align) != 0) + return 1; + + return max_align; + } + default: + pr_warn("unsupported BTF_KIND:%u\n", btf_kind(t)); + return errno = EINVAL, 0; + } +} + +int btf__resolve_type(const struct btf *btf, __u32 type_id) +{ + const struct btf_type *t; + int depth = 0; + + t = btf__type_by_id(btf, type_id); + while (depth < MAX_RESOLVE_DEPTH && + !btf_type_is_void_or_null(t) && + (btf_is_mod(t) || btf_is_typedef(t) || btf_is_var(t))) { + type_id = t->type; + t = btf__type_by_id(btf, type_id); + depth++; + } + + if (depth == MAX_RESOLVE_DEPTH || btf_type_is_void_or_null(t)) + return libbpf_err(-EINVAL); + + return type_id; +} + +__s32 btf__find_by_name(const struct btf *btf, const char *type_name) +{ + __u32 i, nr_types = btf__type_cnt(btf); + + if (!strcmp(type_name, "void")) + return 0; + + for (i = 1; i < nr_types; i++) { + const struct btf_type *t = btf__type_by_id(btf, i); + const char *name = btf__name_by_offset(btf, t->name_off); + + if (name && !strcmp(type_name, name)) + return i; + } + + return libbpf_err(-ENOENT); +} + +static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id, + const char *type_name, __u32 kind) +{ + __u32 i, nr_types = btf__type_cnt(btf); + + if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void")) + return 0; + + for (i = start_id; i < nr_types; i++) { + const struct btf_type *t = btf__type_by_id(btf, i); + const char *name; + + if (btf_kind(t) != kind) + continue; + name = btf__name_by_offset(btf, t->name_off); + if (name && !strcmp(type_name, name)) + return i; + } + + return libbpf_err(-ENOENT); +} + +__s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, + __u32 kind) +{ + return btf_find_by_name_kind(btf, btf->start_id, type_name, kind); +} + +__s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, + __u32 kind) +{ + return btf_find_by_name_kind(btf, 1, type_name, kind); +} + +static bool btf_is_modifiable(const struct btf *btf) +{ + return (void *)btf->hdr != btf->raw_data; +} + +void btf__free(struct btf *btf) +{ + if (IS_ERR_OR_NULL(btf)) + return; + + if (btf->fd >= 0) + close(btf->fd); + + if (btf_is_modifiable(btf)) { + /* if BTF was modified after loading, it will have a split + * in-memory representation for header, types, and strings + * sections, so we need to free all of them individually. It + * might still have a cached contiguous raw data present, + * which will be unconditionally freed below. + */ + free(btf->hdr); + free(btf->types_data); + strset__free(btf->strs_set); + } + free(btf->raw_data); + free(btf->raw_data_swapped); + free(btf->type_offs); + free(btf); +} + +static struct btf *btf_new_empty(struct btf *base_btf) +{ + struct btf *btf; + + btf = calloc(1, sizeof(*btf)); + if (!btf) + return ERR_PTR(-ENOMEM); + + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; + btf->fd = -1; + btf->ptr_sz = sizeof(void *); + btf->swapped_endian = false; + + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__type_cnt(base_btf); + btf->start_str_off = base_btf->hdr->str_len; + } + + /* +1 for empty string at offset 0 */ + btf->raw_size = sizeof(struct btf_header) + (base_btf ? 0 : 1); + btf->raw_data = calloc(1, btf->raw_size); + if (!btf->raw_data) { + free(btf); + return ERR_PTR(-ENOMEM); + } + + btf->hdr = btf->raw_data; + btf->hdr->hdr_len = sizeof(struct btf_header); + btf->hdr->magic = BTF_MAGIC; + btf->hdr->version = BTF_VERSION; + + btf->types_data = btf->raw_data + btf->hdr->hdr_len; + btf->strs_data = btf->raw_data + btf->hdr->hdr_len; + btf->hdr->str_len = base_btf ? 0 : 1; /* empty string at offset 0 */ + + return btf; +} + +struct btf *btf__new_empty(void) +{ + return libbpf_ptr(btf_new_empty(NULL)); +} + +struct btf *btf__new_empty_split(struct btf *base_btf) +{ + return libbpf_ptr(btf_new_empty(base_btf)); +} + +static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) +{ + struct btf *btf; + int err; + + btf = calloc(1, sizeof(struct btf)); + if (!btf) + return ERR_PTR(-ENOMEM); + + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; + btf->fd = -1; + + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__type_cnt(base_btf); + btf->start_str_off = base_btf->hdr->str_len; + } + + btf->raw_data = malloc(size); + if (!btf->raw_data) { + err = -ENOMEM; + goto done; + } + memcpy(btf->raw_data, data, size); + btf->raw_size = size; + + btf->hdr = btf->raw_data; + err = btf_parse_hdr(btf); + if (err) + goto done; + + btf->strs_data = btf->raw_data + btf->hdr->hdr_len + btf->hdr->str_off; + btf->types_data = btf->raw_data + btf->hdr->hdr_len + btf->hdr->type_off; + + err = btf_parse_str_sec(btf); + err = err ?: btf_parse_type_sec(btf); + if (err) + goto done; + +done: + if (err) { + btf__free(btf); + return ERR_PTR(err); + } + + return btf; +} + +struct btf *btf__new(const void *data, __u32 size) +{ + return libbpf_ptr(btf_new(data, size, NULL)); +} + +static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, + struct btf_ext **btf_ext) +{ + Elf_Data *btf_data = NULL, *btf_ext_data = NULL; + int err = 0, fd = -1, idx = 0; + struct btf *btf = NULL; + Elf_Scn *scn = NULL; + Elf *elf = NULL; + GElf_Ehdr ehdr; + size_t shstrndx; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn("failed to init libelf for %s\n", path); + return ERR_PTR(-LIBBPF_ERRNO__LIBELF); + } + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = -errno; + pr_warn("failed to open %s: %s\n", path, strerror(errno)); + return ERR_PTR(err); + } + + err = -LIBBPF_ERRNO__FORMAT; + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_warn("failed to open %s as ELF file\n", path); + goto done; + } + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("failed to get EHDR from %s\n", path); + goto done; + } + + if (elf_getshdrstrndx(elf, &shstrndx)) { + pr_warn("failed to get section names section index for %s\n", + path); + goto done; + } + + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) { + pr_warn("failed to get e_shstrndx from %s\n", path); + goto done; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + char *name; + + idx++; + if (gelf_getshdr(scn, &sh) != &sh) { + pr_warn("failed to get section(%d) header from %s\n", + idx, path); + goto done; + } + name = elf_strptr(elf, shstrndx, sh.sh_name); + if (!name) { + pr_warn("failed to get section(%d) name from %s\n", + idx, path); + goto done; + } + if (strcmp(name, BTF_ELF_SEC) == 0) { + btf_data = elf_getdata(scn, 0); + if (!btf_data) { + pr_warn("failed to get section(%d, %s) data from %s\n", + idx, name, path); + goto done; + } + continue; + } else if (btf_ext && strcmp(name, BTF_EXT_ELF_SEC) == 0) { + btf_ext_data = elf_getdata(scn, 0); + if (!btf_ext_data) { + pr_warn("failed to get section(%d, %s) data from %s\n", + idx, name, path); + goto done; + } + continue; + } + } + + err = 0; + + if (!btf_data) { + pr_warn("failed to find '%s' ELF section in %s\n", BTF_ELF_SEC, path); + err = -ENOENT; + goto done; + } + btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf); + err = libbpf_get_error(btf); + if (err) + goto done; + + switch (gelf_getclass(elf)) { + case ELFCLASS32: + btf__set_pointer_size(btf, 4); + break; + case ELFCLASS64: + btf__set_pointer_size(btf, 8); + break; + default: + pr_warn("failed to get ELF class (bitness) for %s\n", path); + break; + } + + if (btf_ext && btf_ext_data) { + *btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size); + err = libbpf_get_error(*btf_ext); + if (err) + goto done; + } else if (btf_ext) { + *btf_ext = NULL; + } +done: + if (elf) + elf_end(elf); + close(fd); + + if (!err) + return btf; + + if (btf_ext) + btf_ext__free(*btf_ext); + btf__free(btf); + + return ERR_PTR(err); +} + +struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) +{ + return libbpf_ptr(btf_parse_elf(path, NULL, btf_ext)); +} + +struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf) +{ + return libbpf_ptr(btf_parse_elf(path, base_btf, NULL)); +} + +static struct btf *btf_parse_raw(const char *path, struct btf *base_btf) +{ + struct btf *btf = NULL; + void *data = NULL; + FILE *f = NULL; + __u16 magic; + int err = 0; + long sz; + + f = fopen(path, "rb"); + if (!f) { + err = -errno; + goto err_out; + } + + /* check BTF magic */ + if (fread(&magic, 1, sizeof(magic), f) < sizeof(magic)) { + err = -EIO; + goto err_out; + } + if (magic != BTF_MAGIC && magic != bswap_16(BTF_MAGIC)) { + /* definitely not a raw BTF */ + err = -EPROTO; + goto err_out; + } + + /* get file size */ + if (fseek(f, 0, SEEK_END)) { + err = -errno; + goto err_out; + } + sz = ftell(f); + if (sz < 0) { + err = -errno; + goto err_out; + } + /* rewind to the start */ + if (fseek(f, 0, SEEK_SET)) { + err = -errno; + goto err_out; + } + + /* pre-alloc memory and read all of BTF data */ + data = malloc(sz); + if (!data) { + err = -ENOMEM; + goto err_out; + } + if (fread(data, 1, sz, f) < sz) { + err = -EIO; + goto err_out; + } + + /* finally parse BTF data */ + btf = btf_new(data, sz, base_btf); + +err_out: + free(data); + if (f) + fclose(f); + return err ? ERR_PTR(err) : btf; +} + +struct btf *btf__parse_raw(const char *path) +{ + return libbpf_ptr(btf_parse_raw(path, NULL)); +} + +struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf) +{ + return libbpf_ptr(btf_parse_raw(path, base_btf)); +} + +static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) +{ + struct btf *btf; + int err; + + if (btf_ext) + *btf_ext = NULL; + + btf = btf_parse_raw(path, base_btf); + err = libbpf_get_error(btf); + if (!err) + return btf; + if (err != -EPROTO) + return ERR_PTR(err); + return btf_parse_elf(path, base_btf, btf_ext); +} + +struct btf *btf__parse(const char *path, struct btf_ext **btf_ext) +{ + return libbpf_ptr(btf_parse(path, NULL, btf_ext)); +} + +struct btf *btf__parse_split(const char *path, struct btf *base_btf) +{ + return libbpf_ptr(btf_parse(path, base_btf, NULL)); +} + +static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian); + +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level) +{ + LIBBPF_OPTS(bpf_btf_load_opts, opts); + __u32 buf_sz = 0, raw_size; + char *buf = NULL, *tmp; + void *raw_data; + int err = 0; + + if (btf->fd >= 0) + return libbpf_err(-EEXIST); + if (log_sz && !log_buf) + return libbpf_err(-EINVAL); + + /* cache native raw data representation */ + raw_data = btf_get_raw_data(btf, &raw_size, false); + if (!raw_data) { + err = -ENOMEM; + goto done; + } + btf->raw_size = raw_size; + btf->raw_data = raw_data; + +retry_load: + /* if log_level is 0, we won't provide log_buf/log_size to the kernel, + * initially. Only if BTF loading fails, we bump log_level to 1 and + * retry, using either auto-allocated or custom log_buf. This way + * non-NULL custom log_buf provides a buffer just in case, but hopes + * for successful load and no need for log_buf. + */ + if (log_level) { + /* if caller didn't provide custom log_buf, we'll keep + * allocating our own progressively bigger buffers for BTF + * verification log + */ + if (!log_buf) { + buf_sz = max((__u32)BPF_LOG_BUF_SIZE, buf_sz * 2); + tmp = realloc(buf, buf_sz); + if (!tmp) { + err = -ENOMEM; + goto done; + } + buf = tmp; + buf[0] = '\0'; + } + + opts.log_buf = log_buf ? log_buf : buf; + opts.log_size = log_buf ? log_sz : buf_sz; + opts.log_level = log_level; + } + + btf->fd = bpf_btf_load(raw_data, raw_size, &opts); + if (btf->fd < 0) { + /* time to turn on verbose mode and try again */ + if (log_level == 0) { + log_level = 1; + goto retry_load; + } + /* only retry if caller didn't provide custom log_buf, but + * make sure we can never overflow buf_sz + */ + if (!log_buf && errno == ENOSPC && buf_sz <= UINT_MAX / 2) + goto retry_load; + + err = -errno; + pr_warn("BTF loading error: %d\n", err); + /* don't print out contents of custom log_buf */ + if (!log_buf && buf[0]) + pr_warn("-- BEGIN BTF LOAD LOG ---\n%s\n-- END BTF LOAD LOG --\n", buf); + } + +done: + free(buf); + return libbpf_err(err); +} + +int btf__load_into_kernel(struct btf *btf) +{ + return btf_load_into_kernel(btf, NULL, 0, 0); +} + +int btf__fd(const struct btf *btf) +{ + return btf->fd; +} + +void btf__set_fd(struct btf *btf, int fd) +{ + btf->fd = fd; +} + +static const void *btf_strs_data(const struct btf *btf) +{ + return btf->strs_data ? btf->strs_data : strset__data(btf->strs_set); +} + +static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian) +{ + struct btf_header *hdr = btf->hdr; + struct btf_type *t; + void *data, *p; + __u32 data_sz; + int i; + + data = swap_endian ? btf->raw_data_swapped : btf->raw_data; + if (data) { + *size = btf->raw_size; + return data; + } + + data_sz = hdr->hdr_len + hdr->type_len + hdr->str_len; + data = calloc(1, data_sz); + if (!data) + return NULL; + p = data; + + memcpy(p, hdr, hdr->hdr_len); + if (swap_endian) + btf_bswap_hdr(p); + p += hdr->hdr_len; + + memcpy(p, btf->types_data, hdr->type_len); + if (swap_endian) { + for (i = 0; i < btf->nr_types; i++) { + t = p + btf->type_offs[i]; + /* btf_bswap_type_rest() relies on native t->info, so + * we swap base type info after we swapped all the + * additional information + */ + if (btf_bswap_type_rest(t)) + goto err_out; + btf_bswap_type_base(t); + } + } + p += hdr->type_len; + + memcpy(p, btf_strs_data(btf), hdr->str_len); + p += hdr->str_len; + + *size = data_sz; + return data; +err_out: + free(data); + return NULL; +} + +const void *btf__raw_data(const struct btf *btf_ro, __u32 *size) +{ + struct btf *btf = (struct btf *)btf_ro; + __u32 data_sz; + void *data; + + data = btf_get_raw_data(btf, &data_sz, btf->swapped_endian); + if (!data) + return errno = ENOMEM, NULL; + + btf->raw_size = data_sz; + if (btf->swapped_endian) + btf->raw_data_swapped = data; + else + btf->raw_data = data; + *size = data_sz; + return data; +} + +__attribute__((alias("btf__raw_data"))) +const void *btf__get_raw_data(const struct btf *btf, __u32 *size); + +const char *btf__str_by_offset(const struct btf *btf, __u32 offset) +{ + if (offset < btf->start_str_off) + return btf__str_by_offset(btf->base_btf, offset); + else if (offset - btf->start_str_off < btf->hdr->str_len) + return btf_strs_data(btf) + (offset - btf->start_str_off); + else + return errno = EINVAL, NULL; +} + +const char *btf__name_by_offset(const struct btf *btf, __u32 offset) +{ + return btf__str_by_offset(btf, offset); +} + +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) +{ + struct bpf_btf_info btf_info; + __u32 len = sizeof(btf_info); + __u32 last_size; + struct btf *btf; + void *ptr; + int err; + + /* we won't know btf_size until we call bpf_obj_get_info_by_fd(). so + * let's start with a sane default - 4KiB here - and resize it only if + * bpf_obj_get_info_by_fd() needs a bigger buffer. + */ + last_size = 4096; + ptr = malloc(last_size); + if (!ptr) + return ERR_PTR(-ENOMEM); + + memset(&btf_info, 0, sizeof(btf_info)); + btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; + err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); + + if (!err && btf_info.btf_size > last_size) { + void *temp_ptr; + + last_size = btf_info.btf_size; + temp_ptr = realloc(ptr, last_size); + if (!temp_ptr) { + btf = ERR_PTR(-ENOMEM); + goto exit_free; + } + ptr = temp_ptr; + + len = sizeof(btf_info); + memset(&btf_info, 0, sizeof(btf_info)); + btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; + + err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); + } + + if (err || btf_info.btf_size > last_size) { + btf = err ? ERR_PTR(-errno) : ERR_PTR(-E2BIG); + goto exit_free; + } + + btf = btf_new(ptr, btf_info.btf_size, base_btf); + +exit_free: + free(ptr); + return btf; +} + +struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf) +{ + struct btf *btf; + int btf_fd; + + btf_fd = bpf_btf_get_fd_by_id(id); + if (btf_fd < 0) + return libbpf_err_ptr(-errno); + + btf = btf_get_from_fd(btf_fd, base_btf); + close(btf_fd); + + return libbpf_ptr(btf); +} + +struct btf *btf__load_from_kernel_by_id(__u32 id) +{ + return btf__load_from_kernel_by_id_split(id, NULL); +} + +static void btf_invalidate_raw_data(struct btf *btf) +{ + if (btf->raw_data) { + free(btf->raw_data); + btf->raw_data = NULL; + } + if (btf->raw_data_swapped) { + free(btf->raw_data_swapped); + btf->raw_data_swapped = NULL; + } +} + +/* Ensure BTF is ready to be modified (by splitting into a three memory + * regions for header, types, and strings). Also invalidate cached + * raw_data, if any. + */ +static int btf_ensure_modifiable(struct btf *btf) +{ + void *hdr, *types; + struct strset *set = NULL; + int err = -ENOMEM; + + if (btf_is_modifiable(btf)) { + /* any BTF modification invalidates raw_data */ + btf_invalidate_raw_data(btf); + return 0; + } + + /* split raw data into three memory regions */ + hdr = malloc(btf->hdr->hdr_len); + types = malloc(btf->hdr->type_len); + if (!hdr || !types) + goto err_out; + + memcpy(hdr, btf->hdr, btf->hdr->hdr_len); + memcpy(types, btf->types_data, btf->hdr->type_len); + + /* build lookup index for all strings */ + set = strset__new(BTF_MAX_STR_OFFSET, btf->strs_data, btf->hdr->str_len); + if (IS_ERR(set)) { + err = PTR_ERR(set); + goto err_out; + } + + /* only when everything was successful, update internal state */ + btf->hdr = hdr; + btf->types_data = types; + btf->types_data_cap = btf->hdr->type_len; + btf->strs_data = NULL; + btf->strs_set = set; + /* if BTF was created from scratch, all strings are guaranteed to be + * unique and deduplicated + */ + if (btf->hdr->str_len == 0) + btf->strs_deduped = true; + if (!btf->base_btf && btf->hdr->str_len == 1) + btf->strs_deduped = true; + + /* invalidate raw_data representation */ + btf_invalidate_raw_data(btf); + + return 0; + +err_out: + strset__free(set); + free(hdr); + free(types); + return err; +} + +/* Find an offset in BTF string section that corresponds to a given string *s*. + * Returns: + * - >0 offset into string section, if string is found; + * - -ENOENT, if string is not in the string section; + * - <0, on any other error. + */ +int btf__find_str(struct btf *btf, const char *s) +{ + int off; + + if (btf->base_btf) { + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; + } + + /* BTF needs to be in a modifiable state to build string lookup index */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + off = strset__find_str(btf->strs_set, s); + if (off < 0) + return libbpf_err(off); + + return btf->start_str_off + off; +} + +/* Add a string s to the BTF string section. + * Returns: + * - > 0 offset into string section, on success; + * - < 0, on error. + */ +int btf__add_str(struct btf *btf, const char *s) +{ + int off; + + if (btf->base_btf) { + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; + } + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + off = strset__add_str(btf->strs_set, s); + if (off < 0) + return libbpf_err(off); + + btf->hdr->str_len = strset__data_size(btf->strs_set); + + return btf->start_str_off + off; +} + +static void *btf_add_type_mem(struct btf *btf, size_t add_sz) +{ + return libbpf_add_mem(&btf->types_data, &btf->types_data_cap, 1, + btf->hdr->type_len, UINT_MAX, add_sz); +} + +static void btf_type_inc_vlen(struct btf_type *t) +{ + t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t)); +} + +static int btf_commit_type(struct btf *btf, int data_sz) +{ + int err; + + err = btf_add_type_idx_entry(btf, btf->hdr->type_len); + if (err) + return libbpf_err(err); + + btf->hdr->type_len += data_sz; + btf->hdr->str_off += data_sz; + btf->nr_types++; + return btf->start_id + btf->nr_types - 1; +} + +struct btf_pipe { + const struct btf *src; + struct btf *dst; + struct hashmap *str_off_map; /* map string offsets from src to dst */ +}; + +static int btf_rewrite_str(__u32 *str_off, void *ctx) +{ + struct btf_pipe *p = ctx; + long mapped_off; + int off, err; + + if (!*str_off) /* nothing to do for empty strings */ + return 0; + + if (p->str_off_map && + hashmap__find(p->str_off_map, *str_off, &mapped_off)) { + *str_off = mapped_off; + return 0; + } + + off = btf__add_str(p->dst, btf__str_by_offset(p->src, *str_off)); + if (off < 0) + return off; + + /* Remember string mapping from src to dst. It avoids + * performing expensive string comparisons. + */ + if (p->str_off_map) { + err = hashmap__append(p->str_off_map, *str_off, off); + if (err) + return err; + } + + *str_off = off; + return 0; +} + +int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_type *src_type) +{ + struct btf_pipe p = { .src = src_btf, .dst = btf }; + struct btf_type *t; + int sz, err; + + sz = btf_type_size(src_type); + if (sz < 0) + return libbpf_err(sz); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + memcpy(t, src_type, sz); + + err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + if (err) + return libbpf_err(err); + + return btf_commit_type(btf, sz); +} + +static int btf_rewrite_type_ids(__u32 *type_id, void *ctx) +{ + struct btf *btf = ctx; + + if (!*type_id) /* nothing to do for VOID references */ + return 0; + + /* we haven't updated btf's type count yet, so + * btf->start_id + btf->nr_types - 1 is the type ID offset we should + * add to all newly added BTF types + */ + *type_id += btf->start_id + btf->nr_types - 1; + return 0; +} + +static size_t btf_dedup_identity_hash_fn(long key, void *ctx); +static bool btf_dedup_equal_fn(long k1, long k2, void *ctx); + +int btf__add_btf(struct btf *btf, const struct btf *src_btf) +{ + struct btf_pipe p = { .src = src_btf, .dst = btf }; + int data_sz, sz, cnt, i, err, old_strs_len; + __u32 *off; + void *t; + + /* appending split BTF isn't supported yet */ + if (src_btf->base_btf) + return libbpf_err(-ENOTSUP); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + /* remember original strings section size if we have to roll back + * partial strings section changes + */ + old_strs_len = btf->hdr->str_len; + + data_sz = src_btf->hdr->type_len; + cnt = btf__type_cnt(src_btf) - 1; + + /* pre-allocate enough memory for new types */ + t = btf_add_type_mem(btf, data_sz); + if (!t) + return libbpf_err(-ENOMEM); + + /* pre-allocate enough memory for type offset index for new types */ + off = btf_add_type_offs_mem(btf, cnt); + if (!off) + return libbpf_err(-ENOMEM); + + /* Map the string offsets from src_btf to the offsets from btf to improve performance */ + p.str_off_map = hashmap__new(btf_dedup_identity_hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(p.str_off_map)) + return libbpf_err(-ENOMEM); + + /* bulk copy types data for all types from src_btf */ + memcpy(t, src_btf->types_data, data_sz); + + for (i = 0; i < cnt; i++) { + sz = btf_type_size(t); + if (sz < 0) { + /* unlikely, has to be corrupted src_btf */ + err = sz; + goto err_out; + } + + /* fill out type ID to type offset mapping for lookups by type ID */ + *off = t - btf->types_data; + + /* add, dedup, and remap strings referenced by this BTF type */ + err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + if (err) + goto err_out; + + /* remap all type IDs referenced from this BTF type */ + err = btf_type_visit_type_ids(t, btf_rewrite_type_ids, btf); + if (err) + goto err_out; + + /* go to next type data and type offset index entry */ + t += sz; + off++; + } + + /* Up until now any of the copied type data was effectively invisible, + * so if we exited early before this point due to error, BTF would be + * effectively unmodified. There would be extra internal memory + * pre-allocated, but it would not be available for querying. But now + * that we've copied and rewritten all the data successfully, we can + * update type count and various internal offsets and sizes to + * "commit" the changes and made them visible to the outside world. + */ + btf->hdr->type_len += data_sz; + btf->hdr->str_off += data_sz; + btf->nr_types += cnt; + + hashmap__free(p.str_off_map); + + /* return type ID of the first added BTF type */ + return btf->start_id + btf->nr_types - cnt; +err_out: + /* zero out preallocated memory as if it was just allocated with + * libbpf_add_mem() + */ + memset(btf->types_data + btf->hdr->type_len, 0, data_sz); + memset(btf->strs_data + old_strs_len, 0, btf->hdr->str_len - old_strs_len); + + /* and now restore original strings section size; types data size + * wasn't modified, so doesn't need restoring, see big comment above + */ + btf->hdr->str_len = old_strs_len; + + hashmap__free(p.str_off_map); + + return libbpf_err(err); +} + +/* + * Append new BTF_KIND_INT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - power-of-2 (1, 2, 4, ..) size of the type, in bytes; + * - encoding is a combination of BTF_INT_SIGNED, BTF_INT_CHAR, BTF_INT_BOOL. + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + /* byte_sz must be power of 2 */ + if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16) + return libbpf_err(-EINVAL); + if (encoding & ~(BTF_INT_SIGNED | BTF_INT_CHAR | BTF_INT_BOOL)) + return libbpf_err(-EINVAL); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(int); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + /* if something goes wrong later, we might end up with an extra string, + * but that shouldn't be a problem, because BTF can't be constructed + * completely anyway and will most probably be just discarded + */ + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_INT, 0, 0); + t->size = byte_sz; + /* set INT info, we don't allow setting legacy bit offset/size */ + *(__u32 *)(t + 1) = (encoding << 24) | (byte_sz * 8); + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_FLOAT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - size of the type, in bytes; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + /* byte_sz must be one of the explicitly allowed values */ + if (byte_sz != 2 && byte_sz != 4 && byte_sz != 8 && byte_sz != 12 && + byte_sz != 16) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_FLOAT, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + +/* it's completely legal to append BTF types with type IDs pointing forward to + * types that haven't been appended yet, so we only make sure that id looks + * sane, we can't guarantee that ID will always be valid + */ +static int validate_type_id(int id) +{ + if (id < 0 || id > BTF_MAX_NR_TYPES) + return -EINVAL; + return 0; +} + +/* generic append function for PTR, TYPEDEF, CONST/VOLATILE/RESTRICT */ +static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref_type_id) +{ + struct btf_type *t; + int sz, name_off = 0; + + if (validate_type_id(ref_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + t->name_off = name_off; + t->info = btf_type_info(kind, 0, 0); + t->type = ref_type_id; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_PTR type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_ptr(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_PTR, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_ARRAY type with: + * - *index_type_id* - type ID of the type describing array index; + * - *elem_type_id* - type ID of the type describing array element; + * - *nr_elems* - the size of the array; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 nr_elems) +{ + struct btf_type *t; + struct btf_array *a; + int sz; + + if (validate_type_id(index_type_id) || validate_type_id(elem_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(struct btf_array); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + t->name_off = 0; + t->info = btf_type_info(BTF_KIND_ARRAY, 0, 0); + t->size = 0; + + a = btf_array(t); + a->type = elem_type_id; + a->index_type = index_type_id; + a->nelems = nr_elems; + + return btf_commit_type(btf, sz); +} + +/* generic STRUCT/UNION append function */ +static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 bytes_sz) +{ + struct btf_type *t; + int sz, name_off = 0; + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + /* start out with vlen=0 and no kflag; this will be adjusted when + * adding each member + */ + t->name_off = name_off; + t->info = btf_type_info(kind, 0, 0); + t->size = bytes_sz; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_STRUCT type with: + * - *name* - name of the struct, can be NULL or empty for anonymous structs; + * - *byte_sz* - size of the struct, in bytes; + * + * Struct initially has no fields in it. Fields can be added by + * btf__add_field() right after btf__add_struct() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_struct(struct btf *btf, const char *name, __u32 byte_sz) +{ + return btf_add_composite(btf, BTF_KIND_STRUCT, name, byte_sz); +} + +/* + * Append new BTF_KIND_UNION type with: + * - *name* - name of the union, can be NULL or empty for anonymous union; + * - *byte_sz* - size of the union, in bytes; + * + * Union initially has no fields in it. Fields can be added by + * btf__add_field() right after btf__add_union() succeeds. All fields + * should have *bit_offset* of 0. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_union(struct btf *btf, const char *name, __u32 byte_sz) +{ + return btf_add_composite(btf, BTF_KIND_UNION, name, byte_sz); +} + +static struct btf_type *btf_last_type(struct btf *btf) +{ + return btf_type_by_id(btf, btf__type_cnt(btf) - 1); +} + +/* + * Append new field for the current STRUCT/UNION type with: + * - *name* - name of the field, can be NULL or empty for anonymous field; + * - *type_id* - type ID for the type describing field type; + * - *bit_offset* - bit offset of the start of the field within struct/union; + * - *bit_size* - bit size of a bitfield, 0 for non-bitfield fields; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_field(struct btf *btf, const char *name, int type_id, + __u32 bit_offset, __u32 bit_size) +{ + struct btf_type *t; + struct btf_member *m; + bool is_bitfield; + int sz, name_off = 0; + + /* last type should be union/struct */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_composite(t)) + return libbpf_err(-EINVAL); + + if (validate_type_id(type_id)) + return libbpf_err(-EINVAL); + /* best-effort bit field offset/size enforcement */ + is_bitfield = bit_size || (bit_offset % 8 != 0); + if (is_bitfield && (bit_size == 0 || bit_size > 255 || bit_offset > 0xffffff)) + return libbpf_err(-EINVAL); + + /* only offset 0 is allowed for unions */ + if (btf_is_union(t) && bit_offset) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_member); + m = btf_add_type_mem(btf, sz); + if (!m) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + m->name_off = name_off; + m->type = type_id; + m->offset = bit_offset | (bit_size << 24); + + /* btf_add_type_mem can invalidate t pointer */ + t = btf_last_type(btf); + /* update parent type's vlen and kflag */ + t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, is_bitfield || btf_kflag(t)); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed, __u8 kind) +{ + struct btf_type *t; + int sz, name_off = 0; + + /* byte_sz must be power of 2 */ + if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 8) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + /* start out with vlen=0; it will be adjusted when adding enum values */ + t->name_off = name_off; + t->info = btf_type_info(kind, 0, is_signed); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_ENUM type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum_value() + * immediately after btf__add_enum() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) +{ + /* + * set the signedness to be unsigned, it will change to signed + * if any later enumerator is negative. + */ + return btf_add_enum_common(btf, name, byte_sz, false, BTF_KIND_ENUM); +} + +/* + * Append new enum value for the current ENUM type with: + * - *name* - name of the enumerator value, can't be NULL or empty; + * - *value* - integer value corresponding to enum value *name*; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) +{ + struct btf_type *t; + struct btf_enum *v; + int sz, name_off; + + /* last type should be BTF_KIND_ENUM */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_enum(t)) + return libbpf_err(-EINVAL); + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + if (value < INT_MIN || value > UINT_MAX) + return libbpf_err(-E2BIG); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_enum); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + v->name_off = name_off; + v->val = value; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + /* if negative value, set signedness to signed */ + if (value < 0) + t->info = btf_type_info(btf_kind(t), btf_vlen(t), true); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_ENUM64 type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * - *is_signed* - whether the enum values are signed or not; + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum64_value() + * immediately after btf__add_enum64() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum64(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed) +{ + return btf_add_enum_common(btf, name, byte_sz, is_signed, + BTF_KIND_ENUM64); +} + +/* + * Append new enum value for the current ENUM64 type with: + * - *name* - name of the enumerator value, can't be NULL or empty; + * - *value* - integer value corresponding to enum value *name*; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) +{ + struct btf_enum64 *v; + struct btf_type *t; + int sz, name_off; + + /* last type should be BTF_KIND_ENUM64 */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_enum64(t)) + return libbpf_err(-EINVAL); + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_enum64); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + v->name_off = name_off; + v->val_lo32 = (__u32)value; + v->val_hi32 = value >> 32; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_FWD type with: + * - *name*, non-empty/non-NULL name; + * - *fwd_kind*, kind of forward declaration, one of BTF_FWD_STRUCT, + * BTF_FWD_UNION, or BTF_FWD_ENUM; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind) +{ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + switch (fwd_kind) { + case BTF_FWD_STRUCT: + case BTF_FWD_UNION: { + struct btf_type *t; + int id; + + id = btf_add_ref_kind(btf, BTF_KIND_FWD, name, 0); + if (id <= 0) + return id; + t = btf_type_by_id(btf, id); + t->info = btf_type_info(BTF_KIND_FWD, 0, fwd_kind == BTF_FWD_UNION); + return id; + } + case BTF_FWD_ENUM: + /* enum forward in BTF currently is just an enum with no enum + * values; we also assume a standard 4-byte size for it + */ + return btf__add_enum(btf, name, sizeof(int)); + default: + return libbpf_err(-EINVAL); + } +} + +/* + * Append new BTF_KING_TYPEDEF type with: + * - *name*, non-empty/non-NULL name; + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id) +{ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + return btf_add_ref_kind(btf, BTF_KIND_TYPEDEF, name, ref_type_id); +} + +/* + * Append new BTF_KIND_VOLATILE type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_volatile(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_VOLATILE, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_CONST type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_const(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_CONST, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_RESTRICT type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_restrict(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_RESTRICT, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_TYPE_TAG type with: + * - *value*, non-empty/non-NULL tag value; + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) +{ + if (!value || !value[0]) + return libbpf_err(-EINVAL); + + return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id); +} + +/* + * Append new BTF_KIND_FUNC type with: + * - *name*, non-empty/non-NULL name; + * - *proto_type_id* - FUNC_PROTO's type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_func(struct btf *btf, const char *name, + enum btf_func_linkage linkage, int proto_type_id) +{ + int id; + + if (!name || !name[0]) + return libbpf_err(-EINVAL); + if (linkage != BTF_FUNC_STATIC && linkage != BTF_FUNC_GLOBAL && + linkage != BTF_FUNC_EXTERN) + return libbpf_err(-EINVAL); + + id = btf_add_ref_kind(btf, BTF_KIND_FUNC, name, proto_type_id); + if (id > 0) { + struct btf_type *t = btf_type_by_id(btf, id); + + t->info = btf_type_info(BTF_KIND_FUNC, linkage, 0); + } + return libbpf_err(id); +} + +/* + * Append new BTF_KIND_FUNC_PROTO with: + * - *ret_type_id* - type ID for return result of a function. + * + * Function prototype initially has no arguments, but they can be added by + * btf__add_func_param() one by one, immediately after + * btf__add_func_proto() succeeded. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_func_proto(struct btf *btf, int ret_type_id) +{ + struct btf_type *t; + int sz; + + if (validate_type_id(ret_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + /* start out with vlen=0; this will be adjusted when adding enum + * values, if necessary + */ + t->name_off = 0; + t->info = btf_type_info(BTF_KIND_FUNC_PROTO, 0, 0); + t->type = ret_type_id; + + return btf_commit_type(btf, sz); +} + +/* + * Append new function parameter for current FUNC_PROTO type with: + * - *name* - parameter name, can be NULL or empty; + * - *type_id* - type ID describing the type of the parameter. + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_func_param(struct btf *btf, const char *name, int type_id) +{ + struct btf_type *t; + struct btf_param *p; + int sz, name_off = 0; + + if (validate_type_id(type_id)) + return libbpf_err(-EINVAL); + + /* last type should be BTF_KIND_FUNC_PROTO */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_func_proto(t)) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_param); + p = btf_add_type_mem(btf, sz); + if (!p) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + p->name_off = name_off; + p->type = type_id; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_VAR type with: + * - *name* - non-empty/non-NULL name; + * - *linkage* - variable linkage, one of BTF_VAR_STATIC, + * BTF_VAR_GLOBAL_ALLOCATED, or BTF_VAR_GLOBAL_EXTERN; + * - *type_id* - type ID of the type describing the type of the variable. + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) +{ + struct btf_type *t; + struct btf_var *v; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + if (linkage != BTF_VAR_STATIC && linkage != BTF_VAR_GLOBAL_ALLOCATED && + linkage != BTF_VAR_GLOBAL_EXTERN) + return libbpf_err(-EINVAL); + if (validate_type_id(type_id)) + return libbpf_err(-EINVAL); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(struct btf_var); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_VAR, 0, 0); + t->type = type_id; + + v = btf_var(t); + v->linkage = linkage; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_DATASEC type with: + * - *name* - non-empty/non-NULL name; + * - *byte_sz* - data section size, in bytes. + * + * Data section is initially empty. Variables info can be added with + * btf__add_datasec_var_info() calls, after btf__add_datasec() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + /* start with vlen=0, which will be update as var_secinfos are added */ + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_DATASEC, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + +/* + * Append new data section variable information entry for current DATASEC type: + * - *var_type_id* - type ID, describing type of the variable; + * - *offset* - variable offset within data section, in bytes; + * - *byte_sz* - variable size, in bytes. + * + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __u32 byte_sz) +{ + struct btf_type *t; + struct btf_var_secinfo *v; + int sz; + + /* last type should be BTF_KIND_DATASEC */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_datasec(t)) + return libbpf_err(-EINVAL); + + if (validate_type_id(var_type_id)) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_var_secinfo); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + v->type = var_type_id; + v->offset = offset; + v->size = byte_sz; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_DECL_TAG type with: + * - *value* - non-empty/non-NULL string; + * - *ref_type_id* - referenced type ID, it might not exist yet; + * - *component_idx* - -1 for tagging reference type, otherwise struct/union + * member or function argument index; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_decl_tag(struct btf *btf, const char *value, int ref_type_id, + int component_idx) +{ + struct btf_type *t; + int sz, value_off; + + if (!value || !value[0] || component_idx < -1) + return libbpf_err(-EINVAL); + + if (validate_type_id(ref_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(struct btf_decl_tag); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + value_off = btf__add_str(btf, value); + if (value_off < 0) + return value_off; + + t->name_off = value_off; + t->info = btf_type_info(BTF_KIND_DECL_TAG, 0, false); + t->type = ref_type_id; + btf_decl_tag(t)->component_idx = component_idx; + + return btf_commit_type(btf, sz); +} + +struct btf_ext_sec_setup_param { + __u32 off; + __u32 len; + __u32 min_rec_size; + struct btf_ext_info *ext_info; + const char *desc; +}; + +static int btf_ext_setup_info(struct btf_ext *btf_ext, + struct btf_ext_sec_setup_param *ext_sec) +{ + const struct btf_ext_info_sec *sinfo; + struct btf_ext_info *ext_info; + __u32 info_left, record_size; + size_t sec_cnt = 0; + /* The start of the info sec (including the __u32 record_size). */ + void *info; + + if (ext_sec->len == 0) + return 0; + + if (ext_sec->off & 0x03) { + pr_debug(".BTF.ext %s section is not aligned to 4 bytes\n", + ext_sec->desc); + return -EINVAL; + } + + info = btf_ext->data + btf_ext->hdr->hdr_len + ext_sec->off; + info_left = ext_sec->len; + + if (btf_ext->data + btf_ext->data_size < info + ext_sec->len) { + pr_debug("%s section (off:%u len:%u) is beyond the end of the ELF section .BTF.ext\n", + ext_sec->desc, ext_sec->off, ext_sec->len); + return -EINVAL; + } + + /* At least a record size */ + if (info_left < sizeof(__u32)) { + pr_debug(".BTF.ext %s record size not found\n", ext_sec->desc); + return -EINVAL; + } + + /* The record size needs to meet the minimum standard */ + record_size = *(__u32 *)info; + if (record_size < ext_sec->min_rec_size || + record_size & 0x03) { + pr_debug("%s section in .BTF.ext has invalid record size %u\n", + ext_sec->desc, record_size); + return -EINVAL; + } + + sinfo = info + sizeof(__u32); + info_left -= sizeof(__u32); + + /* If no records, return failure now so .BTF.ext won't be used. */ + if (!info_left) { + pr_debug("%s section in .BTF.ext has no records", ext_sec->desc); + return -EINVAL; + } + + while (info_left) { + unsigned int sec_hdrlen = sizeof(struct btf_ext_info_sec); + __u64 total_record_size; + __u32 num_records; + + if (info_left < sec_hdrlen) { + pr_debug("%s section header is not found in .BTF.ext\n", + ext_sec->desc); + return -EINVAL; + } + + num_records = sinfo->num_info; + if (num_records == 0) { + pr_debug("%s section has incorrect num_records in .BTF.ext\n", + ext_sec->desc); + return -EINVAL; + } + + total_record_size = sec_hdrlen + (__u64)num_records * record_size; + if (info_left < total_record_size) { + pr_debug("%s section has incorrect num_records in .BTF.ext\n", + ext_sec->desc); + return -EINVAL; + } + + info_left -= total_record_size; + sinfo = (void *)sinfo + total_record_size; + sec_cnt++; + } + + ext_info = ext_sec->ext_info; + ext_info->len = ext_sec->len - sizeof(__u32); + ext_info->rec_size = record_size; + ext_info->info = info + sizeof(__u32); + ext_info->sec_cnt = sec_cnt; + + return 0; +} + +static int btf_ext_setup_func_info(struct btf_ext *btf_ext) +{ + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->func_info_off, + .len = btf_ext->hdr->func_info_len, + .min_rec_size = sizeof(struct bpf_func_info_min), + .ext_info = &btf_ext->func_info, + .desc = "func_info" + }; + + return btf_ext_setup_info(btf_ext, ¶m); +} + +static int btf_ext_setup_line_info(struct btf_ext *btf_ext) +{ + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->line_info_off, + .len = btf_ext->hdr->line_info_len, + .min_rec_size = sizeof(struct bpf_line_info_min), + .ext_info = &btf_ext->line_info, + .desc = "line_info", + }; + + return btf_ext_setup_info(btf_ext, ¶m); +} + +static int btf_ext_setup_core_relos(struct btf_ext *btf_ext) +{ + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->core_relo_off, + .len = btf_ext->hdr->core_relo_len, + .min_rec_size = sizeof(struct bpf_core_relo), + .ext_info = &btf_ext->core_relo_info, + .desc = "core_relo", + }; + + return btf_ext_setup_info(btf_ext, ¶m); +} + +static int btf_ext_parse_hdr(__u8 *data, __u32 data_size) +{ + const struct btf_ext_header *hdr = (struct btf_ext_header *)data; + + if (data_size < offsetofend(struct btf_ext_header, hdr_len) || + data_size < hdr->hdr_len) { + pr_debug("BTF.ext header not found"); + return -EINVAL; + } + + if (hdr->magic == bswap_16(BTF_MAGIC)) { + pr_warn("BTF.ext in non-native endianness is not supported\n"); + return -ENOTSUP; + } else if (hdr->magic != BTF_MAGIC) { + pr_debug("Invalid BTF.ext magic:%x\n", hdr->magic); + return -EINVAL; + } + + if (hdr->version != BTF_VERSION) { + pr_debug("Unsupported BTF.ext version:%u\n", hdr->version); + return -ENOTSUP; + } + + if (hdr->flags) { + pr_debug("Unsupported BTF.ext flags:%x\n", hdr->flags); + return -ENOTSUP; + } + + if (data_size == hdr->hdr_len) { + pr_debug("BTF.ext has no data\n"); + return -EINVAL; + } + + return 0; +} + +void btf_ext__free(struct btf_ext *btf_ext) +{ + if (IS_ERR_OR_NULL(btf_ext)) + return; + free(btf_ext->func_info.sec_idxs); + free(btf_ext->line_info.sec_idxs); + free(btf_ext->core_relo_info.sec_idxs); + free(btf_ext->data); + free(btf_ext); +} + +struct btf_ext *btf_ext__new(const __u8 *data, __u32 size) +{ + struct btf_ext *btf_ext; + int err; + + btf_ext = calloc(1, sizeof(struct btf_ext)); + if (!btf_ext) + return libbpf_err_ptr(-ENOMEM); + + btf_ext->data_size = size; + btf_ext->data = malloc(size); + if (!btf_ext->data) { + err = -ENOMEM; + goto done; + } + memcpy(btf_ext->data, data, size); + + err = btf_ext_parse_hdr(btf_ext->data, size); + if (err) + goto done; + + if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, line_info_len)) { + err = -EINVAL; + goto done; + } + + err = btf_ext_setup_func_info(btf_ext); + if (err) + goto done; + + err = btf_ext_setup_line_info(btf_ext); + if (err) + goto done; + + if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, core_relo_len)) + goto done; /* skip core relos parsing */ + + err = btf_ext_setup_core_relos(btf_ext); + if (err) + goto done; + +done: + if (err) { + btf_ext__free(btf_ext); + return libbpf_err_ptr(err); + } + + return btf_ext; +} + +const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size) +{ + *size = btf_ext->data_size; + return btf_ext->data; +} + +struct btf_dedup; + +static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts); +static void btf_dedup_free(struct btf_dedup *d); +static int btf_dedup_prep(struct btf_dedup *d); +static int btf_dedup_strings(struct btf_dedup *d); +static int btf_dedup_prim_types(struct btf_dedup *d); +static int btf_dedup_struct_types(struct btf_dedup *d); +static int btf_dedup_ref_types(struct btf_dedup *d); +static int btf_dedup_resolve_fwds(struct btf_dedup *d); +static int btf_dedup_compact_types(struct btf_dedup *d); +static int btf_dedup_remap_types(struct btf_dedup *d); + +/* + * Deduplicate BTF types and strings. + * + * BTF dedup algorithm takes as an input `struct btf` representing `.BTF` ELF + * section with all BTF type descriptors and string data. It overwrites that + * memory in-place with deduplicated types and strings without any loss of + * information. If optional `struct btf_ext` representing '.BTF.ext' ELF section + * is provided, all the strings referenced from .BTF.ext section are honored + * and updated to point to the right offsets after deduplication. + * + * If function returns with error, type/string data might be garbled and should + * be discarded. + * + * More verbose and detailed description of both problem btf_dedup is solving, + * as well as solution could be found at: + * https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html + * + * Problem description and justification + * ===================================== + * + * BTF type information is typically emitted either as a result of conversion + * from DWARF to BTF or directly by compiler. In both cases, each compilation + * unit contains information about a subset of all the types that are used + * in an application. These subsets are frequently overlapping and contain a lot + * of duplicated information when later concatenated together into a single + * binary. This algorithm ensures that each unique type is represented by single + * BTF type descriptor, greatly reducing resulting size of BTF data. + * + * Compilation unit isolation and subsequent duplication of data is not the only + * problem. The same type hierarchy (e.g., struct and all the type that struct + * references) in different compilation units can be represented in BTF to + * various degrees of completeness (or, rather, incompleteness) due to + * struct/union forward declarations. + * + * Let's take a look at an example, that we'll use to better understand the + * problem (and solution). Suppose we have two compilation units, each using + * same `struct S`, but each of them having incomplete type information about + * struct's fields: + * + * // CU #1: + * struct S; + * struct A { + * int a; + * struct A* self; + * struct S* parent; + * }; + * struct B; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * // CU #2: + * struct S; + * struct A; + * struct B { + * int b; + * struct B* self; + * struct S* parent; + * }; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * In case of CU #1, BTF data will know only that `struct B` exist (but no + * more), but will know the complete type information about `struct A`. While + * for CU #2, it will know full type information about `struct B`, but will + * only know about forward declaration of `struct A` (in BTF terms, it will + * have `BTF_KIND_FWD` type descriptor with name `B`). + * + * This compilation unit isolation means that it's possible that there is no + * single CU with complete type information describing structs `S`, `A`, and + * `B`. Also, we might get tons of duplicated and redundant type information. + * + * Additional complication we need to keep in mind comes from the fact that + * types, in general, can form graphs containing cycles, not just DAGs. + * + * While algorithm does deduplication, it also merges and resolves type + * information (unless disabled throught `struct btf_opts`), whenever possible. + * E.g., in the example above with two compilation units having partial type + * information for structs `A` and `B`, the output of algorithm will emit + * a single copy of each BTF type that describes structs `A`, `B`, and `S` + * (as well as type information for `int` and pointers), as if they were defined + * in a single compilation unit as: + * + * struct A { + * int a; + * struct A* self; + * struct S* parent; + * }; + * struct B { + * int b; + * struct B* self; + * struct S* parent; + * }; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * Algorithm summary + * ================= + * + * Algorithm completes its work in 7 separate passes: + * + * 1. Strings deduplication. + * 2. Primitive types deduplication (int, enum, fwd). + * 3. Struct/union types deduplication. + * 4. Resolve unambiguous forward declarations. + * 5. Reference types deduplication (pointers, typedefs, arrays, funcs, func + * protos, and const/volatile/restrict modifiers). + * 6. Types compaction. + * 7. Types remapping. + * + * Algorithm determines canonical type descriptor, which is a single + * representative type for each truly unique type. This canonical type is the + * one that will go into final deduplicated BTF type information. For + * struct/unions, it is also the type that algorithm will merge additional type + * information into (while resolving FWDs), as it discovers it from data in + * other CUs. Each input BTF type eventually gets either mapped to itself, if + * that type is canonical, or to some other type, if that type is equivalent + * and was chosen as canonical representative. This mapping is stored in + * `btf_dedup->map` array. This map is also used to record STRUCT/UNION that + * FWD type got resolved to. + * + * To facilitate fast discovery of canonical types, we also maintain canonical + * index (`btf_dedup->dedup_table`), which maps type descriptor's signature hash + * (i.e., hashed kind, name, size, fields, etc) into a list of canonical types + * that match that signature. With sufficiently good choice of type signature + * hashing function, we can limit number of canonical types for each unique type + * signature to a very small number, allowing to find canonical type for any + * duplicated type very quickly. + * + * Struct/union deduplication is the most critical part and algorithm for + * deduplicating structs/unions is described in greater details in comments for + * `btf_dedup_is_equiv` function. + */ +int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts) +{ + struct btf_dedup *d; + int err; + + if (!OPTS_VALID(opts, btf_dedup_opts)) + return libbpf_err(-EINVAL); + + d = btf_dedup_new(btf, opts); + if (IS_ERR(d)) { + pr_debug("btf_dedup_new failed: %ld", PTR_ERR(d)); + return libbpf_err(-EINVAL); + } + + if (btf_ensure_modifiable(btf)) { + err = -ENOMEM; + goto done; + } + + err = btf_dedup_prep(d); + if (err) { + pr_debug("btf_dedup_prep failed:%d\n", err); + goto done; + } + err = btf_dedup_strings(d); + if (err < 0) { + pr_debug("btf_dedup_strings failed:%d\n", err); + goto done; + } + err = btf_dedup_prim_types(d); + if (err < 0) { + pr_debug("btf_dedup_prim_types failed:%d\n", err); + goto done; + } + err = btf_dedup_struct_types(d); + if (err < 0) { + pr_debug("btf_dedup_struct_types failed:%d\n", err); + goto done; + } + err = btf_dedup_resolve_fwds(d); + if (err < 0) { + pr_debug("btf_dedup_resolve_fwds failed:%d\n", err); + goto done; + } + err = btf_dedup_ref_types(d); + if (err < 0) { + pr_debug("btf_dedup_ref_types failed:%d\n", err); + goto done; + } + err = btf_dedup_compact_types(d); + if (err < 0) { + pr_debug("btf_dedup_compact_types failed:%d\n", err); + goto done; + } + err = btf_dedup_remap_types(d); + if (err < 0) { + pr_debug("btf_dedup_remap_types failed:%d\n", err); + goto done; + } + +done: + btf_dedup_free(d); + return libbpf_err(err); +} + +#define BTF_UNPROCESSED_ID ((__u32)-1) +#define BTF_IN_PROGRESS_ID ((__u32)-2) + +struct btf_dedup { + /* .BTF section to be deduped in-place */ + struct btf *btf; + /* + * Optional .BTF.ext section. When provided, any strings referenced + * from it will be taken into account when deduping strings + */ + struct btf_ext *btf_ext; + /* + * This is a map from any type's signature hash to a list of possible + * canonical representative type candidates. Hash collisions are + * ignored, so even types of various kinds can share same list of + * candidates, which is fine because we rely on subsequent + * btf_xxx_equal() checks to authoritatively verify type equality. + */ + struct hashmap *dedup_table; + /* Canonical types map */ + __u32 *map; + /* Hypothetical mapping, used during type graph equivalence checks */ + __u32 *hypot_map; + __u32 *hypot_list; + size_t hypot_cnt; + size_t hypot_cap; + /* Whether hypothetical mapping, if successful, would need to adjust + * already canonicalized types (due to a new forward declaration to + * concrete type resolution). In such case, during split BTF dedup + * candidate type would still be considered as different, because base + * BTF is considered to be immutable. + */ + bool hypot_adjust_canon; + /* Various option modifying behavior of algorithm */ + struct btf_dedup_opts opts; + /* temporary strings deduplication state */ + struct strset *strs_set; +}; + +static long hash_combine(long h, long value) +{ + return h * 31 + value; +} + +#define for_each_dedup_cand(d, node, hash) \ + hashmap__for_each_key_entry(d->dedup_table, node, hash) + +static int btf_dedup_table_add(struct btf_dedup *d, long hash, __u32 type_id) +{ + return hashmap__append(d->dedup_table, hash, type_id); +} + +static int btf_dedup_hypot_map_add(struct btf_dedup *d, + __u32 from_id, __u32 to_id) +{ + if (d->hypot_cnt == d->hypot_cap) { + __u32 *new_list; + + d->hypot_cap += max((size_t)16, d->hypot_cap / 2); + new_list = libbpf_reallocarray(d->hypot_list, d->hypot_cap, sizeof(__u32)); + if (!new_list) + return -ENOMEM; + d->hypot_list = new_list; + } + d->hypot_list[d->hypot_cnt++] = from_id; + d->hypot_map[from_id] = to_id; + return 0; +} + +static void btf_dedup_clear_hypot_map(struct btf_dedup *d) +{ + int i; + + for (i = 0; i < d->hypot_cnt; i++) + d->hypot_map[d->hypot_list[i]] = BTF_UNPROCESSED_ID; + d->hypot_cnt = 0; + d->hypot_adjust_canon = false; +} + +static void btf_dedup_free(struct btf_dedup *d) +{ + hashmap__free(d->dedup_table); + d->dedup_table = NULL; + + free(d->map); + d->map = NULL; + + free(d->hypot_map); + d->hypot_map = NULL; + + free(d->hypot_list); + d->hypot_list = NULL; + + free(d); +} + +static size_t btf_dedup_identity_hash_fn(long key, void *ctx) +{ + return key; +} + +static size_t btf_dedup_collision_hash_fn(long key, void *ctx) +{ + return 0; +} + +static bool btf_dedup_equal_fn(long k1, long k2, void *ctx) +{ + return k1 == k2; +} + +static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts) +{ + struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup)); + hashmap_hash_fn hash_fn = btf_dedup_identity_hash_fn; + int i, err = 0, type_cnt; + + if (!d) + return ERR_PTR(-ENOMEM); + + if (OPTS_GET(opts, force_collisions, false)) + hash_fn = btf_dedup_collision_hash_fn; + + d->btf = btf; + d->btf_ext = OPTS_GET(opts, btf_ext, NULL); + + d->dedup_table = hashmap__new(hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(d->dedup_table)) { + err = PTR_ERR(d->dedup_table); + d->dedup_table = NULL; + goto done; + } + + type_cnt = btf__type_cnt(btf); + d->map = malloc(sizeof(__u32) * type_cnt); + if (!d->map) { + err = -ENOMEM; + goto done; + } + /* special BTF "void" type is made canonical immediately */ + d->map[0] = 0; + for (i = 1; i < type_cnt; i++) { + struct btf_type *t = btf_type_by_id(d->btf, i); + + /* VAR and DATASEC are never deduped and are self-canonical */ + if (btf_is_var(t) || btf_is_datasec(t)) + d->map[i] = i; + else + d->map[i] = BTF_UNPROCESSED_ID; + } + + d->hypot_map = malloc(sizeof(__u32) * type_cnt); + if (!d->hypot_map) { + err = -ENOMEM; + goto done; + } + for (i = 0; i < type_cnt; i++) + d->hypot_map[i] = BTF_UNPROCESSED_ID; + +done: + if (err) { + btf_dedup_free(d); + return ERR_PTR(err); + } + + return d; +} + +/* + * Iterate over all possible places in .BTF and .BTF.ext that can reference + * string and pass pointer to it to a provided callback `fn`. + */ +static int btf_for_each_str_off(struct btf_dedup *d, str_off_visit_fn fn, void *ctx) +{ + int i, r; + + for (i = 0; i < d->btf->nr_types; i++) { + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + + r = btf_type_visit_str_offs(t, fn, ctx); + if (r) + return r; + } + + if (!d->btf_ext) + return 0; + + r = btf_ext_visit_str_offs(d->btf_ext, fn, ctx); + if (r) + return r; + + return 0; +} + +static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) +{ + struct btf_dedup *d = ctx; + __u32 str_off = *str_off_ptr; + const char *s; + int off, err; + + /* don't touch empty string or string in main BTF */ + if (str_off == 0 || str_off < d->btf->start_str_off) + return 0; + + s = btf__str_by_offset(d->btf, str_off); + if (d->btf->base_btf) { + err = btf__find_str(d->btf->base_btf, s); + if (err >= 0) { + *str_off_ptr = err; + return 0; + } + if (err != -ENOENT) + return err; + } + + off = strset__add_str(d->strs_set, s); + if (off < 0) + return off; + + *str_off_ptr = d->btf->start_str_off + off; + return 0; +} + +/* + * Dedup string and filter out those that are not referenced from either .BTF + * or .BTF.ext (if provided) sections. + * + * This is done by building index of all strings in BTF's string section, + * then iterating over all entities that can reference strings (e.g., type + * names, struct field names, .BTF.ext line info, etc) and marking corresponding + * strings as used. After that all used strings are deduped and compacted into + * sequential blob of memory and new offsets are calculated. Then all the string + * references are iterated again and rewritten using new offsets. + */ +static int btf_dedup_strings(struct btf_dedup *d) +{ + int err; + + if (d->btf->strs_deduped) + return 0; + + d->strs_set = strset__new(BTF_MAX_STR_OFFSET, NULL, 0); + if (IS_ERR(d->strs_set)) { + err = PTR_ERR(d->strs_set); + goto err_out; + } + + if (!d->btf->base_btf) { + /* insert empty string; we won't be looking it up during strings + * dedup, but it's good to have it for generic BTF string lookups + */ + err = strset__add_str(d->strs_set, ""); + if (err < 0) + goto err_out; + } + + /* remap string offsets */ + err = btf_for_each_str_off(d, strs_dedup_remap_str_off, d); + if (err) + goto err_out; + + /* replace BTF string data and hash with deduped ones */ + strset__free(d->btf->strs_set); + d->btf->hdr->str_len = strset__data_size(d->strs_set); + d->btf->strs_set = d->strs_set; + d->strs_set = NULL; + d->btf->strs_deduped = true; + return 0; + +err_out: + strset__free(d->strs_set); + d->strs_set = NULL; + + return err; +} + +static long btf_hash_common(struct btf_type *t) +{ + long h; + + h = hash_combine(0, t->name_off); + h = hash_combine(h, t->info); + h = hash_combine(h, t->size); + return h; +} + +static bool btf_equal_common(struct btf_type *t1, struct btf_type *t2) +{ + return t1->name_off == t2->name_off && + t1->info == t2->info && + t1->size == t2->size; +} + +/* Calculate type signature hash of INT or TAG. */ +static long btf_hash_int_decl_tag(struct btf_type *t) +{ + __u32 info = *(__u32 *)(t + 1); + long h; + + h = btf_hash_common(t); + h = hash_combine(h, info); + return h; +} + +/* Check structural equality of two INTs or TAGs. */ +static bool btf_equal_int_tag(struct btf_type *t1, struct btf_type *t2) +{ + __u32 info1, info2; + + if (!btf_equal_common(t1, t2)) + return false; + info1 = *(__u32 *)(t1 + 1); + info2 = *(__u32 *)(t2 + 1); + return info1 == info2; +} + +/* Calculate type signature hash of ENUM/ENUM64. */ +static long btf_hash_enum(struct btf_type *t) +{ + long h; + + /* don't hash vlen, enum members and size to support enum fwd resolving */ + h = hash_combine(0, t->name_off); + return h; +} + +static bool btf_equal_enum_members(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_enum *m1, *m2; + __u16 vlen; + int i; + + vlen = btf_vlen(t1); + m1 = btf_enum(t1); + m2 = btf_enum(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->val != m2->val) + return false; + m1++; + m2++; + } + return true; +} + +static bool btf_equal_enum64_members(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_enum64 *m1, *m2; + __u16 vlen; + int i; + + vlen = btf_vlen(t1); + m1 = btf_enum64(t1); + m2 = btf_enum64(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->val_lo32 != m2->val_lo32 || + m1->val_hi32 != m2->val_hi32) + return false; + m1++; + m2++; + } + return true; +} + +/* Check structural equality of two ENUMs or ENUM64s. */ +static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_equal_common(t1, t2)) + return false; + + /* t1 & t2 kinds are identical because of btf_equal_common */ + if (btf_kind(t1) == BTF_KIND_ENUM) + return btf_equal_enum_members(t1, t2); + else + return btf_equal_enum64_members(t1, t2); +} + +static inline bool btf_is_enum_fwd(struct btf_type *t) +{ + return btf_is_any_enum(t) && btf_vlen(t) == 0; +} + +static bool btf_compat_enum(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_is_enum_fwd(t1) && !btf_is_enum_fwd(t2)) + return btf_equal_enum(t1, t2); + /* At this point either t1 or t2 or both are forward declarations, thus: + * - skip comparing vlen because it is zero for forward declarations; + * - skip comparing size to allow enum forward declarations + * to be compatible with enum64 full declarations; + * - skip comparing kind for the same reason. + */ + return t1->name_off == t2->name_off && + btf_is_any_enum(t1) && btf_is_any_enum(t2); +} + +/* + * Calculate type signature hash of STRUCT/UNION, ignoring referenced type IDs, + * as referenced type IDs equivalence is established separately during type + * graph equivalence check algorithm. + */ +static long btf_hash_struct(struct btf_type *t) +{ + const struct btf_member *member = btf_members(t); + __u32 vlen = btf_vlen(t); + long h = btf_hash_common(t); + int i; + + for (i = 0; i < vlen; i++) { + h = hash_combine(h, member->name_off); + h = hash_combine(h, member->offset); + /* no hashing of referenced type ID, it can be unresolved yet */ + member++; + } + return h; +} + +/* + * Check structural compatibility of two STRUCTs/UNIONs, ignoring referenced + * type IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_member *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->offset != m2->offset) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Calculate type signature hash of ARRAY, including referenced type IDs, + * under assumption that they were already resolved to canonical type IDs and + * are not going to change. + */ +static long btf_hash_array(struct btf_type *t) +{ + const struct btf_array *info = btf_array(t); + long h = btf_hash_common(t); + + h = hash_combine(h, info->type); + h = hash_combine(h, info->index_type); + h = hash_combine(h, info->nelems); + return h; +} + +/* + * Check exact equality of two ARRAYs, taking into account referenced + * type IDs, under assumption that they were already resolved to canonical + * type IDs and are not going to change. + * This function is called during reference types deduplication to compare + * ARRAY to potential canonical representative. + */ +static bool btf_equal_array(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_array *info1, *info2; + + if (!btf_equal_common(t1, t2)) + return false; + + info1 = btf_array(t1); + info2 = btf_array(t2); + return info1->type == info2->type && + info1->index_type == info2->index_type && + info1->nelems == info2->nelems; +} + +/* + * Check structural compatibility of two ARRAYs, ignoring referenced type + * IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_compat_array(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_equal_common(t1, t2)) + return false; + + return btf_array(t1)->nelems == btf_array(t2)->nelems; +} + +/* + * Calculate type signature hash of FUNC_PROTO, including referenced type IDs, + * under assumption that they were already resolved to canonical type IDs and + * are not going to change. + */ +static long btf_hash_fnproto(struct btf_type *t) +{ + const struct btf_param *member = btf_params(t); + __u16 vlen = btf_vlen(t); + long h = btf_hash_common(t); + int i; + + for (i = 0; i < vlen; i++) { + h = hash_combine(h, member->name_off); + h = hash_combine(h, member->type); + member++; + } + return h; +} + +/* + * Check exact equality of two FUNC_PROTOs, taking into account referenced + * type IDs, under assumption that they were already resolved to canonical + * type IDs and are not going to change. + * This function is called during reference types deduplication to compare + * FUNC_PROTO to potential canonical representative. + */ +static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_param *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->type != m2->type) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Check structural compatibility of two FUNC_PROTOs, ignoring referenced type + * IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_param *m1, *m2; + __u16 vlen; + int i; + + /* skip return type ID */ + if (t1->name_off != t2->name_off || t1->info != t2->info) + return false; + + vlen = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off) + return false; + m1++; + m2++; + } + return true; +} + +/* Prepare split BTF for deduplication by calculating hashes of base BTF's + * types and initializing the rest of the state (canonical type mapping) for + * the fixed base BTF part. + */ +static int btf_dedup_prep(struct btf_dedup *d) +{ + struct btf_type *t; + int type_id; + long h; + + if (!d->btf->base_btf) + return 0; + + for (type_id = 1; type_id < d->btf->start_id; type_id++) { + t = btf_type_by_id(d->btf, type_id); + + /* all base BTF types are self-canonical by definition */ + d->map[type_id] = type_id; + + switch (btf_kind(t)) { + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + /* VAR and DATASEC are never hash/deduplicated */ + continue; + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_FWD: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: + case BTF_KIND_TYPE_TAG: + h = btf_hash_common(t); + break; + case BTF_KIND_INT: + case BTF_KIND_DECL_TAG: + h = btf_hash_int_decl_tag(t); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + h = btf_hash_enum(t); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + h = btf_hash_struct(t); + break; + case BTF_KIND_ARRAY: + h = btf_hash_array(t); + break; + case BTF_KIND_FUNC_PROTO: + h = btf_hash_fnproto(t); + break; + default: + pr_debug("unknown kind %d for type [%d]\n", btf_kind(t), type_id); + return -EINVAL; + } + if (btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + } + + return 0; +} + +/* + * Deduplicate primitive types, that can't reference other types, by calculating + * their type signature hash and comparing them with any possible canonical + * candidate. If no canonical candidate matches, type itself is marked as + * canonical and is added into `btf_dedup->dedup_table` as another candidate. + */ +static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_type *t = btf_type_by_id(d->btf, type_id); + struct hashmap_entry *hash_entry; + struct btf_type *cand; + /* if we don't find equivalent type, then we are canonical */ + __u32 new_id = type_id; + __u32 cand_id; + long h; + + switch (btf_kind(t)) { + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_ARRAY: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + return 0; + + case BTF_KIND_INT: + h = btf_hash_int_decl_tag(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_int_tag(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + h = btf_hash_enum(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_enum(t, cand)) { + new_id = cand_id; + break; + } + if (btf_compat_enum(t, cand)) { + if (btf_is_enum_fwd(t)) { + /* resolve fwd to full enum */ + new_id = cand_id; + break; + } + /* resolve canonical enum fwd to full enum */ + d->map[cand_id] = type_id; + } + } + break; + + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + h = btf_hash_common(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_common(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + default: + return -EINVAL; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return 0; +} + +static int btf_dedup_prim_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_prim_type(d, d->btf->start_id + i); + if (err) + return err; + } + return 0; +} + +/* + * Check whether type is already mapped into canonical one (could be to itself). + */ +static inline bool is_type_mapped(struct btf_dedup *d, uint32_t type_id) +{ + return d->map[type_id] <= BTF_MAX_NR_TYPES; +} + +/* + * Resolve type ID into its canonical type ID, if any; otherwise return original + * type ID. If type is FWD and is resolved into STRUCT/UNION already, follow + * STRUCT/UNION link and resolve it into canonical type ID as well. + */ +static inline __u32 resolve_type_id(struct btf_dedup *d, __u32 type_id) +{ + while (is_type_mapped(d, type_id) && d->map[type_id] != type_id) + type_id = d->map[type_id]; + return type_id; +} + +/* + * Resolve FWD to underlying STRUCT/UNION, if any; otherwise return original + * type ID. + */ +static uint32_t resolve_fwd_id(struct btf_dedup *d, uint32_t type_id) +{ + __u32 orig_type_id = type_id; + + if (!btf_is_fwd(btf__type_by_id(d->btf, type_id))) + return type_id; + + while (is_type_mapped(d, type_id) && d->map[type_id] != type_id) + type_id = d->map[type_id]; + + if (!btf_is_fwd(btf__type_by_id(d->btf, type_id))) + return type_id; + + return orig_type_id; +} + + +static inline __u16 btf_fwd_kind(struct btf_type *t) +{ + return btf_kflag(t) ? BTF_KIND_UNION : BTF_KIND_STRUCT; +} + +/* Check if given two types are identical ARRAY definitions */ +static bool btf_dedup_identical_arrays(struct btf_dedup *d, __u32 id1, __u32 id2) +{ + struct btf_type *t1, *t2; + + t1 = btf_type_by_id(d->btf, id1); + t2 = btf_type_by_id(d->btf, id2); + if (!btf_is_array(t1) || !btf_is_array(t2)) + return false; + + return btf_equal_array(t1, t2); +} + +/* Check if given two types are identical STRUCT/UNION definitions */ +static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id2) +{ + const struct btf_member *m1, *m2; + struct btf_type *t1, *t2; + int n, i; + + t1 = btf_type_by_id(d->btf, id1); + t2 = btf_type_by_id(d->btf, id2); + + if (!btf_is_composite(t1) || btf_kind(t1) != btf_kind(t2)) + return false; + + if (!btf_shallow_equal_struct(t1, t2)) + return false; + + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) { + if (m1->type != m2->type && + !btf_dedup_identical_arrays(d, m1->type, m2->type) && + !btf_dedup_identical_structs(d, m1->type, m2->type)) + return false; + } + return true; +} + +/* + * Check equivalence of BTF type graph formed by candidate struct/union (we'll + * call it "candidate graph" in this description for brevity) to a type graph + * formed by (potential) canonical struct/union ("canonical graph" for brevity + * here, though keep in mind that not all types in canonical graph are + * necessarily canonical representatives themselves, some of them might be + * duplicates or its uniqueness might not have been established yet). + * Returns: + * - >0, if type graphs are equivalent; + * - 0, if not equivalent; + * - <0, on error. + * + * Algorithm performs side-by-side DFS traversal of both type graphs and checks + * equivalence of BTF types at each step. If at any point BTF types in candidate + * and canonical graphs are not compatible structurally, whole graphs are + * incompatible. If types are structurally equivalent (i.e., all information + * except referenced type IDs is exactly the same), a mapping from `canon_id` to + * a `cand_id` is recored in hypothetical mapping (`btf_dedup->hypot_map`). + * If a type references other types, then those referenced types are checked + * for equivalence recursively. + * + * During DFS traversal, if we find that for current `canon_id` type we + * already have some mapping in hypothetical map, we check for two possible + * situations: + * - `canon_id` is mapped to exactly the same type as `cand_id`. This will + * happen when type graphs have cycles. In this case we assume those two + * types are equivalent. + * - `canon_id` is mapped to different type. This is contradiction in our + * hypothetical mapping, because same graph in canonical graph corresponds + * to two different types in candidate graph, which for equivalent type + * graphs shouldn't happen. This condition terminates equivalence check + * with negative result. + * + * If type graphs traversal exhausts types to check and find no contradiction, + * then type graphs are equivalent. + * + * When checking types for equivalence, there is one special case: FWD types. + * If FWD type resolution is allowed and one of the types (either from canonical + * or candidate graph) is FWD and other is STRUCT/UNION (depending on FWD's kind + * flag) and their names match, hypothetical mapping is updated to point from + * FWD to STRUCT/UNION. If graphs will be determined as equivalent successfully, + * this mapping will be used to record FWD -> STRUCT/UNION mapping permanently. + * + * Technically, this could lead to incorrect FWD to STRUCT/UNION resolution, + * if there are two exactly named (or anonymous) structs/unions that are + * compatible structurally, one of which has FWD field, while other is concrete + * STRUCT/UNION, but according to C sources they are different structs/unions + * that are referencing different types with the same name. This is extremely + * unlikely to happen, but btf_dedup API allows to disable FWD resolution if + * this logic is causing problems. + * + * Doing FWD resolution means that both candidate and/or canonical graphs can + * consists of portions of the graph that come from multiple compilation units. + * This is due to the fact that types within single compilation unit are always + * deduplicated and FWDs are already resolved, if referenced struct/union + * definiton is available. So, if we had unresolved FWD and found corresponding + * STRUCT/UNION, they will be from different compilation units. This + * consequently means that when we "link" FWD to corresponding STRUCT/UNION, + * type graph will likely have at least two different BTF types that describe + * same type (e.g., most probably there will be two different BTF types for the + * same 'int' primitive type) and could even have "overlapping" parts of type + * graph that describe same subset of types. + * + * This in turn means that our assumption that each type in canonical graph + * must correspond to exactly one type in candidate graph might not hold + * anymore and will make it harder to detect contradictions using hypothetical + * map. To handle this problem, we allow to follow FWD -> STRUCT/UNION + * resolution only in canonical graph. FWDs in candidate graphs are never + * resolved. To see why it's OK, let's check all possible situations w.r.t. FWDs + * that can occur: + * - Both types in canonical and candidate graphs are FWDs. If they are + * structurally equivalent, then they can either be both resolved to the + * same STRUCT/UNION or not resolved at all. In both cases they are + * equivalent and there is no need to resolve FWD on candidate side. + * - Both types in canonical and candidate graphs are concrete STRUCT/UNION, + * so nothing to resolve as well, algorithm will check equivalence anyway. + * - Type in canonical graph is FWD, while type in candidate is concrete + * STRUCT/UNION. In this case candidate graph comes from single compilation + * unit, so there is exactly one BTF type for each unique C type. After + * resolving FWD into STRUCT/UNION, there might be more than one BTF type + * in canonical graph mapping to single BTF type in candidate graph, but + * because hypothetical mapping maps from canonical to candidate types, it's + * alright, and we still maintain the property of having single `canon_id` + * mapping to single `cand_id` (there could be two different `canon_id` + * mapped to the same `cand_id`, but it's not contradictory). + * - Type in canonical graph is concrete STRUCT/UNION, while type in candidate + * graph is FWD. In this case we are just going to check compatibility of + * STRUCT/UNION and corresponding FWD, and if they are compatible, we'll + * assume that whatever STRUCT/UNION FWD resolves to must be equivalent to + * a concrete STRUCT/UNION from canonical graph. If the rest of type graphs + * turn out equivalent, we'll re-resolve FWD to concrete STRUCT/UNION from + * canonical graph. + */ +static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, + __u32 canon_id) +{ + struct btf_type *cand_type; + struct btf_type *canon_type; + __u32 hypot_type_id; + __u16 cand_kind; + __u16 canon_kind; + int i, eq; + + /* if both resolve to the same canonical, they must be equivalent */ + if (resolve_type_id(d, cand_id) == resolve_type_id(d, canon_id)) + return 1; + + canon_id = resolve_fwd_id(d, canon_id); + + hypot_type_id = d->hypot_map[canon_id]; + if (hypot_type_id <= BTF_MAX_NR_TYPES) { + if (hypot_type_id == cand_id) + return 1; + /* In some cases compiler will generate different DWARF types + * for *identical* array type definitions and use them for + * different fields within the *same* struct. This breaks type + * equivalence check, which makes an assumption that candidate + * types sub-graph has a consistent and deduped-by-compiler + * types within a single CU. So work around that by explicitly + * allowing identical array types here. + */ + if (btf_dedup_identical_arrays(d, hypot_type_id, cand_id)) + return 1; + /* It turns out that similar situation can happen with + * struct/union sometimes, sigh... Handle the case where + * structs/unions are exactly the same, down to the referenced + * type IDs. Anything more complicated (e.g., if referenced + * types are different, but equivalent) is *way more* + * complicated and requires a many-to-many equivalence mapping. + */ + if (btf_dedup_identical_structs(d, hypot_type_id, cand_id)) + return 1; + return 0; + } + + if (btf_dedup_hypot_map_add(d, canon_id, cand_id)) + return -ENOMEM; + + cand_type = btf_type_by_id(d->btf, cand_id); + canon_type = btf_type_by_id(d->btf, canon_id); + cand_kind = btf_kind(cand_type); + canon_kind = btf_kind(canon_type); + + if (cand_type->name_off != canon_type->name_off) + return 0; + + /* FWD <--> STRUCT/UNION equivalence check, if enabled */ + if ((cand_kind == BTF_KIND_FWD || canon_kind == BTF_KIND_FWD) + && cand_kind != canon_kind) { + __u16 real_kind; + __u16 fwd_kind; + + if (cand_kind == BTF_KIND_FWD) { + real_kind = canon_kind; + fwd_kind = btf_fwd_kind(cand_type); + } else { + real_kind = cand_kind; + fwd_kind = btf_fwd_kind(canon_type); + /* we'd need to resolve base FWD to STRUCT/UNION */ + if (fwd_kind == real_kind && canon_id < d->btf->start_id) + d->hypot_adjust_canon = true; + } + return fwd_kind == real_kind; + } + + if (cand_kind != canon_kind) + return 0; + + switch (cand_kind) { + case BTF_KIND_INT: + return btf_equal_int_tag(cand_type, canon_type); + + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + return btf_compat_enum(cand_type, canon_type); + + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + return btf_equal_common(cand_type, canon_type); + + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_TYPE_TAG: + if (cand_type->info != canon_type->info) + return 0; + return btf_dedup_is_equiv(d, cand_type->type, canon_type->type); + + case BTF_KIND_ARRAY: { + const struct btf_array *cand_arr, *canon_arr; + + if (!btf_compat_array(cand_type, canon_type)) + return 0; + cand_arr = btf_array(cand_type); + canon_arr = btf_array(canon_type); + eq = btf_dedup_is_equiv(d, cand_arr->index_type, canon_arr->index_type); + if (eq <= 0) + return eq; + return btf_dedup_is_equiv(d, cand_arr->type, canon_arr->type); + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *cand_m, *canon_m; + __u16 vlen; + + if (!btf_shallow_equal_struct(cand_type, canon_type)) + return 0; + vlen = btf_vlen(cand_type); + cand_m = btf_members(cand_type); + canon_m = btf_members(canon_type); + for (i = 0; i < vlen; i++) { + eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type); + if (eq <= 0) + return eq; + cand_m++; + canon_m++; + } + + return 1; + } + + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *cand_p, *canon_p; + __u16 vlen; + + if (!btf_compat_fnproto(cand_type, canon_type)) + return 0; + eq = btf_dedup_is_equiv(d, cand_type->type, canon_type->type); + if (eq <= 0) + return eq; + vlen = btf_vlen(cand_type); + cand_p = btf_params(cand_type); + canon_p = btf_params(canon_type); + for (i = 0; i < vlen; i++) { + eq = btf_dedup_is_equiv(d, cand_p->type, canon_p->type); + if (eq <= 0) + return eq; + cand_p++; + canon_p++; + } + return 1; + } + + default: + return -EINVAL; + } + return 0; +} + +/* + * Use hypothetical mapping, produced by successful type graph equivalence + * check, to augment existing struct/union canonical mapping, where possible. + * + * If BTF_KIND_FWD resolution is allowed, this mapping is also used to record + * FWD -> STRUCT/UNION correspondence as well. FWD resolution is bidirectional: + * it doesn't matter if FWD type was part of canonical graph or candidate one, + * we are recording the mapping anyway. As opposed to carefulness required + * for struct/union correspondence mapping (described below), for FWD resolution + * it's not important, as by the time that FWD type (reference type) will be + * deduplicated all structs/unions will be deduped already anyway. + * + * Recording STRUCT/UNION mapping is purely a performance optimization and is + * not required for correctness. It needs to be done carefully to ensure that + * struct/union from candidate's type graph is not mapped into corresponding + * struct/union from canonical type graph that itself hasn't been resolved into + * canonical representative. The only guarantee we have is that canonical + * struct/union was determined as canonical and that won't change. But any + * types referenced through that struct/union fields could have been not yet + * resolved, so in case like that it's too early to establish any kind of + * correspondence between structs/unions. + * + * No canonical correspondence is derived for primitive types (they are already + * deduplicated completely already anyway) or reference types (they rely on + * stability of struct/union canonical relationship for equivalence checks). + */ +static void btf_dedup_merge_hypot_map(struct btf_dedup *d) +{ + __u32 canon_type_id, targ_type_id; + __u16 t_kind, c_kind; + __u32 t_id, c_id; + int i; + + for (i = 0; i < d->hypot_cnt; i++) { + canon_type_id = d->hypot_list[i]; + targ_type_id = d->hypot_map[canon_type_id]; + t_id = resolve_type_id(d, targ_type_id); + c_id = resolve_type_id(d, canon_type_id); + t_kind = btf_kind(btf__type_by_id(d->btf, t_id)); + c_kind = btf_kind(btf__type_by_id(d->btf, c_id)); + /* + * Resolve FWD into STRUCT/UNION. + * It's ok to resolve FWD into STRUCT/UNION that's not yet + * mapped to canonical representative (as opposed to + * STRUCT/UNION <--> STRUCT/UNION mapping logic below), because + * eventually that struct is going to be mapped and all resolved + * FWDs will automatically resolve to correct canonical + * representative. This will happen before ref type deduping, + * which critically depends on stability of these mapping. This + * stability is not a requirement for STRUCT/UNION equivalence + * checks, though. + */ + + /* if it's the split BTF case, we still need to point base FWD + * to STRUCT/UNION in a split BTF, because FWDs from split BTF + * will be resolved against base FWD. If we don't point base + * canonical FWD to the resolved STRUCT/UNION, then all the + * FWDs in split BTF won't be correctly resolved to a proper + * STRUCT/UNION. + */ + if (t_kind != BTF_KIND_FWD && c_kind == BTF_KIND_FWD) + d->map[c_id] = t_id; + + /* if graph equivalence determined that we'd need to adjust + * base canonical types, then we need to only point base FWDs + * to STRUCTs/UNIONs and do no more modifications. For all + * other purposes the type graphs were not equivalent. + */ + if (d->hypot_adjust_canon) + continue; + + if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD) + d->map[t_id] = c_id; + + if ((t_kind == BTF_KIND_STRUCT || t_kind == BTF_KIND_UNION) && + c_kind != BTF_KIND_FWD && + is_type_mapped(d, c_id) && + !is_type_mapped(d, t_id)) { + /* + * as a perf optimization, we can map struct/union + * that's part of type graph we just verified for + * equivalence. We can do that for struct/union that has + * canonical representative only, though. + */ + d->map[t_id] = c_id; + } + } +} + +/* + * Deduplicate struct/union types. + * + * For each struct/union type its type signature hash is calculated, taking + * into account type's name, size, number, order and names of fields, but + * ignoring type ID's referenced from fields, because they might not be deduped + * completely until after reference types deduplication phase. This type hash + * is used to iterate over all potential canonical types, sharing same hash. + * For each canonical candidate we check whether type graphs that they form + * (through referenced types in fields and so on) are equivalent using algorithm + * implemented in `btf_dedup_is_equiv`. If such equivalence is found and + * BTF_KIND_FWD resolution is allowed, then hypothetical mapping + * (btf_dedup->hypot_map) produced by aforementioned type graph equivalence + * algorithm is used to record FWD -> STRUCT/UNION mapping. It's also used to + * potentially map other structs/unions to their canonical representatives, + * if such relationship hasn't yet been established. This speeds up algorithm + * by eliminating some of the duplicate work. + * + * If no matching canonical representative was found, struct/union is marked + * as canonical for itself and is added into btf_dedup->dedup_table hash map + * for further look ups. + */ +static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_type *cand_type, *t; + struct hashmap_entry *hash_entry; + /* if we don't find equivalent type, then we are canonical */ + __u32 new_id = type_id; + __u16 kind; + long h; + + /* already deduped or is in process of deduping (loop detected) */ + if (d->map[type_id] <= BTF_MAX_NR_TYPES) + return 0; + + t = btf_type_by_id(d->btf, type_id); + kind = btf_kind(t); + + if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + return 0; + + h = btf_hash_struct(t); + for_each_dedup_cand(d, hash_entry, h) { + __u32 cand_id = hash_entry->value; + int eq; + + /* + * Even though btf_dedup_is_equiv() checks for + * btf_shallow_equal_struct() internally when checking two + * structs (unions) for equivalence, we need to guard here + * from picking matching FWD type as a dedup candidate. + * This can happen due to hash collision. In such case just + * relying on btf_dedup_is_equiv() would lead to potentially + * creating a loop (FWD -> STRUCT and STRUCT -> FWD), because + * FWD and compatible STRUCT/UNION are considered equivalent. + */ + cand_type = btf_type_by_id(d->btf, cand_id); + if (!btf_shallow_equal_struct(t, cand_type)) + continue; + + btf_dedup_clear_hypot_map(d); + eq = btf_dedup_is_equiv(d, type_id, cand_id); + if (eq < 0) + return eq; + if (!eq) + continue; + btf_dedup_merge_hypot_map(d); + if (d->hypot_adjust_canon) /* not really equivalent */ + continue; + new_id = cand_id; + break; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return 0; +} + +static int btf_dedup_struct_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_struct_type(d, d->btf->start_id + i); + if (err) + return err; + } + return 0; +} + +/* + * Deduplicate reference type. + * + * Once all primitive and struct/union types got deduplicated, we can easily + * deduplicate all other (reference) BTF types. This is done in two steps: + * + * 1. Resolve all referenced type IDs into their canonical type IDs. This + * resolution can be done either immediately for primitive or struct/union types + * (because they were deduped in previous two phases) or recursively for + * reference types. Recursion will always terminate at either primitive or + * struct/union type, at which point we can "unwind" chain of reference types + * one by one. There is no danger of encountering cycles because in C type + * system the only way to form type cycle is through struct/union, so any chain + * of reference types, even those taking part in a type cycle, will inevitably + * reach struct/union at some point. + * + * 2. Once all referenced type IDs are resolved into canonical ones, BTF type + * becomes "stable", in the sense that no further deduplication will cause + * any changes to it. With that, it's now possible to calculate type's signature + * hash (this time taking into account referenced type IDs) and loop over all + * potential canonical representatives. If no match was found, current type + * will become canonical representative of itself and will be added into + * btf_dedup->dedup_table as another possible canonical representative. + */ +static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) +{ + struct hashmap_entry *hash_entry; + __u32 new_id = type_id, cand_id; + struct btf_type *t, *cand; + /* if we don't find equivalent type, then we are representative type */ + int ref_type_id; + long h; + + if (d->map[type_id] == BTF_IN_PROGRESS_ID) + return -ELOOP; + if (d->map[type_id] <= BTF_MAX_NR_TYPES) + return resolve_type_id(d, type_id); + + t = btf_type_by_id(d->btf, type_id); + d->map[type_id] = BTF_IN_PROGRESS_ID; + + switch (btf_kind(t)) { + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_TYPE_TAG: + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + h = btf_hash_common(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_common(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + case BTF_KIND_DECL_TAG: + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + h = btf_hash_int_decl_tag(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_int_tag(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + case BTF_KIND_ARRAY: { + struct btf_array *info = btf_array(t); + + ref_type_id = btf_dedup_ref_type(d, info->type); + if (ref_type_id < 0) + return ref_type_id; + info->type = ref_type_id; + + ref_type_id = btf_dedup_ref_type(d, info->index_type); + if (ref_type_id < 0) + return ref_type_id; + info->index_type = ref_type_id; + + h = btf_hash_array(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_array(t, cand)) { + new_id = cand_id; + break; + } + } + break; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *param; + __u16 vlen; + int i; + + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + vlen = btf_vlen(t); + param = btf_params(t); + for (i = 0; i < vlen; i++) { + ref_type_id = btf_dedup_ref_type(d, param->type); + if (ref_type_id < 0) + return ref_type_id; + param->type = ref_type_id; + param++; + } + + h = btf_hash_fnproto(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_fnproto(t, cand)) { + new_id = cand_id; + break; + } + } + break; + } + + default: + return -EINVAL; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return new_id; +} + +static int btf_dedup_ref_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_ref_type(d, d->btf->start_id + i); + if (err < 0) + return err; + } + /* we won't need d->dedup_table anymore */ + hashmap__free(d->dedup_table); + d->dedup_table = NULL; + return 0; +} + +/* + * Collect a map from type names to type ids for all canonical structs + * and unions. If the same name is shared by several canonical types + * use a special value 0 to indicate this fact. + */ +static int btf_dedup_fill_unique_names_map(struct btf_dedup *d, struct hashmap *names_map) +{ + __u32 nr_types = btf__type_cnt(d->btf); + struct btf_type *t; + __u32 type_id; + __u16 kind; + int err; + + /* + * Iterate over base and split module ids in order to get all + * available structs in the map. + */ + for (type_id = 1; type_id < nr_types; ++type_id) { + t = btf_type_by_id(d->btf, type_id); + kind = btf_kind(t); + + if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + continue; + + /* Skip non-canonical types */ + if (type_id != d->map[type_id]) + continue; + + err = hashmap__add(names_map, t->name_off, type_id); + if (err == -EEXIST) + err = hashmap__set(names_map, t->name_off, 0, NULL, NULL); + + if (err) + return err; + } + + return 0; +} + +static int btf_dedup_resolve_fwd(struct btf_dedup *d, struct hashmap *names_map, __u32 type_id) +{ + struct btf_type *t = btf_type_by_id(d->btf, type_id); + enum btf_fwd_kind fwd_kind = btf_kflag(t); + __u16 cand_kind, kind = btf_kind(t); + struct btf_type *cand_t; + uintptr_t cand_id; + + if (kind != BTF_KIND_FWD) + return 0; + + /* Skip if this FWD already has a mapping */ + if (type_id != d->map[type_id]) + return 0; + + if (!hashmap__find(names_map, t->name_off, &cand_id)) + return 0; + + /* Zero is a special value indicating that name is not unique */ + if (!cand_id) + return 0; + + cand_t = btf_type_by_id(d->btf, cand_id); + cand_kind = btf_kind(cand_t); + if ((cand_kind == BTF_KIND_STRUCT && fwd_kind != BTF_FWD_STRUCT) || + (cand_kind == BTF_KIND_UNION && fwd_kind != BTF_FWD_UNION)) + return 0; + + d->map[type_id] = cand_id; + + return 0; +} + +/* + * Resolve unambiguous forward declarations. + * + * The lion's share of all FWD declarations is resolved during + * `btf_dedup_struct_types` phase when different type graphs are + * compared against each other. However, if in some compilation unit a + * FWD declaration is not a part of a type graph compared against + * another type graph that declaration's canonical type would not be + * changed. Example: + * + * CU #1: + * + * struct foo; + * struct foo *some_global; + * + * CU #2: + * + * struct foo { int u; }; + * struct foo *another_global; + * + * After `btf_dedup_struct_types` the BTF looks as follows: + * + * [1] STRUCT 'foo' size=4 vlen=1 ... + * [2] INT 'int' size=4 ... + * [3] PTR '(anon)' type_id=1 + * [4] FWD 'foo' fwd_kind=struct + * [5] PTR '(anon)' type_id=4 + * + * This pass assumes that such FWD declarations should be mapped to + * structs or unions with identical name in case if the name is not + * ambiguous. + */ +static int btf_dedup_resolve_fwds(struct btf_dedup *d) +{ + int i, err; + struct hashmap *names_map; + + names_map = hashmap__new(btf_dedup_identity_hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(names_map)) + return PTR_ERR(names_map); + + err = btf_dedup_fill_unique_names_map(d, names_map); + if (err < 0) + goto exit; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_resolve_fwd(d, names_map, d->btf->start_id + i); + if (err < 0) + break; + } + +exit: + hashmap__free(names_map); + return err; +} + +/* + * Compact types. + * + * After we established for each type its corresponding canonical representative + * type, we now can eliminate types that are not canonical and leave only + * canonical ones layed out sequentially in memory by copying them over + * duplicates. During compaction btf_dedup->hypot_map array is reused to store + * a map from original type ID to a new compacted type ID, which will be used + * during next phase to "fix up" type IDs, referenced from struct/union and + * reference types. + */ +static int btf_dedup_compact_types(struct btf_dedup *d) +{ + __u32 *new_offs; + __u32 next_type_id = d->btf->start_id; + const struct btf_type *t; + void *p; + int i, id, len; + + /* we are going to reuse hypot_map to store compaction remapping */ + d->hypot_map[0] = 0; + /* base BTF types are not renumbered */ + for (id = 1; id < d->btf->start_id; id++) + d->hypot_map[id] = id; + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) + d->hypot_map[id] = BTF_UNPROCESSED_ID; + + p = d->btf->types_data; + + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) { + if (d->map[id] != id) + continue; + + t = btf__type_by_id(d->btf, id); + len = btf_type_size(t); + if (len < 0) + return len; + + memmove(p, t, len); + d->hypot_map[id] = next_type_id; + d->btf->type_offs[next_type_id - d->btf->start_id] = p - d->btf->types_data; + p += len; + next_type_id++; + } + + /* shrink struct btf's internal types index and update btf_header */ + d->btf->nr_types = next_type_id - d->btf->start_id; + d->btf->type_offs_cap = d->btf->nr_types; + d->btf->hdr->type_len = p - d->btf->types_data; + new_offs = libbpf_reallocarray(d->btf->type_offs, d->btf->type_offs_cap, + sizeof(*new_offs)); + if (d->btf->type_offs_cap && !new_offs) + return -ENOMEM; + d->btf->type_offs = new_offs; + d->btf->hdr->str_off = d->btf->hdr->type_len; + d->btf->raw_size = d->btf->hdr->hdr_len + d->btf->hdr->type_len + d->btf->hdr->str_len; + return 0; +} + +/* + * Figure out final (deduplicated and compacted) type ID for provided original + * `type_id` by first resolving it into corresponding canonical type ID and + * then mapping it to a deduplicated type ID, stored in btf_dedup->hypot_map, + * which is populated during compaction phase. + */ +static int btf_dedup_remap_type_id(__u32 *type_id, void *ctx) +{ + struct btf_dedup *d = ctx; + __u32 resolved_type_id, new_type_id; + + resolved_type_id = resolve_type_id(d, *type_id); + new_type_id = d->hypot_map[resolved_type_id]; + if (new_type_id > BTF_MAX_NR_TYPES) + return -EINVAL; + + *type_id = new_type_id; + return 0; +} + +/* + * Remap referenced type IDs into deduped type IDs. + * + * After BTF types are deduplicated and compacted, their final type IDs may + * differ from original ones. The map from original to a corresponding + * deduped type ID is stored in btf_dedup->hypot_map and is populated during + * compaction phase. During remapping phase we are rewriting all type IDs + * referenced from any BTF type (e.g., struct fields, func proto args, etc) to + * their final deduped type IDs. + */ +static int btf_dedup_remap_types(struct btf_dedup *d) +{ + int i, r; + + for (i = 0; i < d->btf->nr_types; i++) { + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + + r = btf_type_visit_type_ids(t, btf_dedup_remap_type_id, d); + if (r) + return r; + } + + if (!d->btf_ext) + return 0; + + r = btf_ext_visit_type_ids(d->btf_ext, btf_dedup_remap_type_id, d); + if (r) + return r; + + return 0; +} + +/* + * Probe few well-known locations for vmlinux kernel image and try to load BTF + * data out of it to use for target BTF. + */ +struct btf *btf__load_vmlinux_btf(void) +{ + const char *locations[] = { + /* try canonical vmlinux BTF through sysfs first */ + "/sys/kernel/btf/vmlinux", + /* fall back to trying to find vmlinux on disk otherwise */ + "/boot/vmlinux-%1$s", + "/lib/modules/%1$s/vmlinux-%1$s", + "/lib/modules/%1$s/build/vmlinux", + "/usr/lib/modules/%1$s/kernel/vmlinux", + "/usr/lib/debug/boot/vmlinux-%1$s", + "/usr/lib/debug/boot/vmlinux-%1$s.debug", + "/usr/lib/debug/lib/modules/%1$s/vmlinux", + }; + char path[PATH_MAX + 1]; + struct utsname buf; + struct btf *btf; + int i, err; + + uname(&buf); + + for (i = 0; i < ARRAY_SIZE(locations); i++) { + snprintf(path, PATH_MAX, locations[i], buf.release); + + if (faccessat(AT_FDCWD, path, R_OK, AT_EACCESS)) + continue; + + btf = btf__parse(path, NULL); + err = libbpf_get_error(btf); + pr_debug("loading kernel BTF '%s': %d\n", path, err); + if (err) + continue; + + return btf; + } + + pr_warn("failed to find valid kernel BTF\n"); + return libbpf_err_ptr(-ESRCH); +} + +struct btf *libbpf_find_kernel_btf(void) __attribute__((alias("btf__load_vmlinux_btf"))); + +struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf) +{ + char path[80]; + + snprintf(path, sizeof(path), "/sys/kernel/btf/%s", module_name); + return btf__parse_split(path, vmlinux_btf); +} + +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx) +{ + int i, n, err; + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + return 0; + + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + return visit(&t->type, ctx); + + case BTF_KIND_ARRAY: { + struct btf_array *a = btf_array(t); + + err = visit(&a->type, ctx); + err = err ?: visit(&a->index_type, ctx); + return err; + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + err = visit(&t->type, ctx); + if (err) + return err; + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_DATASEC: { + struct btf_var_secinfo *m = btf_var_secinfos(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + default: + return -EINVAL; + } +} + +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx) +{ + int i, n, err; + + err = visit(&t->name_off, ctx); + if (err) + return err; + + switch (btf_kind(t)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM: { + struct btf_enum *m = btf_enum(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM64: { + struct btf_enum64 *m = btf_enum64(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + default: + break; + } + + return 0; +} + +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_func_info_min *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + return 0; +} + +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + } + + seg = &btf_ext->line_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_line_info_min *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->file_name_off, ctx); + if (err) + return err; + err = visit(&rec->line_off, ctx); + if (err) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->access_str_off, ctx); + if (err) + return err; + } + } + + return 0; +} diff --git a/src/btf.h b/src/btf.h new file mode 100644 index 0000000..8e6880d --- /dev/null +++ b/src/btf.h @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2018 Facebook */ +/*! \file */ + +#ifndef __LIBBPF_BTF_H +#define __LIBBPF_BTF_H + +#include <stdarg.h> +#include <stdbool.h> +#include <linux/btf.h> +#include <linux/types.h> + +#include "libbpf_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BTF_ELF_SEC ".BTF" +#define BTF_EXT_ELF_SEC ".BTF.ext" +#define MAPS_ELF_SEC ".maps" + +struct btf; +struct btf_ext; +struct btf_type; + +struct bpf_object; + +enum btf_endianness { + BTF_LITTLE_ENDIAN = 0, + BTF_BIG_ENDIAN = 1, +}; + +/** + * @brief **btf__free()** frees all data of a BTF object + * @param btf BTF object to free + */ +LIBBPF_API void btf__free(struct btf *btf); + +/** + * @brief **btf__new()** creates a new instance of a BTF object from the raw + * bytes of an ELF's BTF section + * @param data raw bytes + * @param size number of bytes passed in `data` + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new(const void *data, __u32 size); + +/** + * @brief **btf__new_split()** create a new instance of a BTF object from the + * provided raw data bytes. It takes another BTF instance, **base_btf**, which + * serves as a base BTF, which is extended by types in a newly created BTF + * instance + * @param data raw bytes + * @param size length of raw bytes + * @param base_btf the base BTF object + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * If *base_btf* is NULL, `btf__new_split()` is equivalent to `btf__new()` and + * creates non-split BTF. + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf); + +/** + * @brief **btf__new_empty()** creates an empty BTF object. Use + * `btf__add_*()` to populate such BTF object. + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new_empty(void); + +/** + * @brief **btf__new_empty_split()** creates an unpopulated BTF object from an + * ELF BTF section except with a base BTF on top of which split BTF should be + * based + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * If *base_btf* is NULL, `btf__new_empty_split()` is equivalent to + * `btf__new_empty()` and creates non-split BTF. + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf); + +LIBBPF_API struct btf *btf__parse(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_split(const char *path, struct btf *base_btf); +LIBBPF_API struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf); +LIBBPF_API struct btf *btf__parse_raw(const char *path); +LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf); + +LIBBPF_API struct btf *btf__load_vmlinux_btf(void); +LIBBPF_API struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf); + +LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id); +LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf); + +LIBBPF_API int btf__load_into_kernel(struct btf *btf); +LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, + const char *type_name); +LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, + const char *type_name, __u32 kind); +LIBBPF_API __u32 btf__type_cnt(const struct btf *btf); +LIBBPF_API const struct btf *btf__base_btf(const struct btf *btf); +LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, + __u32 id); +LIBBPF_API size_t btf__pointer_size(const struct btf *btf); +LIBBPF_API int btf__set_pointer_size(struct btf *btf, size_t ptr_sz); +LIBBPF_API enum btf_endianness btf__endianness(const struct btf *btf); +LIBBPF_API int btf__set_endianness(struct btf *btf, enum btf_endianness endian); +LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__align_of(const struct btf *btf, __u32 id); +LIBBPF_API int btf__fd(const struct btf *btf); +LIBBPF_API void btf__set_fd(struct btf *btf, int fd); +LIBBPF_API const void *btf__raw_data(const struct btf *btf, __u32 *size); +LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); +LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset); + +LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size); +LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); +LIBBPF_API const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size); + +LIBBPF_API int btf__find_str(struct btf *btf, const char *s); +LIBBPF_API int btf__add_str(struct btf *btf, const char *s); +LIBBPF_API int btf__add_type(struct btf *btf, const struct btf *src_btf, + const struct btf_type *src_type); +/** + * @brief **btf__add_btf()** appends all the BTF types from *src_btf* into *btf* + * @param btf BTF object which all the BTF types and strings are added to + * @param src_btf BTF object which all BTF types and referenced strings are copied from + * @return BTF type ID of the first appended BTF type, or negative error code + * + * **btf__add_btf()** can be used to simply and efficiently append the entire + * contents of one BTF object to another one. All the BTF type data is copied + * over, all referenced type IDs are adjusted by adding a necessary ID offset. + * Only strings referenced from BTF types are copied over and deduplicated, so + * if there were some unused strings in *src_btf*, those won't be copied over, + * which is consistent with the general string deduplication semantics of BTF + * writing APIs. + * + * If any error is encountered during this process, the contents of *btf* is + * left intact, which means that **btf__add_btf()** follows the transactional + * semantics and the operation as a whole is all-or-nothing. + * + * *src_btf* has to be non-split BTF, as of now copying types from split BTF + * is not supported and will result in -ENOTSUP error code returned. + */ +LIBBPF_API int btf__add_btf(struct btf *btf, const struct btf *src_btf); + +LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding); +LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz); +LIBBPF_API int btf__add_ptr(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_array(struct btf *btf, + int index_type_id, int elem_type_id, __u32 nr_elems); +/* struct/union construction APIs */ +LIBBPF_API int btf__add_struct(struct btf *btf, const char *name, __u32 sz); +LIBBPF_API int btf__add_union(struct btf *btf, const char *name, __u32 sz); +LIBBPF_API int btf__add_field(struct btf *btf, const char *name, int field_type_id, + __u32 bit_offset, __u32 bit_size); + +/* enum construction APIs */ +LIBBPF_API int btf__add_enum(struct btf *btf, const char *name, __u32 bytes_sz); +LIBBPF_API int btf__add_enum_value(struct btf *btf, const char *name, __s64 value); +LIBBPF_API int btf__add_enum64(struct btf *btf, const char *name, __u32 bytes_sz, bool is_signed); +LIBBPF_API int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value); + +enum btf_fwd_kind { + BTF_FWD_STRUCT = 0, + BTF_FWD_UNION = 1, + BTF_FWD_ENUM = 2, +}; + +LIBBPF_API int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind); +LIBBPF_API int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id); +LIBBPF_API int btf__add_volatile(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_const(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_restrict(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id); + +/* func and func_proto construction APIs */ +LIBBPF_API int btf__add_func(struct btf *btf, const char *name, + enum btf_func_linkage linkage, int proto_type_id); +LIBBPF_API int btf__add_func_proto(struct btf *btf, int ret_type_id); +LIBBPF_API int btf__add_func_param(struct btf *btf, const char *name, int type_id); + +/* var & datasec construction APIs */ +LIBBPF_API int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id); +LIBBPF_API int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz); +LIBBPF_API int btf__add_datasec_var_info(struct btf *btf, int var_type_id, + __u32 offset, __u32 byte_sz); + +/* tag construction API */ +LIBBPF_API int btf__add_decl_tag(struct btf *btf, const char *value, int ref_type_id, + int component_idx); + +struct btf_dedup_opts { + size_t sz; + /* optional .BTF.ext info to dedup along the main BTF info */ + struct btf_ext *btf_ext; + /* force hash collisions (used for testing) */ + bool force_collisions; + size_t :0; +}; +#define btf_dedup_opts__last_field force_collisions + +LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); + +struct btf_dump; + +struct btf_dump_opts { + size_t sz; +}; +#define btf_dump_opts__last_field sz + +typedef void (*btf_dump_printf_fn_t)(void *ctx, const char *fmt, va_list args); + +LIBBPF_API struct btf_dump *btf_dump__new(const struct btf *btf, + btf_dump_printf_fn_t printf_fn, + void *ctx, + const struct btf_dump_opts *opts); + +LIBBPF_API void btf_dump__free(struct btf_dump *d); + +LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id); + +struct btf_dump_emit_type_decl_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* optional field name for type declaration, e.g.: + * - struct my_struct <FNAME> + * - void (*<FNAME>)(int) + * - char (*<FNAME>)[123] + */ + const char *field_name; + /* extra indentation level (in number of tabs) to emit for multi-line + * type declarations (e.g., anonymous struct); applies for lines + * starting from the second one (first line is assumed to have + * necessary indentation already + */ + int indent_level; + /* strip all the const/volatile/restrict mods */ + bool strip_mods; + size_t :0; +}; +#define btf_dump_emit_type_decl_opts__last_field strip_mods + +LIBBPF_API int +btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, + const struct btf_dump_emit_type_decl_opts *opts); + + +struct btf_dump_type_data_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + const char *indent_str; + int indent_level; + /* below match "show" flags for bpf_show_snprintf() */ + bool compact; /* no newlines/indentation */ + bool skip_names; /* skip member/type names */ + bool emit_zeroes; /* show 0-valued fields */ + size_t :0; +}; +#define btf_dump_type_data_opts__last_field emit_zeroes + +LIBBPF_API int +btf_dump__dump_type_data(struct btf_dump *d, __u32 id, + const void *data, size_t data_sz, + const struct btf_dump_type_data_opts *opts); + +/* + * A set of helpers for easier BTF types handling. + * + * The inline functions below rely on constants from the kernel headers which + * may not be available for applications including this header file. To avoid + * compilation errors, we define all the constants here that were added after + * the initial introduction of the BTF_KIND* constants. + */ +#ifndef BTF_KIND_FUNC +#define BTF_KIND_FUNC 12 /* Function */ +#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ +#endif +#ifndef BTF_KIND_VAR +#define BTF_KIND_VAR 14 /* Variable */ +#define BTF_KIND_DATASEC 15 /* Section */ +#endif +#ifndef BTF_KIND_FLOAT +#define BTF_KIND_FLOAT 16 /* Floating point */ +#endif +/* The kernel header switched to enums, so the following were never #defined */ +#define BTF_KIND_DECL_TAG 17 /* Decl Tag */ +#define BTF_KIND_TYPE_TAG 18 /* Type Tag */ +#define BTF_KIND_ENUM64 19 /* Enum for up-to 64bit values */ + +static inline __u16 btf_kind(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info); +} + +static inline __u16 btf_vlen(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static inline bool btf_kflag(const struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info); +} + +static inline bool btf_is_void(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_UNKN; +} + +static inline bool btf_is_int(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_INT; +} + +static inline bool btf_is_ptr(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_PTR; +} + +static inline bool btf_is_array(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ARRAY; +} + +static inline bool btf_is_struct(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_STRUCT; +} + +static inline bool btf_is_union(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_UNION; +} + +static inline bool btf_is_composite(const struct btf_type *t) +{ + __u16 kind = btf_kind(t); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static inline bool btf_is_enum(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM; +} + +static inline bool btf_is_enum64(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM64; +} + +static inline bool btf_is_fwd(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FWD; +} + +static inline bool btf_is_typedef(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_TYPEDEF; +} + +static inline bool btf_is_volatile(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_VOLATILE; +} + +static inline bool btf_is_const(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_CONST; +} + +static inline bool btf_is_restrict(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_RESTRICT; +} + +static inline bool btf_is_mod(const struct btf_type *t) +{ + __u16 kind = btf_kind(t); + + return kind == BTF_KIND_VOLATILE || + kind == BTF_KIND_CONST || + kind == BTF_KIND_RESTRICT || + kind == BTF_KIND_TYPE_TAG; +} + +static inline bool btf_is_func(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FUNC; +} + +static inline bool btf_is_func_proto(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FUNC_PROTO; +} + +static inline bool btf_is_var(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_VAR; +} + +static inline bool btf_is_datasec(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_DATASEC; +} + +static inline bool btf_is_float(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FLOAT; +} + +static inline bool btf_is_decl_tag(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_DECL_TAG; +} + +static inline bool btf_is_type_tag(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_TYPE_TAG; +} + +static inline bool btf_is_any_enum(const struct btf_type *t) +{ + return btf_is_enum(t) || btf_is_enum64(t); +} + +static inline bool btf_kind_core_compat(const struct btf_type *t1, + const struct btf_type *t2) +{ + return btf_kind(t1) == btf_kind(t2) || + (btf_is_any_enum(t1) && btf_is_any_enum(t2)); +} + +static inline __u8 btf_int_encoding(const struct btf_type *t) +{ + return BTF_INT_ENCODING(*(__u32 *)(t + 1)); +} + +static inline __u8 btf_int_offset(const struct btf_type *t) +{ + return BTF_INT_OFFSET(*(__u32 *)(t + 1)); +} + +static inline __u8 btf_int_bits(const struct btf_type *t) +{ + return BTF_INT_BITS(*(__u32 *)(t + 1)); +} + +static inline struct btf_array *btf_array(const struct btf_type *t) +{ + return (struct btf_array *)(t + 1); +} + +static inline struct btf_enum *btf_enum(const struct btf_type *t) +{ + return (struct btf_enum *)(t + 1); +} + +struct btf_enum64; + +static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) +{ + return (struct btf_enum64 *)(t + 1); +} + +static inline __u64 btf_enum64_value(const struct btf_enum64 *e) +{ + /* struct btf_enum64 is introduced in Linux 6.0, which is very + * bleeding-edge. Here we are avoiding relying on struct btf_enum64 + * definition coming from kernel UAPI headers to support wider range + * of system-wide kernel headers. + * + * Given this header can be also included from C++ applications, that + * further restricts C tricks we can use (like using compatible + * anonymous struct). So just treat struct btf_enum64 as + * a three-element array of u32 and access second (lo32) and third + * (hi32) elements directly. + * + * For reference, here is a struct btf_enum64 definition: + * + * const struct btf_enum64 { + * __u32 name_off; + * __u32 val_lo32; + * __u32 val_hi32; + * }; + */ + const __u32 *e64 = (const __u32 *)e; + + return ((__u64)e64[2] << 32) | e64[1]; +} + +static inline struct btf_member *btf_members(const struct btf_type *t) +{ + return (struct btf_member *)(t + 1); +} + +/* Get bit offset of a member with specified index. */ +static inline __u32 btf_member_bit_offset(const struct btf_type *t, + __u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + bool kflag = btf_kflag(t); + + return kflag ? BTF_MEMBER_BIT_OFFSET(m->offset) : m->offset; +} +/* + * Get bitfield size of a member, assuming t is BTF_KIND_STRUCT or + * BTF_KIND_UNION. If member is not a bitfield, zero is returned. + */ +static inline __u32 btf_member_bitfield_size(const struct btf_type *t, + __u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + bool kflag = btf_kflag(t); + + return kflag ? BTF_MEMBER_BITFIELD_SIZE(m->offset) : 0; +} + +static inline struct btf_param *btf_params(const struct btf_type *t) +{ + return (struct btf_param *)(t + 1); +} + +static inline struct btf_var *btf_var(const struct btf_type *t) +{ + return (struct btf_var *)(t + 1); +} + +static inline struct btf_var_secinfo * +btf_var_secinfos(const struct btf_type *t) +{ + return (struct btf_var_secinfo *)(t + 1); +} + +struct btf_decl_tag; +static inline struct btf_decl_tag *btf_decl_tag(const struct btf_type *t) +{ + return (struct btf_decl_tag *)(t + 1); +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_BTF_H */ diff --git a/src/btf_dump.c b/src/btf_dump.c new file mode 100644 index 0000000..580985e --- /dev/null +++ b/src/btf_dump.c @@ -0,0 +1,2526 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * BTF-to-C type converter. + * + * Copyright (c) 2019 Facebook + */ + +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <endian.h> +#include <errno.h> +#include <limits.h> +#include <linux/err.h> +#include <linux/btf.h> +#include <linux/kernel.h> +#include "btf.h" +#include "hashmap.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +static const char PREFIXES[] = "\t\t\t\t\t\t\t\t\t\t\t\t\t"; +static const size_t PREFIX_CNT = sizeof(PREFIXES) - 1; + +static const char *pfx(int lvl) +{ + return lvl >= PREFIX_CNT ? PREFIXES : &PREFIXES[PREFIX_CNT - lvl]; +} + +enum btf_dump_type_order_state { + NOT_ORDERED, + ORDERING, + ORDERED, +}; + +enum btf_dump_type_emit_state { + NOT_EMITTED, + EMITTING, + EMITTED, +}; + +/* per-type auxiliary state */ +struct btf_dump_type_aux_state { + /* topological sorting state */ + enum btf_dump_type_order_state order_state: 2; + /* emitting state used to determine the need for forward declaration */ + enum btf_dump_type_emit_state emit_state: 2; + /* whether forward declaration was already emitted */ + __u8 fwd_emitted: 1; + /* whether unique non-duplicate name was already assigned */ + __u8 name_resolved: 1; + /* whether type is referenced from any other type */ + __u8 referenced: 1; +}; + +/* indent string length; one indent string is added for each indent level */ +#define BTF_DATA_INDENT_STR_LEN 32 + +/* + * Common internal data for BTF type data dump operations. + */ +struct btf_dump_data { + const void *data_end; /* end of valid data to show */ + bool compact; + bool skip_names; + bool emit_zeroes; + __u8 indent_lvl; /* base indent level */ + char indent_str[BTF_DATA_INDENT_STR_LEN]; + /* below are used during iteration */ + int depth; + bool is_array_member; + bool is_array_terminated; + bool is_array_char; +}; + +struct btf_dump { + const struct btf *btf; + btf_dump_printf_fn_t printf_fn; + void *cb_ctx; + int ptr_sz; + bool strip_mods; + bool skip_anon_defs; + int last_id; + + /* per-type auxiliary state */ + struct btf_dump_type_aux_state *type_states; + size_t type_states_cap; + /* per-type optional cached unique name, must be freed, if present */ + const char **cached_names; + size_t cached_names_cap; + + /* topo-sorted list of dependent type definitions */ + __u32 *emit_queue; + int emit_queue_cap; + int emit_queue_cnt; + + /* + * stack of type declarations (e.g., chain of modifiers, arrays, + * funcs, etc) + */ + __u32 *decl_stack; + int decl_stack_cap; + int decl_stack_cnt; + + /* maps struct/union/enum name to a number of name occurrences */ + struct hashmap *type_names; + /* + * maps typedef identifiers and enum value names to a number of such + * name occurrences + */ + struct hashmap *ident_names; + /* + * data for typed display; allocated if needed. + */ + struct btf_dump_data *typed_dump; +}; + +static size_t str_hash_fn(long key, void *ctx) +{ + return str_hash((void *)key); +} + +static bool str_equal_fn(long a, long b, void *ctx) +{ + return strcmp((void *)a, (void *)b) == 0; +} + +static const char *btf_name_of(const struct btf_dump *d, __u32 name_off) +{ + return btf__name_by_offset(d->btf, name_off); +} + +static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + d->printf_fn(d->cb_ctx, fmt, args); + va_end(args); +} + +static int btf_dump_mark_referenced(struct btf_dump *d); +static int btf_dump_resize(struct btf_dump *d); + +struct btf_dump *btf_dump__new(const struct btf *btf, + btf_dump_printf_fn_t printf_fn, + void *ctx, + const struct btf_dump_opts *opts) +{ + struct btf_dump *d; + int err; + + if (!OPTS_VALID(opts, btf_dump_opts)) + return libbpf_err_ptr(-EINVAL); + + if (!printf_fn) + return libbpf_err_ptr(-EINVAL); + + d = calloc(1, sizeof(struct btf_dump)); + if (!d) + return libbpf_err_ptr(-ENOMEM); + + d->btf = btf; + d->printf_fn = printf_fn; + d->cb_ctx = ctx; + d->ptr_sz = btf__pointer_size(btf) ? : sizeof(void *); + + d->type_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); + if (IS_ERR(d->type_names)) { + err = PTR_ERR(d->type_names); + d->type_names = NULL; + goto err; + } + d->ident_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); + if (IS_ERR(d->ident_names)) { + err = PTR_ERR(d->ident_names); + d->ident_names = NULL; + goto err; + } + + err = btf_dump_resize(d); + if (err) + goto err; + + return d; +err: + btf_dump__free(d); + return libbpf_err_ptr(err); +} + +static int btf_dump_resize(struct btf_dump *d) +{ + int err, last_id = btf__type_cnt(d->btf) - 1; + + if (last_id <= d->last_id) + return 0; + + if (libbpf_ensure_mem((void **)&d->type_states, &d->type_states_cap, + sizeof(*d->type_states), last_id + 1)) + return -ENOMEM; + if (libbpf_ensure_mem((void **)&d->cached_names, &d->cached_names_cap, + sizeof(*d->cached_names), last_id + 1)) + return -ENOMEM; + + if (d->last_id == 0) { + /* VOID is special */ + d->type_states[0].order_state = ORDERED; + d->type_states[0].emit_state = EMITTED; + } + + /* eagerly determine referenced types for anon enums */ + err = btf_dump_mark_referenced(d); + if (err) + return err; + + d->last_id = last_id; + return 0; +} + +static void btf_dump_free_names(struct hashmap *map) +{ + size_t bkt; + struct hashmap_entry *cur; + + hashmap__for_each_entry(map, cur, bkt) + free((void *)cur->pkey); + + hashmap__free(map); +} + +void btf_dump__free(struct btf_dump *d) +{ + int i; + + if (IS_ERR_OR_NULL(d)) + return; + + free(d->type_states); + if (d->cached_names) { + /* any set cached name is owned by us and should be freed */ + for (i = 0; i <= d->last_id; i++) { + if (d->cached_names[i]) + free((void *)d->cached_names[i]); + } + } + free(d->cached_names); + free(d->emit_queue); + free(d->decl_stack); + btf_dump_free_names(d->type_names); + btf_dump_free_names(d->ident_names); + + free(d); +} + +static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr); +static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id); + +/* + * Dump BTF type in a compilable C syntax, including all the necessary + * dependent types, necessary for compilation. If some of the dependent types + * were already emitted as part of previous btf_dump__dump_type() invocation + * for another type, they won't be emitted again. This API allows callers to + * filter out BTF types according to user-defined criterias and emitted only + * minimal subset of types, necessary to compile everything. Full struct/union + * definitions will still be emitted, even if the only usage is through + * pointer and could be satisfied with just a forward declaration. + * + * Dumping is done in two high-level passes: + * 1. Topologically sort type definitions to satisfy C rules of compilation. + * 2. Emit type definitions in C syntax. + * + * Returns 0 on success; <0, otherwise. + */ +int btf_dump__dump_type(struct btf_dump *d, __u32 id) +{ + int err, i; + + if (id >= btf__type_cnt(d->btf)) + return libbpf_err(-EINVAL); + + err = btf_dump_resize(d); + if (err) + return libbpf_err(err); + + d->emit_queue_cnt = 0; + err = btf_dump_order_type(d, id, false); + if (err < 0) + return libbpf_err(err); + + for (i = 0; i < d->emit_queue_cnt; i++) + btf_dump_emit_type(d, d->emit_queue[i], 0 /*top-level*/); + + return 0; +} + +/* + * Mark all types that are referenced from any other type. This is used to + * determine top-level anonymous enums that need to be emitted as an + * independent type declarations. + * Anonymous enums come in two flavors: either embedded in a struct's field + * definition, in which case they have to be declared inline as part of field + * type declaration; or as a top-level anonymous enum, typically used for + * declaring global constants. It's impossible to distinguish between two + * without knowning whether given enum type was referenced from other type: + * top-level anonymous enum won't be referenced by anything, while embedded + * one will. + */ +static int btf_dump_mark_referenced(struct btf_dump *d) +{ + int i, j, n = btf__type_cnt(d->btf); + const struct btf_type *t; + __u16 vlen; + + for (i = d->last_id + 1; i < n; i++) { + t = btf__type_by_id(d->btf, i); + vlen = btf_vlen(t); + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + break; + + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + d->type_states[t->type].referenced = 1; + break; + + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + + d->type_states[a->index_type].referenced = 1; + d->type_states[a->type].referenced = 1; + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + + for (j = 0; j < vlen; j++, m++) + d->type_states[m->type].referenced = 1; + break; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + + for (j = 0; j < vlen; j++, p++) + d->type_states[p->type].referenced = 1; + break; + } + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *v = btf_var_secinfos(t); + + for (j = 0; j < vlen; j++, v++) + d->type_states[v->type].referenced = 1; + break; + } + default: + return -EINVAL; + } + } + return 0; +} + +static int btf_dump_add_emit_queue_id(struct btf_dump *d, __u32 id) +{ + __u32 *new_queue; + size_t new_cap; + + if (d->emit_queue_cnt >= d->emit_queue_cap) { + new_cap = max(16, d->emit_queue_cap * 3 / 2); + new_queue = libbpf_reallocarray(d->emit_queue, new_cap, sizeof(new_queue[0])); + if (!new_queue) + return -ENOMEM; + d->emit_queue = new_queue; + d->emit_queue_cap = new_cap; + } + + d->emit_queue[d->emit_queue_cnt++] = id; + return 0; +} + +/* + * Determine order of emitting dependent types and specified type to satisfy + * C compilation rules. This is done through topological sorting with an + * additional complication which comes from C rules. The main idea for C is + * that if some type is "embedded" into a struct/union, it's size needs to be + * known at the time of definition of containing type. E.g., for: + * + * struct A {}; + * struct B { struct A x; } + * + * struct A *HAS* to be defined before struct B, because it's "embedded", + * i.e., it is part of struct B layout. But in the following case: + * + * struct A; + * struct B { struct A *x; } + * struct A {}; + * + * it's enough to just have a forward declaration of struct A at the time of + * struct B definition, as struct B has a pointer to struct A, so the size of + * field x is known without knowing struct A size: it's sizeof(void *). + * + * Unfortunately, there are some trickier cases we need to handle, e.g.: + * + * struct A {}; // if this was forward-declaration: compilation error + * struct B { + * struct { // anonymous struct + * struct A y; + * } *x; + * }; + * + * In this case, struct B's field x is a pointer, so it's size is known + * regardless of the size of (anonymous) struct it points to. But because this + * struct is anonymous and thus defined inline inside struct B, *and* it + * embeds struct A, compiler requires full definition of struct A to be known + * before struct B can be defined. This creates a transitive dependency + * between struct A and struct B. If struct A was forward-declared before + * struct B definition and fully defined after struct B definition, that would + * trigger compilation error. + * + * All this means that while we are doing topological sorting on BTF type + * graph, we need to determine relationships between different types (graph + * nodes): + * - weak link (relationship) between X and Y, if Y *CAN* be + * forward-declared at the point of X definition; + * - strong link, if Y *HAS* to be fully-defined before X can be defined. + * + * The rule is as follows. Given a chain of BTF types from X to Y, if there is + * BTF_KIND_PTR type in the chain and at least one non-anonymous type + * Z (excluding X, including Y), then link is weak. Otherwise, it's strong. + * Weak/strong relationship is determined recursively during DFS traversal and + * is returned as a result from btf_dump_order_type(). + * + * btf_dump_order_type() is trying to avoid unnecessary forward declarations, + * but it is not guaranteeing that no extraneous forward declarations will be + * emitted. + * + * To avoid extra work, algorithm marks some of BTF types as ORDERED, when + * it's done with them, but not for all (e.g., VOLATILE, CONST, RESTRICT, + * ARRAY, FUNC_PROTO), as weak/strong semantics for those depends on the + * entire graph path, so depending where from one came to that BTF type, it + * might cause weak or strong ordering. For types like STRUCT/UNION/INT/ENUM, + * once they are processed, there is no need to do it again, so they are + * marked as ORDERED. We can mark PTR as ORDERED as well, as it semi-forces + * weak link, unless subsequent referenced STRUCT/UNION/ENUM is anonymous. But + * in any case, once those are processed, no need to do it again, as the + * result won't change. + * + * Returns: + * - 1, if type is part of strong link (so there is strong topological + * ordering requirements); + * - 0, if type is part of weak link (so can be satisfied through forward + * declaration); + * - <0, on error (e.g., unsatisfiable type loop detected). + */ +static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) +{ + /* + * Order state is used to detect strong link cycles, but only for BTF + * kinds that are or could be an independent definition (i.e., + * stand-alone fwd decl, enum, typedef, struct, union). Ptrs, arrays, + * func_protos, modifiers are just means to get to these definitions. + * Int/void don't need definitions, they are assumed to be always + * properly defined. We also ignore datasec, var, and funcs for now. + * So for all non-defining kinds, we never even set ordering state, + * for defining kinds we set ORDERING and subsequently ORDERED if it + * forms a strong link. + */ + struct btf_dump_type_aux_state *tstate = &d->type_states[id]; + const struct btf_type *t; + __u16 vlen; + int err, i; + + /* return true, letting typedefs know that it's ok to be emitted */ + if (tstate->order_state == ORDERED) + return 1; + + t = btf__type_by_id(d->btf, id); + + if (tstate->order_state == ORDERING) { + /* type loop, but resolvable through fwd declaration */ + if (btf_is_composite(t) && through_ptr && t->name_off != 0) + return 0; + pr_warn("unsatisfiable type cycle, id:[%u]\n", id); + return -ELOOP; + } + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + tstate->order_state = ORDERED; + return 0; + + case BTF_KIND_PTR: + err = btf_dump_order_type(d, t->type, true); + tstate->order_state = ORDERED; + return err; + + case BTF_KIND_ARRAY: + return btf_dump_order_type(d, btf_array(t)->type, false); + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + /* + * struct/union is part of strong link, only if it's embedded + * (so no ptr in a path) or it's anonymous (so has to be + * defined inline, even if declared through ptr) + */ + if (through_ptr && t->name_off != 0) + return 0; + + tstate->order_state = ORDERING; + + vlen = btf_vlen(t); + for (i = 0; i < vlen; i++, m++) { + err = btf_dump_order_type(d, m->type, false); + if (err < 0) + return err; + } + + if (t->name_off != 0) { + err = btf_dump_add_emit_queue_id(d, id); + if (err < 0) + return err; + } + + tstate->order_state = ORDERED; + return 1; + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + /* + * non-anonymous or non-referenced enums are top-level + * declarations and should be emitted. Same logic can be + * applied to FWDs, it won't hurt anyways. + */ + if (t->name_off != 0 || !tstate->referenced) { + err = btf_dump_add_emit_queue_id(d, id); + if (err) + return err; + } + tstate->order_state = ORDERED; + return 1; + + case BTF_KIND_TYPEDEF: { + int is_strong; + + is_strong = btf_dump_order_type(d, t->type, through_ptr); + if (is_strong < 0) + return is_strong; + + /* typedef is similar to struct/union w.r.t. fwd-decls */ + if (through_ptr && !is_strong) + return 0; + + /* typedef is always a named definition */ + err = btf_dump_add_emit_queue_id(d, id); + if (err) + return err; + + d->type_states[id].order_state = ORDERED; + return 1; + } + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + return btf_dump_order_type(d, t->type, through_ptr); + + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + bool is_strong; + + err = btf_dump_order_type(d, t->type, through_ptr); + if (err < 0) + return err; + is_strong = err > 0; + + vlen = btf_vlen(t); + for (i = 0; i < vlen; i++, p++) { + err = btf_dump_order_type(d, p->type, through_ptr); + if (err < 0) + return err; + if (err > 0) + is_strong = true; + } + return is_strong; + } + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + case BTF_KIND_DECL_TAG: + d->type_states[id].order_state = ORDERED; + return 0; + + default: + return -EINVAL; + } +} + +static void btf_dump_emit_missing_aliases(struct btf_dump *d, __u32 id, + const struct btf_type *t); + +static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t); +static void btf_dump_emit_struct_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl); + +static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t); +static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl); + +static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, + const struct btf_type *t); + +static void btf_dump_emit_typedef_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl); + +/* a local view into a shared stack */ +struct id_stack { + const __u32 *ids; + int cnt; +}; + +static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, + const char *fname, int lvl); +static void btf_dump_emit_type_chain(struct btf_dump *d, + struct id_stack *decl_stack, + const char *fname, int lvl); + +static const char *btf_dump_type_name(struct btf_dump *d, __u32 id); +static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id); +static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, + const char *orig_name); + +static bool btf_dump_is_blacklisted(struct btf_dump *d, __u32 id) +{ + const struct btf_type *t = btf__type_by_id(d->btf, id); + + /* __builtin_va_list is a compiler built-in, which causes compilation + * errors, when compiling w/ different compiler, then used to compile + * original code (e.g., GCC to compile kernel, Clang to use generated + * C header from BTF). As it is built-in, it should be already defined + * properly internally in compiler. + */ + if (t->name_off == 0) + return false; + return strcmp(btf_name_of(d, t->name_off), "__builtin_va_list") == 0; +} + +/* + * Emit C-syntax definitions of types from chains of BTF types. + * + * High-level handling of determining necessary forward declarations are handled + * by btf_dump_emit_type() itself, but all nitty-gritty details of emitting type + * declarations/definitions in C syntax are handled by a combo of + * btf_dump_emit_type_decl()/btf_dump_emit_type_chain() w/ delegation to + * corresponding btf_dump_emit_*_{def,fwd}() functions. + * + * We also keep track of "containing struct/union type ID" to determine when + * we reference it from inside and thus can avoid emitting unnecessary forward + * declaration. + * + * This algorithm is designed in such a way, that even if some error occurs + * (either technical, e.g., out of memory, or logical, i.e., malformed BTF + * that doesn't comply to C rules completely), algorithm will try to proceed + * and produce as much meaningful output as possible. + */ +static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) +{ + struct btf_dump_type_aux_state *tstate = &d->type_states[id]; + bool top_level_def = cont_id == 0; + const struct btf_type *t; + __u16 kind; + + if (tstate->emit_state == EMITTED) + return; + + t = btf__type_by_id(d->btf, id); + kind = btf_kind(t); + + if (tstate->emit_state == EMITTING) { + if (tstate->fwd_emitted) + return; + + switch (kind) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + /* + * if we are referencing a struct/union that we are + * part of - then no need for fwd declaration + */ + if (id == cont_id) + return; + if (t->name_off == 0) { + pr_warn("anonymous struct/union loop, id:[%u]\n", + id); + return; + } + btf_dump_emit_struct_fwd(d, id, t); + btf_dump_printf(d, ";\n\n"); + tstate->fwd_emitted = 1; + break; + case BTF_KIND_TYPEDEF: + /* + * for typedef fwd_emitted means typedef definition + * was emitted, but it can be used only for "weak" + * references through pointer only, not for embedding + */ + if (!btf_dump_is_blacklisted(d, id)) { + btf_dump_emit_typedef_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + } + tstate->fwd_emitted = 1; + break; + default: + break; + } + + return; + } + + switch (kind) { + case BTF_KIND_INT: + /* Emit type alias definitions if necessary */ + btf_dump_emit_missing_aliases(d, id, t); + + tstate->emit_state = EMITTED; + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (top_level_def) { + btf_dump_emit_enum_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + } + tstate->emit_state = EMITTED; + break; + case BTF_KIND_PTR: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + btf_dump_emit_type(d, t->type, cont_id); + break; + case BTF_KIND_ARRAY: + btf_dump_emit_type(d, btf_array(t)->type, cont_id); + break; + case BTF_KIND_FWD: + btf_dump_emit_fwd_def(d, id, t); + btf_dump_printf(d, ";\n\n"); + tstate->emit_state = EMITTED; + break; + case BTF_KIND_TYPEDEF: + tstate->emit_state = EMITTING; + btf_dump_emit_type(d, t->type, id); + /* + * typedef can server as both definition and forward + * declaration; at this stage someone depends on + * typedef as a forward declaration (refers to it + * through pointer), so unless we already did it, + * emit typedef as a forward declaration + */ + if (!tstate->fwd_emitted && !btf_dump_is_blacklisted(d, id)) { + btf_dump_emit_typedef_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + } + tstate->emit_state = EMITTED; + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + tstate->emit_state = EMITTING; + /* if it's a top-level struct/union definition or struct/union + * is anonymous, then in C we'll be emitting all fields and + * their types (as opposed to just `struct X`), so we need to + * make sure that all types, referenced from struct/union + * members have necessary forward-declarations, where + * applicable + */ + if (top_level_def || t->name_off == 0) { + const struct btf_member *m = btf_members(t); + __u16 vlen = btf_vlen(t); + int i, new_cont_id; + + new_cont_id = t->name_off == 0 ? cont_id : id; + for (i = 0; i < vlen; i++, m++) + btf_dump_emit_type(d, m->type, new_cont_id); + } else if (!tstate->fwd_emitted && id != cont_id) { + btf_dump_emit_struct_fwd(d, id, t); + btf_dump_printf(d, ";\n\n"); + tstate->fwd_emitted = 1; + } + + if (top_level_def) { + btf_dump_emit_struct_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + tstate->emit_state = EMITTED; + } else { + tstate->emit_state = NOT_EMITTED; + } + break; + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + __u16 n = btf_vlen(t); + int i; + + btf_dump_emit_type(d, t->type, cont_id); + for (i = 0; i < n; i++, p++) + btf_dump_emit_type(d, p->type, cont_id); + + break; + } + default: + break; + } +} + +static bool btf_is_struct_packed(const struct btf *btf, __u32 id, + const struct btf_type *t) +{ + const struct btf_member *m; + int max_align = 1, align, i, bit_sz; + __u16 vlen; + + m = btf_members(t); + vlen = btf_vlen(t); + /* all non-bitfield fields have to be naturally aligned */ + for (i = 0; i < vlen; i++, m++) { + align = btf__align_of(btf, m->type); + bit_sz = btf_member_bitfield_size(t, i); + if (align && bit_sz == 0 && m->offset % (8 * align) != 0) + return true; + max_align = max(align, max_align); + } + /* size of a non-packed struct has to be a multiple of its alignment */ + if (t->size % max_align != 0) + return true; + /* + * if original struct was marked as packed, but its layout is + * naturally aligned, we'll detect that it's not packed + */ + return false; +} + +static void btf_dump_emit_bit_padding(const struct btf_dump *d, + int cur_off, int next_off, int next_align, + bool in_bitfield, int lvl) +{ + const struct { + const char *name; + int bits; + } pads[] = { + {"long", d->ptr_sz * 8}, {"int", 32}, {"short", 16}, {"char", 8} + }; + int new_off, pad_bits, bits, i; + const char *pad_type; + + if (cur_off >= next_off) + return; /* no gap */ + + /* For filling out padding we want to take advantage of + * natural alignment rules to minimize unnecessary explicit + * padding. First, we find the largest type (among long, int, + * short, or char) that can be used to force naturally aligned + * boundary. Once determined, we'll use such type to fill in + * the remaining padding gap. In some cases we can rely on + * compiler filling some gaps, but sometimes we need to force + * alignment to close natural alignment with markers like + * `long: 0` (this is always the case for bitfields). Note + * that even if struct itself has, let's say 4-byte alignment + * (i.e., it only uses up to int-aligned types), using `long: + * X;` explicit padding doesn't actually change struct's + * overall alignment requirements, but compiler does take into + * account that type's (long, in this example) natural + * alignment requirements when adding implicit padding. We use + * this fact heavily and don't worry about ruining correct + * struct alignment requirement. + */ + for (i = 0; i < ARRAY_SIZE(pads); i++) { + pad_bits = pads[i].bits; + pad_type = pads[i].name; + + new_off = roundup(cur_off, pad_bits); + if (new_off <= next_off) + break; + } + + if (new_off > cur_off && new_off <= next_off) { + /* We need explicit `<type>: 0` aligning mark if next + * field is right on alignment offset and its + * alignment requirement is less strict than <type>'s + * alignment (so compiler won't naturally align to the + * offset we expect), or if subsequent `<type>: X`, + * will actually completely fit in the remaining hole, + * making compiler basically ignore `<type>: X` + * completely. + */ + if (in_bitfield || + (new_off == next_off && roundup(cur_off, next_align * 8) != new_off) || + (new_off != next_off && next_off - new_off <= new_off - cur_off)) + /* but for bitfields we'll emit explicit bit count */ + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, + in_bitfield ? new_off - cur_off : 0); + cur_off = new_off; + } + + /* Now we know we start at naturally aligned offset for a chosen + * padding type (long, int, short, or char), and so the rest is just + * a straightforward filling of remaining padding gap with full + * `<type>: sizeof(<type>);` markers, except for the last one, which + * might need smaller than sizeof(<type>) padding. + */ + while (cur_off != next_off) { + bits = min(next_off - cur_off, pad_bits); + if (bits == pad_bits) { + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); + cur_off += bits; + continue; + } + /* For the remainder padding that doesn't cover entire + * pad_type bit length, we pick the smallest necessary type. + * This is pure aesthetics, we could have just used `long`, + * but having smallest necessary one communicates better the + * scale of the padding gap. + */ + for (i = ARRAY_SIZE(pads) - 1; i >= 0; i--) { + pad_type = pads[i].name; + pad_bits = pads[i].bits; + if (pad_bits < bits) + continue; + + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, bits); + cur_off += bits; + break; + } + } +} + +static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + btf_dump_printf(d, "%s%s%s", + btf_is_struct(t) ? "struct" : "union", + t->name_off ? " " : "", + btf_dump_type_name(d, id)); +} + +static void btf_dump_emit_struct_def(struct btf_dump *d, + __u32 id, + const struct btf_type *t, + int lvl) +{ + const struct btf_member *m = btf_members(t); + bool is_struct = btf_is_struct(t); + bool packed, prev_bitfield = false; + int align, i, off = 0; + __u16 vlen = btf_vlen(t); + + align = btf__align_of(d->btf, id); + packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0; + + btf_dump_printf(d, "%s%s%s {", + is_struct ? "struct" : "union", + t->name_off ? " " : "", + btf_dump_type_name(d, id)); + + for (i = 0; i < vlen; i++, m++) { + const char *fname; + int m_off, m_sz, m_align; + bool in_bitfield; + + fname = btf_name_of(d, m->name_off); + m_sz = btf_member_bitfield_size(t, i); + m_off = btf_member_bit_offset(t, i); + m_align = packed ? 1 : btf__align_of(d->btf, m->type); + + in_bitfield = prev_bitfield && m_sz != 0; + + btf_dump_emit_bit_padding(d, off, m_off, m_align, in_bitfield, lvl + 1); + btf_dump_printf(d, "\n%s", pfx(lvl + 1)); + btf_dump_emit_type_decl(d, m->type, fname, lvl + 1); + + if (m_sz) { + btf_dump_printf(d, ": %d", m_sz); + off = m_off + m_sz; + prev_bitfield = true; + } else { + m_sz = max((__s64)0, btf__resolve_size(d->btf, m->type)); + off = m_off + m_sz * 8; + prev_bitfield = false; + } + + btf_dump_printf(d, ";"); + } + + /* pad at the end, if necessary */ + if (is_struct) + btf_dump_emit_bit_padding(d, off, t->size * 8, align, false, lvl + 1); + + /* + * Keep `struct empty {}` on a single line, + * only print newline when there are regular or padding fields. + */ + if (vlen || t->size) { + btf_dump_printf(d, "\n"); + btf_dump_printf(d, "%s}", pfx(lvl)); + } else { + btf_dump_printf(d, "}"); + } + if (packed) + btf_dump_printf(d, " __attribute__((packed))"); +} + +static const char *missing_base_types[][2] = { + /* + * GCC emits typedefs to its internal __PolyX_t types when compiling Arm + * SIMD intrinsics. Alias them to standard base types. + */ + { "__Poly8_t", "unsigned char" }, + { "__Poly16_t", "unsigned short" }, + { "__Poly64_t", "unsigned long long" }, + { "__Poly128_t", "unsigned __int128" }, +}; + +static void btf_dump_emit_missing_aliases(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + const char *name = btf_dump_type_name(d, id); + int i; + + for (i = 0; i < ARRAY_SIZE(missing_base_types); i++) { + if (strcmp(name, missing_base_types[i][0]) == 0) { + btf_dump_printf(d, "typedef %s %s;\n\n", + missing_base_types[i][1], name); + break; + } + } +} + +static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + btf_dump_printf(d, "enum %s", btf_dump_type_name(d, id)); +} + +static void btf_dump_emit_enum32_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) +{ + const struct btf_enum *v = btf_enum(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; + const char *name; + size_t dup_cnt; + int i; + + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + /* enumerators share namespace with typedef idents */ + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %d," : "\n%s%s___%zd = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, dup_cnt, v->val); + } else { + fmt_str = is_signed ? "\n%s%s = %d," : "\n%s%s = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, v->val); + } + } +} + +static void btf_dump_emit_enum64_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) +{ + const struct btf_enum64 *v = btf_enum64(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; + const char *name; + size_t dup_cnt; + __u64 val; + int i; + + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + val = btf_enum64_value(v); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %lldLL," + : "\n%s%s___%zd = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, dup_cnt, + (unsigned long long)val); + } else { + fmt_str = is_signed ? "\n%s%s = %lldLL," + : "\n%s%s = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, + (unsigned long long)val); + } + } +} +static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, + int lvl) +{ + __u16 vlen = btf_vlen(t); + + btf_dump_printf(d, "enum%s%s", + t->name_off ? " " : "", + btf_dump_type_name(d, id)); + + if (!vlen) + return; + + btf_dump_printf(d, " {"); + if (btf_is_enum(t)) + btf_dump_emit_enum32_val(d, t, lvl, vlen); + else + btf_dump_emit_enum64_val(d, t, lvl, vlen); + btf_dump_printf(d, "\n%s}", pfx(lvl)); + + /* special case enums with special sizes */ + if (t->size == 1) { + /* one-byte enums can be forced with mode(byte) attribute */ + btf_dump_printf(d, " __attribute__((mode(byte)))"); + } else if (t->size == 8 && d->ptr_sz == 8) { + /* enum can be 8-byte sized if one of the enumerator values + * doesn't fit in 32-bit integer, or by adding mode(word) + * attribute (but probably only on 64-bit architectures); do + * our best here to try to satisfy the contract without adding + * unnecessary attributes + */ + bool needs_word_mode; + + if (btf_is_enum(t)) { + /* enum can't represent 64-bit values, so we need word mode */ + needs_word_mode = true; + } else { + /* enum64 needs mode(word) if none of its values has + * non-zero upper 32-bits (which means that all values + * fit in 32-bit integers and won't cause compiler to + * bump enum to be 64-bit naturally + */ + int i; + + needs_word_mode = true; + for (i = 0; i < vlen; i++) { + if (btf_enum64(t)[i].val_hi32 != 0) { + needs_word_mode = false; + break; + } + } + } + if (needs_word_mode) + btf_dump_printf(d, " __attribute__((mode(word)))"); + } + +} + +static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + const char *name = btf_dump_type_name(d, id); + + if (btf_kflag(t)) + btf_dump_printf(d, "union %s", name); + else + btf_dump_printf(d, "struct %s", name); +} + +static void btf_dump_emit_typedef_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl) +{ + const char *name = btf_dump_ident_name(d, id); + + /* + * Old GCC versions are emitting invalid typedef for __gnuc_va_list + * pointing to VOID. This generates warnings from btf_dump() and + * results in uncompilable header file, so we are fixing it up here + * with valid typedef into __builtin_va_list. + */ + if (t->type == 0 && strcmp(name, "__gnuc_va_list") == 0) { + btf_dump_printf(d, "typedef __builtin_va_list __gnuc_va_list"); + return; + } + + btf_dump_printf(d, "typedef "); + btf_dump_emit_type_decl(d, t->type, name, lvl); +} + +static int btf_dump_push_decl_stack_id(struct btf_dump *d, __u32 id) +{ + __u32 *new_stack; + size_t new_cap; + + if (d->decl_stack_cnt >= d->decl_stack_cap) { + new_cap = max(16, d->decl_stack_cap * 3 / 2); + new_stack = libbpf_reallocarray(d->decl_stack, new_cap, sizeof(new_stack[0])); + if (!new_stack) + return -ENOMEM; + d->decl_stack = new_stack; + d->decl_stack_cap = new_cap; + } + + d->decl_stack[d->decl_stack_cnt++] = id; + + return 0; +} + +/* + * Emit type declaration (e.g., field type declaration in a struct or argument + * declaration in function prototype) in correct C syntax. + * + * For most types it's trivial, but there are few quirky type declaration + * cases worth mentioning: + * - function prototypes (especially nesting of function prototypes); + * - arrays; + * - const/volatile/restrict for pointers vs other types. + * + * For a good discussion of *PARSING* C syntax (as a human), see + * Peter van der Linden's "Expert C Programming: Deep C Secrets", + * Ch.3 "Unscrambling Declarations in C". + * + * It won't help with BTF to C conversion much, though, as it's an opposite + * problem. So we came up with this algorithm in reverse to van der Linden's + * parsing algorithm. It goes from structured BTF representation of type + * declaration to a valid compilable C syntax. + * + * For instance, consider this C typedef: + * typedef const int * const * arr[10] arr_t; + * It will be represented in BTF with this chain of BTF types: + * [typedef] -> [array] -> [ptr] -> [const] -> [ptr] -> [const] -> [int] + * + * Notice how [const] modifier always goes before type it modifies in BTF type + * graph, but in C syntax, const/volatile/restrict modifiers are written to + * the right of pointers, but to the left of other types. There are also other + * quirks, like function pointers, arrays of them, functions returning other + * functions, etc. + * + * We handle that by pushing all the types to a stack, until we hit "terminal" + * type (int/enum/struct/union/fwd). Then depending on the kind of a type on + * top of a stack, modifiers are handled differently. Array/function pointers + * have also wildly different syntax and how nesting of them are done. See + * code for authoritative definition. + * + * To avoid allocating new stack for each independent chain of BTF types, we + * share one bigger stack, with each chain working only on its own local view + * of a stack frame. Some care is required to "pop" stack frames after + * processing type declaration chain. + */ +int btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, + const struct btf_dump_emit_type_decl_opts *opts) +{ + const char *fname; + int lvl, err; + + if (!OPTS_VALID(opts, btf_dump_emit_type_decl_opts)) + return libbpf_err(-EINVAL); + + err = btf_dump_resize(d); + if (err) + return libbpf_err(err); + + fname = OPTS_GET(opts, field_name, ""); + lvl = OPTS_GET(opts, indent_level, 0); + d->strip_mods = OPTS_GET(opts, strip_mods, false); + btf_dump_emit_type_decl(d, id, fname, lvl); + d->strip_mods = false; + return 0; +} + +static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, + const char *fname, int lvl) +{ + struct id_stack decl_stack; + const struct btf_type *t; + int err, stack_start; + + stack_start = d->decl_stack_cnt; + for (;;) { + t = btf__type_by_id(d->btf, id); + if (d->strip_mods && btf_is_mod(t)) + goto skip_mod; + + err = btf_dump_push_decl_stack_id(d, id); + if (err < 0) { + /* + * if we don't have enough memory for entire type decl + * chain, restore stack, emit warning, and try to + * proceed nevertheless + */ + pr_warn("not enough memory for decl stack:%d", err); + d->decl_stack_cnt = stack_start; + return; + } +skip_mod: + /* VOID */ + if (id == 0) + break; + + switch (btf_kind(t)) { + case BTF_KIND_PTR: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_TYPE_TAG: + id = t->type; + break; + case BTF_KIND_ARRAY: + id = btf_array(t)->type; + break; + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: + goto done; + default: + pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", + btf_kind(t), id); + goto done; + } + } +done: + /* + * We might be inside a chain of declarations (e.g., array of function + * pointers returning anonymous (so inlined) structs, having another + * array field). Each of those needs its own "stack frame" to handle + * emitting of declarations. Those stack frames are non-overlapping + * portions of shared btf_dump->decl_stack. To make it a bit nicer to + * handle this set of nested stacks, we create a view corresponding to + * our own "stack frame" and work with it as an independent stack. + * We'll need to clean up after emit_type_chain() returns, though. + */ + decl_stack.ids = d->decl_stack + stack_start; + decl_stack.cnt = d->decl_stack_cnt - stack_start; + btf_dump_emit_type_chain(d, &decl_stack, fname, lvl); + /* + * emit_type_chain() guarantees that it will pop its entire decl_stack + * frame before returning. But it works with a read-only view into + * decl_stack, so it doesn't actually pop anything from the + * perspective of shared btf_dump->decl_stack, per se. We need to + * reset decl_stack state to how it was before us to avoid it growing + * all the time. + */ + d->decl_stack_cnt = stack_start; +} + +static void btf_dump_emit_mods(struct btf_dump *d, struct id_stack *decl_stack) +{ + const struct btf_type *t; + __u32 id; + + while (decl_stack->cnt) { + id = decl_stack->ids[decl_stack->cnt - 1]; + t = btf__type_by_id(d->btf, id); + + switch (btf_kind(t)) { + case BTF_KIND_VOLATILE: + btf_dump_printf(d, "volatile "); + break; + case BTF_KIND_CONST: + btf_dump_printf(d, "const "); + break; + case BTF_KIND_RESTRICT: + btf_dump_printf(d, "restrict "); + break; + default: + return; + } + decl_stack->cnt--; + } +} + +static void btf_dump_drop_mods(struct btf_dump *d, struct id_stack *decl_stack) +{ + const struct btf_type *t; + __u32 id; + + while (decl_stack->cnt) { + id = decl_stack->ids[decl_stack->cnt - 1]; + t = btf__type_by_id(d->btf, id); + if (!btf_is_mod(t)) + return; + decl_stack->cnt--; + } +} + +static void btf_dump_emit_name(const struct btf_dump *d, + const char *name, bool last_was_ptr) +{ + bool separate = name[0] && !last_was_ptr; + + btf_dump_printf(d, "%s%s", separate ? " " : "", name); +} + +static void btf_dump_emit_type_chain(struct btf_dump *d, + struct id_stack *decls, + const char *fname, int lvl) +{ + /* + * last_was_ptr is used to determine if we need to separate pointer + * asterisk (*) from previous part of type signature with space, so + * that we get `int ***`, instead of `int * * *`. We default to true + * for cases where we have single pointer in a chain. E.g., in ptr -> + * func_proto case. func_proto will start a new emit_type_chain call + * with just ptr, which should be emitted as (*) or (*<fname>), so we + * don't want to prepend space for that last pointer. + */ + bool last_was_ptr = true; + const struct btf_type *t; + const char *name; + __u16 kind; + __u32 id; + + while (decls->cnt) { + id = decls->ids[--decls->cnt]; + if (id == 0) { + /* VOID is a special snowflake */ + btf_dump_emit_mods(d, decls); + btf_dump_printf(d, "void"); + last_was_ptr = false; + continue; + } + + t = btf__type_by_id(d->btf, id); + kind = btf_kind(t); + + switch (kind) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + btf_dump_emit_mods(d, decls); + name = btf_name_of(d, t->name_off); + btf_dump_printf(d, "%s", name); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + btf_dump_emit_mods(d, decls); + /* inline anonymous struct/union */ + if (t->name_off == 0 && !d->skip_anon_defs) + btf_dump_emit_struct_def(d, id, t, lvl); + else + btf_dump_emit_struct_fwd(d, id, t); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + btf_dump_emit_mods(d, decls); + /* inline anonymous enum */ + if (t->name_off == 0 && !d->skip_anon_defs) + btf_dump_emit_enum_def(d, id, t, lvl); + else + btf_dump_emit_enum_fwd(d, id, t); + break; + case BTF_KIND_FWD: + btf_dump_emit_mods(d, decls); + btf_dump_emit_fwd_def(d, id, t); + break; + case BTF_KIND_TYPEDEF: + btf_dump_emit_mods(d, decls); + btf_dump_printf(d, "%s", btf_dump_ident_name(d, id)); + break; + case BTF_KIND_PTR: + btf_dump_printf(d, "%s", last_was_ptr ? "*" : " *"); + break; + case BTF_KIND_VOLATILE: + btf_dump_printf(d, " volatile"); + break; + case BTF_KIND_CONST: + btf_dump_printf(d, " const"); + break; + case BTF_KIND_RESTRICT: + btf_dump_printf(d, " restrict"); + break; + case BTF_KIND_TYPE_TAG: + btf_dump_emit_mods(d, decls); + name = btf_name_of(d, t->name_off); + btf_dump_printf(d, " __attribute__((btf_type_tag(\"%s\")))", name); + break; + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + const struct btf_type *next_t; + __u32 next_id; + bool multidim; + /* + * GCC has a bug + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=8354) + * which causes it to emit extra const/volatile + * modifiers for an array, if array's element type has + * const/volatile modifiers. Clang doesn't do that. + * In general, it doesn't seem very meaningful to have + * a const/volatile modifier for array, so we are + * going to silently skip them here. + */ + btf_dump_drop_mods(d, decls); + + if (decls->cnt == 0) { + btf_dump_emit_name(d, fname, last_was_ptr); + btf_dump_printf(d, "[%u]", a->nelems); + return; + } + + next_id = decls->ids[decls->cnt - 1]; + next_t = btf__type_by_id(d->btf, next_id); + multidim = btf_is_array(next_t); + /* we need space if we have named non-pointer */ + if (fname[0] && !last_was_ptr) + btf_dump_printf(d, " "); + /* no parentheses for multi-dimensional array */ + if (!multidim) + btf_dump_printf(d, "("); + btf_dump_emit_type_chain(d, decls, fname, lvl); + if (!multidim) + btf_dump_printf(d, ")"); + btf_dump_printf(d, "[%u]", a->nelems); + return; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + __u16 vlen = btf_vlen(t); + int i; + + /* + * GCC emits extra volatile qualifier for + * __attribute__((noreturn)) function pointers. Clang + * doesn't do it. It's a GCC quirk for backwards + * compatibility with code written for GCC <2.5. So, + * similarly to extra qualifiers for array, just drop + * them, instead of handling them. + */ + btf_dump_drop_mods(d, decls); + if (decls->cnt) { + btf_dump_printf(d, " ("); + btf_dump_emit_type_chain(d, decls, fname, lvl); + btf_dump_printf(d, ")"); + } else { + btf_dump_emit_name(d, fname, last_was_ptr); + } + btf_dump_printf(d, "("); + /* + * Clang for BPF target generates func_proto with no + * args as a func_proto with a single void arg (e.g., + * `int (*f)(void)` vs just `int (*f)()`). We are + * going to pretend there are no args for such case. + */ + if (vlen == 1 && p->type == 0) { + btf_dump_printf(d, ")"); + return; + } + + for (i = 0; i < vlen; i++, p++) { + if (i > 0) + btf_dump_printf(d, ", "); + + /* last arg of type void is vararg */ + if (i == vlen - 1 && p->type == 0) { + btf_dump_printf(d, "..."); + break; + } + + name = btf_name_of(d, p->name_off); + btf_dump_emit_type_decl(d, p->type, name, lvl); + } + + btf_dump_printf(d, ")"); + return; + } + default: + pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", + kind, id); + return; + } + + last_was_ptr = kind == BTF_KIND_PTR; + } + + btf_dump_emit_name(d, fname, last_was_ptr); +} + +/* show type name as (type_name) */ +static void btf_dump_emit_type_cast(struct btf_dump *d, __u32 id, + bool top_level) +{ + const struct btf_type *t; + + /* for array members, we don't bother emitting type name for each + * member to avoid the redundancy of + * .name = (char[4])[(char)'f',(char)'o',(char)'o',] + */ + if (d->typed_dump->is_array_member) + return; + + /* avoid type name specification for variable/section; it will be done + * for the associated variable value(s). + */ + t = btf__type_by_id(d->btf, id); + if (btf_is_var(t) || btf_is_datasec(t)) + return; + + if (top_level) + btf_dump_printf(d, "("); + + d->skip_anon_defs = true; + d->strip_mods = true; + btf_dump_emit_type_decl(d, id, "", 0); + d->strip_mods = false; + d->skip_anon_defs = false; + + if (top_level) + btf_dump_printf(d, ")"); +} + +/* return number of duplicates (occurrences) of a given name */ +static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, + const char *orig_name) +{ + char *old_name, *new_name; + size_t dup_cnt = 0; + int err; + + new_name = strdup(orig_name); + if (!new_name) + return 1; + + (void)hashmap__find(name_map, orig_name, &dup_cnt); + dup_cnt++; + + err = hashmap__set(name_map, new_name, dup_cnt, &old_name, NULL); + if (err) + free(new_name); + + free(old_name); + + return dup_cnt; +} + +static const char *btf_dump_resolve_name(struct btf_dump *d, __u32 id, + struct hashmap *name_map) +{ + struct btf_dump_type_aux_state *s = &d->type_states[id]; + const struct btf_type *t = btf__type_by_id(d->btf, id); + const char *orig_name = btf_name_of(d, t->name_off); + const char **cached_name = &d->cached_names[id]; + size_t dup_cnt; + + if (t->name_off == 0) + return ""; + + if (s->name_resolved) + return *cached_name ? *cached_name : orig_name; + + if (btf_is_fwd(t) || (btf_is_enum(t) && btf_vlen(t) == 0)) { + s->name_resolved = 1; + return orig_name; + } + + dup_cnt = btf_dump_name_dups(d, name_map, orig_name); + if (dup_cnt > 1) { + const size_t max_len = 256; + char new_name[max_len]; + + snprintf(new_name, max_len, "%s___%zu", orig_name, dup_cnt); + *cached_name = strdup(new_name); + } + + s->name_resolved = 1; + return *cached_name ? *cached_name : orig_name; +} + +static const char *btf_dump_type_name(struct btf_dump *d, __u32 id) +{ + return btf_dump_resolve_name(d, id, d->type_names); +} + +static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id) +{ + return btf_dump_resolve_name(d, id, d->ident_names); +} + +static int btf_dump_dump_type_data(struct btf_dump *d, + const char *fname, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz); + +static const char *btf_dump_data_newline(struct btf_dump *d) +{ + return d->typed_dump->compact || d->typed_dump->depth == 0 ? "" : "\n"; +} + +static const char *btf_dump_data_delim(struct btf_dump *d) +{ + return d->typed_dump->depth == 0 ? "" : ","; +} + +static void btf_dump_data_pfx(struct btf_dump *d) +{ + int i, lvl = d->typed_dump->indent_lvl + d->typed_dump->depth; + + if (d->typed_dump->compact) + return; + + for (i = 0; i < lvl; i++) + btf_dump_printf(d, "%s", d->typed_dump->indent_str); +} + +/* A macro is used here as btf_type_value[s]() appends format specifiers + * to the format specifier passed in; these do the work of appending + * delimiters etc while the caller simply has to specify the type values + * in the format specifier + value(s). + */ +#define btf_dump_type_values(d, fmt, ...) \ + btf_dump_printf(d, fmt "%s%s", \ + ##__VA_ARGS__, \ + btf_dump_data_delim(d), \ + btf_dump_data_newline(d)) + +static int btf_dump_unsupported_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id) +{ + btf_dump_printf(d, "<unsupported kind:%u>", btf_kind(t)); + return -ENOTSUP; +} + +static int btf_dump_get_bitfield_value(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz, + __u64 *value) +{ + __u16 left_shift_bits, right_shift_bits; + const __u8 *bytes = data; + __u8 nr_copy_bits; + __u64 num = 0; + int i; + + /* Maximum supported bitfield size is 64 bits */ + if (t->size > 8) { + pr_warn("unexpected bitfield size %d\n", t->size); + return -EINVAL; + } + + /* Bitfield value retrieval is done in two steps; first relevant bytes are + * stored in num, then we left/right shift num to eliminate irrelevant bits. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + for (i = t->size - 1; i >= 0; i--) + num = num * 256 + bytes[i]; + nr_copy_bits = bit_sz + bits_offset; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + for (i = 0; i < t->size; i++) + num = num * 256 + bytes[i]; + nr_copy_bits = t->size * 8 - bits_offset; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + left_shift_bits = 64 - nr_copy_bits; + right_shift_bits = 64 - bit_sz; + + *value = (num << left_shift_bits) >> right_shift_bits; + + return 0; +} + +static int btf_dump_bitfield_check_zero(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __u64 check_num; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &check_num); + if (err) + return err; + if (check_num == 0) + return -ENODATA; + return 0; +} + +static int btf_dump_bitfield_data(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __u64 print_num; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &print_num); + if (err) + return err; + + btf_dump_type_values(d, "0x%llx", (unsigned long long)print_num); + + return 0; +} + +/* ints, floats and ptrs */ +static int btf_dump_base_type_check_zero(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + static __u8 bytecmp[16] = {}; + int nr_bytes; + + /* For pointer types, pointer size is not defined on a per-type basis. + * On dump creation however, we store the pointer size. + */ + if (btf_kind(t) == BTF_KIND_PTR) + nr_bytes = d->ptr_sz; + else + nr_bytes = t->size; + + if (nr_bytes < 1 || nr_bytes > 16) { + pr_warn("unexpected size %d for id [%u]\n", nr_bytes, id); + return -EINVAL; + } + + if (memcmp(data, bytecmp, nr_bytes) == 0) + return -ENODATA; + return 0; +} + +static bool ptr_is_aligned(const struct btf *btf, __u32 type_id, + const void *data) +{ + int alignment = btf__align_of(btf, type_id); + + if (alignment == 0) + return false; + + return ((uintptr_t)data) % alignment == 0; +} + +static int btf_dump_int_data(struct btf_dump *d, + const struct btf_type *t, + __u32 type_id, + const void *data, + __u8 bits_offset) +{ + __u8 encoding = btf_int_encoding(t); + bool sign = encoding & BTF_INT_SIGNED; + char buf[16] __attribute__((aligned(16))); + int sz = t->size; + + if (sz == 0 || sz > sizeof(buf)) { + pr_warn("unexpected size %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + + /* handle packed int data - accesses of integers not aligned on + * int boundaries can cause problems on some platforms. + */ + if (!ptr_is_aligned(d->btf, type_id, data)) { + memcpy(buf, data, sz); + data = buf; + } + + switch (sz) { + case 16: { + const __u64 *ints = data; + __u64 lsi, msi; + + /* avoid use of __int128 as some 32-bit platforms do not + * support it. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + lsi = ints[0]; + msi = ints[1]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + lsi = ints[1]; + msi = ints[0]; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + if (msi == 0) + btf_dump_type_values(d, "0x%llx", (unsigned long long)lsi); + else + btf_dump_type_values(d, "0x%llx%016llx", (unsigned long long)msi, + (unsigned long long)lsi); + break; + } + case 8: + if (sign) + btf_dump_type_values(d, "%lld", *(long long *)data); + else + btf_dump_type_values(d, "%llu", *(unsigned long long *)data); + break; + case 4: + if (sign) + btf_dump_type_values(d, "%d", *(__s32 *)data); + else + btf_dump_type_values(d, "%u", *(__u32 *)data); + break; + case 2: + if (sign) + btf_dump_type_values(d, "%d", *(__s16 *)data); + else + btf_dump_type_values(d, "%u", *(__u16 *)data); + break; + case 1: + if (d->typed_dump->is_array_char) { + /* check for null terminator */ + if (d->typed_dump->is_array_terminated) + break; + if (*(char *)data == '\0') { + d->typed_dump->is_array_terminated = true; + break; + } + if (isprint(*(char *)data)) { + btf_dump_type_values(d, "'%c'", *(char *)data); + break; + } + } + if (sign) + btf_dump_type_values(d, "%d", *(__s8 *)data); + else + btf_dump_type_values(d, "%u", *(__u8 *)data); + break; + default: + pr_warn("unexpected sz %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + return 0; +} + +union float_data { + long double ld; + double d; + float f; +}; + +static int btf_dump_float_data(struct btf_dump *d, + const struct btf_type *t, + __u32 type_id, + const void *data) +{ + const union float_data *flp = data; + union float_data fl; + int sz = t->size; + + /* handle unaligned data; copy to local union */ + if (!ptr_is_aligned(d->btf, type_id, data)) { + memcpy(&fl, data, sz); + flp = &fl; + } + + switch (sz) { + case 16: + btf_dump_type_values(d, "%Lf", flp->ld); + break; + case 8: + btf_dump_type_values(d, "%lf", flp->d); + break; + case 4: + btf_dump_type_values(d, "%f", flp->f); + break; + default: + pr_warn("unexpected size %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + return 0; +} + +static int btf_dump_var_data(struct btf_dump *d, + const struct btf_type *v, + __u32 id, + const void *data) +{ + enum btf_func_linkage linkage = btf_var(v)->linkage; + const struct btf_type *t; + const char *l; + __u32 type_id; + + switch (linkage) { + case BTF_FUNC_STATIC: + l = "static "; + break; + case BTF_FUNC_EXTERN: + l = "extern "; + break; + case BTF_FUNC_GLOBAL: + default: + l = ""; + break; + } + + /* format of output here is [linkage] [type] [varname] = (type)value, + * for example "static int cpu_profile_flip = (int)1" + */ + btf_dump_printf(d, "%s", l); + type_id = v->type; + t = btf__type_by_id(d->btf, type_id); + btf_dump_emit_type_cast(d, type_id, false); + btf_dump_printf(d, " %s = ", btf_name_of(d, v->name_off)); + return btf_dump_dump_type_data(d, NULL, t, type_id, data, 0, 0); +} + +static int btf_dump_array_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_array *array = btf_array(t); + const struct btf_type *elem_type; + __u32 i, elem_type_id; + __s64 elem_size; + bool is_array_member; + + elem_type_id = array->type; + elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); + elem_size = btf__resolve_size(d->btf, elem_type_id); + if (elem_size <= 0) { + pr_warn("unexpected elem size %zd for array type [%u]\n", + (ssize_t)elem_size, id); + return -EINVAL; + } + + if (btf_is_int(elem_type)) { + /* + * BTF_INT_CHAR encoding never seems to be set for + * char arrays, so if size is 1 and element is + * printable as a char, we'll do that. + */ + if (elem_size == 1) + d->typed_dump->is_array_char = true; + } + + /* note that we increment depth before calling btf_dump_print() below; + * this is intentional. btf_dump_data_newline() will not print a + * newline for depth 0 (since this leaves us with trailing newlines + * at the end of typed display), so depth is incremented first. + * For similar reasons, we decrement depth before showing the closing + * parenthesis. + */ + d->typed_dump->depth++; + btf_dump_printf(d, "[%s", btf_dump_data_newline(d)); + + /* may be a multidimensional array, so store current "is array member" + * status so we can restore it correctly later. + */ + is_array_member = d->typed_dump->is_array_member; + d->typed_dump->is_array_member = true; + for (i = 0; i < array->nelems; i++, data += elem_size) { + if (d->typed_dump->is_array_terminated) + break; + btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); + } + d->typed_dump->is_array_member = is_array_member; + d->typed_dump->depth--; + btf_dump_data_pfx(d); + btf_dump_type_values(d, "]"); + + return 0; +} + +static int btf_dump_struct_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_member *m = btf_members(t); + __u16 n = btf_vlen(t); + int i, err = 0; + + /* note that we increment depth before calling btf_dump_print() below; + * this is intentional. btf_dump_data_newline() will not print a + * newline for depth 0 (since this leaves us with trailing newlines + * at the end of typed display), so depth is incremented first. + * For similar reasons, we decrement depth before showing the closing + * parenthesis. + */ + d->typed_dump->depth++; + btf_dump_printf(d, "{%s", btf_dump_data_newline(d)); + + for (i = 0; i < n; i++, m++) { + const struct btf_type *mtype; + const char *mname; + __u32 moffset; + __u8 bit_sz; + + mtype = btf__type_by_id(d->btf, m->type); + mname = btf_name_of(d, m->name_off); + moffset = btf_member_bit_offset(t, i); + + bit_sz = btf_member_bitfield_size(t, i); + err = btf_dump_dump_type_data(d, mname, mtype, m->type, data + moffset / 8, + moffset % 8, bit_sz); + if (err < 0) + return err; + } + d->typed_dump->depth--; + btf_dump_data_pfx(d); + btf_dump_type_values(d, "}"); + return err; +} + +union ptr_data { + unsigned int p; + unsigned long long lp; +}; + +static int btf_dump_ptr_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + if (ptr_is_aligned(d->btf, id, data) && d->ptr_sz == sizeof(void *)) { + btf_dump_type_values(d, "%p", *(void **)data); + } else { + union ptr_data pt; + + memcpy(&pt, data, d->ptr_sz); + if (d->ptr_sz == 4) + btf_dump_type_values(d, "0x%x", pt.p); + else + btf_dump_type_values(d, "0x%llx", pt.lp); + } + return 0; +} + +static int btf_dump_get_enum_value(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u32 id, + __s64 *value) +{ + bool is_signed = btf_kflag(t); + + if (!ptr_is_aligned(d->btf, id, data)) { + __u64 val; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, 0, 0, &val); + if (err) + return err; + *value = (__s64)val; + return 0; + } + + switch (t->size) { + case 8: + *value = *(__s64 *)data; + return 0; + case 4: + *value = is_signed ? (__s64)*(__s32 *)data : *(__u32 *)data; + return 0; + case 2: + *value = is_signed ? *(__s16 *)data : *(__u16 *)data; + return 0; + case 1: + *value = is_signed ? *(__s8 *)data : *(__u8 *)data; + return 0; + default: + pr_warn("unexpected size %d for enum, id:[%u]\n", t->size, id); + return -EINVAL; + } +} + +static int btf_dump_enum_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + bool is_signed; + __s64 value; + int i, err; + + err = btf_dump_get_enum_value(d, t, data, id, &value); + if (err) + return err; + + is_signed = btf_kflag(t); + if (btf_is_enum(t)) { + const struct btf_enum *e; + + for (i = 0, e = btf_enum(t); i < btf_vlen(t); i++, e++) { + if (value != e->val) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } + + btf_dump_type_values(d, is_signed ? "%d" : "%u", value); + } else { + const struct btf_enum64 *e; + + for (i = 0, e = btf_enum64(t); i < btf_vlen(t); i++, e++) { + if (value != btf_enum64_value(e)) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } + + btf_dump_type_values(d, is_signed ? "%lldLL" : "%lluULL", + (unsigned long long)value); + } + return 0; +} + +static int btf_dump_datasec_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_var_secinfo *vsi; + const struct btf_type *var; + __u32 i; + int err; + + btf_dump_type_values(d, "SEC(\"%s\") ", btf_name_of(d, t->name_off)); + + for (i = 0, vsi = btf_var_secinfos(t); i < btf_vlen(t); i++, vsi++) { + var = btf__type_by_id(d->btf, vsi->type); + err = btf_dump_dump_type_data(d, NULL, var, vsi->type, data + vsi->offset, 0, 0); + if (err < 0) + return err; + btf_dump_printf(d, ";"); + } + return 0; +} + +/* return size of type, or if base type overflows, return -E2BIG. */ +static int btf_dump_type_data_check_overflow(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset) +{ + __s64 size = btf__resolve_size(d->btf, id); + + if (size < 0 || size >= INT_MAX) { + pr_warn("unexpected size [%zu] for id [%u]\n", + (size_t)size, id); + return -EINVAL; + } + + /* Only do overflow checking for base types; we do not want to + * avoid showing part of a struct, union or array, even if we + * do not have enough data to show the full object. By + * restricting overflow checking to base types we can ensure + * that partial display succeeds, while avoiding overflowing + * and using bogus data for display. + */ + t = skip_mods_and_typedefs(d->btf, id, NULL); + if (!t) { + pr_warn("unexpected error skipping mods/typedefs for id [%u]\n", + id); + return -EINVAL; + } + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_PTR: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (data + bits_offset / 8 + size > d->typed_dump->data_end) + return -E2BIG; + break; + default: + break; + } + return (int)size; +} + +static int btf_dump_type_data_check_zero(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __s64 value; + int i, err; + + /* toplevel exceptions; we show zero values if + * - we ask for them (emit_zeros) + * - if we are at top-level so we see "struct empty { }" + * - or if we are an array member and the array is non-empty and + * not a char array; we don't want to be in a situation where we + * have an integer array 0, 1, 0, 1 and only show non-zero values. + * If the array contains zeroes only, or is a char array starting + * with a '\0', the array-level check_zero() will prevent showing it; + * we are concerned with determining zero value at the array member + * level here. + */ + if (d->typed_dump->emit_zeroes || d->typed_dump->depth == 0 || + (d->typed_dump->is_array_member && + !d->typed_dump->is_array_char)) + return 0; + + t = skip_mods_and_typedefs(d->btf, id, NULL); + + switch (btf_kind(t)) { + case BTF_KIND_INT: + if (bit_sz) + return btf_dump_bitfield_check_zero(d, t, data, bits_offset, bit_sz); + return btf_dump_base_type_check_zero(d, t, id, data); + case BTF_KIND_FLOAT: + case BTF_KIND_PTR: + return btf_dump_base_type_check_zero(d, t, id, data); + case BTF_KIND_ARRAY: { + const struct btf_array *array = btf_array(t); + const struct btf_type *elem_type; + __u32 elem_type_id, elem_size; + bool ischar; + + elem_type_id = array->type; + elem_size = btf__resolve_size(d->btf, elem_type_id); + elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); + + ischar = btf_is_int(elem_type) && elem_size == 1; + + /* check all elements; if _any_ element is nonzero, all + * of array is displayed. We make an exception however + * for char arrays where the first element is 0; these + * are considered zeroed also, even if later elements are + * non-zero because the string is terminated. + */ + for (i = 0; i < array->nelems; i++) { + if (i == 0 && ischar && *(char *)data == 0) + return -ENODATA; + err = btf_dump_type_data_check_zero(d, elem_type, + elem_type_id, + data + + (i * elem_size), + bits_offset, 0); + if (err != -ENODATA) + return err; + } + return -ENODATA; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + __u16 n = btf_vlen(t); + + /* if any struct/union member is non-zero, the struct/union + * is considered non-zero and dumped. + */ + for (i = 0; i < n; i++, m++) { + const struct btf_type *mtype; + __u32 moffset; + + mtype = btf__type_by_id(d->btf, m->type); + moffset = btf_member_bit_offset(t, i); + + /* btf_int_bits() does not store member bitfield size; + * bitfield size needs to be stored here so int display + * of member can retrieve it. + */ + bit_sz = btf_member_bitfield_size(t, i); + err = btf_dump_type_data_check_zero(d, mtype, m->type, data + moffset / 8, + moffset % 8, bit_sz); + if (err != ENODATA) + return err; + } + return -ENODATA; + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + err = btf_dump_get_enum_value(d, t, data, id, &value); + if (err) + return err; + if (value == 0) + return -ENODATA; + return 0; + default: + return 0; + } +} + +/* returns size of data dumped, or error. */ +static int btf_dump_dump_type_data(struct btf_dump *d, + const char *fname, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + int size, err = 0; + + size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset); + if (size < 0) + return size; + err = btf_dump_type_data_check_zero(d, t, id, data, bits_offset, bit_sz); + if (err) { + /* zeroed data is expected and not an error, so simply skip + * dumping such data. Record other errors however. + */ + if (err == -ENODATA) + return size; + return err; + } + btf_dump_data_pfx(d); + + if (!d->typed_dump->skip_names) { + if (fname && strlen(fname) > 0) + btf_dump_printf(d, ".%s = ", fname); + btf_dump_emit_type_cast(d, id, true); + } + + t = skip_mods_and_typedefs(d->btf, id, NULL); + + switch (btf_kind(t)) { + case BTF_KIND_UNKN: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_DECL_TAG: + err = btf_dump_unsupported_data(d, t, id); + break; + case BTF_KIND_INT: + if (bit_sz) + err = btf_dump_bitfield_data(d, t, data, bits_offset, bit_sz); + else + err = btf_dump_int_data(d, t, id, data, bits_offset); + break; + case BTF_KIND_FLOAT: + err = btf_dump_float_data(d, t, id, data); + break; + case BTF_KIND_PTR: + err = btf_dump_ptr_data(d, t, id, data); + break; + case BTF_KIND_ARRAY: + err = btf_dump_array_data(d, t, id, data); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + err = btf_dump_struct_data(d, t, id, data); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + /* handle bitfield and int enum values */ + if (bit_sz) { + __u64 print_num; + __s64 enum_val; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, + &print_num); + if (err) + break; + enum_val = (__s64)print_num; + err = btf_dump_enum_data(d, t, id, &enum_val); + } else + err = btf_dump_enum_data(d, t, id, data); + break; + case BTF_KIND_VAR: + err = btf_dump_var_data(d, t, id, data); + break; + case BTF_KIND_DATASEC: + err = btf_dump_datasec_data(d, t, id, data); + break; + default: + pr_warn("unexpected kind [%u] for id [%u]\n", + BTF_INFO_KIND(t->info), id); + return -EINVAL; + } + if (err < 0) + return err; + return size; +} + +int btf_dump__dump_type_data(struct btf_dump *d, __u32 id, + const void *data, size_t data_sz, + const struct btf_dump_type_data_opts *opts) +{ + struct btf_dump_data typed_dump = {}; + const struct btf_type *t; + int ret; + + if (!OPTS_VALID(opts, btf_dump_type_data_opts)) + return libbpf_err(-EINVAL); + + t = btf__type_by_id(d->btf, id); + if (!t) + return libbpf_err(-ENOENT); + + d->typed_dump = &typed_dump; + d->typed_dump->data_end = data + data_sz; + d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0); + + /* default indent string is a tab */ + if (!OPTS_GET(opts, indent_str, NULL)) + d->typed_dump->indent_str[0] = '\t'; + else + libbpf_strlcpy(d->typed_dump->indent_str, opts->indent_str, + sizeof(d->typed_dump->indent_str)); + + d->typed_dump->compact = OPTS_GET(opts, compact, false); + d->typed_dump->skip_names = OPTS_GET(opts, skip_names, false); + d->typed_dump->emit_zeroes = OPTS_GET(opts, emit_zeroes, false); + + ret = btf_dump_dump_type_data(d, NULL, t, id, data, 0, 0); + + d->typed_dump = NULL; + + return libbpf_err(ret); +} diff --git a/src/gen_loader.c b/src/gen_loader.c new file mode 100644 index 0000000..23f5c46 --- /dev/null +++ b/src/gen_loader.c @@ -0,0 +1,1121 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <linux/filter.h> +#include <sys/param.h> +#include "btf.h" +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" +#include "hashmap.h" +#include "bpf_gen_internal.h" +#include "skel_internal.h" +#include <asm/byteorder.h> + +#define MAX_USED_MAPS 64 +#define MAX_USED_PROGS 32 +#define MAX_KFUNC_DESCS 256 +#define MAX_FD_ARRAY_SZ (MAX_USED_MAPS + MAX_KFUNC_DESCS) + +/* The following structure describes the stack layout of the loader program. + * In addition R6 contains the pointer to context. + * R7 contains the result of the last sys_bpf command (typically error or FD). + * R9 contains the result of the last sys_close command. + * + * Naming convention: + * ctx - bpf program context + * stack - bpf program stack + * blob - bpf_attr-s, strings, insns, map data. + * All the bytes that loader prog will use for read/write. + */ +struct loader_stack { + __u32 btf_fd; + __u32 inner_map_fd; + __u32 prog_fd[MAX_USED_PROGS]; +}; + +#define stack_off(field) \ + (__s16)(-sizeof(struct loader_stack) + offsetof(struct loader_stack, field)) + +#define attr_field(attr, field) (attr + offsetof(union bpf_attr, field)) + +static int blob_fd_array_off(struct bpf_gen *gen, int index) +{ + return gen->fd_array + index * sizeof(int); +} + +static int realloc_insn_buf(struct bpf_gen *gen, __u32 size) +{ + size_t off = gen->insn_cur - gen->insn_start; + void *insn_start; + + if (gen->error) + return gen->error; + if (size > INT32_MAX || off + size > INT32_MAX) { + gen->error = -ERANGE; + return -ERANGE; + } + insn_start = realloc(gen->insn_start, off + size); + if (!insn_start) { + gen->error = -ENOMEM; + free(gen->insn_start); + gen->insn_start = NULL; + return -ENOMEM; + } + gen->insn_start = insn_start; + gen->insn_cur = insn_start + off; + return 0; +} + +static int realloc_data_buf(struct bpf_gen *gen, __u32 size) +{ + size_t off = gen->data_cur - gen->data_start; + void *data_start; + + if (gen->error) + return gen->error; + if (size > INT32_MAX || off + size > INT32_MAX) { + gen->error = -ERANGE; + return -ERANGE; + } + data_start = realloc(gen->data_start, off + size); + if (!data_start) { + gen->error = -ENOMEM; + free(gen->data_start); + gen->data_start = NULL; + return -ENOMEM; + } + gen->data_start = data_start; + gen->data_cur = data_start + off; + return 0; +} + +static void emit(struct bpf_gen *gen, struct bpf_insn insn) +{ + if (realloc_insn_buf(gen, sizeof(insn))) + return; + memcpy(gen->insn_cur, &insn, sizeof(insn)); + gen->insn_cur += sizeof(insn); +} + +static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn insn2) +{ + emit(gen, insn1); + emit(gen, insn2); +} + +static int add_data(struct bpf_gen *gen, const void *data, __u32 size); +static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off); + +void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps) +{ + size_t stack_sz = sizeof(struct loader_stack), nr_progs_sz; + int i; + + gen->fd_array = add_data(gen, NULL, MAX_FD_ARRAY_SZ * sizeof(int)); + gen->log_level = log_level; + /* save ctx pointer into R6 */ + emit(gen, BPF_MOV64_REG(BPF_REG_6, BPF_REG_1)); + + /* bzero stack */ + emit(gen, BPF_MOV64_REG(BPF_REG_1, BPF_REG_10)); + emit(gen, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -stack_sz)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, stack_sz)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, 0)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel)); + + /* amount of stack actually used, only used to calculate iterations, not stack offset */ + nr_progs_sz = offsetof(struct loader_stack, prog_fd[nr_progs]); + /* jump over cleanup code */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, + /* size of cleanup code below (including map fd cleanup) */ + (nr_progs_sz / 4) * 3 + 2 + + /* 6 insns for emit_sys_close_blob, + * 6 insns for debug_regs in emit_sys_close_blob + */ + nr_maps * (6 + (gen->log_level ? 6 : 0)))); + + /* remember the label where all error branches will jump to */ + gen->cleanup_label = gen->insn_cur - gen->insn_start; + /* emit cleanup code: close all temp FDs */ + for (i = 0; i < nr_progs_sz; i += 4) { + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -stack_sz + i)); + emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, 1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close)); + } + for (i = 0; i < nr_maps; i++) + emit_sys_close_blob(gen, blob_fd_array_off(gen, i)); + /* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */ + emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7)); + emit(gen, BPF_EXIT_INSN()); +} + +static int add_data(struct bpf_gen *gen, const void *data, __u32 size) +{ + __u32 size8 = roundup(size, 8); + __u64 zero = 0; + void *prev; + + if (realloc_data_buf(gen, size8)) + return 0; + prev = gen->data_cur; + if (data) { + memcpy(gen->data_cur, data, size); + memcpy(gen->data_cur + size, &zero, size8 - size); + } else { + memset(gen->data_cur, 0, size8); + } + gen->data_cur += size8; + return prev - gen->data_start; +} + +/* Get index for map_fd/btf_fd slot in reserved fd_array, or in data relative + * to start of fd_array. Caller can decide if it is usable or not. + */ +static int add_map_fd(struct bpf_gen *gen) +{ + if (gen->nr_maps == MAX_USED_MAPS) { + pr_warn("Total maps exceeds %d\n", MAX_USED_MAPS); + gen->error = -E2BIG; + return 0; + } + return gen->nr_maps++; +} + +static int add_kfunc_btf_fd(struct bpf_gen *gen) +{ + int cur; + + if (gen->nr_fd_array == MAX_KFUNC_DESCS) { + cur = add_data(gen, NULL, sizeof(int)); + return (cur - gen->fd_array) / sizeof(int); + } + return MAX_USED_MAPS + gen->nr_fd_array++; +} + +static int insn_bytes_to_bpf_size(__u32 sz) +{ + switch (sz) { + case 8: return BPF_DW; + case 4: return BPF_W; + case 2: return BPF_H; + case 1: return BPF_B; + default: return -1; + } +} + +/* *(u64 *)(blob + off) = (u64)(void *)(blob + data) */ +static void emit_rel_store(struct bpf_gen *gen, int off, int data) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, data)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_blob2blob(struct bpf_gen *gen, int off, int size, int blob_off) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_off)); + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_2, 0)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_blob2ctx(struct bpf_gen *gen, int ctx_off, int size, int blob_off) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_off)); + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_1, 0)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_6, BPF_REG_0, ctx_off)); +} + +static void move_ctx2blob(struct bpf_gen *gen, int off, int size, int ctx_off, + bool check_non_zero) +{ + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_6, ctx_off)); + if (check_non_zero) + /* If value in ctx is zero don't update the blob. + * For example: when ctx->map.max_entries == 0, keep default max_entries from bpf.c + */ + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_stack2blob(struct bpf_gen *gen, int off, int size, int stack_off) +{ + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_10, stack_off)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_stack2ctx(struct bpf_gen *gen, int ctx_off, int size, int stack_off) +{ + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_10, stack_off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_6, BPF_REG_0, ctx_off)); +} + +static void emit_sys_bpf(struct bpf_gen *gen, int cmd, int attr, int attr_size) +{ + emit(gen, BPF_MOV64_IMM(BPF_REG_1, cmd)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, attr)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, attr_size)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_bpf)); + /* remember the result in R7 */ + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); +} + +static bool is_simm16(__s64 value) +{ + return value == (__s64)(__s16)value; +} + +static void emit_check_err(struct bpf_gen *gen) +{ + __s64 off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 1; + + /* R7 contains result of last sys_bpf command. + * if (R7 < 0) goto cleanup; + */ + if (is_simm16(off)) { + emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, off)); + } else { + gen->error = -ERANGE; + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1)); + } +} + +/* reg1 and reg2 should not be R1 - R5. They can be R0, R6 - R10 */ +static void emit_debug(struct bpf_gen *gen, int reg1, int reg2, + const char *fmt, va_list args) +{ + char buf[1024]; + int addr, len, ret; + + if (!gen->log_level) + return; + ret = vsnprintf(buf, sizeof(buf), fmt, args); + if (ret < 1024 - 7 && reg1 >= 0 && reg2 < 0) + /* The special case to accommodate common debug_ret(): + * to avoid specifying BPF_REG_7 and adding " r=%%d" to + * prints explicitly. + */ + strcat(buf, " r=%d"); + len = strlen(buf) + 1; + addr = add_data(gen, buf, len); + + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, addr)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + if (reg1 >= 0) + emit(gen, BPF_MOV64_REG(BPF_REG_3, reg1)); + if (reg2 >= 0) + emit(gen, BPF_MOV64_REG(BPF_REG_4, reg2)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_trace_printk)); +} + +static void debug_regs(struct bpf_gen *gen, int reg1, int reg2, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + emit_debug(gen, reg1, reg2, fmt, args); + va_end(args); +} + +static void debug_ret(struct bpf_gen *gen, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + emit_debug(gen, BPF_REG_7, -1, fmt, args); + va_end(args); +} + +static void __emit_sys_close(struct bpf_gen *gen) +{ + emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, + /* 2 is the number of the following insns + * * 6 is additional insns in debug_regs + */ + 2 + (gen->log_level ? 6 : 0))); + emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close)); + debug_regs(gen, BPF_REG_9, BPF_REG_0, "close(%%d) = %%d"); +} + +static void emit_sys_close_stack(struct bpf_gen *gen, int stack_off) +{ + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, stack_off)); + __emit_sys_close(gen); +} + +static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_off)); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0)); + __emit_sys_close(gen); +} + +int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) +{ + int i; + + if (nr_progs < gen->nr_progs || nr_maps != gen->nr_maps) { + pr_warn("nr_progs %d/%d nr_maps %d/%d mismatch\n", + nr_progs, gen->nr_progs, nr_maps, gen->nr_maps); + gen->error = -EFAULT; + return gen->error; + } + emit_sys_close_stack(gen, stack_off(btf_fd)); + for (i = 0; i < gen->nr_progs; i++) + move_stack2ctx(gen, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * gen->nr_maps + + sizeof(struct bpf_prog_desc) * i + + offsetof(struct bpf_prog_desc, prog_fd), 4, + stack_off(prog_fd[i])); + for (i = 0; i < gen->nr_maps; i++) + move_blob2ctx(gen, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * i + + offsetof(struct bpf_map_desc, map_fd), 4, + blob_fd_array_off(gen, i)); + emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0)); + emit(gen, BPF_EXIT_INSN()); + pr_debug("gen: finish %d\n", gen->error); + if (!gen->error) { + struct gen_loader_opts *opts = gen->opts; + + opts->insns = gen->insn_start; + opts->insns_sz = gen->insn_cur - gen->insn_start; + opts->data = gen->data_start; + opts->data_sz = gen->data_cur - gen->data_start; + } + return gen->error; +} + +void bpf_gen__free(struct bpf_gen *gen) +{ + if (!gen) + return; + free(gen->data_start); + free(gen->insn_start); + free(gen); +} + +void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, + __u32 btf_raw_size) +{ + int attr_size = offsetofend(union bpf_attr, btf_log_level); + int btf_data, btf_load_attr; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: load_btf: size %d\n", btf_raw_size); + btf_data = add_data(gen, btf_raw_data, btf_raw_size); + + attr.btf_size = btf_raw_size; + btf_load_attr = add_data(gen, &attr, attr_size); + + /* populate union bpf_attr with user provided log details */ + move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_level), 4, + offsetof(struct bpf_loader_ctx, log_level), false); + move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_size), 4, + offsetof(struct bpf_loader_ctx, log_size), false); + move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_buf), 8, + offsetof(struct bpf_loader_ctx, log_buf), false); + /* populate union bpf_attr with a pointer to the BTF data */ + emit_rel_store(gen, attr_field(btf_load_attr, btf), btf_data); + /* emit BTF_LOAD command */ + emit_sys_bpf(gen, BPF_BTF_LOAD, btf_load_attr, attr_size); + debug_ret(gen, "btf_load size %d", btf_raw_size); + emit_check_err(gen); + /* remember btf_fd in the stack, if successful */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(btf_fd))); +} + +void bpf_gen__map_create(struct bpf_gen *gen, + enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, __u32 value_size, __u32 max_entries, + struct bpf_map_create_opts *map_attr, int map_idx) +{ + int attr_size = offsetofend(union bpf_attr, map_extra); + bool close_inner_map_fd = false; + int map_create_attr, idx; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = value_size; + attr.map_flags = map_attr->map_flags; + attr.map_extra = map_attr->map_extra; + if (map_name) + libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name)); + attr.numa_node = map_attr->numa_node; + attr.map_ifindex = map_attr->map_ifindex; + attr.max_entries = max_entries; + attr.btf_key_type_id = map_attr->btf_key_type_id; + attr.btf_value_type_id = map_attr->btf_value_type_id; + + pr_debug("gen: map_create: %s idx %d type %d value_type_id %d\n", + attr.map_name, map_idx, map_type, attr.btf_value_type_id); + + map_create_attr = add_data(gen, &attr, attr_size); + if (attr.btf_value_type_id) + /* populate union bpf_attr with btf_fd saved in the stack earlier */ + move_stack2blob(gen, attr_field(map_create_attr, btf_fd), 4, + stack_off(btf_fd)); + switch (attr.map_type) { + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + move_stack2blob(gen, attr_field(map_create_attr, inner_map_fd), 4, + stack_off(inner_map_fd)); + close_inner_map_fd = true; + break; + default: + break; + } + /* conditionally update max_entries */ + if (map_idx >= 0) + move_ctx2blob(gen, attr_field(map_create_attr, max_entries), 4, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * map_idx + + offsetof(struct bpf_map_desc, max_entries), + true /* check that max_entries != 0 */); + /* emit MAP_CREATE command */ + emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size); + debug_ret(gen, "map_create %s idx %d type %d value_size %d value_btf_id %d", + attr.map_name, map_idx, map_type, value_size, + attr.btf_value_type_id); + emit_check_err(gen); + /* remember map_fd in the stack, if successful */ + if (map_idx < 0) { + /* This bpf_gen__map_create() function is called with map_idx >= 0 + * for all maps that libbpf loading logic tracks. + * It's called with -1 to create an inner map. + */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, + stack_off(inner_map_fd))); + } else if (map_idx != gen->nr_maps) { + gen->error = -EDOM; /* internal bug */ + return; + } else { + /* add_map_fd does gen->nr_maps++ */ + idx = add_map_fd(gen); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, idx))); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_7, 0)); + } + if (close_inner_map_fd) + emit_sys_close_stack(gen, stack_off(inner_map_fd)); +} + +void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name, + enum bpf_attach_type type) +{ + const char *prefix; + int kind, ret; + + btf_get_kernel_prefix_kind(type, &prefix, &kind); + gen->attach_kind = kind; + ret = snprintf(gen->attach_target, sizeof(gen->attach_target), "%s%s", + prefix, attach_name); + if (ret >= sizeof(gen->attach_target)) + gen->error = -ENOSPC; +} + +static void emit_find_attach_target(struct bpf_gen *gen) +{ + int name, len = strlen(gen->attach_target) + 1; + + pr_debug("gen: find_attach_tgt %s %d\n", gen->attach_target, gen->attach_kind); + name = add_data(gen, gen->attach_target, len); + + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, name)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, gen->attach_kind)); + emit(gen, BPF_MOV64_IMM(BPF_REG_4, 0)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_btf_find_by_name_kind)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); + debug_ret(gen, "find_by_name_kind(%s,%d)", + gen->attach_target, gen->attach_kind); + emit_check_err(gen); + /* if successful, btf_id is in lower 32-bit of R7 and + * btf_obj_fd is in upper 32-bit + */ +} + +void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, + bool is_typeless, int kind, int insn_idx) +{ + struct ksym_relo_desc *relo; + + relo = libbpf_reallocarray(gen->relos, gen->relo_cnt + 1, sizeof(*relo)); + if (!relo) { + gen->error = -ENOMEM; + return; + } + gen->relos = relo; + relo += gen->relo_cnt; + relo->name = name; + relo->is_weak = is_weak; + relo->is_typeless = is_typeless; + relo->kind = kind; + relo->insn_idx = insn_idx; + gen->relo_cnt++; +} + +/* returns existing ksym_desc with ref incremented, or inserts a new one */ +static struct ksym_desc *get_ksym_desc(struct bpf_gen *gen, struct ksym_relo_desc *relo) +{ + struct ksym_desc *kdesc; + int i; + + for (i = 0; i < gen->nr_ksyms; i++) { + if (!strcmp(gen->ksyms[i].name, relo->name)) { + gen->ksyms[i].ref++; + return &gen->ksyms[i]; + } + } + kdesc = libbpf_reallocarray(gen->ksyms, gen->nr_ksyms + 1, sizeof(*kdesc)); + if (!kdesc) { + gen->error = -ENOMEM; + return NULL; + } + gen->ksyms = kdesc; + kdesc = &gen->ksyms[gen->nr_ksyms++]; + kdesc->name = relo->name; + kdesc->kind = relo->kind; + kdesc->ref = 1; + kdesc->off = 0; + kdesc->insn = 0; + return kdesc; +} + +/* Overwrites BPF_REG_{0, 1, 2, 3, 4, 7} + * Returns result in BPF_REG_7 + */ +static void emit_bpf_find_by_name_kind(struct bpf_gen *gen, struct ksym_relo_desc *relo) +{ + int name_off, len = strlen(relo->name) + 1; + + name_off = add_data(gen, relo->name, len); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, name_off)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, relo->kind)); + emit(gen, BPF_MOV64_IMM(BPF_REG_4, 0)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_btf_find_by_name_kind)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); + debug_ret(gen, "find_by_name_kind(%s,%d)", relo->name, relo->kind); +} + +/* Overwrites BPF_REG_{0, 1, 2, 3, 4, 7} + * Returns result in BPF_REG_7 + * Returns u64 symbol addr in BPF_REG_9 + */ +static void emit_bpf_kallsyms_lookup_name(struct bpf_gen *gen, struct ksym_relo_desc *relo) +{ + int name_off, len = strlen(relo->name) + 1, res_off; + + name_off = add_data(gen, relo->name, len); + res_off = add_data(gen, NULL, 8); /* res is u64 */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, name_off)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, 0)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_4, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, res_off)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_4)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_kallsyms_lookup_name)); + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); + debug_ret(gen, "kallsyms_lookup_name(%s,%d)", relo->name, relo->kind); +} + +/* Expects: + * BPF_REG_8 - pointer to instruction + * + * We need to reuse BTF fd for same symbol otherwise each relocation takes a new + * index, while kernel limits total kfunc BTFs to 256. For duplicate symbols, + * this would mean a new BTF fd index for each entry. By pairing symbol name + * with index, we get the insn->imm, insn->off pairing that kernel uses for + * kfunc_tab, which becomes the effective limit even though all of them may + * share same index in fd_array (such that kfunc_btf_tab has 1 element). + */ +static void emit_relo_kfunc_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insn) +{ + struct ksym_desc *kdesc; + int btf_fd_idx; + + kdesc = get_ksym_desc(gen, relo); + if (!kdesc) + return; + /* try to copy from existing bpf_insn */ + if (kdesc->ref > 1) { + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + move_blob2blob(gen, insn + offsetof(struct bpf_insn, off), 2, + kdesc->insn + offsetof(struct bpf_insn, off)); + goto log; + } + /* remember insn offset, so we can copy BTF ID and FD later */ + kdesc->insn = insn; + emit_bpf_find_by_name_kind(gen, relo); + if (!relo->is_weak) + emit_check_err(gen); + /* get index in fd_array to store BTF FD at */ + btf_fd_idx = add_kfunc_btf_fd(gen); + if (btf_fd_idx > INT16_MAX) { + pr_warn("BTF fd off %d for kfunc %s exceeds INT16_MAX, cannot process relocation\n", + btf_fd_idx, relo->name); + gen->error = -E2BIG; + return; + } + kdesc->off = btf_fd_idx; + /* jump to success case */ + emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + /* set value for imm, off as 0 */ + emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0)); + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); + /* skip success case for ret < 0 */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 10)); + /* store btf_id into insn[insn_idx].imm */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm))); + /* obtain fd in BPF_REG_9 */ + emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_7)); + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); + /* jump to fd_array store if fd denotes module BTF */ + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2)); + /* set the default value for off */ + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); + /* skip BTF fd store for vmlinux BTF */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4)); + /* load fd_array slot pointer */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, btf_fd_idx))); + /* store BTF fd in slot */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_9, 0)); + /* store index into insn[insn_idx].off */ + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), btf_fd_idx)); +log: + if (!gen->log_level) + return; + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_8, + offsetof(struct bpf_insn, imm))); + emit(gen, BPF_LDX_MEM(BPF_H, BPF_REG_9, BPF_REG_8, + offsetof(struct bpf_insn, off))); + debug_regs(gen, BPF_REG_7, BPF_REG_9, " func (%s:count=%d): imm: %%d, off: %%d", + relo->name, kdesc->ref); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, kdesc->off))); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_0, 0)); + debug_regs(gen, BPF_REG_9, -1, " func (%s:count=%d): btf_fd", + relo->name, kdesc->ref); +} + +static void emit_ksym_relo_log(struct bpf_gen *gen, struct ksym_relo_desc *relo, + int ref) +{ + if (!gen->log_level) + return; + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_8, + offsetof(struct bpf_insn, imm))); + emit(gen, BPF_LDX_MEM(BPF_H, BPF_REG_9, BPF_REG_8, sizeof(struct bpf_insn) + + offsetof(struct bpf_insn, imm))); + debug_regs(gen, BPF_REG_7, BPF_REG_9, " var t=%d w=%d (%s:count=%d): imm[0]: %%d, imm[1]: %%d", + relo->is_typeless, relo->is_weak, relo->name, ref); + emit(gen, BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_8, offsetofend(struct bpf_insn, code))); + debug_regs(gen, BPF_REG_9, -1, " var t=%d w=%d (%s:count=%d): insn.reg", + relo->is_typeless, relo->is_weak, relo->name, ref); +} + +/* Expects: + * BPF_REG_8 - pointer to instruction + */ +static void emit_relo_ksym_typeless(struct bpf_gen *gen, + struct ksym_relo_desc *relo, int insn) +{ + struct ksym_desc *kdesc; + + kdesc = get_ksym_desc(gen, relo); + if (!kdesc) + return; + /* try to copy from existing ldimm64 insn */ + if (kdesc->ref > 1) { + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)); + goto log; + } + /* remember insn offset, so we can copy ksym addr later */ + kdesc->insn = insn; + /* skip typeless ksym_desc in fd closing loop in cleanup_relos */ + kdesc->typeless = true; + emit_bpf_kallsyms_lookup_name(gen, relo); + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_7, -ENOENT, 1)); + emit_check_err(gen); + /* store lower half of addr into insn[insn_idx].imm */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_9, offsetof(struct bpf_insn, imm))); + /* store upper half of addr into insn[insn_idx + 1].imm */ + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_9, + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm))); +log: + emit_ksym_relo_log(gen, relo, kdesc->ref); +} + +static __u32 src_reg_mask(void) +{ +#if defined(__LITTLE_ENDIAN_BITFIELD) + return 0x0f; /* src_reg,dst_reg,... */ +#elif defined(__BIG_ENDIAN_BITFIELD) + return 0xf0; /* dst_reg,src_reg,... */ +#else +#error "Unsupported bit endianness, cannot proceed" +#endif +} + +/* Expects: + * BPF_REG_8 - pointer to instruction + */ +static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insn) +{ + struct ksym_desc *kdesc; + __u32 reg_mask; + + kdesc = get_ksym_desc(gen, relo); + if (!kdesc) + return; + /* try to copy from existing ldimm64 insn */ + if (kdesc->ref > 1) { + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)); + /* jump over src_reg adjustment if imm is not 0, reuse BPF_REG_0 from move_blob2blob */ + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3)); + goto clear_src_reg; + } + /* remember insn offset, so we can copy BTF ID and FD later */ + kdesc->insn = insn; + emit_bpf_find_by_name_kind(gen, relo); + if (!relo->is_weak) + emit_check_err(gen); + /* jump to success case */ + emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + /* set values for insn[insn_idx].imm, insn[insn_idx + 1].imm as 0 */ + emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0)); + emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 0)); + /* skip success case for ret < 0 */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4)); + /* store btf_id into insn[insn_idx].imm */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm))); + /* store btf_obj_fd into insn[insn_idx + 1].imm */ + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 32)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm))); + /* skip src_reg adjustment */ + emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); +clear_src_reg: + /* clear bpf_object__relocate_data's src_reg assignment, otherwise we get a verifier failure */ + reg_mask = src_reg_mask(); + emit(gen, BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_8, offsetofend(struct bpf_insn, code))); + emit(gen, BPF_ALU32_IMM(BPF_AND, BPF_REG_9, reg_mask)); + emit(gen, BPF_STX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, offsetofend(struct bpf_insn, code))); + + emit_ksym_relo_log(gen, relo, kdesc->ref); +} + +void bpf_gen__record_relo_core(struct bpf_gen *gen, + const struct bpf_core_relo *core_relo) +{ + struct bpf_core_relo *relos; + + relos = libbpf_reallocarray(gen->core_relos, gen->core_relo_cnt + 1, sizeof(*relos)); + if (!relos) { + gen->error = -ENOMEM; + return; + } + gen->core_relos = relos; + relos += gen->core_relo_cnt; + memcpy(relos, core_relo, sizeof(*relos)); + gen->core_relo_cnt++; +} + +static void emit_relo(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insns) +{ + int insn; + + pr_debug("gen: emit_relo (%d): %s at %d\n", relo->kind, relo->name, relo->insn_idx); + insn = insns + sizeof(struct bpf_insn) * relo->insn_idx; + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_8, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, insn)); + switch (relo->kind) { + case BTF_KIND_VAR: + if (relo->is_typeless) + emit_relo_ksym_typeless(gen, relo, insn); + else + emit_relo_ksym_btf(gen, relo, insn); + break; + case BTF_KIND_FUNC: + emit_relo_kfunc_btf(gen, relo, insn); + break; + default: + pr_warn("Unknown relocation kind '%d'\n", relo->kind); + gen->error = -EDOM; + return; + } +} + +static void emit_relos(struct bpf_gen *gen, int insns) +{ + int i; + + for (i = 0; i < gen->relo_cnt; i++) + emit_relo(gen, gen->relos + i, insns); +} + +static void cleanup_core_relo(struct bpf_gen *gen) +{ + if (!gen->core_relo_cnt) + return; + free(gen->core_relos); + gen->core_relo_cnt = 0; + gen->core_relos = NULL; +} + +static void cleanup_relos(struct bpf_gen *gen, int insns) +{ + int i, insn; + + for (i = 0; i < gen->nr_ksyms; i++) { + /* only close fds for typed ksyms and kfuncs */ + if (gen->ksyms[i].kind == BTF_KIND_VAR && !gen->ksyms[i].typeless) { + /* close fd recorded in insn[insn_idx + 1].imm */ + insn = gen->ksyms[i].insn; + insn += sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm); + emit_sys_close_blob(gen, insn); + } else if (gen->ksyms[i].kind == BTF_KIND_FUNC) { + emit_sys_close_blob(gen, blob_fd_array_off(gen, gen->ksyms[i].off)); + if (gen->ksyms[i].off < MAX_FD_ARRAY_SZ) + gen->nr_fd_array--; + } + } + if (gen->nr_ksyms) { + free(gen->ksyms); + gen->nr_ksyms = 0; + gen->ksyms = NULL; + } + if (gen->relo_cnt) { + free(gen->relos); + gen->relo_cnt = 0; + gen->relos = NULL; + } + cleanup_core_relo(gen); +} + +void bpf_gen__prog_load(struct bpf_gen *gen, + enum bpf_prog_type prog_type, const char *prog_name, + const char *license, struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *load_attr, int prog_idx) +{ + int prog_load_attr, license_off, insns_off, func_info, line_info, core_relos; + int attr_size = offsetofend(union bpf_attr, core_relo_rec_size); + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: prog_load: type %d insns_cnt %zd progi_idx %d\n", + prog_type, insn_cnt, prog_idx); + /* add license string to blob of bytes */ + license_off = add_data(gen, license, strlen(license) + 1); + /* add insns to blob of bytes */ + insns_off = add_data(gen, insns, insn_cnt * sizeof(struct bpf_insn)); + + attr.prog_type = prog_type; + attr.expected_attach_type = load_attr->expected_attach_type; + attr.attach_btf_id = load_attr->attach_btf_id; + attr.prog_ifindex = load_attr->prog_ifindex; + attr.kern_version = 0; + attr.insn_cnt = (__u32)insn_cnt; + attr.prog_flags = load_attr->prog_flags; + + attr.func_info_rec_size = load_attr->func_info_rec_size; + attr.func_info_cnt = load_attr->func_info_cnt; + func_info = add_data(gen, load_attr->func_info, + attr.func_info_cnt * attr.func_info_rec_size); + + attr.line_info_rec_size = load_attr->line_info_rec_size; + attr.line_info_cnt = load_attr->line_info_cnt; + line_info = add_data(gen, load_attr->line_info, + attr.line_info_cnt * attr.line_info_rec_size); + + attr.core_relo_rec_size = sizeof(struct bpf_core_relo); + attr.core_relo_cnt = gen->core_relo_cnt; + core_relos = add_data(gen, gen->core_relos, + attr.core_relo_cnt * attr.core_relo_rec_size); + + libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); + prog_load_attr = add_data(gen, &attr, attr_size); + + /* populate union bpf_attr with a pointer to license */ + emit_rel_store(gen, attr_field(prog_load_attr, license), license_off); + + /* populate union bpf_attr with a pointer to instructions */ + emit_rel_store(gen, attr_field(prog_load_attr, insns), insns_off); + + /* populate union bpf_attr with a pointer to func_info */ + emit_rel_store(gen, attr_field(prog_load_attr, func_info), func_info); + + /* populate union bpf_attr with a pointer to line_info */ + emit_rel_store(gen, attr_field(prog_load_attr, line_info), line_info); + + /* populate union bpf_attr with a pointer to core_relos */ + emit_rel_store(gen, attr_field(prog_load_attr, core_relos), core_relos); + + /* populate union bpf_attr fd_array with a pointer to data where map_fds are saved */ + emit_rel_store(gen, attr_field(prog_load_attr, fd_array), gen->fd_array); + + /* populate union bpf_attr with user provided log details */ + move_ctx2blob(gen, attr_field(prog_load_attr, log_level), 4, + offsetof(struct bpf_loader_ctx, log_level), false); + move_ctx2blob(gen, attr_field(prog_load_attr, log_size), 4, + offsetof(struct bpf_loader_ctx, log_size), false); + move_ctx2blob(gen, attr_field(prog_load_attr, log_buf), 8, + offsetof(struct bpf_loader_ctx, log_buf), false); + /* populate union bpf_attr with btf_fd saved in the stack earlier */ + move_stack2blob(gen, attr_field(prog_load_attr, prog_btf_fd), 4, + stack_off(btf_fd)); + if (gen->attach_kind) { + emit_find_attach_target(gen); + /* populate union bpf_attr with btf_id and btf_obj_fd found by helper */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, prog_load_attr)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_7, + offsetof(union bpf_attr, attach_btf_id))); + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 32)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_7, + offsetof(union bpf_attr, attach_btf_obj_fd))); + } + emit_relos(gen, insns_off); + /* emit PROG_LOAD command */ + emit_sys_bpf(gen, BPF_PROG_LOAD, prog_load_attr, attr_size); + debug_ret(gen, "prog_load %s insn_cnt %d", attr.prog_name, attr.insn_cnt); + /* successful or not, close btf module FDs used in extern ksyms and attach_btf_obj_fd */ + cleanup_relos(gen, insns_off); + if (gen->attach_kind) { + emit_sys_close_blob(gen, + attr_field(prog_load_attr, attach_btf_obj_fd)); + gen->attach_kind = 0; + } + emit_check_err(gen); + /* remember prog_fd in the stack, if successful */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, + stack_off(prog_fd[gen->nr_progs]))); + gen->nr_progs++; +} + +void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue, + __u32 value_size) +{ + int attr_size = offsetofend(union bpf_attr, flags); + int map_update_attr, value, key; + union bpf_attr attr; + int zero = 0; + + memset(&attr, 0, attr_size); + pr_debug("gen: map_update_elem: idx %d\n", map_idx); + + value = add_data(gen, pvalue, value_size); + key = add_data(gen, &zero, sizeof(zero)); + + /* if (map_desc[map_idx].initial_value) { + * if (ctx->flags & BPF_SKEL_KERNEL) + * bpf_probe_read_kernel(value, value_size, initial_value); + * else + * bpf_copy_from_user(value, value_size, initial_value); + * } + */ + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * map_idx + + offsetof(struct bpf_map_desc, initial_value))); + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 8)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, value)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, value_size)); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, + offsetof(struct bpf_loader_ctx, flags))); + emit(gen, BPF_JMP_IMM(BPF_JSET, BPF_REG_0, BPF_SKEL_KERNEL, 2)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_copy_from_user)); + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel)); + + map_update_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4, + blob_fd_array_off(gen, map_idx)); + emit_rel_store(gen, attr_field(map_update_attr, key), key); + emit_rel_store(gen, attr_field(map_update_attr, value), value); + /* emit MAP_UPDATE_ELEM command */ + emit_sys_bpf(gen, BPF_MAP_UPDATE_ELEM, map_update_attr, attr_size); + debug_ret(gen, "update_elem idx %d value_size %d", map_idx, value_size); + emit_check_err(gen); +} + +void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int slot, + int inner_map_idx) +{ + int attr_size = offsetofend(union bpf_attr, flags); + int map_update_attr, key; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: populate_outer_map: outer %d key %d inner %d\n", + outer_map_idx, slot, inner_map_idx); + + key = add_data(gen, &slot, sizeof(slot)); + + map_update_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4, + blob_fd_array_off(gen, outer_map_idx)); + emit_rel_store(gen, attr_field(map_update_attr, key), key); + emit_rel_store(gen, attr_field(map_update_attr, value), + blob_fd_array_off(gen, inner_map_idx)); + + /* emit MAP_UPDATE_ELEM command */ + emit_sys_bpf(gen, BPF_MAP_UPDATE_ELEM, map_update_attr, attr_size); + debug_ret(gen, "populate_outer_map outer %d key %d inner %d", + outer_map_idx, slot, inner_map_idx); + emit_check_err(gen); +} + +void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx) +{ + int attr_size = offsetofend(union bpf_attr, map_fd); + int map_freeze_attr; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: map_freeze: idx %d\n", map_idx); + map_freeze_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_freeze_attr, map_fd), 4, + blob_fd_array_off(gen, map_idx)); + /* emit MAP_FREEZE command */ + emit_sys_bpf(gen, BPF_MAP_FREEZE, map_freeze_attr, attr_size); + debug_ret(gen, "map_freeze"); + emit_check_err(gen); +} diff --git a/src/hashmap.c b/src/hashmap.c new file mode 100644 index 0000000..140ee40 --- /dev/null +++ b/src/hashmap.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Generic non-thread safe hash map implementation. + * + * Copyright (c) 2019 Facebook + */ +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <linux/err.h> +#include "hashmap.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* prevent accidental re-addition of reallocarray() */ +#pragma GCC poison reallocarray + +/* start with 4 buckets */ +#define HASHMAP_MIN_CAP_BITS 2 + +static void hashmap_add_entry(struct hashmap_entry **pprev, + struct hashmap_entry *entry) +{ + entry->next = *pprev; + *pprev = entry; +} + +static void hashmap_del_entry(struct hashmap_entry **pprev, + struct hashmap_entry *entry) +{ + *pprev = entry->next; + entry->next = NULL; +} + +void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, void *ctx) +{ + map->hash_fn = hash_fn; + map->equal_fn = equal_fn; + map->ctx = ctx; + + map->buckets = NULL; + map->cap = 0; + map->cap_bits = 0; + map->sz = 0; +} + +struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, + void *ctx) +{ + struct hashmap *map = malloc(sizeof(struct hashmap)); + + if (!map) + return ERR_PTR(-ENOMEM); + hashmap__init(map, hash_fn, equal_fn, ctx); + return map; +} + +void hashmap__clear(struct hashmap *map) +{ + struct hashmap_entry *cur, *tmp; + size_t bkt; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + free(cur); + } + free(map->buckets); + map->buckets = NULL; + map->cap = map->cap_bits = map->sz = 0; +} + +void hashmap__free(struct hashmap *map) +{ + if (IS_ERR_OR_NULL(map)) + return; + + hashmap__clear(map); + free(map); +} + +size_t hashmap__size(const struct hashmap *map) +{ + return map->sz; +} + +size_t hashmap__capacity(const struct hashmap *map) +{ + return map->cap; +} + +static bool hashmap_needs_to_grow(struct hashmap *map) +{ + /* grow if empty or more than 75% filled */ + return (map->cap == 0) || ((map->sz + 1) * 4 / 3 > map->cap); +} + +static int hashmap_grow(struct hashmap *map) +{ + struct hashmap_entry **new_buckets; + struct hashmap_entry *cur, *tmp; + size_t new_cap_bits, new_cap; + size_t h, bkt; + + new_cap_bits = map->cap_bits + 1; + if (new_cap_bits < HASHMAP_MIN_CAP_BITS) + new_cap_bits = HASHMAP_MIN_CAP_BITS; + + new_cap = 1UL << new_cap_bits; + new_buckets = calloc(new_cap, sizeof(new_buckets[0])); + if (!new_buckets) + return -ENOMEM; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + h = hash_bits(map->hash_fn(cur->key, map->ctx), new_cap_bits); + hashmap_add_entry(&new_buckets[h], cur); + } + + map->cap = new_cap; + map->cap_bits = new_cap_bits; + free(map->buckets); + map->buckets = new_buckets; + + return 0; +} + +static bool hashmap_find_entry(const struct hashmap *map, + const long key, size_t hash, + struct hashmap_entry ***pprev, + struct hashmap_entry **entry) +{ + struct hashmap_entry *cur, **prev_ptr; + + if (!map->buckets) + return false; + + for (prev_ptr = &map->buckets[hash], cur = *prev_ptr; + cur; + prev_ptr = &cur->next, cur = cur->next) { + if (map->equal_fn(cur->key, key, map->ctx)) { + if (pprev) + *pprev = prev_ptr; + *entry = cur; + return true; + } + } + + return false; +} + +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value) +{ + struct hashmap_entry *entry; + size_t h; + int err; + + if (old_key) + *old_key = 0; + if (old_value) + *old_value = 0; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (strategy != HASHMAP_APPEND && + hashmap_find_entry(map, key, h, NULL, &entry)) { + if (old_key) + *old_key = entry->key; + if (old_value) + *old_value = entry->value; + + if (strategy == HASHMAP_SET || strategy == HASHMAP_UPDATE) { + entry->key = key; + entry->value = value; + return 0; + } else if (strategy == HASHMAP_ADD) { + return -EEXIST; + } + } + + if (strategy == HASHMAP_UPDATE) + return -ENOENT; + + if (hashmap_needs_to_grow(map)) { + err = hashmap_grow(map); + if (err) + return err; + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + } + + entry = malloc(sizeof(struct hashmap_entry)); + if (!entry) + return -ENOMEM; + + entry->key = key; + entry->value = value; + hashmap_add_entry(&map->buckets[h], entry); + map->sz++; + + return 0; +} + +bool hashmap_find(const struct hashmap *map, long key, long *value) +{ + struct hashmap_entry *entry; + size_t h; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (!hashmap_find_entry(map, key, h, NULL, &entry)) + return false; + + if (value) + *value = entry->value; + return true; +} + +bool hashmap_delete(struct hashmap *map, long key, + long *old_key, long *old_value) +{ + struct hashmap_entry **pprev, *entry; + size_t h; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (!hashmap_find_entry(map, key, h, &pprev, &entry)) + return false; + + if (old_key) + *old_key = entry->key; + if (old_value) + *old_value = entry->value; + + hashmap_del_entry(pprev, entry); + free(entry); + map->sz--; + + return true; +} diff --git a/src/hashmap.h b/src/hashmap.h new file mode 100644 index 0000000..0a5bf19 --- /dev/null +++ b/src/hashmap.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Generic non-thread safe hash map implementation. + * + * Copyright (c) 2019 Facebook + */ +#ifndef __LIBBPF_HASHMAP_H +#define __LIBBPF_HASHMAP_H + +#include <stdbool.h> +#include <stddef.h> +#include <limits.h> + +static inline size_t hash_bits(size_t h, int bits) +{ + /* shuffle bits and return requested number of upper bits */ + if (bits == 0) + return 0; + +#if (__SIZEOF_SIZE_T__ == __SIZEOF_LONG_LONG__) + /* LP64 case */ + return (h * 11400714819323198485llu) >> (__SIZEOF_LONG_LONG__ * 8 - bits); +#elif (__SIZEOF_SIZE_T__ <= __SIZEOF_LONG__) + return (h * 2654435769lu) >> (__SIZEOF_LONG__ * 8 - bits); +#else +# error "Unsupported size_t size" +#endif +} + +/* generic C-string hashing function */ +static inline size_t str_hash(const char *s) +{ + size_t h = 0; + + while (*s) { + h = h * 31 + *s; + s++; + } + return h; +} + +typedef size_t (*hashmap_hash_fn)(long key, void *ctx); +typedef bool (*hashmap_equal_fn)(long key1, long key2, void *ctx); + +/* + * Hashmap interface is polymorphic, keys and values could be either + * long-sized integers or pointers, this is achieved as follows: + * - interface functions that operate on keys and values are hidden + * behind auxiliary macros, e.g. hashmap_insert <-> hashmap__insert; + * - these auxiliary macros cast the key and value parameters as + * long or long *, so the user does not have to specify the casts explicitly; + * - for pointer parameters (e.g. old_key) the size of the pointed + * type is verified by hashmap_cast_ptr using _Static_assert; + * - when iterating using hashmap__for_each_* forms + * hasmap_entry->key should be used for integer keys and + * hasmap_entry->pkey should be used for pointer keys, + * same goes for values. + */ +struct hashmap_entry { + union { + long key; + const void *pkey; + }; + union { + long value; + void *pvalue; + }; + struct hashmap_entry *next; +}; + +struct hashmap { + hashmap_hash_fn hash_fn; + hashmap_equal_fn equal_fn; + void *ctx; + + struct hashmap_entry **buckets; + size_t cap; + size_t cap_bits; + size_t sz; +}; + +#define HASHMAP_INIT(hash_fn, equal_fn, ctx) { \ + .hash_fn = (hash_fn), \ + .equal_fn = (equal_fn), \ + .ctx = (ctx), \ + .buckets = NULL, \ + .cap = 0, \ + .cap_bits = 0, \ + .sz = 0, \ +} + +void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, void *ctx); +struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, + void *ctx); +void hashmap__clear(struct hashmap *map); +void hashmap__free(struct hashmap *map); + +size_t hashmap__size(const struct hashmap *map); +size_t hashmap__capacity(const struct hashmap *map); + +/* + * Hashmap insertion strategy: + * - HASHMAP_ADD - only add key/value if key doesn't exist yet; + * - HASHMAP_SET - add key/value pair if key doesn't exist yet; otherwise, + * update value; + * - HASHMAP_UPDATE - update value, if key already exists; otherwise, do + * nothing and return -ENOENT; + * - HASHMAP_APPEND - always add key/value pair, even if key already exists. + * This turns hashmap into a multimap by allowing multiple values to be + * associated with the same key. Most useful read API for such hashmap is + * hashmap__for_each_key_entry() iteration. If hashmap__find() is still + * used, it will return last inserted key/value entry (first in a bucket + * chain). + */ +enum hashmap_insert_strategy { + HASHMAP_ADD, + HASHMAP_SET, + HASHMAP_UPDATE, + HASHMAP_APPEND, +}; + +#define hashmap_cast_ptr(p) ({ \ + _Static_assert((__builtin_constant_p((p)) ? (p) == NULL : 0) || \ + sizeof(*(p)) == sizeof(long), \ + #p " pointee should be a long-sized integer or a pointer"); \ + (long *)(p); \ +}) + +/* + * hashmap__insert() adds key/value entry w/ various semantics, depending on + * provided strategy value. If a given key/value pair replaced already + * existing key/value pair, both old key and old value will be returned + * through old_key and old_value to allow calling code do proper memory + * management. + */ +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value); + +#define hashmap__insert(map, key, value, strategy, old_key, old_value) \ + hashmap_insert((map), (long)(key), (long)(value), (strategy), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) + +#define hashmap__add(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_ADD, NULL, NULL) + +#define hashmap__set(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_SET, (old_key), (old_value)) + +#define hashmap__update(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_UPDATE, (old_key), (old_value)) + +#define hashmap__append(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_APPEND, NULL, NULL) + +bool hashmap_delete(struct hashmap *map, long key, long *old_key, long *old_value); + +#define hashmap__delete(map, key, old_key, old_value) \ + hashmap_delete((map), (long)(key), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) + +bool hashmap_find(const struct hashmap *map, long key, long *value); + +#define hashmap__find(map, key, value) \ + hashmap_find((map), (long)(key), hashmap_cast_ptr(value)) + +/* + * hashmap__for_each_entry - iterate over all entries in hashmap + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @bkt: integer used as a bucket loop cursor + */ +#define hashmap__for_each_entry(map, cur, bkt) \ + for (bkt = 0; bkt < map->cap; bkt++) \ + for (cur = map->buckets[bkt]; cur; cur = cur->next) + +/* + * hashmap__for_each_entry_safe - iterate over all entries in hashmap, safe + * against removals + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @tmp: struct hashmap_entry * used as a temporary next cursor storage + * @bkt: integer used as a bucket loop cursor + */ +#define hashmap__for_each_entry_safe(map, cur, tmp, bkt) \ + for (bkt = 0; bkt < map->cap; bkt++) \ + for (cur = map->buckets[bkt]; \ + cur && ({tmp = cur->next; true; }); \ + cur = tmp) + +/* + * hashmap__for_each_key_entry - iterate over entries associated with given key + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @key: key to iterate entries for + */ +#define hashmap__for_each_key_entry(map, cur, _key) \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ + cur; \ + cur = cur->next) \ + if (map->equal_fn(cur->key, (_key), map->ctx)) + +#define hashmap__for_each_key_entry_safe(map, cur, tmp, _key) \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ + cur && ({ tmp = cur->next; true; }); \ + cur = tmp) \ + if (map->equal_fn(cur->key, (_key), map->ctx)) + +#endif /* __LIBBPF_HASHMAP_H */ diff --git a/src/libbpf.c b/src/libbpf.c new file mode 100644 index 0000000..a5c67a3 --- /dev/null +++ b/src/libbpf.c @@ -0,0 +1,12490 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Common eBPF ELF object loading operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org> + * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com> + * Copyright (C) 2015 Huawei Inc. + * Copyright (C) 2017 Nicira, Inc. + * Copyright (C) 2019 Isovalent, Inc. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <libgen.h> +#include <inttypes.h> +#include <limits.h> +#include <string.h> +#include <unistd.h> +#include <endian.h> +#include <fcntl.h> +#include <errno.h> +#include <ctype.h> +#include <asm/unistd.h> +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <linux/limits.h> +#include <linux/perf_event.h> +#include <linux/ring_buffer.h> +#include <linux/version.h> +#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/vfs.h> +#include <sys/utsname.h> +#include <sys/resource.h> +#include <libelf.h> +#include <gelf.h> +#include <zlib.h> + +#include "libbpf.h" +#include "bpf.h" +#include "btf.h" +#include "str_error.h" +#include "libbpf_internal.h" +#include "hashmap.h" +#include "bpf_gen_internal.h" + +#ifndef BPF_FS_MAGIC +#define BPF_FS_MAGIC 0xcafe4a11 +#endif + +#define BPF_INSN_SZ (sizeof(struct bpf_insn)) + +/* vsprintf() in __base_pr() uses nonliteral format string. It may break + * compilation if user enables corresponding warning. Disable it explicitly. + */ +#pragma GCC diagnostic ignored "-Wformat-nonliteral" + +#define __printf(a, b) __attribute__((format(printf, a, b))) + +static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); + +static const char * const attach_type_name[] = { + [BPF_CGROUP_INET_INGRESS] = "cgroup_inet_ingress", + [BPF_CGROUP_INET_EGRESS] = "cgroup_inet_egress", + [BPF_CGROUP_INET_SOCK_CREATE] = "cgroup_inet_sock_create", + [BPF_CGROUP_INET_SOCK_RELEASE] = "cgroup_inet_sock_release", + [BPF_CGROUP_SOCK_OPS] = "cgroup_sock_ops", + [BPF_CGROUP_DEVICE] = "cgroup_device", + [BPF_CGROUP_INET4_BIND] = "cgroup_inet4_bind", + [BPF_CGROUP_INET6_BIND] = "cgroup_inet6_bind", + [BPF_CGROUP_INET4_CONNECT] = "cgroup_inet4_connect", + [BPF_CGROUP_INET6_CONNECT] = "cgroup_inet6_connect", + [BPF_CGROUP_INET4_POST_BIND] = "cgroup_inet4_post_bind", + [BPF_CGROUP_INET6_POST_BIND] = "cgroup_inet6_post_bind", + [BPF_CGROUP_INET4_GETPEERNAME] = "cgroup_inet4_getpeername", + [BPF_CGROUP_INET6_GETPEERNAME] = "cgroup_inet6_getpeername", + [BPF_CGROUP_INET4_GETSOCKNAME] = "cgroup_inet4_getsockname", + [BPF_CGROUP_INET6_GETSOCKNAME] = "cgroup_inet6_getsockname", + [BPF_CGROUP_UDP4_SENDMSG] = "cgroup_udp4_sendmsg", + [BPF_CGROUP_UDP6_SENDMSG] = "cgroup_udp6_sendmsg", + [BPF_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_CGROUP_UDP4_RECVMSG] = "cgroup_udp4_recvmsg", + [BPF_CGROUP_UDP6_RECVMSG] = "cgroup_udp6_recvmsg", + [BPF_CGROUP_GETSOCKOPT] = "cgroup_getsockopt", + [BPF_CGROUP_SETSOCKOPT] = "cgroup_setsockopt", + [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", + [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", + [BPF_LIRC_MODE2] = "lirc_mode2", + [BPF_FLOW_DISSECTOR] = "flow_dissector", + [BPF_TRACE_RAW_TP] = "trace_raw_tp", + [BPF_TRACE_FENTRY] = "trace_fentry", + [BPF_TRACE_FEXIT] = "trace_fexit", + [BPF_MODIFY_RETURN] = "modify_return", + [BPF_LSM_MAC] = "lsm_mac", + [BPF_LSM_CGROUP] = "lsm_cgroup", + [BPF_SK_LOOKUP] = "sk_lookup", + [BPF_TRACE_ITER] = "trace_iter", + [BPF_XDP_DEVMAP] = "xdp_devmap", + [BPF_XDP_CPUMAP] = "xdp_cpumap", + [BPF_XDP] = "xdp", + [BPF_SK_REUSEPORT_SELECT] = "sk_reuseport_select", + [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate", + [BPF_PERF_EVENT] = "perf_event", + [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", +}; + +static const char * const link_type_name[] = { + [BPF_LINK_TYPE_UNSPEC] = "unspec", + [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_LINK_TYPE_TRACING] = "tracing", + [BPF_LINK_TYPE_CGROUP] = "cgroup", + [BPF_LINK_TYPE_ITER] = "iter", + [BPF_LINK_TYPE_NETNS] = "netns", + [BPF_LINK_TYPE_XDP] = "xdp", + [BPF_LINK_TYPE_PERF_EVENT] = "perf_event", + [BPF_LINK_TYPE_KPROBE_MULTI] = "kprobe_multi", + [BPF_LINK_TYPE_STRUCT_OPS] = "struct_ops", +}; + +static const char * const map_type_name[] = { + [BPF_MAP_TYPE_UNSPEC] = "unspec", + [BPF_MAP_TYPE_HASH] = "hash", + [BPF_MAP_TYPE_ARRAY] = "array", + [BPF_MAP_TYPE_PROG_ARRAY] = "prog_array", + [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf_event_array", + [BPF_MAP_TYPE_PERCPU_HASH] = "percpu_hash", + [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu_array", + [BPF_MAP_TYPE_STACK_TRACE] = "stack_trace", + [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup_array", + [BPF_MAP_TYPE_LRU_HASH] = "lru_hash", + [BPF_MAP_TYPE_LRU_PERCPU_HASH] = "lru_percpu_hash", + [BPF_MAP_TYPE_LPM_TRIE] = "lpm_trie", + [BPF_MAP_TYPE_ARRAY_OF_MAPS] = "array_of_maps", + [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", + [BPF_MAP_TYPE_DEVMAP] = "devmap", + [BPF_MAP_TYPE_DEVMAP_HASH] = "devmap_hash", + [BPF_MAP_TYPE_SOCKMAP] = "sockmap", + [BPF_MAP_TYPE_CPUMAP] = "cpumap", + [BPF_MAP_TYPE_XSKMAP] = "xskmap", + [BPF_MAP_TYPE_SOCKHASH] = "sockhash", + [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", + [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray", + [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "percpu_cgroup_storage", + [BPF_MAP_TYPE_QUEUE] = "queue", + [BPF_MAP_TYPE_STACK] = "stack", + [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", + [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_MAP_TYPE_RINGBUF] = "ringbuf", + [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", + [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", + [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", + [BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage", +}; + +static const char * const prog_type_name[] = { + [BPF_PROG_TYPE_UNSPEC] = "unspec", + [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", + [BPF_PROG_TYPE_KPROBE] = "kprobe", + [BPF_PROG_TYPE_SCHED_CLS] = "sched_cls", + [BPF_PROG_TYPE_SCHED_ACT] = "sched_act", + [BPF_PROG_TYPE_TRACEPOINT] = "tracepoint", + [BPF_PROG_TYPE_XDP] = "xdp", + [BPF_PROG_TYPE_PERF_EVENT] = "perf_event", + [BPF_PROG_TYPE_CGROUP_SKB] = "cgroup_skb", + [BPF_PROG_TYPE_CGROUP_SOCK] = "cgroup_sock", + [BPF_PROG_TYPE_LWT_IN] = "lwt_in", + [BPF_PROG_TYPE_LWT_OUT] = "lwt_out", + [BPF_PROG_TYPE_LWT_XMIT] = "lwt_xmit", + [BPF_PROG_TYPE_SOCK_OPS] = "sock_ops", + [BPF_PROG_TYPE_SK_SKB] = "sk_skb", + [BPF_PROG_TYPE_CGROUP_DEVICE] = "cgroup_device", + [BPF_PROG_TYPE_SK_MSG] = "sk_msg", + [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", + [BPF_PROG_TYPE_LWT_SEG6LOCAL] = "lwt_seg6local", + [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", + [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", + [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", + [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", + [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", + [BPF_PROG_TYPE_TRACING] = "tracing", + [BPF_PROG_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_PROG_TYPE_EXT] = "ext", + [BPF_PROG_TYPE_LSM] = "lsm", + [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SYSCALL] = "syscall", +}; + +static int __base_pr(enum libbpf_print_level level, const char *format, + va_list args) +{ + if (level == LIBBPF_DEBUG) + return 0; + + return vfprintf(stderr, format, args); +} + +static libbpf_print_fn_t __libbpf_pr = __base_pr; + +libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn) +{ + libbpf_print_fn_t old_print_fn = __libbpf_pr; + + __libbpf_pr = fn; + return old_print_fn; +} + +__printf(2, 3) +void libbpf_print(enum libbpf_print_level level, const char *format, ...) +{ + va_list args; + int old_errno; + + if (!__libbpf_pr) + return; + + old_errno = errno; + + va_start(args, format); + __libbpf_pr(level, format, args); + va_end(args); + + errno = old_errno; +} + +static void pr_perm_msg(int err) +{ + struct rlimit limit; + char buf[100]; + + if (err != -EPERM || geteuid() != 0) + return; + + err = getrlimit(RLIMIT_MEMLOCK, &limit); + if (err) + return; + + if (limit.rlim_cur == RLIM_INFINITY) + return; + + if (limit.rlim_cur < 1024) + snprintf(buf, sizeof(buf), "%zu bytes", (size_t)limit.rlim_cur); + else if (limit.rlim_cur < 1024*1024) + snprintf(buf, sizeof(buf), "%.1f KiB", (double)limit.rlim_cur / 1024); + else + snprintf(buf, sizeof(buf), "%.1f MiB", (double)limit.rlim_cur / (1024*1024)); + + pr_warn("permission error while running as root; try raising 'ulimit -l'? current value: %s\n", + buf); +} + +#define STRERR_BUFSIZE 128 + +/* Copied from tools/perf/util/util.h */ +#ifndef zfree +# define zfree(ptr) ({ free(*ptr); *ptr = NULL; }) +#endif + +#ifndef zclose +# define zclose(fd) ({ \ + int ___err = 0; \ + if ((fd) >= 0) \ + ___err = close((fd)); \ + fd = -1; \ + ___err; }) +#endif + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +int libbpf_set_strict_mode(enum libbpf_strict_mode mode) +{ + /* as of v1.0 libbpf_set_strict_mode() is a no-op */ + return 0; +} + +__u32 libbpf_major_version(void) +{ + return LIBBPF_MAJOR_VERSION; +} + +__u32 libbpf_minor_version(void) +{ + return LIBBPF_MINOR_VERSION; +} + +const char *libbpf_version_string(void) +{ +#define __S(X) #X +#define _S(X) __S(X) + return "v" _S(LIBBPF_MAJOR_VERSION) "." _S(LIBBPF_MINOR_VERSION); +#undef _S +#undef __S +} + +enum reloc_type { + RELO_LD64, + RELO_CALL, + RELO_DATA, + RELO_EXTERN_VAR, + RELO_EXTERN_FUNC, + RELO_SUBPROG_ADDR, + RELO_CORE, +}; + +struct reloc_desc { + enum reloc_type type; + int insn_idx; + union { + const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */ + struct { + int map_idx; + int sym_off; + }; + }; +}; + +/* stored as sec_def->cookie for all libbpf-supported SEC()s */ +enum sec_def_flags { + SEC_NONE = 0, + /* expected_attach_type is optional, if kernel doesn't support that */ + SEC_EXP_ATTACH_OPT = 1, + /* legacy, only used by libbpf_get_type_names() and + * libbpf_attach_type_by_name(), not used by libbpf itself at all. + * This used to be associated with cgroup (and few other) BPF programs + * that were attachable through BPF_PROG_ATTACH command. Pretty + * meaningless nowadays, though. + */ + SEC_ATTACHABLE = 2, + SEC_ATTACHABLE_OPT = SEC_ATTACHABLE | SEC_EXP_ATTACH_OPT, + /* attachment target is specified through BTF ID in either kernel or + * other BPF program's BTF object + */ + SEC_ATTACH_BTF = 4, + /* BPF program type allows sleeping/blocking in kernel */ + SEC_SLEEPABLE = 8, + /* BPF program support non-linear XDP buffer */ + SEC_XDP_FRAGS = 16, +}; + +struct bpf_sec_def { + char *sec; + enum bpf_prog_type prog_type; + enum bpf_attach_type expected_attach_type; + long cookie; + int handler_id; + + libbpf_prog_setup_fn_t prog_setup_fn; + libbpf_prog_prepare_load_fn_t prog_prepare_load_fn; + libbpf_prog_attach_fn_t prog_attach_fn; +}; + +/* + * bpf_prog should be a better name but it has been used in + * linux/filter.h. + */ +struct bpf_program { + char *name; + char *sec_name; + size_t sec_idx; + const struct bpf_sec_def *sec_def; + /* this program's instruction offset (in number of instructions) + * within its containing ELF section + */ + size_t sec_insn_off; + /* number of original instructions in ELF section belonging to this + * program, not taking into account subprogram instructions possible + * appended later during relocation + */ + size_t sec_insn_cnt; + /* Offset (in number of instructions) of the start of instruction + * belonging to this BPF program within its containing main BPF + * program. For the entry-point (main) BPF program, this is always + * zero. For a sub-program, this gets reset before each of main BPF + * programs are processed and relocated and is used to determined + * whether sub-program was already appended to the main program, and + * if yes, at which instruction offset. + */ + size_t sub_insn_off; + + /* instructions that belong to BPF program; insns[0] is located at + * sec_insn_off instruction within its ELF section in ELF file, so + * when mapping ELF file instruction index to the local instruction, + * one needs to subtract sec_insn_off; and vice versa. + */ + struct bpf_insn *insns; + /* actual number of instruction in this BPF program's image; for + * entry-point BPF programs this includes the size of main program + * itself plus all the used sub-programs, appended at the end + */ + size_t insns_cnt; + + struct reloc_desc *reloc_desc; + int nr_reloc; + + /* BPF verifier log settings */ + char *log_buf; + size_t log_size; + __u32 log_level; + + struct bpf_object *obj; + + int fd; + bool autoload; + bool autoattach; + bool mark_btf_static; + enum bpf_prog_type type; + enum bpf_attach_type expected_attach_type; + + int prog_ifindex; + __u32 attach_btf_obj_fd; + __u32 attach_btf_id; + __u32 attach_prog_fd; + + void *func_info; + __u32 func_info_rec_size; + __u32 func_info_cnt; + + void *line_info; + __u32 line_info_rec_size; + __u32 line_info_cnt; + __u32 prog_flags; +}; + +struct bpf_struct_ops { + const char *tname; + const struct btf_type *type; + struct bpf_program **progs; + __u32 *kern_func_off; + /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */ + void *data; + /* e.g. struct bpf_struct_ops_tcp_congestion_ops in + * btf_vmlinux's format. + * struct bpf_struct_ops_tcp_congestion_ops { + * [... some other kernel fields ...] + * struct tcp_congestion_ops data; + * } + * kern_vdata-size == sizeof(struct bpf_struct_ops_tcp_congestion_ops) + * bpf_map__init_kern_struct_ops() will populate the "kern_vdata" + * from "data". + */ + void *kern_vdata; + __u32 type_id; +}; + +#define DATA_SEC ".data" +#define BSS_SEC ".bss" +#define RODATA_SEC ".rodata" +#define KCONFIG_SEC ".kconfig" +#define KSYMS_SEC ".ksyms" +#define STRUCT_OPS_SEC ".struct_ops" + +enum libbpf_map_type { + LIBBPF_MAP_UNSPEC, + LIBBPF_MAP_DATA, + LIBBPF_MAP_BSS, + LIBBPF_MAP_RODATA, + LIBBPF_MAP_KCONFIG, +}; + +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; +}; + +struct bpf_map { + struct bpf_object *obj; + char *name; + /* real_name is defined for special internal maps (.rodata*, + * .data*, .bss, .kconfig) and preserves their original ELF section + * name. This is important to be able to find corresponding BTF + * DATASEC information. + */ + char *real_name; + int fd; + int sec_idx; + size_t sec_offset; + int map_ifindex; + int inner_map_fd; + struct bpf_map_def def; + __u32 numa_node; + __u32 btf_var_idx; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 btf_vmlinux_value_type_id; + enum libbpf_map_type libbpf_type; + void *mmaped; + struct bpf_struct_ops *st_ops; + struct bpf_map *inner_map; + void **init_slots; + int init_slots_sz; + char *pin_path; + bool pinned; + bool reused; + bool autocreate; + __u64 map_extra; +}; + +enum extern_type { + EXT_UNKNOWN, + EXT_KCFG, + EXT_KSYM, +}; + +enum kcfg_type { + KCFG_UNKNOWN, + KCFG_CHAR, + KCFG_BOOL, + KCFG_INT, + KCFG_TRISTATE, + KCFG_CHAR_ARR, +}; + +struct extern_desc { + enum extern_type type; + int sym_idx; + int btf_id; + int sec_btf_id; + const char *name; + bool is_set; + bool is_weak; + union { + struct { + enum kcfg_type type; + int sz; + int align; + int data_off; + bool is_signed; + } kcfg; + struct { + unsigned long long addr; + + /* target btf_id of the corresponding kernel var. */ + int kernel_btf_obj_fd; + int kernel_btf_id; + + /* local btf_id of the ksym extern's type. */ + __u32 type_id; + /* BTF fd index to be patched in for insn->off, this is + * 0 for vmlinux BTF, index in obj->fd_array for module + * BTF + */ + __s16 btf_fd_idx; + } ksym; + }; +}; + +struct module_btf { + struct btf *btf; + char *name; + __u32 id; + int fd; + int fd_array_idx; +}; + +enum sec_type { + SEC_UNUSED = 0, + SEC_RELO, + SEC_BSS, + SEC_DATA, + SEC_RODATA, +}; + +struct elf_sec_desc { + enum sec_type sec_type; + Elf64_Shdr *shdr; + Elf_Data *data; +}; + +struct elf_state { + int fd; + const void *obj_buf; + size_t obj_buf_sz; + Elf *elf; + Elf64_Ehdr *ehdr; + Elf_Data *symbols; + Elf_Data *st_ops_data; + size_t shstrndx; /* section index for section name strings */ + size_t strtabidx; + struct elf_sec_desc *secs; + size_t sec_cnt; + int btf_maps_shndx; + __u32 btf_maps_sec_btf_id; + int text_shndx; + int symbols_shndx; + int st_ops_shndx; +}; + +struct usdt_manager; + +struct bpf_object { + char name[BPF_OBJ_NAME_LEN]; + char license[64]; + __u32 kern_version; + + struct bpf_program *programs; + size_t nr_programs; + struct bpf_map *maps; + size_t nr_maps; + size_t maps_cap; + + char *kconfig; + struct extern_desc *externs; + int nr_extern; + int kconfig_map_idx; + + bool loaded; + bool has_subcalls; + bool has_rodata; + + struct bpf_gen *gen_loader; + + /* Information when doing ELF related work. Only valid if efile.elf is not NULL */ + struct elf_state efile; + + struct btf *btf; + struct btf_ext *btf_ext; + + /* Parse and load BTF vmlinux if any of the programs in the object need + * it at load time. + */ + struct btf *btf_vmlinux; + /* Path to the custom BTF to be used for BPF CO-RE relocations as an + * override for vmlinux BTF. + */ + char *btf_custom_path; + /* vmlinux BTF override for CO-RE relocations */ + struct btf *btf_vmlinux_override; + /* Lazily initialized kernel module BTFs */ + struct module_btf *btf_modules; + bool btf_modules_loaded; + size_t btf_module_cnt; + size_t btf_module_cap; + + /* optional log settings passed to BPF_BTF_LOAD and BPF_PROG_LOAD commands */ + char *log_buf; + size_t log_size; + __u32 log_level; + + int *fd_array; + size_t fd_array_cap; + size_t fd_array_cnt; + + struct usdt_manager *usdt_man; + + char path[]; +}; + +static const char *elf_sym_str(const struct bpf_object *obj, size_t off); +static const char *elf_sec_str(const struct bpf_object *obj, size_t off); +static Elf_Scn *elf_sec_by_idx(const struct bpf_object *obj, size_t idx); +static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name); +static Elf64_Shdr *elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn); +static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn); +static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn); +static Elf64_Sym *elf_sym_by_idx(const struct bpf_object *obj, size_t idx); +static Elf64_Rel *elf_rel_by_idx(Elf_Data *data, size_t idx); + +void bpf_program__unload(struct bpf_program *prog) +{ + if (!prog) + return; + + zclose(prog->fd); + + zfree(&prog->func_info); + zfree(&prog->line_info); +} + +static void bpf_program__exit(struct bpf_program *prog) +{ + if (!prog) + return; + + bpf_program__unload(prog); + zfree(&prog->name); + zfree(&prog->sec_name); + zfree(&prog->insns); + zfree(&prog->reloc_desc); + + prog->nr_reloc = 0; + prog->insns_cnt = 0; + prog->sec_idx = -1; +} + +static bool insn_is_subprog_call(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_CALL && + BPF_SRC(insn->code) == BPF_K && + insn->src_reg == BPF_PSEUDO_CALL && + insn->dst_reg == 0 && + insn->off == 0; +} + +static bool is_call_insn(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL); +} + +static bool insn_is_pseudo_func(struct bpf_insn *insn) +{ + return is_ldimm64_insn(insn) && insn->src_reg == BPF_PSEUDO_FUNC; +} + +static int +bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, + const char *name, size_t sec_idx, const char *sec_name, + size_t sec_off, void *insn_data, size_t insn_data_sz) +{ + if (insn_data_sz == 0 || insn_data_sz % BPF_INSN_SZ || sec_off % BPF_INSN_SZ) { + pr_warn("sec '%s': corrupted program '%s', offset %zu, size %zu\n", + sec_name, name, sec_off, insn_data_sz); + return -EINVAL; + } + + memset(prog, 0, sizeof(*prog)); + prog->obj = obj; + + prog->sec_idx = sec_idx; + prog->sec_insn_off = sec_off / BPF_INSN_SZ; + prog->sec_insn_cnt = insn_data_sz / BPF_INSN_SZ; + /* insns_cnt can later be increased by appending used subprograms */ + prog->insns_cnt = prog->sec_insn_cnt; + + prog->type = BPF_PROG_TYPE_UNSPEC; + prog->fd = -1; + + /* libbpf's convention for SEC("?abc...") is that it's just like + * SEC("abc...") but the corresponding bpf_program starts out with + * autoload set to false. + */ + if (sec_name[0] == '?') { + prog->autoload = false; + /* from now on forget there was ? in section name */ + sec_name++; + } else { + prog->autoload = true; + } + + prog->autoattach = true; + + /* inherit object's log_level */ + prog->log_level = obj->log_level; + + prog->sec_name = strdup(sec_name); + if (!prog->sec_name) + goto errout; + + prog->name = strdup(name); + if (!prog->name) + goto errout; + + prog->insns = malloc(insn_data_sz); + if (!prog->insns) + goto errout; + memcpy(prog->insns, insn_data, insn_data_sz); + + return 0; +errout: + pr_warn("sec '%s': failed to allocate memory for prog '%s'\n", sec_name, name); + bpf_program__exit(prog); + return -ENOMEM; +} + +static int +bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, + const char *sec_name, int sec_idx) +{ + Elf_Data *symbols = obj->efile.symbols; + struct bpf_program *prog, *progs; + void *data = sec_data->d_buf; + size_t sec_sz = sec_data->d_size, sec_off, prog_sz, nr_syms; + int nr_progs, err, i; + const char *name; + Elf64_Sym *sym; + + progs = obj->programs; + nr_progs = obj->nr_programs; + nr_syms = symbols->d_size / sizeof(Elf64_Sym); + sec_off = 0; + + for (i = 0; i < nr_syms; i++) { + sym = elf_sym_by_idx(obj, i); + + if (sym->st_shndx != sec_idx) + continue; + if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC) + continue; + + prog_sz = sym->st_size; + sec_off = sym->st_value; + + name = elf_sym_str(obj, sym->st_name); + if (!name) { + pr_warn("sec '%s': failed to get symbol name for offset %zu\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } + + if (sec_off + prog_sz > sec_sz) { + pr_warn("sec '%s': program at offset %zu crosses section boundary\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } + + if (sec_idx != obj->efile.text_shndx && ELF64_ST_BIND(sym->st_info) == STB_LOCAL) { + pr_warn("sec '%s': program '%s' is static and not supported\n", sec_name, name); + return -ENOTSUP; + } + + pr_debug("sec '%s': found program '%s' at insn offset %zu (%zu bytes), code size %zu insns (%zu bytes)\n", + sec_name, name, sec_off / BPF_INSN_SZ, sec_off, prog_sz / BPF_INSN_SZ, prog_sz); + + progs = libbpf_reallocarray(progs, nr_progs + 1, sizeof(*progs)); + if (!progs) { + /* + * In this case the original obj->programs + * is still valid, so don't need special treat for + * bpf_close_object(). + */ + pr_warn("sec '%s': failed to alloc memory for new program '%s'\n", + sec_name, name); + return -ENOMEM; + } + obj->programs = progs; + + prog = &progs[nr_progs]; + + err = bpf_object__init_prog(obj, prog, name, sec_idx, sec_name, + sec_off, data + sec_off, prog_sz); + if (err) + return err; + + /* if function is a global/weak symbol, but has restricted + * (STV_HIDDEN or STV_INTERNAL) visibility, mark its BTF FUNC + * as static to enable more permissive BPF verification mode + * with more outside context available to BPF verifier + */ + if (ELF64_ST_BIND(sym->st_info) != STB_LOCAL + && (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN + || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL)) + prog->mark_btf_static = true; + + nr_progs++; + obj->nr_programs = nr_progs; + } + + return 0; +} + +__u32 get_kernel_version(void) +{ + /* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release, + * but Ubuntu provides /proc/version_signature file, as described at + * https://ubuntu.com/kernel, with an example contents below, which we + * can use to get a proper LINUX_VERSION_CODE. + * + * Ubuntu 5.4.0-12.15-generic 5.4.8 + * + * In the above, 5.4.8 is what kernel is actually expecting, while + * uname() call will return 5.4.0 in info.release. + */ + const char *ubuntu_kver_file = "/proc/version_signature"; + __u32 major, minor, patch; + struct utsname info; + + if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) == 0) { + FILE *f; + + f = fopen(ubuntu_kver_file, "r"); + if (f) { + if (fscanf(f, "%*s %*s %d.%d.%d\n", &major, &minor, &patch) == 3) { + fclose(f); + return KERNEL_VERSION(major, minor, patch); + } + fclose(f); + } + /* something went wrong, fall back to uname() approach */ + } + + uname(&info); + if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3) + return 0; + return KERNEL_VERSION(major, minor, patch); +} + +static const struct btf_member * +find_member_by_offset(const struct btf_type *t, __u32 bit_offset) +{ + struct btf_member *m; + int i; + + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + if (btf_member_bit_offset(t, i) == bit_offset) + return m; + } + + return NULL; +} + +static const struct btf_member * +find_member_by_name(const struct btf *btf, const struct btf_type *t, + const char *name) +{ + struct btf_member *m; + int i; + + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + if (!strcmp(btf__name_by_offset(btf, m->name_off), name)) + return m; + } + + return NULL; +} + +#define STRUCT_OPS_VALUE_PREFIX "bpf_struct_ops_" +static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, + const char *name, __u32 kind); + +static int +find_struct_ops_kern_types(const struct btf *btf, const char *tname, + const struct btf_type **type, __u32 *type_id, + const struct btf_type **vtype, __u32 *vtype_id, + const struct btf_member **data_member) +{ + const struct btf_type *kern_type, *kern_vtype; + const struct btf_member *kern_data_member; + __s32 kern_vtype_id, kern_type_id; + __u32 i; + + kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + if (kern_type_id < 0) { + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", + tname); + return kern_type_id; + } + kern_type = btf__type_by_id(btf, kern_type_id); + + /* Find the corresponding "map_value" type that will be used + * in map_update(BPF_MAP_TYPE_STRUCT_OPS). For example, + * find "struct bpf_struct_ops_tcp_congestion_ops" from the + * btf_vmlinux. + */ + kern_vtype_id = find_btf_by_prefix_kind(btf, STRUCT_OPS_VALUE_PREFIX, + tname, BTF_KIND_STRUCT); + if (kern_vtype_id < 0) { + pr_warn("struct_ops init_kern: struct %s%s is not found in kernel BTF\n", + STRUCT_OPS_VALUE_PREFIX, tname); + return kern_vtype_id; + } + kern_vtype = btf__type_by_id(btf, kern_vtype_id); + + /* Find "struct tcp_congestion_ops" from + * struct bpf_struct_ops_tcp_congestion_ops { + * [ ... ] + * struct tcp_congestion_ops data; + * } + */ + kern_data_member = btf_members(kern_vtype); + for (i = 0; i < btf_vlen(kern_vtype); i++, kern_data_member++) { + if (kern_data_member->type == kern_type_id) + break; + } + if (i == btf_vlen(kern_vtype)) { + pr_warn("struct_ops init_kern: struct %s data is not found in struct %s%s\n", + tname, STRUCT_OPS_VALUE_PREFIX, tname); + return -EINVAL; + } + + *type = kern_type; + *type_id = kern_type_id; + *vtype = kern_vtype; + *vtype_id = kern_vtype_id; + *data_member = kern_data_member; + + return 0; +} + +static bool bpf_map__is_struct_ops(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_STRUCT_OPS; +} + +/* Init the map's fields that depend on kern_btf */ +static int bpf_map__init_kern_struct_ops(struct bpf_map *map, + const struct btf *btf, + const struct btf *kern_btf) +{ + const struct btf_member *member, *kern_member, *kern_data_member; + const struct btf_type *type, *kern_type, *kern_vtype; + __u32 i, kern_type_id, kern_vtype_id, kern_data_off; + struct bpf_struct_ops *st_ops; + void *data, *kern_data; + const char *tname; + int err; + + st_ops = map->st_ops; + type = st_ops->type; + tname = st_ops->tname; + err = find_struct_ops_kern_types(kern_btf, tname, + &kern_type, &kern_type_id, + &kern_vtype, &kern_vtype_id, + &kern_data_member); + if (err) + return err; + + pr_debug("struct_ops init_kern %s: type_id:%u kern_type_id:%u kern_vtype_id:%u\n", + map->name, st_ops->type_id, kern_type_id, kern_vtype_id); + + map->def.value_size = kern_vtype->size; + map->btf_vmlinux_value_type_id = kern_vtype_id; + + st_ops->kern_vdata = calloc(1, kern_vtype->size); + if (!st_ops->kern_vdata) + return -ENOMEM; + + data = st_ops->data; + kern_data_off = kern_data_member->offset / 8; + kern_data = st_ops->kern_vdata + kern_data_off; + + member = btf_members(type); + for (i = 0; i < btf_vlen(type); i++, member++) { + const struct btf_type *mtype, *kern_mtype; + __u32 mtype_id, kern_mtype_id; + void *mdata, *kern_mdata; + __s64 msize, kern_msize; + __u32 moff, kern_moff; + __u32 kern_member_idx; + const char *mname; + + mname = btf__name_by_offset(btf, member->name_off); + kern_member = find_member_by_name(kern_btf, kern_type, mname); + if (!kern_member) { + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + map->name, mname); + return -ENOTSUP; + } + + kern_member_idx = kern_member - btf_members(kern_type); + if (btf_member_bitfield_size(type, i) || + btf_member_bitfield_size(kern_type, kern_member_idx)) { + pr_warn("struct_ops init_kern %s: bitfield %s is not supported\n", + map->name, mname); + return -ENOTSUP; + } + + moff = member->offset / 8; + kern_moff = kern_member->offset / 8; + + mdata = data + moff; + kern_mdata = kern_data + kern_moff; + + mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id); + kern_mtype = skip_mods_and_typedefs(kern_btf, kern_member->type, + &kern_mtype_id); + if (BTF_INFO_KIND(mtype->info) != + BTF_INFO_KIND(kern_mtype->info)) { + pr_warn("struct_ops init_kern %s: Unmatched member type %s %u != %u(kernel)\n", + map->name, mname, BTF_INFO_KIND(mtype->info), + BTF_INFO_KIND(kern_mtype->info)); + return -ENOTSUP; + } + + if (btf_is_ptr(mtype)) { + struct bpf_program *prog; + + prog = st_ops->progs[i]; + if (!prog) + continue; + + kern_mtype = skip_mods_and_typedefs(kern_btf, + kern_mtype->type, + &kern_mtype_id); + + /* mtype->type must be a func_proto which was + * guaranteed in bpf_object__collect_st_ops_relos(), + * so only check kern_mtype for func_proto here. + */ + if (!btf_is_func_proto(kern_mtype)) { + pr_warn("struct_ops init_kern %s: kernel member %s is not a func ptr\n", + map->name, mname); + return -ENOTSUP; + } + + prog->attach_btf_id = kern_type_id; + prog->expected_attach_type = kern_member_idx; + + st_ops->kern_func_off[i] = kern_data_off + kern_moff; + + pr_debug("struct_ops init_kern %s: func ptr %s is set to prog %s from data(+%u) to kern_data(+%u)\n", + map->name, mname, prog->name, moff, + kern_moff); + + continue; + } + + msize = btf__resolve_size(btf, mtype_id); + kern_msize = btf__resolve_size(kern_btf, kern_mtype_id); + if (msize < 0 || kern_msize < 0 || msize != kern_msize) { + pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n", + map->name, mname, (ssize_t)msize, + (ssize_t)kern_msize); + return -ENOTSUP; + } + + pr_debug("struct_ops init_kern %s: copy %s %u bytes from data(+%u) to kern_data(+%u)\n", + map->name, mname, (unsigned int)msize, + moff, kern_moff); + memcpy(kern_mdata, mdata, msize); + } + + return 0; +} + +static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj) +{ + struct bpf_map *map; + size_t i; + int err; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + if (!bpf_map__is_struct_ops(map)) + continue; + + err = bpf_map__init_kern_struct_ops(map, obj->btf, + obj->btf_vmlinux); + if (err) + return err; + } + + return 0; +} + +static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) +{ + const struct btf_type *type, *datasec; + const struct btf_var_secinfo *vsi; + struct bpf_struct_ops *st_ops; + const char *tname, *var_name; + __s32 type_id, datasec_id; + const struct btf *btf; + struct bpf_map *map; + __u32 i; + + if (obj->efile.st_ops_shndx == -1) + return 0; + + btf = obj->btf; + datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC, + BTF_KIND_DATASEC); + if (datasec_id < 0) { + pr_warn("struct_ops init: DATASEC %s not found\n", + STRUCT_OPS_SEC); + return -EINVAL; + } + + datasec = btf__type_by_id(btf, datasec_id); + vsi = btf_var_secinfos(datasec); + for (i = 0; i < btf_vlen(datasec); i++, vsi++) { + type = btf__type_by_id(obj->btf, vsi->type); + var_name = btf__name_by_offset(obj->btf, type->name_off); + + type_id = btf__resolve_type(obj->btf, vsi->type); + if (type_id < 0) { + pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n", + vsi->type, STRUCT_OPS_SEC); + return -EINVAL; + } + + type = btf__type_by_id(obj->btf, type_id); + tname = btf__name_by_offset(obj->btf, type->name_off); + if (!tname[0]) { + pr_warn("struct_ops init: anonymous type is not supported\n"); + return -ENOTSUP; + } + if (!btf_is_struct(type)) { + pr_warn("struct_ops init: %s is not a struct\n", tname); + return -EINVAL; + } + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + + map->sec_idx = obj->efile.st_ops_shndx; + map->sec_offset = vsi->offset; + map->name = strdup(var_name); + if (!map->name) + return -ENOMEM; + + map->def.type = BPF_MAP_TYPE_STRUCT_OPS; + map->def.key_size = sizeof(int); + map->def.value_size = type->size; + map->def.max_entries = 1; + + map->st_ops = calloc(1, sizeof(*map->st_ops)); + if (!map->st_ops) + return -ENOMEM; + st_ops = map->st_ops; + st_ops->data = malloc(type->size); + st_ops->progs = calloc(btf_vlen(type), sizeof(*st_ops->progs)); + st_ops->kern_func_off = malloc(btf_vlen(type) * + sizeof(*st_ops->kern_func_off)); + if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off) + return -ENOMEM; + + if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) { + pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n", + var_name, STRUCT_OPS_SEC); + return -EINVAL; + } + + memcpy(st_ops->data, + obj->efile.st_ops_data->d_buf + vsi->offset, + type->size); + st_ops->tname = tname; + st_ops->type = type; + st_ops->type_id = type_id; + + pr_debug("struct_ops init: struct %s(type_id=%u) %s found at offset %u\n", + tname, type_id, var_name, vsi->offset); + } + + return 0; +} + +static struct bpf_object *bpf_object__new(const char *path, + const void *obj_buf, + size_t obj_buf_sz, + const char *obj_name) +{ + struct bpf_object *obj; + char *end; + + obj = calloc(1, sizeof(struct bpf_object) + strlen(path) + 1); + if (!obj) { + pr_warn("alloc memory failed for %s\n", path); + return ERR_PTR(-ENOMEM); + } + + strcpy(obj->path, path); + if (obj_name) { + libbpf_strlcpy(obj->name, obj_name, sizeof(obj->name)); + } else { + /* Using basename() GNU version which doesn't modify arg. */ + libbpf_strlcpy(obj->name, basename((void *)path), sizeof(obj->name)); + end = strchr(obj->name, '.'); + if (end) + *end = 0; + } + + obj->efile.fd = -1; + /* + * Caller of this function should also call + * bpf_object__elf_finish() after data collection to return + * obj_buf to user. If not, we should duplicate the buffer to + * avoid user freeing them before elf finish. + */ + obj->efile.obj_buf = obj_buf; + obj->efile.obj_buf_sz = obj_buf_sz; + obj->efile.btf_maps_shndx = -1; + obj->efile.st_ops_shndx = -1; + obj->kconfig_map_idx = -1; + + obj->kern_version = get_kernel_version(); + obj->loaded = false; + + return obj; +} + +static void bpf_object__elf_finish(struct bpf_object *obj) +{ + if (!obj->efile.elf) + return; + + elf_end(obj->efile.elf); + obj->efile.elf = NULL; + obj->efile.symbols = NULL; + obj->efile.st_ops_data = NULL; + + zfree(&obj->efile.secs); + obj->efile.sec_cnt = 0; + zclose(obj->efile.fd); + obj->efile.obj_buf = NULL; + obj->efile.obj_buf_sz = 0; +} + +static int bpf_object__elf_init(struct bpf_object *obj) +{ + Elf64_Ehdr *ehdr; + int err = 0; + Elf *elf; + + if (obj->efile.elf) { + pr_warn("elf: init internal error\n"); + return -LIBBPF_ERRNO__LIBELF; + } + + if (obj->efile.obj_buf_sz > 0) { + /* obj_buf should have been validated by bpf_object__open_mem(). */ + elf = elf_memory((char *)obj->efile.obj_buf, obj->efile.obj_buf_sz); + } else { + obj->efile.fd = open(obj->path, O_RDONLY | O_CLOEXEC); + if (obj->efile.fd < 0) { + char errmsg[STRERR_BUFSIZE], *cp; + + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("elf: failed to open %s: %s\n", obj->path, cp); + return err; + } + + elf = elf_begin(obj->efile.fd, ELF_C_READ_MMAP, NULL); + } + + if (!elf) { + pr_warn("elf: failed to open %s as ELF file: %s\n", obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__LIBELF; + goto errout; + } + + obj->efile.elf = elf; + + if (elf_kind(elf) != ELF_K_ELF) { + err = -LIBBPF_ERRNO__FORMAT; + pr_warn("elf: '%s' is not a proper ELF object\n", obj->path); + goto errout; + } + + if (gelf_getclass(elf) != ELFCLASS64) { + err = -LIBBPF_ERRNO__FORMAT; + pr_warn("elf: '%s' is not a 64-bit ELF object\n", obj->path); + goto errout; + } + + obj->efile.ehdr = ehdr = elf64_getehdr(elf); + if (!obj->efile.ehdr) { + pr_warn("elf: failed to get ELF header from %s: %s\n", obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + if (elf_getshdrstrndx(elf, &obj->efile.shstrndx)) { + pr_warn("elf: failed to get section names section index for %s: %s\n", + obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + /* Elf is corrupted/truncated, avoid calling elf_strptr. */ + if (!elf_rawdata(elf_getscn(elf, obj->efile.shstrndx), NULL)) { + pr_warn("elf: failed to get section names strings from %s: %s\n", + obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + /* Old LLVM set e_machine to EM_NONE */ + if (ehdr->e_type != ET_REL || (ehdr->e_machine && ehdr->e_machine != EM_BPF)) { + pr_warn("elf: %s is not a valid eBPF object file\n", obj->path); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + return 0; +errout: + bpf_object__elf_finish(obj); + return err; +} + +static int bpf_object__check_endianness(struct bpf_object *obj) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + if (obj->efile.ehdr->e_ident[EI_DATA] == ELFDATA2LSB) + return 0; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + if (obj->efile.ehdr->e_ident[EI_DATA] == ELFDATA2MSB) + return 0; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + pr_warn("elf: endianness mismatch in %s.\n", obj->path); + return -LIBBPF_ERRNO__ENDIAN; +} + +static int +bpf_object__init_license(struct bpf_object *obj, void *data, size_t size) +{ + if (!data) { + pr_warn("invalid license section in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + /* libbpf_strlcpy() only copies first N - 1 bytes, so size + 1 won't + * go over allowed ELF data section buffer + */ + libbpf_strlcpy(obj->license, data, min(size + 1, sizeof(obj->license))); + pr_debug("license of %s is %s\n", obj->path, obj->license); + return 0; +} + +static int +bpf_object__init_kversion(struct bpf_object *obj, void *data, size_t size) +{ + __u32 kver; + + if (!data || size != sizeof(kver)) { + pr_warn("invalid kver section in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + memcpy(&kver, data, sizeof(kver)); + obj->kern_version = kver; + pr_debug("kernel version of %s is %x\n", obj->path, obj->kern_version); + return 0; +} + +static bool bpf_map_type__is_map_in_map(enum bpf_map_type type) +{ + if (type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + type == BPF_MAP_TYPE_HASH_OF_MAPS) + return true; + return false; +} + +static int find_elf_sec_sz(const struct bpf_object *obj, const char *name, __u32 *size) +{ + Elf_Data *data; + Elf_Scn *scn; + + if (!name) + return -EINVAL; + + scn = elf_sec_by_name(obj, name); + data = elf_sec_data(obj, scn); + if (data) { + *size = data->d_size; + return 0; /* found it */ + } + + return -ENOENT; +} + +static Elf64_Sym *find_elf_var_sym(const struct bpf_object *obj, const char *name) +{ + Elf_Data *symbols = obj->efile.symbols; + const char *sname; + size_t si; + + for (si = 0; si < symbols->d_size / sizeof(Elf64_Sym); si++) { + Elf64_Sym *sym = elf_sym_by_idx(obj, si); + + if (ELF64_ST_TYPE(sym->st_info) != STT_OBJECT) + continue; + + if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && + ELF64_ST_BIND(sym->st_info) != STB_WEAK) + continue; + + sname = elf_sym_str(obj, sym->st_name); + if (!sname) { + pr_warn("failed to get sym name string for var %s\n", name); + return ERR_PTR(-EIO); + } + if (strcmp(name, sname) == 0) + return sym; + } + + return ERR_PTR(-ENOENT); +} + +static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) +{ + struct bpf_map *map; + int err; + + err = libbpf_ensure_mem((void **)&obj->maps, &obj->maps_cap, + sizeof(*obj->maps), obj->nr_maps + 1); + if (err) + return ERR_PTR(err); + + map = &obj->maps[obj->nr_maps++]; + map->obj = obj; + map->fd = -1; + map->inner_map_fd = -1; + map->autocreate = true; + + return map; +} + +static size_t bpf_map_mmap_sz(const struct bpf_map *map) +{ + long page_sz = sysconf(_SC_PAGE_SIZE); + size_t map_sz; + + map_sz = (size_t)roundup(map->def.value_size, 8) * map->def.max_entries; + map_sz = roundup(map_sz, page_sz); + return map_sz; +} + +static char *internal_map_name(struct bpf_object *obj, const char *real_name) +{ + char map_name[BPF_OBJ_NAME_LEN], *p; + int pfx_len, sfx_len = max((size_t)7, strlen(real_name)); + + /* This is one of the more confusing parts of libbpf for various + * reasons, some of which are historical. The original idea for naming + * internal names was to include as much of BPF object name prefix as + * possible, so that it can be distinguished from similar internal + * maps of a different BPF object. + * As an example, let's say we have bpf_object named 'my_object_name' + * and internal map corresponding to '.rodata' ELF section. The final + * map name advertised to user and to the kernel will be + * 'my_objec.rodata', taking first 8 characters of object name and + * entire 7 characters of '.rodata'. + * Somewhat confusingly, if internal map ELF section name is shorter + * than 7 characters, e.g., '.bss', we still reserve 7 characters + * for the suffix, even though we only have 4 actual characters, and + * resulting map will be called 'my_objec.bss', not even using all 15 + * characters allowed by the kernel. Oh well, at least the truncated + * object name is somewhat consistent in this case. But if the map + * name is '.kconfig', we'll still have entirety of '.kconfig' added + * (8 chars) and thus will be left with only first 7 characters of the + * object name ('my_obje'). Happy guessing, user, that the final map + * name will be "my_obje.kconfig". + * Now, with libbpf starting to support arbitrarily named .rodata.* + * and .data.* data sections, it's possible that ELF section name is + * longer than allowed 15 chars, so we now need to be careful to take + * only up to 15 first characters of ELF name, taking no BPF object + * name characters at all. So '.rodata.abracadabra' will result in + * '.rodata.abracad' kernel and user-visible name. + * We need to keep this convoluted logic intact for .data, .bss and + * .rodata maps, but for new custom .data.custom and .rodata.custom + * maps we use their ELF names as is, not prepending bpf_object name + * in front. We still need to truncate them to 15 characters for the + * kernel. Full name can be recovered for such maps by using DATASEC + * BTF type associated with such map's value type, though. + */ + if (sfx_len >= BPF_OBJ_NAME_LEN) + sfx_len = BPF_OBJ_NAME_LEN - 1; + + /* if there are two or more dots in map name, it's a custom dot map */ + if (strchr(real_name + 1, '.') != NULL) + pfx_len = 0; + else + pfx_len = min((size_t)BPF_OBJ_NAME_LEN - sfx_len - 1, strlen(obj->name)); + + snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name, + sfx_len, real_name); + + /* sanitise map name to characters allowed by kernel */ + for (p = map_name; *p && p < map_name + sizeof(map_name); p++) + if (!isalnum(*p) && *p != '_' && *p != '.') + *p = '_'; + + return strdup(map_name); +} + +static int +map_fill_btf_type_info(struct bpf_object *obj, struct bpf_map *map); + +/* Internal BPF map is mmap()'able only if at least one of corresponding + * DATASEC's VARs are to be exposed through BPF skeleton. I.e., it's a GLOBAL + * variable and it's not marked as __hidden (which turns it into, effectively, + * a STATIC variable). + */ +static bool map_is_mmapable(struct bpf_object *obj, struct bpf_map *map) +{ + const struct btf_type *t, *vt; + struct btf_var_secinfo *vsi; + int i, n; + + if (!map->btf_value_type_id) + return false; + + t = btf__type_by_id(obj->btf, map->btf_value_type_id); + if (!btf_is_datasec(t)) + return false; + + vsi = btf_var_secinfos(t); + for (i = 0, n = btf_vlen(t); i < n; i++, vsi++) { + vt = btf__type_by_id(obj->btf, vsi->type); + if (!btf_is_var(vt)) + continue; + + if (btf_var(vt)->linkage != BTF_VAR_STATIC) + return true; + } + + return false; +} + +static int +bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, + const char *real_name, int sec_idx, void *data, size_t data_sz) +{ + struct bpf_map_def *def; + struct bpf_map *map; + int err; + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + + map->libbpf_type = type; + map->sec_idx = sec_idx; + map->sec_offset = 0; + map->real_name = strdup(real_name); + map->name = internal_map_name(obj, real_name); + if (!map->real_name || !map->name) { + zfree(&map->real_name); + zfree(&map->name); + return -ENOMEM; + } + + def = &map->def; + def->type = BPF_MAP_TYPE_ARRAY; + def->key_size = sizeof(int); + def->value_size = data_sz; + def->max_entries = 1; + def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG + ? BPF_F_RDONLY_PROG : 0; + + /* failures are fine because of maps like .rodata.str1.1 */ + (void) map_fill_btf_type_info(obj, map); + + if (map_is_mmapable(obj, map)) + def->map_flags |= BPF_F_MMAPABLE; + + pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", + map->name, map->sec_idx, map->sec_offset, def->map_flags); + + map->mmaped = mmap(NULL, bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (map->mmaped == MAP_FAILED) { + err = -errno; + map->mmaped = NULL; + pr_warn("failed to alloc map '%s' content buffer: %d\n", + map->name, err); + zfree(&map->real_name); + zfree(&map->name); + return err; + } + + if (data) + memcpy(map->mmaped, data, data_sz); + + pr_debug("map %td is \"%s\"\n", map - obj->maps, map->name); + return 0; +} + +static int bpf_object__init_global_data_maps(struct bpf_object *obj) +{ + struct elf_sec_desc *sec_desc; + const char *sec_name; + int err = 0, sec_idx; + + /* + * Populate obj->maps with libbpf internal maps. + */ + for (sec_idx = 1; sec_idx < obj->efile.sec_cnt; sec_idx++) { + sec_desc = &obj->efile.secs[sec_idx]; + + /* Skip recognized sections with size 0. */ + if (!sec_desc->data || sec_desc->data->d_size == 0) + continue; + + switch (sec_desc->sec_type) { + case SEC_DATA: + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_DATA, + sec_name, sec_idx, + sec_desc->data->d_buf, + sec_desc->data->d_size); + break; + case SEC_RODATA: + obj->has_rodata = true; + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_RODATA, + sec_name, sec_idx, + sec_desc->data->d_buf, + sec_desc->data->d_size); + break; + case SEC_BSS: + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_BSS, + sec_name, sec_idx, + NULL, + sec_desc->data->d_size); + break; + default: + /* skip */ + break; + } + if (err) + return err; + } + return 0; +} + + +static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, + const void *name) +{ + int i; + + for (i = 0; i < obj->nr_extern; i++) { + if (strcmp(obj->externs[i].name, name) == 0) + return &obj->externs[i]; + } + return NULL; +} + +static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, + char value) +{ + switch (ext->kcfg.type) { + case KCFG_BOOL: + if (value == 'm') { + pr_warn("extern (kcfg) '%s': value '%c' implies tristate or char type\n", + ext->name, value); + return -EINVAL; + } + *(bool *)ext_val = value == 'y' ? true : false; + break; + case KCFG_TRISTATE: + if (value == 'y') + *(enum libbpf_tristate *)ext_val = TRI_YES; + else if (value == 'm') + *(enum libbpf_tristate *)ext_val = TRI_MODULE; + else /* value == 'n' */ + *(enum libbpf_tristate *)ext_val = TRI_NO; + break; + case KCFG_CHAR: + *(char *)ext_val = value; + break; + case KCFG_UNKNOWN: + case KCFG_INT: + case KCFG_CHAR_ARR: + default: + pr_warn("extern (kcfg) '%s': value '%c' implies bool, tristate, or char type\n", + ext->name, value); + return -EINVAL; + } + ext->is_set = true; + return 0; +} + +static int set_kcfg_value_str(struct extern_desc *ext, char *ext_val, + const char *value) +{ + size_t len; + + if (ext->kcfg.type != KCFG_CHAR_ARR) { + pr_warn("extern (kcfg) '%s': value '%s' implies char array type\n", + ext->name, value); + return -EINVAL; + } + + len = strlen(value); + if (value[len - 1] != '"') { + pr_warn("extern (kcfg) '%s': invalid string config '%s'\n", + ext->name, value); + return -EINVAL; + } + + /* strip quotes */ + len -= 2; + if (len >= ext->kcfg.sz) { + pr_warn("extern (kcfg) '%s': long string '%s' of (%zu bytes) truncated to %d bytes\n", + ext->name, value, len, ext->kcfg.sz - 1); + len = ext->kcfg.sz - 1; + } + memcpy(ext_val, value + 1, len); + ext_val[len] = '\0'; + ext->is_set = true; + return 0; +} + +static int parse_u64(const char *value, __u64 *res) +{ + char *value_end; + int err; + + errno = 0; + *res = strtoull(value, &value_end, 0); + if (errno) { + err = -errno; + pr_warn("failed to parse '%s' as integer: %d\n", value, err); + return err; + } + if (*value_end) { + pr_warn("failed to parse '%s' as integer completely\n", value); + return -EINVAL; + } + return 0; +} + +static bool is_kcfg_value_in_range(const struct extern_desc *ext, __u64 v) +{ + int bit_sz = ext->kcfg.sz * 8; + + if (ext->kcfg.sz == 8) + return true; + + /* Validate that value stored in u64 fits in integer of `ext->sz` + * bytes size without any loss of information. If the target integer + * is signed, we rely on the following limits of integer type of + * Y bits and subsequent transformation: + * + * -2^(Y-1) <= X <= 2^(Y-1) - 1 + * 0 <= X + 2^(Y-1) <= 2^Y - 1 + * 0 <= X + 2^(Y-1) < 2^Y + * + * For unsigned target integer, check that all the (64 - Y) bits are + * zero. + */ + if (ext->kcfg.is_signed) + return v + (1ULL << (bit_sz - 1)) < (1ULL << bit_sz); + else + return (v >> bit_sz) == 0; +} + +static int set_kcfg_value_num(struct extern_desc *ext, void *ext_val, + __u64 value) +{ + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR && + ext->kcfg.type != KCFG_BOOL) { + pr_warn("extern (kcfg) '%s': value '%llu' implies integer, char, or boolean type\n", + ext->name, (unsigned long long)value); + return -EINVAL; + } + if (ext->kcfg.type == KCFG_BOOL && value > 1) { + pr_warn("extern (kcfg) '%s': value '%llu' isn't boolean compatible\n", + ext->name, (unsigned long long)value); + return -EINVAL; + + } + if (!is_kcfg_value_in_range(ext, value)) { + pr_warn("extern (kcfg) '%s': value '%llu' doesn't fit in %d bytes\n", + ext->name, (unsigned long long)value, ext->kcfg.sz); + return -ERANGE; + } + switch (ext->kcfg.sz) { + case 1: + *(__u8 *)ext_val = value; + break; + case 2: + *(__u16 *)ext_val = value; + break; + case 4: + *(__u32 *)ext_val = value; + break; + case 8: + *(__u64 *)ext_val = value; + break; + default: + return -EINVAL; + } + ext->is_set = true; + return 0; +} + +static int bpf_object__process_kconfig_line(struct bpf_object *obj, + char *buf, void *data) +{ + struct extern_desc *ext; + char *sep, *value; + int len, err = 0; + void *ext_val; + __u64 num; + + if (!str_has_pfx(buf, "CONFIG_")) + return 0; + + sep = strchr(buf, '='); + if (!sep) { + pr_warn("failed to parse '%s': no separator\n", buf); + return -EINVAL; + } + + /* Trim ending '\n' */ + len = strlen(buf); + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + /* Split on '=' and ensure that a value is present. */ + *sep = '\0'; + if (!sep[1]) { + *sep = '='; + pr_warn("failed to parse '%s': no value\n", buf); + return -EINVAL; + } + + ext = find_extern_by_name(obj, buf); + if (!ext || ext->is_set) + return 0; + + ext_val = data + ext->kcfg.data_off; + value = sep + 1; + + switch (*value) { + case 'y': case 'n': case 'm': + err = set_kcfg_value_tri(ext, ext_val, *value); + break; + case '"': + err = set_kcfg_value_str(ext, ext_val, value); + break; + default: + /* assume integer */ + err = parse_u64(value, &num); + if (err) { + pr_warn("extern (kcfg) '%s': value '%s' isn't a valid integer\n", ext->name, value); + return err; + } + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR) { + pr_warn("extern (kcfg) '%s': value '%s' implies integer type\n", ext->name, value); + return -EINVAL; + } + err = set_kcfg_value_num(ext, ext_val, num); + break; + } + if (err) + return err; + pr_debug("extern (kcfg) '%s': set to %s\n", ext->name, value); + return 0; +} + +static int bpf_object__read_kconfig_file(struct bpf_object *obj, void *data) +{ + char buf[PATH_MAX]; + struct utsname uts; + int len, err = 0; + gzFile file; + + uname(&uts); + len = snprintf(buf, PATH_MAX, "/boot/config-%s", uts.release); + if (len < 0) + return -EINVAL; + else if (len >= PATH_MAX) + return -ENAMETOOLONG; + + /* gzopen also accepts uncompressed files. */ + file = gzopen(buf, "r"); + if (!file) + file = gzopen("/proc/config.gz", "r"); + + if (!file) { + pr_warn("failed to open system Kconfig\n"); + return -ENOENT; + } + + while (gzgets(file, buf, sizeof(buf))) { + err = bpf_object__process_kconfig_line(obj, buf, data); + if (err) { + pr_warn("error parsing system Kconfig line '%s': %d\n", + buf, err); + goto out; + } + } + +out: + gzclose(file); + return err; +} + +static int bpf_object__read_kconfig_mem(struct bpf_object *obj, + const char *config, void *data) +{ + char buf[PATH_MAX]; + int err = 0; + FILE *file; + + file = fmemopen((void *)config, strlen(config), "r"); + if (!file) { + err = -errno; + pr_warn("failed to open in-memory Kconfig: %d\n", err); + return err; + } + + while (fgets(buf, sizeof(buf), file)) { + err = bpf_object__process_kconfig_line(obj, buf, data); + if (err) { + pr_warn("error parsing in-memory Kconfig line '%s': %d\n", + buf, err); + break; + } + } + + fclose(file); + return err; +} + +static int bpf_object__init_kconfig_map(struct bpf_object *obj) +{ + struct extern_desc *last_ext = NULL, *ext; + size_t map_sz; + int i, err; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type == EXT_KCFG) + last_ext = ext; + } + + if (!last_ext) + return 0; + + map_sz = last_ext->kcfg.data_off + last_ext->kcfg.sz; + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_KCONFIG, + ".kconfig", obj->efile.symbols_shndx, + NULL, map_sz); + if (err) + return err; + + obj->kconfig_map_idx = obj->nr_maps - 1; + + return 0; +} + +const struct btf_type * +skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + + if (res_id) + *res_id = id; + + while (btf_is_mod(t) || btf_is_typedef(t)) { + if (res_id) + *res_id = t->type; + t = btf__type_by_id(btf, t->type); + } + + return t; +} + +static const struct btf_type * +resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id) +{ + const struct btf_type *t; + + t = skip_mods_and_typedefs(btf, id, NULL); + if (!btf_is_ptr(t)) + return NULL; + + t = skip_mods_and_typedefs(btf, t->type, res_id); + + return btf_is_func_proto(t) ? t : NULL; +} + +static const char *__btf_kind_str(__u16 kind) +{ + switch (kind) { + case BTF_KIND_UNKN: return "void"; + case BTF_KIND_INT: return "int"; + case BTF_KIND_PTR: return "ptr"; + case BTF_KIND_ARRAY: return "array"; + case BTF_KIND_STRUCT: return "struct"; + case BTF_KIND_UNION: return "union"; + case BTF_KIND_ENUM: return "enum"; + case BTF_KIND_FWD: return "fwd"; + case BTF_KIND_TYPEDEF: return "typedef"; + case BTF_KIND_VOLATILE: return "volatile"; + case BTF_KIND_CONST: return "const"; + case BTF_KIND_RESTRICT: return "restrict"; + case BTF_KIND_FUNC: return "func"; + case BTF_KIND_FUNC_PROTO: return "func_proto"; + case BTF_KIND_VAR: return "var"; + case BTF_KIND_DATASEC: return "datasec"; + case BTF_KIND_FLOAT: return "float"; + case BTF_KIND_DECL_TAG: return "decl_tag"; + case BTF_KIND_TYPE_TAG: return "type_tag"; + case BTF_KIND_ENUM64: return "enum64"; + default: return "unknown"; + } +} + +const char *btf_kind_str(const struct btf_type *t) +{ + return __btf_kind_str(btf_kind(t)); +} + +/* + * Fetch integer attribute of BTF map definition. Such attributes are + * represented using a pointer to an array, in which dimensionality of array + * encodes specified integer value. E.g., int (*type)[BPF_MAP_TYPE_ARRAY]; + * encodes `type => BPF_MAP_TYPE_ARRAY` key/value pair completely using BTF + * type definition, while using only sizeof(void *) space in ELF data section. + */ +static bool get_map_field_int(const char *map_name, const struct btf *btf, + const struct btf_member *m, __u32 *res) +{ + const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL); + const char *name = btf__name_by_offset(btf, m->name_off); + const struct btf_array *arr_info; + const struct btf_type *arr_t; + + if (!btf_is_ptr(t)) { + pr_warn("map '%s': attr '%s': expected PTR, got %s.\n", + map_name, name, btf_kind_str(t)); + return false; + } + + arr_t = btf__type_by_id(btf, t->type); + if (!arr_t) { + pr_warn("map '%s': attr '%s': type [%u] not found.\n", + map_name, name, t->type); + return false; + } + if (!btf_is_array(arr_t)) { + pr_warn("map '%s': attr '%s': expected ARRAY, got %s.\n", + map_name, name, btf_kind_str(arr_t)); + return false; + } + arr_info = btf_array(arr_t); + *res = arr_info->nelems; + return true; +} + +static int pathname_concat(char *buf, size_t buf_sz, const char *path, const char *name) +{ + int len; + + len = snprintf(buf, buf_sz, "%s/%s", path, name); + if (len < 0) + return -EINVAL; + if (len >= buf_sz) + return -ENAMETOOLONG; + + return 0; +} + +static int build_map_pin_path(struct bpf_map *map, const char *path) +{ + char buf[PATH_MAX]; + int err; + + if (!path) + path = "/sys/fs/bpf"; + + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return err; + + return bpf_map__set_pin_path(map, buf); +} + +/* should match definition in bpf_helpers.h */ +enum libbpf_pin_type { + LIBBPF_PIN_NONE, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + LIBBPF_PIN_BY_NAME, +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def) +{ + const struct btf_type *t; + const struct btf_member *m; + bool is_inner = inner_def == NULL; + int vlen, i; + + vlen = btf_vlen(def_t); + m = btf_members(def_t); + for (i = 0; i < vlen; i++, m++) { + const char *name = btf__name_by_offset(btf, m->name_off); + + if (!name) { + pr_warn("map '%s': invalid field #%d.\n", map_name, i); + return -EINVAL; + } + if (strcmp(name, "type") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->map_type)) + return -EINVAL; + map_def->parts |= MAP_DEF_MAP_TYPE; + } else if (strcmp(name, "max_entries") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->max_entries)) + return -EINVAL; + map_def->parts |= MAP_DEF_MAX_ENTRIES; + } else if (strcmp(name, "map_flags") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->map_flags)) + return -EINVAL; + map_def->parts |= MAP_DEF_MAP_FLAGS; + } else if (strcmp(name, "numa_node") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->numa_node)) + return -EINVAL; + map_def->parts |= MAP_DEF_NUMA_NODE; + } else if (strcmp(name, "key_size") == 0) { + __u32 sz; + + if (!get_map_field_int(map_name, btf, m, &sz)) + return -EINVAL; + if (map_def->key_size && map_def->key_size != sz) { + pr_warn("map '%s': conflicting key size %u != %u.\n", + map_name, map_def->key_size, sz); + return -EINVAL; + } + map_def->key_size = sz; + map_def->parts |= MAP_DEF_KEY_SIZE; + } else if (strcmp(name, "key") == 0) { + __s64 sz; + + t = btf__type_by_id(btf, m->type); + if (!t) { + pr_warn("map '%s': key type [%d] not found.\n", + map_name, m->type); + return -EINVAL; + } + if (!btf_is_ptr(t)) { + pr_warn("map '%s': key spec is not PTR: %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + sz = btf__resolve_size(btf, t->type); + if (sz < 0) { + pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); + return sz; + } + if (map_def->key_size && map_def->key_size != sz) { + pr_warn("map '%s': conflicting key size %u != %zd.\n", + map_name, map_def->key_size, (ssize_t)sz); + return -EINVAL; + } + map_def->key_size = sz; + map_def->key_type_id = t->type; + map_def->parts |= MAP_DEF_KEY_SIZE | MAP_DEF_KEY_TYPE; + } else if (strcmp(name, "value_size") == 0) { + __u32 sz; + + if (!get_map_field_int(map_name, btf, m, &sz)) + return -EINVAL; + if (map_def->value_size && map_def->value_size != sz) { + pr_warn("map '%s': conflicting value size %u != %u.\n", + map_name, map_def->value_size, sz); + return -EINVAL; + } + map_def->value_size = sz; + map_def->parts |= MAP_DEF_VALUE_SIZE; + } else if (strcmp(name, "value") == 0) { + __s64 sz; + + t = btf__type_by_id(btf, m->type); + if (!t) { + pr_warn("map '%s': value type [%d] not found.\n", + map_name, m->type); + return -EINVAL; + } + if (!btf_is_ptr(t)) { + pr_warn("map '%s': value spec is not PTR: %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + sz = btf__resolve_size(btf, t->type); + if (sz < 0) { + pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); + return sz; + } + if (map_def->value_size && map_def->value_size != sz) { + pr_warn("map '%s': conflicting value size %u != %zd.\n", + map_name, map_def->value_size, (ssize_t)sz); + return -EINVAL; + } + map_def->value_size = sz; + map_def->value_type_id = t->type; + map_def->parts |= MAP_DEF_VALUE_SIZE | MAP_DEF_VALUE_TYPE; + } + else if (strcmp(name, "values") == 0) { + bool is_map_in_map = bpf_map_type__is_map_in_map(map_def->map_type); + bool is_prog_array = map_def->map_type == BPF_MAP_TYPE_PROG_ARRAY; + const char *desc = is_map_in_map ? "map-in-map inner" : "prog-array value"; + char inner_map_name[128]; + int err; + + if (is_inner) { + pr_warn("map '%s': multi-level inner maps not supported.\n", + map_name); + return -ENOTSUP; + } + if (i != vlen - 1) { + pr_warn("map '%s': '%s' member should be last.\n", + map_name, name); + return -EINVAL; + } + if (!is_map_in_map && !is_prog_array) { + pr_warn("map '%s': should be map-in-map or prog-array.\n", + map_name); + return -ENOTSUP; + } + if (map_def->value_size && map_def->value_size != 4) { + pr_warn("map '%s': conflicting value size %u != 4.\n", + map_name, map_def->value_size); + return -EINVAL; + } + map_def->value_size = 4; + t = btf__type_by_id(btf, m->type); + if (!t) { + pr_warn("map '%s': %s type [%d] not found.\n", + map_name, desc, m->type); + return -EINVAL; + } + if (!btf_is_array(t) || btf_array(t)->nelems) { + pr_warn("map '%s': %s spec is not a zero-sized array.\n", + map_name, desc); + return -EINVAL; + } + t = skip_mods_and_typedefs(btf, btf_array(t)->type, NULL); + if (!btf_is_ptr(t)) { + pr_warn("map '%s': %s def is of unexpected kind %s.\n", + map_name, desc, btf_kind_str(t)); + return -EINVAL; + } + t = skip_mods_and_typedefs(btf, t->type, NULL); + if (is_prog_array) { + if (!btf_is_func_proto(t)) { + pr_warn("map '%s': prog-array value def is of unexpected kind %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + continue; + } + if (!btf_is_struct(t)) { + pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", map_name); + err = parse_btf_map_def(inner_map_name, btf, t, strict, inner_def, NULL); + if (err) + return err; + + map_def->parts |= MAP_DEF_INNER_MAP; + } else if (strcmp(name, "pinning") == 0) { + __u32 val; + + if (is_inner) { + pr_warn("map '%s': inner def can't be pinned.\n", map_name); + return -EINVAL; + } + if (!get_map_field_int(map_name, btf, m, &val)) + return -EINVAL; + if (val != LIBBPF_PIN_NONE && val != LIBBPF_PIN_BY_NAME) { + pr_warn("map '%s': invalid pinning value %u.\n", + map_name, val); + return -EINVAL; + } + map_def->pinning = val; + map_def->parts |= MAP_DEF_PINNING; + } else if (strcmp(name, "map_extra") == 0) { + __u32 map_extra; + + if (!get_map_field_int(map_name, btf, m, &map_extra)) + return -EINVAL; + map_def->map_extra = map_extra; + map_def->parts |= MAP_DEF_MAP_EXTRA; + } else { + if (strict) { + pr_warn("map '%s': unknown field '%s'.\n", map_name, name); + return -ENOTSUP; + } + pr_debug("map '%s': ignoring unknown field '%s'.\n", map_name, name); + } + } + + if (map_def->map_type == BPF_MAP_TYPE_UNSPEC) { + pr_warn("map '%s': map type isn't specified.\n", map_name); + return -EINVAL; + } + + return 0; +} + +static size_t adjust_ringbuf_sz(size_t sz) +{ + __u32 page_sz = sysconf(_SC_PAGE_SIZE); + __u32 mul; + + /* if user forgot to set any size, make sure they see error */ + if (sz == 0) + return 0; + /* Kernel expects BPF_MAP_TYPE_RINGBUF's max_entries to be + * a power-of-2 multiple of kernel's page size. If user diligently + * satisified these conditions, pass the size through. + */ + if ((sz % page_sz) == 0 && is_pow_of_2(sz / page_sz)) + return sz; + + /* Otherwise find closest (page_sz * power_of_2) product bigger than + * user-set size to satisfy both user size request and kernel + * requirements and substitute correct max_entries for map creation. + */ + for (mul = 1; mul <= UINT_MAX / page_sz; mul <<= 1) { + if (mul * page_sz > sz) + return mul * page_sz; + } + + /* if it's impossible to satisfy the conditions (i.e., user size is + * very close to UINT_MAX but is not a power-of-2 multiple of + * page_size) then just return original size and let kernel reject it + */ + return sz; +} + +static bool map_is_ringbuf(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_RINGBUF || + map->def.type == BPF_MAP_TYPE_USER_RINGBUF; +} + +static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) +{ + map->def.type = def->map_type; + map->def.key_size = def->key_size; + map->def.value_size = def->value_size; + map->def.max_entries = def->max_entries; + map->def.map_flags = def->map_flags; + map->map_extra = def->map_extra; + + map->numa_node = def->numa_node; + map->btf_key_type_id = def->key_type_id; + map->btf_value_type_id = def->value_type_id; + + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map_is_ringbuf(map)) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + + if (def->parts & MAP_DEF_MAP_TYPE) + pr_debug("map '%s': found type = %u.\n", map->name, def->map_type); + + if (def->parts & MAP_DEF_KEY_TYPE) + pr_debug("map '%s': found key [%u], sz = %u.\n", + map->name, def->key_type_id, def->key_size); + else if (def->parts & MAP_DEF_KEY_SIZE) + pr_debug("map '%s': found key_size = %u.\n", map->name, def->key_size); + + if (def->parts & MAP_DEF_VALUE_TYPE) + pr_debug("map '%s': found value [%u], sz = %u.\n", + map->name, def->value_type_id, def->value_size); + else if (def->parts & MAP_DEF_VALUE_SIZE) + pr_debug("map '%s': found value_size = %u.\n", map->name, def->value_size); + + if (def->parts & MAP_DEF_MAX_ENTRIES) + pr_debug("map '%s': found max_entries = %u.\n", map->name, def->max_entries); + if (def->parts & MAP_DEF_MAP_FLAGS) + pr_debug("map '%s': found map_flags = 0x%x.\n", map->name, def->map_flags); + if (def->parts & MAP_DEF_MAP_EXTRA) + pr_debug("map '%s': found map_extra = 0x%llx.\n", map->name, + (unsigned long long)def->map_extra); + if (def->parts & MAP_DEF_PINNING) + pr_debug("map '%s': found pinning = %u.\n", map->name, def->pinning); + if (def->parts & MAP_DEF_NUMA_NODE) + pr_debug("map '%s': found numa_node = %u.\n", map->name, def->numa_node); + + if (def->parts & MAP_DEF_INNER_MAP) + pr_debug("map '%s': found inner map definition.\n", map->name); +} + +static const char *btf_var_linkage_str(__u32 linkage) +{ + switch (linkage) { + case BTF_VAR_STATIC: return "static"; + case BTF_VAR_GLOBAL_ALLOCATED: return "global"; + case BTF_VAR_GLOBAL_EXTERN: return "extern"; + default: return "unknown"; + } +} + +static int bpf_object__init_user_btf_map(struct bpf_object *obj, + const struct btf_type *sec, + int var_idx, int sec_idx, + const Elf_Data *data, bool strict, + const char *pin_root_path) +{ + struct btf_map_def map_def = {}, inner_def = {}; + const struct btf_type *var, *def; + const struct btf_var_secinfo *vi; + const struct btf_var *var_extra; + const char *map_name; + struct bpf_map *map; + int err; + + vi = btf_var_secinfos(sec) + var_idx; + var = btf__type_by_id(obj->btf, vi->type); + var_extra = btf_var(var); + map_name = btf__name_by_offset(obj->btf, var->name_off); + + if (map_name == NULL || map_name[0] == '\0') { + pr_warn("map #%d: empty name.\n", var_idx); + return -EINVAL; + } + if ((__u64)vi->offset + vi->size > data->d_size) { + pr_warn("map '%s' BTF data is corrupted.\n", map_name); + return -EINVAL; + } + if (!btf_is_var(var)) { + pr_warn("map '%s': unexpected var kind %s.\n", + map_name, btf_kind_str(var)); + return -EINVAL; + } + if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED) { + pr_warn("map '%s': unsupported map linkage %s.\n", + map_name, btf_var_linkage_str(var_extra->linkage)); + return -EOPNOTSUPP; + } + + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (!btf_is_struct(def)) { + pr_warn("map '%s': unexpected def kind %s.\n", + map_name, btf_kind_str(var)); + return -EINVAL; + } + if (def->size > vi->size) { + pr_warn("map '%s': invalid def size.\n", map_name); + return -EINVAL; + } + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + map->name = strdup(map_name); + if (!map->name) { + pr_warn("map '%s': failed to alloc map name.\n", map_name); + return -ENOMEM; + } + map->libbpf_type = LIBBPF_MAP_UNSPEC; + map->def.type = BPF_MAP_TYPE_UNSPEC; + map->sec_idx = sec_idx; + map->sec_offset = vi->offset; + map->btf_var_idx = var_idx; + pr_debug("map '%s': at sec_idx %d, offset %zu.\n", + map_name, map->sec_idx, map->sec_offset); + + err = parse_btf_map_def(map->name, obj->btf, def, strict, &map_def, &inner_def); + if (err) + return err; + + fill_map_from_def(map, &map_def); + + if (map_def.pinning == LIBBPF_PIN_BY_NAME) { + err = build_map_pin_path(map, pin_root_path); + if (err) { + pr_warn("map '%s': couldn't build pin path.\n", map->name); + return err; + } + } + + if (map_def.parts & MAP_DEF_INNER_MAP) { + map->inner_map = calloc(1, sizeof(*map->inner_map)); + if (!map->inner_map) + return -ENOMEM; + map->inner_map->fd = -1; + map->inner_map->sec_idx = sec_idx; + map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); + if (!map->inner_map->name) + return -ENOMEM; + sprintf(map->inner_map->name, "%s.inner", map_name); + + fill_map_from_def(map->inner_map, &inner_def); + } + + err = map_fill_btf_type_info(obj, map); + if (err) + return err; + + return 0; +} + +static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, + const char *pin_root_path) +{ + const struct btf_type *sec = NULL; + int nr_types, i, vlen, err; + const struct btf_type *t; + const char *name; + Elf_Data *data; + Elf_Scn *scn; + + if (obj->efile.btf_maps_shndx < 0) + return 0; + + scn = elf_sec_by_idx(obj, obj->efile.btf_maps_shndx); + data = elf_sec_data(obj, scn); + if (!scn || !data) { + pr_warn("elf: failed to get %s map definitions for %s\n", + MAPS_ELF_SEC, obj->path); + return -EINVAL; + } + + nr_types = btf__type_cnt(obj->btf); + for (i = 1; i < nr_types; i++) { + t = btf__type_by_id(obj->btf, i); + if (!btf_is_datasec(t)) + continue; + name = btf__name_by_offset(obj->btf, t->name_off); + if (strcmp(name, MAPS_ELF_SEC) == 0) { + sec = t; + obj->efile.btf_maps_sec_btf_id = i; + break; + } + } + + if (!sec) { + pr_warn("DATASEC '%s' not found.\n", MAPS_ELF_SEC); + return -ENOENT; + } + + vlen = btf_vlen(sec); + for (i = 0; i < vlen; i++) { + err = bpf_object__init_user_btf_map(obj, sec, i, + obj->efile.btf_maps_shndx, + data, strict, + pin_root_path); + if (err) + return err; + } + + return 0; +} + +static int bpf_object__init_maps(struct bpf_object *obj, + const struct bpf_object_open_opts *opts) +{ + const char *pin_root_path; + bool strict; + int err = 0; + + strict = !OPTS_GET(opts, relaxed_maps, false); + pin_root_path = OPTS_GET(opts, pin_root_path, NULL); + + err = err ?: bpf_object__init_user_btf_maps(obj, strict, pin_root_path); + err = err ?: bpf_object__init_global_data_maps(obj); + err = err ?: bpf_object__init_kconfig_map(obj); + err = err ?: bpf_object__init_struct_ops_maps(obj); + + return err; +} + +static bool section_have_execinstr(struct bpf_object *obj, int idx) +{ + Elf64_Shdr *sh; + + sh = elf_sec_hdr(obj, elf_sec_by_idx(obj, idx)); + if (!sh) + return false; + + return sh->sh_flags & SHF_EXECINSTR; +} + +static bool btf_needs_sanitization(struct bpf_object *obj) +{ + bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC); + bool has_datasec = kernel_supports(obj, FEAT_BTF_DATASEC); + bool has_float = kernel_supports(obj, FEAT_BTF_FLOAT); + bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); + bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); + bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); + + return !has_func || !has_datasec || !has_func_global || !has_float || + !has_decl_tag || !has_type_tag || !has_enum64; +} + +static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) +{ + bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC); + bool has_datasec = kernel_supports(obj, FEAT_BTF_DATASEC); + bool has_float = kernel_supports(obj, FEAT_BTF_FLOAT); + bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); + bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); + bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); + int enum64_placeholder_id = 0; + struct btf_type *t; + int i, j, vlen; + + for (i = 1; i < btf__type_cnt(btf); i++) { + t = (struct btf_type *)btf__type_by_id(btf, i); + + if ((!has_datasec && btf_is_var(t)) || (!has_decl_tag && btf_is_decl_tag(t))) { + /* replace VAR/DECL_TAG with INT */ + t->info = BTF_INFO_ENC(BTF_KIND_INT, 0, 0); + /* + * using size = 1 is the safest choice, 4 will be too + * big and cause kernel BTF validation failure if + * original variable took less than 4 bytes + */ + t->size = 1; + *(int *)(t + 1) = BTF_INT_ENC(0, 0, 8); + } else if (!has_datasec && btf_is_datasec(t)) { + /* replace DATASEC with STRUCT */ + const struct btf_var_secinfo *v = btf_var_secinfos(t); + struct btf_member *m = btf_members(t); + struct btf_type *vt; + char *name; + + name = (char *)btf__name_by_offset(btf, t->name_off); + while (*name) { + if (*name == '.') + *name = '_'; + name++; + } + + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, vlen); + for (j = 0; j < vlen; j++, v++, m++) { + /* order of field assignments is important */ + m->offset = v->offset * 8; + m->type = v->type; + /* preserve variable name as member name */ + vt = (void *)btf__type_by_id(btf, v->type); + m->name_off = vt->name_off; + } + } else if (!has_func && btf_is_func_proto(t)) { + /* replace FUNC_PROTO with ENUM */ + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_ENUM, 0, vlen); + t->size = sizeof(__u32); /* kernel enforced */ + } else if (!has_func && btf_is_func(t)) { + /* replace FUNC with TYPEDEF */ + t->info = BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0); + } else if (!has_func_global && btf_is_func(t)) { + /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */ + t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0); + } else if (!has_float && btf_is_float(t)) { + /* replace FLOAT with an equally-sized empty STRUCT; + * since C compilers do not accept e.g. "float" as a + * valid struct name, make it anonymous + */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0); + } else if (!has_type_tag && btf_is_type_tag(t)) { + /* replace TYPE_TAG with a CONST */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_CONST, 0, 0); + } else if (!has_enum64 && btf_is_enum(t)) { + /* clear the kflag */ + t->info = btf_type_info(btf_kind(t), btf_vlen(t), false); + } else if (!has_enum64 && btf_is_enum64(t)) { + /* replace ENUM64 with a union */ + struct btf_member *m; + + if (enum64_placeholder_id == 0) { + enum64_placeholder_id = btf__add_int(btf, "enum64_placeholder", 1, 0); + if (enum64_placeholder_id < 0) + return enum64_placeholder_id; + + t = (struct btf_type *)btf__type_by_id(btf, i); + } + + m = btf_members(t); + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_UNION, 0, vlen); + for (j = 0; j < vlen; j++, m++) { + m->type = enum64_placeholder_id; + m->offset = 0; + } + } + } + + return 0; +} + +static bool libbpf_needs_btf(const struct bpf_object *obj) +{ + return obj->efile.btf_maps_shndx >= 0 || + obj->efile.st_ops_shndx >= 0 || + obj->nr_extern > 0; +} + +static bool kernel_needs_btf(const struct bpf_object *obj) +{ + return obj->efile.st_ops_shndx >= 0; +} + +static int bpf_object__init_btf(struct bpf_object *obj, + Elf_Data *btf_data, + Elf_Data *btf_ext_data) +{ + int err = -ENOENT; + + if (btf_data) { + obj->btf = btf__new(btf_data->d_buf, btf_data->d_size); + err = libbpf_get_error(obj->btf); + if (err) { + obj->btf = NULL; + pr_warn("Error loading ELF section %s: %d.\n", BTF_ELF_SEC, err); + goto out; + } + /* enforce 8-byte pointers for BPF-targeted BTFs */ + btf__set_pointer_size(obj->btf, 8); + } + if (btf_ext_data) { + struct btf_ext_info *ext_segs[3]; + int seg_num, sec_num; + + if (!obj->btf) { + pr_debug("Ignore ELF section %s because its depending ELF section %s is not found.\n", + BTF_EXT_ELF_SEC, BTF_ELF_SEC); + goto out; + } + obj->btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size); + err = libbpf_get_error(obj->btf_ext); + if (err) { + pr_warn("Error loading ELF section %s: %d. Ignored and continue.\n", + BTF_EXT_ELF_SEC, err); + obj->btf_ext = NULL; + goto out; + } + + /* setup .BTF.ext to ELF section mapping */ + ext_segs[0] = &obj->btf_ext->func_info; + ext_segs[1] = &obj->btf_ext->line_info; + ext_segs[2] = &obj->btf_ext->core_relo_info; + for (seg_num = 0; seg_num < ARRAY_SIZE(ext_segs); seg_num++) { + struct btf_ext_info *seg = ext_segs[seg_num]; + const struct btf_ext_info_sec *sec; + const char *sec_name; + Elf_Scn *scn; + + if (seg->sec_cnt == 0) + continue; + + seg->sec_idxs = calloc(seg->sec_cnt, sizeof(*seg->sec_idxs)); + if (!seg->sec_idxs) { + err = -ENOMEM; + goto out; + } + + sec_num = 0; + for_each_btf_ext_sec(seg, sec) { + /* preventively increment index to avoid doing + * this before every continue below + */ + sec_num++; + + sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off); + if (str_is_empty(sec_name)) + continue; + scn = elf_sec_by_name(obj, sec_name); + if (!scn) + continue; + + seg->sec_idxs[sec_num - 1] = elf_ndxscn(scn); + } + } + } +out: + if (err && libbpf_needs_btf(obj)) { + pr_warn("BTF is required, but is missing or corrupted.\n"); + return err; + } + return 0; +} + +static int compare_vsi_off(const void *_a, const void *_b) +{ + const struct btf_var_secinfo *a = _a; + const struct btf_var_secinfo *b = _b; + + return a->offset - b->offset; +} + +static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf, + struct btf_type *t) +{ + __u32 size = 0, i, vars = btf_vlen(t); + const char *sec_name = btf__name_by_offset(btf, t->name_off); + struct btf_var_secinfo *vsi; + bool fixup_offsets = false; + int err; + + if (!sec_name) { + pr_debug("No name found in string section for DATASEC kind.\n"); + return -ENOENT; + } + + /* Extern-backing datasecs (.ksyms, .kconfig) have their size and + * variable offsets set at the previous step. Further, not every + * extern BTF VAR has corresponding ELF symbol preserved, so we skip + * all fixups altogether for such sections and go straight to sorting + * VARs within their DATASEC. + */ + if (strcmp(sec_name, KCONFIG_SEC) == 0 || strcmp(sec_name, KSYMS_SEC) == 0) + goto sort_vars; + + /* Clang leaves DATASEC size and VAR offsets as zeroes, so we need to + * fix this up. But BPF static linker already fixes this up and fills + * all the sizes and offsets during static linking. So this step has + * to be optional. But the STV_HIDDEN handling is non-optional for any + * non-extern DATASEC, so the variable fixup loop below handles both + * functions at the same time, paying the cost of BTF VAR <-> ELF + * symbol matching just once. + */ + if (t->size == 0) { + err = find_elf_sec_sz(obj, sec_name, &size); + if (err || !size) { + pr_debug("sec '%s': failed to determine size from ELF: size %u, err %d\n", + sec_name, size, err); + return -ENOENT; + } + + t->size = size; + fixup_offsets = true; + } + + for (i = 0, vsi = btf_var_secinfos(t); i < vars; i++, vsi++) { + const struct btf_type *t_var; + struct btf_var *var; + const char *var_name; + Elf64_Sym *sym; + + t_var = btf__type_by_id(btf, vsi->type); + if (!t_var || !btf_is_var(t_var)) { + pr_debug("sec '%s': unexpected non-VAR type found\n", sec_name); + return -EINVAL; + } + + var = btf_var(t_var); + if (var->linkage == BTF_VAR_STATIC || var->linkage == BTF_VAR_GLOBAL_EXTERN) + continue; + + var_name = btf__name_by_offset(btf, t_var->name_off); + if (!var_name) { + pr_debug("sec '%s': failed to find name of DATASEC's member #%d\n", + sec_name, i); + return -ENOENT; + } + + sym = find_elf_var_sym(obj, var_name); + if (IS_ERR(sym)) { + pr_debug("sec '%s': failed to find ELF symbol for VAR '%s'\n", + sec_name, var_name); + return -ENOENT; + } + + if (fixup_offsets) + vsi->offset = sym->st_value; + + /* if variable is a global/weak symbol, but has restricted + * (STV_HIDDEN or STV_INTERNAL) visibility, mark its BTF VAR + * as static. This follows similar logic for functions (BPF + * subprogs) and influences libbpf's further decisions about + * whether to make global data BPF array maps as + * BPF_F_MMAPABLE. + */ + if (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN + || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL) + var->linkage = BTF_VAR_STATIC; + } + +sort_vars: + qsort(btf_var_secinfos(t), vars, sizeof(*vsi), compare_vsi_off); + return 0; +} + +static int bpf_object_fixup_btf(struct bpf_object *obj) +{ + int i, n, err = 0; + + if (!obj->btf) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + struct btf_type *t = btf_type_by_id(obj->btf, i); + + /* Loader needs to fix up some of the things compiler + * couldn't get its hands on while emitting BTF. This + * is section size and global variable offset. We use + * the info from the ELF itself for this purpose. + */ + if (btf_is_datasec(t)) { + err = btf_fixup_datasec(obj, obj->btf, t); + if (err) + return err; + } + } + + return 0; +} + +static bool prog_needs_vmlinux_btf(struct bpf_program *prog) +{ + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_LSM) + return true; + + /* BPF_PROG_TYPE_TRACING programs which do not attach to other programs + * also need vmlinux BTF + */ + if (prog->type == BPF_PROG_TYPE_TRACING && !prog->attach_prog_fd) + return true; + + return false; +} + +static bool obj_needs_vmlinux_btf(const struct bpf_object *obj) +{ + struct bpf_program *prog; + int i; + + /* CO-RE relocations need kernel BTF, only when btf_custom_path + * is not specified + */ + if (obj->btf_ext && obj->btf_ext->core_relo_info.len && !obj->btf_custom_path) + return true; + + /* Support for typed ksyms needs kernel BTF */ + for (i = 0; i < obj->nr_extern; i++) { + const struct extern_desc *ext; + + ext = &obj->externs[i]; + if (ext->type == EXT_KSYM && ext->ksym.type_id) + return true; + } + + bpf_object__for_each_program(prog, obj) { + if (!prog->autoload) + continue; + if (prog_needs_vmlinux_btf(prog)) + return true; + } + + return false; +} + +static int bpf_object__load_vmlinux_btf(struct bpf_object *obj, bool force) +{ + int err; + + /* btf_vmlinux could be loaded earlier */ + if (obj->btf_vmlinux || obj->gen_loader) + return 0; + + if (!force && !obj_needs_vmlinux_btf(obj)) + return 0; + + obj->btf_vmlinux = btf__load_vmlinux_btf(); + err = libbpf_get_error(obj->btf_vmlinux); + if (err) { + pr_warn("Error loading vmlinux BTF: %d\n", err); + obj->btf_vmlinux = NULL; + return err; + } + return 0; +} + +static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) +{ + struct btf *kern_btf = obj->btf; + bool btf_mandatory, sanitize; + int i, err = 0; + + if (!obj->btf) + return 0; + + if (!kernel_supports(obj, FEAT_BTF)) { + if (kernel_needs_btf(obj)) { + err = -EOPNOTSUPP; + goto report; + } + pr_debug("Kernel doesn't support BTF, skipping uploading it.\n"); + return 0; + } + + /* Even though some subprogs are global/weak, user might prefer more + * permissive BPF verification process that BPF verifier performs for + * static functions, taking into account more context from the caller + * functions. In such case, they need to mark such subprogs with + * __attribute__((visibility("hidden"))) and libbpf will adjust + * corresponding FUNC BTF type to be marked as static and trigger more + * involved BPF verification process. + */ + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *prog = &obj->programs[i]; + struct btf_type *t; + const char *name; + int j, n; + + if (!prog->mark_btf_static || !prog_is_subprog(obj, prog)) + continue; + + n = btf__type_cnt(obj->btf); + for (j = 1; j < n; j++) { + t = btf_type_by_id(obj->btf, j); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, prog->name) != 0) + continue; + + t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_STATIC, 0); + break; + } + } + + sanitize = btf_needs_sanitization(obj); + if (sanitize) { + const void *raw_data; + __u32 sz; + + /* clone BTF to sanitize a copy and leave the original intact */ + raw_data = btf__raw_data(obj->btf, &sz); + kern_btf = btf__new(raw_data, sz); + err = libbpf_get_error(kern_btf); + if (err) + return err; + + /* enforce 8-byte pointers for BPF-targeted BTFs */ + btf__set_pointer_size(obj->btf, 8); + err = bpf_object__sanitize_btf(obj, kern_btf); + if (err) + return err; + } + + if (obj->gen_loader) { + __u32 raw_size = 0; + const void *raw_data = btf__raw_data(kern_btf, &raw_size); + + if (!raw_data) + return -ENOMEM; + bpf_gen__load_btf(obj->gen_loader, raw_data, raw_size); + /* Pretend to have valid FD to pass various fd >= 0 checks. + * This fd == 0 will not be used with any syscall and will be reset to -1 eventually. + */ + btf__set_fd(kern_btf, 0); + } else { + /* currently BPF_BTF_LOAD only supports log_level 1 */ + err = btf_load_into_kernel(kern_btf, obj->log_buf, obj->log_size, + obj->log_level ? 1 : 0); + } + if (sanitize) { + if (!err) { + /* move fd to libbpf's BTF */ + btf__set_fd(obj->btf, btf__fd(kern_btf)); + btf__set_fd(kern_btf, -1); + } + btf__free(kern_btf); + } +report: + if (err) { + btf_mandatory = kernel_needs_btf(obj); + pr_warn("Error loading .BTF into kernel: %d. %s\n", err, + btf_mandatory ? "BTF is mandatory, can't proceed." + : "BTF is optional, ignoring."); + if (!btf_mandatory) + err = 0; + } + return err; +} + +static const char *elf_sym_str(const struct bpf_object *obj, size_t off) +{ + const char *name; + + name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, off); + if (!name) { + pr_warn("elf: failed to get section name string at offset %zu from %s: %s\n", + off, obj->path, elf_errmsg(-1)); + return NULL; + } + + return name; +} + +static const char *elf_sec_str(const struct bpf_object *obj, size_t off) +{ + const char *name; + + name = elf_strptr(obj->efile.elf, obj->efile.shstrndx, off); + if (!name) { + pr_warn("elf: failed to get section name string at offset %zu from %s: %s\n", + off, obj->path, elf_errmsg(-1)); + return NULL; + } + + return name; +} + +static Elf_Scn *elf_sec_by_idx(const struct bpf_object *obj, size_t idx) +{ + Elf_Scn *scn; + + scn = elf_getscn(obj->efile.elf, idx); + if (!scn) { + pr_warn("elf: failed to get section(%zu) from %s: %s\n", + idx, obj->path, elf_errmsg(-1)); + return NULL; + } + return scn; +} + +static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name) +{ + Elf_Scn *scn = NULL; + Elf *elf = obj->efile.elf; + const char *sec_name; + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + sec_name = elf_sec_name(obj, scn); + if (!sec_name) + return NULL; + + if (strcmp(sec_name, name) != 0) + continue; + + return scn; + } + return NULL; +} + +static Elf64_Shdr *elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn) +{ + Elf64_Shdr *shdr; + + if (!scn) + return NULL; + + shdr = elf64_getshdr(scn); + if (!shdr) { + pr_warn("elf: failed to get section(%zu) header from %s: %s\n", + elf_ndxscn(scn), obj->path, elf_errmsg(-1)); + return NULL; + } + + return shdr; +} + +static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn) +{ + const char *name; + Elf64_Shdr *sh; + + if (!scn) + return NULL; + + sh = elf_sec_hdr(obj, scn); + if (!sh) + return NULL; + + name = elf_sec_str(obj, sh->sh_name); + if (!name) { + pr_warn("elf: failed to get section(%zu) name from %s: %s\n", + elf_ndxscn(scn), obj->path, elf_errmsg(-1)); + return NULL; + } + + return name; +} + +static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn) +{ + Elf_Data *data; + + if (!scn) + return NULL; + + data = elf_getdata(scn, 0); + if (!data) { + pr_warn("elf: failed to get section(%zu) %s data from %s: %s\n", + elf_ndxscn(scn), elf_sec_name(obj, scn) ?: "<?>", + obj->path, elf_errmsg(-1)); + return NULL; + } + + return data; +} + +static Elf64_Sym *elf_sym_by_idx(const struct bpf_object *obj, size_t idx) +{ + if (idx >= obj->efile.symbols->d_size / sizeof(Elf64_Sym)) + return NULL; + + return (Elf64_Sym *)obj->efile.symbols->d_buf + idx; +} + +static Elf64_Rel *elf_rel_by_idx(Elf_Data *data, size_t idx) +{ + if (idx >= data->d_size / sizeof(Elf64_Rel)) + return NULL; + + return (Elf64_Rel *)data->d_buf + idx; +} + +static bool is_sec_name_dwarf(const char *name) +{ + /* approximation, but the actual list is too long */ + return str_has_pfx(name, ".debug_"); +} + +static bool ignore_elf_section(Elf64_Shdr *hdr, const char *name) +{ + /* no special handling of .strtab */ + if (hdr->sh_type == SHT_STRTAB) + return true; + + /* ignore .llvm_addrsig section as well */ + if (hdr->sh_type == SHT_LLVM_ADDRSIG) + return true; + + /* no subprograms will lead to an empty .text section, ignore it */ + if (hdr->sh_type == SHT_PROGBITS && hdr->sh_size == 0 && + strcmp(name, ".text") == 0) + return true; + + /* DWARF sections */ + if (is_sec_name_dwarf(name)) + return true; + + if (str_has_pfx(name, ".rel")) { + name += sizeof(".rel") - 1; + /* DWARF section relocations */ + if (is_sec_name_dwarf(name)) + return true; + + /* .BTF and .BTF.ext don't need relocations */ + if (strcmp(name, BTF_ELF_SEC) == 0 || + strcmp(name, BTF_EXT_ELF_SEC) == 0) + return true; + } + + return false; +} + +static int cmp_progs(const void *_a, const void *_b) +{ + const struct bpf_program *a = _a; + const struct bpf_program *b = _b; + + if (a->sec_idx != b->sec_idx) + return a->sec_idx < b->sec_idx ? -1 : 1; + + /* sec_insn_off can't be the same within the section */ + return a->sec_insn_off < b->sec_insn_off ? -1 : 1; +} + +static int bpf_object__elf_collect(struct bpf_object *obj) +{ + struct elf_sec_desc *sec_desc; + Elf *elf = obj->efile.elf; + Elf_Data *btf_ext_data = NULL; + Elf_Data *btf_data = NULL; + int idx = 0, err = 0; + const char *name; + Elf_Data *data; + Elf_Scn *scn; + Elf64_Shdr *sh; + + /* ELF section indices are 0-based, but sec #0 is special "invalid" + * section. Since section count retrieved by elf_getshdrnum() does + * include sec #0, it is already the necessary size of an array to keep + * all the sections. + */ + if (elf_getshdrnum(obj->efile.elf, &obj->efile.sec_cnt)) { + pr_warn("elf: failed to get the number of sections for %s: %s\n", + obj->path, elf_errmsg(-1)); + return -LIBBPF_ERRNO__FORMAT; + } + obj->efile.secs = calloc(obj->efile.sec_cnt, sizeof(*obj->efile.secs)); + if (!obj->efile.secs) + return -ENOMEM; + + /* a bunch of ELF parsing functionality depends on processing symbols, + * so do the first pass and find the symbol table + */ + scn = NULL; + while ((scn = elf_nextscn(elf, scn)) != NULL) { + sh = elf_sec_hdr(obj, scn); + if (!sh) + return -LIBBPF_ERRNO__FORMAT; + + if (sh->sh_type == SHT_SYMTAB) { + if (obj->efile.symbols) { + pr_warn("elf: multiple symbol tables in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + + data = elf_sec_data(obj, scn); + if (!data) + return -LIBBPF_ERRNO__FORMAT; + + idx = elf_ndxscn(scn); + + obj->efile.symbols = data; + obj->efile.symbols_shndx = idx; + obj->efile.strtabidx = sh->sh_link; + } + } + + if (!obj->efile.symbols) { + pr_warn("elf: couldn't find symbol table in %s, stripped object file?\n", + obj->path); + return -ENOENT; + } + + scn = NULL; + while ((scn = elf_nextscn(elf, scn)) != NULL) { + idx = elf_ndxscn(scn); + sec_desc = &obj->efile.secs[idx]; + + sh = elf_sec_hdr(obj, scn); + if (!sh) + return -LIBBPF_ERRNO__FORMAT; + + name = elf_sec_str(obj, sh->sh_name); + if (!name) + return -LIBBPF_ERRNO__FORMAT; + + if (ignore_elf_section(sh, name)) + continue; + + data = elf_sec_data(obj, scn); + if (!data) + return -LIBBPF_ERRNO__FORMAT; + + pr_debug("elf: section(%d) %s, size %ld, link %d, flags %lx, type=%d\n", + idx, name, (unsigned long)data->d_size, + (int)sh->sh_link, (unsigned long)sh->sh_flags, + (int)sh->sh_type); + + if (strcmp(name, "license") == 0) { + err = bpf_object__init_license(obj, data->d_buf, data->d_size); + if (err) + return err; + } else if (strcmp(name, "version") == 0) { + err = bpf_object__init_kversion(obj, data->d_buf, data->d_size); + if (err) + return err; + } else if (strcmp(name, "maps") == 0) { + pr_warn("elf: legacy map definitions in 'maps' section are not supported by libbpf v1.0+\n"); + return -ENOTSUP; + } else if (strcmp(name, MAPS_ELF_SEC) == 0) { + obj->efile.btf_maps_shndx = idx; + } else if (strcmp(name, BTF_ELF_SEC) == 0) { + if (sh->sh_type != SHT_PROGBITS) + return -LIBBPF_ERRNO__FORMAT; + btf_data = data; + } else if (strcmp(name, BTF_EXT_ELF_SEC) == 0) { + if (sh->sh_type != SHT_PROGBITS) + return -LIBBPF_ERRNO__FORMAT; + btf_ext_data = data; + } else if (sh->sh_type == SHT_SYMTAB) { + /* already processed during the first pass above */ + } else if (sh->sh_type == SHT_PROGBITS && data->d_size > 0) { + if (sh->sh_flags & SHF_EXECINSTR) { + if (strcmp(name, ".text") == 0) + obj->efile.text_shndx = idx; + err = bpf_object__add_programs(obj, data, name, idx); + if (err) + return err; + } else if (strcmp(name, DATA_SEC) == 0 || + str_has_pfx(name, DATA_SEC ".")) { + sec_desc->sec_type = SEC_DATA; + sec_desc->shdr = sh; + sec_desc->data = data; + } else if (strcmp(name, RODATA_SEC) == 0 || + str_has_pfx(name, RODATA_SEC ".")) { + sec_desc->sec_type = SEC_RODATA; + sec_desc->shdr = sh; + sec_desc->data = data; + } else if (strcmp(name, STRUCT_OPS_SEC) == 0) { + obj->efile.st_ops_data = data; + obj->efile.st_ops_shndx = idx; + } else { + pr_info("elf: skipping unrecognized data section(%d) %s\n", + idx, name); + } + } else if (sh->sh_type == SHT_REL) { + int targ_sec_idx = sh->sh_info; /* points to other section */ + + if (sh->sh_entsize != sizeof(Elf64_Rel) || + targ_sec_idx >= obj->efile.sec_cnt) + return -LIBBPF_ERRNO__FORMAT; + + /* Only do relo for section with exec instructions */ + if (!section_have_execinstr(obj, targ_sec_idx) && + strcmp(name, ".rel" STRUCT_OPS_SEC) && + strcmp(name, ".rel" MAPS_ELF_SEC)) { + pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n", + idx, name, targ_sec_idx, + elf_sec_name(obj, elf_sec_by_idx(obj, targ_sec_idx)) ?: "<?>"); + continue; + } + + sec_desc->sec_type = SEC_RELO; + sec_desc->shdr = sh; + sec_desc->data = data; + } else if (sh->sh_type == SHT_NOBITS && (strcmp(name, BSS_SEC) == 0 || + str_has_pfx(name, BSS_SEC "."))) { + sec_desc->sec_type = SEC_BSS; + sec_desc->shdr = sh; + sec_desc->data = data; + } else { + pr_info("elf: skipping section(%d) %s (size %zu)\n", idx, name, + (size_t)sh->sh_size); + } + } + + if (!obj->efile.strtabidx || obj->efile.strtabidx > idx) { + pr_warn("elf: symbol strings section missing or invalid in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + + /* sort BPF programs by section name and in-section instruction offset + * for faster search + */ + if (obj->nr_programs) + qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs); + + return bpf_object__init_btf(obj, btf_data, btf_ext_data); +} + +static bool sym_is_extern(const Elf64_Sym *sym) +{ + int bind = ELF64_ST_BIND(sym->st_info); + /* externs are symbols w/ type=NOTYPE, bind=GLOBAL|WEAK, section=UND */ + return sym->st_shndx == SHN_UNDEF && + (bind == STB_GLOBAL || bind == STB_WEAK) && + ELF64_ST_TYPE(sym->st_info) == STT_NOTYPE; +} + +static bool sym_is_subprog(const Elf64_Sym *sym, int text_shndx) +{ + int bind = ELF64_ST_BIND(sym->st_info); + int type = ELF64_ST_TYPE(sym->st_info); + + /* in .text section */ + if (sym->st_shndx != text_shndx) + return false; + + /* local function */ + if (bind == STB_LOCAL && type == STT_SECTION) + return true; + + /* global function */ + return bind == STB_GLOBAL && type == STT_FUNC; +} + +static int find_extern_btf_id(const struct btf *btf, const char *ext_name) +{ + const struct btf_type *t; + const char *tname; + int i, n; + + if (!btf) + return -ESRCH; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + + if (!btf_is_var(t) && !btf_is_func(t)) + continue; + + tname = btf__name_by_offset(btf, t->name_off); + if (strcmp(tname, ext_name)) + continue; + + if (btf_is_var(t) && + btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + return -EINVAL; + + if (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_EXTERN) + return -EINVAL; + + return i; + } + + return -ENOENT; +} + +static int find_extern_sec_btf_id(struct btf *btf, int ext_btf_id) { + const struct btf_var_secinfo *vs; + const struct btf_type *t; + int i, j, n; + + if (!btf) + return -ESRCH; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + + if (!btf_is_datasec(t)) + continue; + + vs = btf_var_secinfos(t); + for (j = 0; j < btf_vlen(t); j++, vs++) { + if (vs->type == ext_btf_id) + return i; + } + } + + return -ENOENT; +} + +static enum kcfg_type find_kcfg_type(const struct btf *btf, int id, + bool *is_signed) +{ + const struct btf_type *t; + const char *name; + + t = skip_mods_and_typedefs(btf, id, NULL); + name = btf__name_by_offset(btf, t->name_off); + + if (is_signed) + *is_signed = false; + switch (btf_kind(t)) { + case BTF_KIND_INT: { + int enc = btf_int_encoding(t); + + if (enc & BTF_INT_BOOL) + return t->size == 1 ? KCFG_BOOL : KCFG_UNKNOWN; + if (is_signed) + *is_signed = enc & BTF_INT_SIGNED; + if (t->size == 1) + return KCFG_CHAR; + if (t->size < 1 || t->size > 8 || (t->size & (t->size - 1))) + return KCFG_UNKNOWN; + return KCFG_INT; + } + case BTF_KIND_ENUM: + if (t->size != 4) + return KCFG_UNKNOWN; + if (strcmp(name, "libbpf_tristate")) + return KCFG_UNKNOWN; + return KCFG_TRISTATE; + case BTF_KIND_ENUM64: + if (strcmp(name, "libbpf_tristate")) + return KCFG_UNKNOWN; + return KCFG_TRISTATE; + case BTF_KIND_ARRAY: + if (btf_array(t)->nelems == 0) + return KCFG_UNKNOWN; + if (find_kcfg_type(btf, btf_array(t)->type, NULL) != KCFG_CHAR) + return KCFG_UNKNOWN; + return KCFG_CHAR_ARR; + default: + return KCFG_UNKNOWN; + } +} + +static int cmp_externs(const void *_a, const void *_b) +{ + const struct extern_desc *a = _a; + const struct extern_desc *b = _b; + + if (a->type != b->type) + return a->type < b->type ? -1 : 1; + + if (a->type == EXT_KCFG) { + /* descending order by alignment requirements */ + if (a->kcfg.align != b->kcfg.align) + return a->kcfg.align > b->kcfg.align ? -1 : 1; + /* ascending order by size, within same alignment class */ + if (a->kcfg.sz != b->kcfg.sz) + return a->kcfg.sz < b->kcfg.sz ? -1 : 1; + } + + /* resolve ties by name */ + return strcmp(a->name, b->name); +} + +static int find_int_btf_id(const struct btf *btf) +{ + const struct btf_type *t; + int i, n; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + + if (btf_is_int(t) && btf_int_bits(t) == 32) + return i; + } + + return 0; +} + +static int add_dummy_ksym_var(struct btf *btf) +{ + int i, int_btf_id, sec_btf_id, dummy_var_btf_id; + const struct btf_var_secinfo *vs; + const struct btf_type *sec; + + if (!btf) + return 0; + + sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC, + BTF_KIND_DATASEC); + if (sec_btf_id < 0) + return 0; + + sec = btf__type_by_id(btf, sec_btf_id); + vs = btf_var_secinfos(sec); + for (i = 0; i < btf_vlen(sec); i++, vs++) { + const struct btf_type *vt; + + vt = btf__type_by_id(btf, vs->type); + if (btf_is_func(vt)) + break; + } + + /* No func in ksyms sec. No need to add dummy var. */ + if (i == btf_vlen(sec)) + return 0; + + int_btf_id = find_int_btf_id(btf); + dummy_var_btf_id = btf__add_var(btf, + "dummy_ksym", + BTF_VAR_GLOBAL_ALLOCATED, + int_btf_id); + if (dummy_var_btf_id < 0) + pr_warn("cannot create a dummy_ksym var\n"); + + return dummy_var_btf_id; +} + +static int bpf_object__collect_externs(struct bpf_object *obj) +{ + struct btf_type *sec, *kcfg_sec = NULL, *ksym_sec = NULL; + const struct btf_type *t; + struct extern_desc *ext; + int i, n, off, dummy_var_btf_id; + const char *ext_name, *sec_name; + Elf_Scn *scn; + Elf64_Shdr *sh; + + if (!obj->efile.symbols) + return 0; + + scn = elf_sec_by_idx(obj, obj->efile.symbols_shndx); + sh = elf_sec_hdr(obj, scn); + if (!sh || sh->sh_entsize != sizeof(Elf64_Sym)) + return -LIBBPF_ERRNO__FORMAT; + + dummy_var_btf_id = add_dummy_ksym_var(obj->btf); + if (dummy_var_btf_id < 0) + return dummy_var_btf_id; + + n = sh->sh_size / sh->sh_entsize; + pr_debug("looking for externs among %d symbols...\n", n); + + for (i = 0; i < n; i++) { + Elf64_Sym *sym = elf_sym_by_idx(obj, i); + + if (!sym) + return -LIBBPF_ERRNO__FORMAT; + if (!sym_is_extern(sym)) + continue; + ext_name = elf_sym_str(obj, sym->st_name); + if (!ext_name || !ext_name[0]) + continue; + + ext = obj->externs; + ext = libbpf_reallocarray(ext, obj->nr_extern + 1, sizeof(*ext)); + if (!ext) + return -ENOMEM; + obj->externs = ext; + ext = &ext[obj->nr_extern]; + memset(ext, 0, sizeof(*ext)); + obj->nr_extern++; + + ext->btf_id = find_extern_btf_id(obj->btf, ext_name); + if (ext->btf_id <= 0) { + pr_warn("failed to find BTF for extern '%s': %d\n", + ext_name, ext->btf_id); + return ext->btf_id; + } + t = btf__type_by_id(obj->btf, ext->btf_id); + ext->name = btf__name_by_offset(obj->btf, t->name_off); + ext->sym_idx = i; + ext->is_weak = ELF64_ST_BIND(sym->st_info) == STB_WEAK; + + ext->sec_btf_id = find_extern_sec_btf_id(obj->btf, ext->btf_id); + if (ext->sec_btf_id <= 0) { + pr_warn("failed to find BTF for extern '%s' [%d] section: %d\n", + ext_name, ext->btf_id, ext->sec_btf_id); + return ext->sec_btf_id; + } + sec = (void *)btf__type_by_id(obj->btf, ext->sec_btf_id); + sec_name = btf__name_by_offset(obj->btf, sec->name_off); + + if (strcmp(sec_name, KCONFIG_SEC) == 0) { + if (btf_is_func(t)) { + pr_warn("extern function %s is unsupported under %s section\n", + ext->name, KCONFIG_SEC); + return -ENOTSUP; + } + kcfg_sec = sec; + ext->type = EXT_KCFG; + ext->kcfg.sz = btf__resolve_size(obj->btf, t->type); + if (ext->kcfg.sz <= 0) { + pr_warn("failed to resolve size of extern (kcfg) '%s': %d\n", + ext_name, ext->kcfg.sz); + return ext->kcfg.sz; + } + ext->kcfg.align = btf__align_of(obj->btf, t->type); + if (ext->kcfg.align <= 0) { + pr_warn("failed to determine alignment of extern (kcfg) '%s': %d\n", + ext_name, ext->kcfg.align); + return -EINVAL; + } + ext->kcfg.type = find_kcfg_type(obj->btf, t->type, + &ext->kcfg.is_signed); + if (ext->kcfg.type == KCFG_UNKNOWN) { + pr_warn("extern (kcfg) '%s': type is unsupported\n", ext_name); + return -ENOTSUP; + } + } else if (strcmp(sec_name, KSYMS_SEC) == 0) { + ksym_sec = sec; + ext->type = EXT_KSYM; + skip_mods_and_typedefs(obj->btf, t->type, + &ext->ksym.type_id); + } else { + pr_warn("unrecognized extern section '%s'\n", sec_name); + return -ENOTSUP; + } + } + pr_debug("collected %d externs total\n", obj->nr_extern); + + if (!obj->nr_extern) + return 0; + + /* sort externs by type, for kcfg ones also by (align, size, name) */ + qsort(obj->externs, obj->nr_extern, sizeof(*ext), cmp_externs); + + /* for .ksyms section, we need to turn all externs into allocated + * variables in BTF to pass kernel verification; we do this by + * pretending that each extern is a 8-byte variable + */ + if (ksym_sec) { + /* find existing 4-byte integer type in BTF to use for fake + * extern variables in DATASEC + */ + int int_btf_id = find_int_btf_id(obj->btf); + /* For extern function, a dummy_var added earlier + * will be used to replace the vs->type and + * its name string will be used to refill + * the missing param's name. + */ + const struct btf_type *dummy_var; + + dummy_var = btf__type_by_id(obj->btf, dummy_var_btf_id); + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM) + continue; + pr_debug("extern (ksym) #%d: symbol %d, name %s\n", + i, ext->sym_idx, ext->name); + } + + sec = ksym_sec; + n = btf_vlen(sec); + for (i = 0, off = 0; i < n; i++, off += sizeof(int)) { + struct btf_var_secinfo *vs = btf_var_secinfos(sec) + i; + struct btf_type *vt; + + vt = (void *)btf__type_by_id(obj->btf, vs->type); + ext_name = btf__name_by_offset(obj->btf, vt->name_off); + ext = find_extern_by_name(obj, ext_name); + if (!ext) { + pr_warn("failed to find extern definition for BTF %s '%s'\n", + btf_kind_str(vt), ext_name); + return -ESRCH; + } + if (btf_is_func(vt)) { + const struct btf_type *func_proto; + struct btf_param *param; + int j; + + func_proto = btf__type_by_id(obj->btf, + vt->type); + param = btf_params(func_proto); + /* Reuse the dummy_var string if the + * func proto does not have param name. + */ + for (j = 0; j < btf_vlen(func_proto); j++) + if (param[j].type && !param[j].name_off) + param[j].name_off = + dummy_var->name_off; + vs->type = dummy_var_btf_id; + vt->info &= ~0xffff; + vt->info |= BTF_FUNC_GLOBAL; + } else { + btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + vt->type = int_btf_id; + } + vs->offset = off; + vs->size = sizeof(int); + } + sec->size = off; + } + + if (kcfg_sec) { + sec = kcfg_sec; + /* for kcfg externs calculate their offsets within a .kconfig map */ + off = 0; + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KCFG) + continue; + + ext->kcfg.data_off = roundup(off, ext->kcfg.align); + off = ext->kcfg.data_off + ext->kcfg.sz; + pr_debug("extern (kcfg) #%d: symbol %d, off %u, name %s\n", + i, ext->sym_idx, ext->kcfg.data_off, ext->name); + } + sec->size = off; + n = btf_vlen(sec); + for (i = 0; i < n; i++) { + struct btf_var_secinfo *vs = btf_var_secinfos(sec) + i; + + t = btf__type_by_id(obj->btf, vs->type); + ext_name = btf__name_by_offset(obj->btf, t->name_off); + ext = find_extern_by_name(obj, ext_name); + if (!ext) { + pr_warn("failed to find extern definition for BTF var '%s'\n", + ext_name); + return -ESRCH; + } + btf_var(t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + vs->offset = ext->kcfg.data_off; + } + } + return 0; +} + +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog) +{ + return prog->sec_idx == obj->efile.text_shndx && obj->nr_programs > 1; +} + +struct bpf_program * +bpf_object__find_program_by_name(const struct bpf_object *obj, + const char *name) +{ + struct bpf_program *prog; + + bpf_object__for_each_program(prog, obj) { + if (prog_is_subprog(obj, prog)) + continue; + if (!strcmp(prog->name, name)) + return prog; + } + return errno = ENOENT, NULL; +} + +static bool bpf_object__shndx_is_data(const struct bpf_object *obj, + int shndx) +{ + switch (obj->efile.secs[shndx].sec_type) { + case SEC_BSS: + case SEC_DATA: + case SEC_RODATA: + return true; + default: + return false; + } +} + +static bool bpf_object__shndx_is_maps(const struct bpf_object *obj, + int shndx) +{ + return shndx == obj->efile.btf_maps_shndx; +} + +static enum libbpf_map_type +bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) +{ + if (shndx == obj->efile.symbols_shndx) + return LIBBPF_MAP_KCONFIG; + + switch (obj->efile.secs[shndx].sec_type) { + case SEC_BSS: + return LIBBPF_MAP_BSS; + case SEC_DATA: + return LIBBPF_MAP_DATA; + case SEC_RODATA: + return LIBBPF_MAP_RODATA; + default: + return LIBBPF_MAP_UNSPEC; + } +} + +static int bpf_program__record_reloc(struct bpf_program *prog, + struct reloc_desc *reloc_desc, + __u32 insn_idx, const char *sym_name, + const Elf64_Sym *sym, const Elf64_Rel *rel) +{ + struct bpf_insn *insn = &prog->insns[insn_idx]; + size_t map_idx, nr_maps = prog->obj->nr_maps; + struct bpf_object *obj = prog->obj; + __u32 shdr_idx = sym->st_shndx; + enum libbpf_map_type type; + const char *sym_sec_name; + struct bpf_map *map; + + if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { + pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", + prog->name, sym_name, insn_idx, insn->code); + return -LIBBPF_ERRNO__RELOC; + } + + if (sym_is_extern(sym)) { + int sym_idx = ELF64_R_SYM(rel->r_info); + int i, n = obj->nr_extern; + struct extern_desc *ext; + + for (i = 0; i < n; i++) { + ext = &obj->externs[i]; + if (ext->sym_idx == sym_idx) + break; + } + if (i >= n) { + pr_warn("prog '%s': extern relo failed to find extern for '%s' (%d)\n", + prog->name, sym_name, sym_idx); + return -LIBBPF_ERRNO__RELOC; + } + pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", + prog->name, i, ext->name, ext->sym_idx, insn_idx); + if (insn->code == (BPF_JMP | BPF_CALL)) + reloc_desc->type = RELO_EXTERN_FUNC; + else + reloc_desc->type = RELO_EXTERN_VAR; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = i; /* sym_off stores extern index */ + return 0; + } + + /* sub-program call relocation */ + if (is_call_insn(insn)) { + if (insn->src_reg != BPF_PSEUDO_CALL) { + pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); + return -LIBBPF_ERRNO__RELOC; + } + /* text_shndx can be 0, if no default "main" program exists */ + if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", + prog->name, sym_name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % BPF_INSN_SZ) { + pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", + prog->name, sym_name, (size_t)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_CALL; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + + if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { + pr_warn("prog '%s': invalid relo against '%s' in special section 0x%x; forgot to initialize global var?..\n", + prog->name, sym_name, shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + + /* loading subprog addresses */ + if (sym_is_subprog(sym, obj->efile.text_shndx)) { + /* global_func: sym->st_value = offset in the section, insn->imm = 0. + * local_func: sym->st_value = 0, insn->imm = offset in the section. + */ + if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) { + pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n", + prog->name, sym_name, (size_t)sym->st_value, insn->imm); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_SUBPROG_ADDR; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + + type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + + /* generic map reference relocation */ + if (type == LIBBPF_MAP_UNSPEC) { + if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { + pr_warn("prog '%s': bad map relo against '%s' in section '%s'\n", + prog->name, sym_name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type || + map->sec_idx != sym->st_shndx || + map->sec_offset != sym->st_value) + continue; + pr_debug("prog '%s': found map %zd (%s, sec %d, off %zu) for insn #%u\n", + prog->name, map_idx, map->name, map->sec_idx, + map->sec_offset, insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("prog '%s': map relo failed to find map for section '%s', off %zu\n", + prog->name, sym_sec_name, (size_t)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_LD64; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + reloc_desc->sym_off = 0; /* sym->st_value determines map_idx */ + return 0; + } + + /* global data map relocation */ + if (!bpf_object__shndx_is_data(obj, shdr_idx)) { + pr_warn("prog '%s': bad data relo against section '%s'\n", + prog->name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type || map->sec_idx != sym->st_shndx) + continue; + pr_debug("prog '%s': found data map %zd (%s, sec %d, off %zu) for insn %u\n", + prog->name, map_idx, map->name, map->sec_idx, + map->sec_offset, insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("prog '%s': data relo failed to find map for section '%s'\n", + prog->name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_DATA; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + reloc_desc->sym_off = sym->st_value; + return 0; +} + +static bool prog_contains_insn(const struct bpf_program *prog, size_t insn_idx) +{ + return insn_idx >= prog->sec_insn_off && + insn_idx < prog->sec_insn_off + prog->sec_insn_cnt; +} + +static struct bpf_program *find_prog_by_sec_insn(const struct bpf_object *obj, + size_t sec_idx, size_t insn_idx) +{ + int l = 0, r = obj->nr_programs - 1, m; + struct bpf_program *prog; + + if (!obj->nr_programs) + return NULL; + + while (l < r) { + m = l + (r - l + 1) / 2; + prog = &obj->programs[m]; + + if (prog->sec_idx < sec_idx || + (prog->sec_idx == sec_idx && prog->sec_insn_off <= insn_idx)) + l = m; + else + r = m - 1; + } + /* matching program could be at index l, but it still might be the + * wrong one, so we need to double check conditions for the last time + */ + prog = &obj->programs[l]; + if (prog->sec_idx == sec_idx && prog_contains_insn(prog, insn_idx)) + return prog; + return NULL; +} + +static int +bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Data *data) +{ + const char *relo_sec_name, *sec_name; + size_t sec_idx = shdr->sh_info, sym_idx; + struct bpf_program *prog; + struct reloc_desc *relos; + int err, i, nrels; + const char *sym_name; + __u32 insn_idx; + Elf_Scn *scn; + Elf_Data *scn_data; + Elf64_Sym *sym; + Elf64_Rel *rel; + + if (sec_idx >= obj->efile.sec_cnt) + return -EINVAL; + + scn = elf_sec_by_idx(obj, sec_idx); + scn_data = elf_sec_data(obj, scn); + + relo_sec_name = elf_sec_str(obj, shdr->sh_name); + sec_name = elf_sec_name(obj, scn); + if (!relo_sec_name || !sec_name) + return -EINVAL; + + pr_debug("sec '%s': collecting relocation for section(%zu) '%s'\n", + relo_sec_name, sec_idx, sec_name); + nrels = shdr->sh_size / shdr->sh_entsize; + + for (i = 0; i < nrels; i++) { + rel = elf_rel_by_idx(data, i); + if (!rel) { + pr_warn("sec '%s': failed to get relo #%d\n", relo_sec_name, i); + return -LIBBPF_ERRNO__FORMAT; + } + + sym_idx = ELF64_R_SYM(rel->r_info); + sym = elf_sym_by_idx(obj, sym_idx); + if (!sym) { + pr_warn("sec '%s': symbol #%zu not found for relo #%d\n", + relo_sec_name, sym_idx, i); + return -LIBBPF_ERRNO__FORMAT; + } + + if (sym->st_shndx >= obj->efile.sec_cnt) { + pr_warn("sec '%s': corrupted symbol #%zu pointing to invalid section #%zu for relo #%d\n", + relo_sec_name, sym_idx, (size_t)sym->st_shndx, i); + return -LIBBPF_ERRNO__FORMAT; + } + + if (rel->r_offset % BPF_INSN_SZ || rel->r_offset >= scn_data->d_size) { + pr_warn("sec '%s': invalid offset 0x%zx for relo #%d\n", + relo_sec_name, (size_t)rel->r_offset, i); + return -LIBBPF_ERRNO__FORMAT; + } + + insn_idx = rel->r_offset / BPF_INSN_SZ; + /* relocations against static functions are recorded as + * relocations against the section that contains a function; + * in such case, symbol will be STT_SECTION and sym.st_name + * will point to empty string (0), so fetch section name + * instead + */ + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && sym->st_name == 0) + sym_name = elf_sec_name(obj, elf_sec_by_idx(obj, sym->st_shndx)); + else + sym_name = elf_sym_str(obj, sym->st_name); + sym_name = sym_name ?: "<?"; + + pr_debug("sec '%s': relo #%d: insn #%u against '%s'\n", + relo_sec_name, i, insn_idx, sym_name); + + prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); + if (!prog) { + pr_debug("sec '%s': relo #%d: couldn't find program in section '%s' for insn #%u, probably overridden weak function, skipping...\n", + relo_sec_name, i, sec_name, insn_idx); + continue; + } + + relos = libbpf_reallocarray(prog->reloc_desc, + prog->nr_reloc + 1, sizeof(*relos)); + if (!relos) + return -ENOMEM; + prog->reloc_desc = relos; + + /* adjust insn_idx to local BPF program frame of reference */ + insn_idx -= prog->sec_insn_off; + err = bpf_program__record_reloc(prog, &relos[prog->nr_reloc], + insn_idx, sym_name, sym, rel); + if (err) + return err; + + prog->nr_reloc++; + } + return 0; +} + +static int map_fill_btf_type_info(struct bpf_object *obj, struct bpf_map *map) +{ + int id; + + if (!obj->btf) + return -ENOENT; + + /* if it's BTF-defined map, we don't need to search for type IDs. + * For struct_ops map, it does not need btf_key_type_id and + * btf_value_type_id. + */ + if (map->sec_idx == obj->efile.btf_maps_shndx || bpf_map__is_struct_ops(map)) + return 0; + + /* + * LLVM annotates global data differently in BTF, that is, + * only as '.data', '.bss' or '.rodata'. + */ + if (!bpf_map__is_internal(map)) + return -ENOENT; + + id = btf__find_by_name(obj->btf, map->real_name); + if (id < 0) + return id; + + map->btf_key_type_id = 0; + map->btf_value_type_id = id; + return 0; +} + +static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) +{ + char file[PATH_MAX], buff[4096]; + FILE *fp; + __u32 val; + int err; + + snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd); + memset(info, 0, sizeof(*info)); + + fp = fopen(file, "r"); + if (!fp) { + err = -errno; + pr_warn("failed to open %s: %d. No procfs support?\n", file, + err); + return err; + } + + while (fgets(buff, sizeof(buff), fp)) { + if (sscanf(buff, "map_type:\t%u", &val) == 1) + info->type = val; + else if (sscanf(buff, "key_size:\t%u", &val) == 1) + info->key_size = val; + else if (sscanf(buff, "value_size:\t%u", &val) == 1) + info->value_size = val; + else if (sscanf(buff, "max_entries:\t%u", &val) == 1) + info->max_entries = val; + else if (sscanf(buff, "map_flags:\t%i", &val) == 1) + info->map_flags = val; + } + + fclose(fp); + + return 0; +} + +bool bpf_map__autocreate(const struct bpf_map *map) +{ + return map->autocreate; +} + +int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate) +{ + if (map->obj->loaded) + return libbpf_err(-EBUSY); + + map->autocreate = autocreate; + return 0; +} + +int bpf_map__reuse_fd(struct bpf_map *map, int fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info), name_len; + int new_fd, err; + char *new_name; + + memset(&info, 0, len); + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err && errno == EINVAL) + err = bpf_get_map_info_from_fdinfo(fd, &info); + if (err) + return libbpf_err(err); + + name_len = strlen(info.name); + if (name_len == BPF_OBJ_NAME_LEN - 1 && strncmp(map->name, info.name, name_len) == 0) + new_name = strdup(map->name); + else + new_name = strdup(info.name); + + if (!new_name) + return libbpf_err(-errno); + + new_fd = open("/", O_RDONLY | O_CLOEXEC); + if (new_fd < 0) { + err = -errno; + goto err_free_new_name; + } + + new_fd = dup3(fd, new_fd, O_CLOEXEC); + if (new_fd < 0) { + err = -errno; + goto err_close_new_fd; + } + + err = zclose(map->fd); + if (err) { + err = -errno; + goto err_close_new_fd; + } + free(map->name); + + map->fd = new_fd; + map->name = new_name; + map->def.type = info.type; + map->def.key_size = info.key_size; + map->def.value_size = info.value_size; + map->def.max_entries = info.max_entries; + map->def.map_flags = info.map_flags; + map->btf_key_type_id = info.btf_key_type_id; + map->btf_value_type_id = info.btf_value_type_id; + map->reused = true; + map->map_extra = info.map_extra; + + return 0; + +err_close_new_fd: + close(new_fd); +err_free_new_name: + free(new_name); + return libbpf_err(err); +} + +__u32 bpf_map__max_entries(const struct bpf_map *map) +{ + return map->def.max_entries; +} + +struct bpf_map *bpf_map__inner_map(struct bpf_map *map) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) + return errno = EINVAL, NULL; + + return map->inner_map; +} + +int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) +{ + if (map->obj->loaded) + return libbpf_err(-EBUSY); + + map->def.max_entries = max_entries; + + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map_is_ringbuf(map)) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + + return 0; +} + +static int +bpf_object__probe_loading(struct bpf_object *obj) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, insn_cnt = ARRAY_SIZE(insns); + + if (obj->gen_loader) + return 0; + + ret = bump_rlimit_memlock(); + if (ret) + pr_warn("Failed to bump RLIMIT_MEMLOCK (err = %d), you might need to do it explicitly!\n", ret); + + /* make sure basic loading works */ + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + if (ret < 0) + ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); + if (ret < 0) { + ret = errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't load trivial BPF " + "program. Make sure your kernel supports BPF " + "(CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is " + "set to big enough value.\n", __func__, cp, ret); + return -ret; + } + close(ret); + + return 0; +} + +static int probe_fd(int fd) +{ + if (fd >= 0) + close(fd); + return fd >= 0; +} + +static int probe_kern_prog_name(void) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_name); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.license = ptr_to_u64("GPL"); + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = (__u32)ARRAY_SIZE(insns); + libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name)); + + /* make sure loading with name works */ + ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS); + return probe_fd(ret); +} + +static int probe_kern_global_data(void) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, NULL); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + insns[0].imm = map; + + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + close(map); + return probe_fd(ret); +} + +static int probe_kern_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_func(void) +{ + static const char strs[] = "\0int\0x\0a"; + /* void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_func_global(void) +{ + static const char strs[] = "\0int\0x\0a"; + /* static void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x BTF_FUNC_GLOBAL */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_datasec(void) +{ + static const char strs[] = "\0x\0.data"; + /* static int a; */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC val */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_float(void) +{ + static const char strs[] = "\0float"; + __u32 types[] = { + /* float */ + BTF_TYPE_FLOAT_ENC(1, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_decl_tag(void) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* attr */ + BTF_TYPE_DECL_TAG_ENC(1, 2, -1), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_type_tag(void) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* attr */ + BTF_TYPE_TYPE_TAG_ENC(1, 1), /* [2] */ + /* ptr */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), /* [3] */ + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_array_mmap(void) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE); + int fd; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts); + return probe_fd(fd); +} + +static int probe_kern_exp_attach_type(void) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + /* use any valid combination of program type and (optional) + * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS) + * to see if kernel supports expected_attach_type field for + * BPF_PROG_LOAD command + */ + fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts); + return probe_fd(fd); +} + +static int probe_kern_probe_read_kernel(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), /* r1 = r10 (fp) */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), /* r1 += -8 */ + BPF_MOV64_IMM(BPF_REG_2, 8), /* r2 = 8 */ + BPF_MOV64_IMM(BPF_REG_3, 0), /* r3 = 0 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); + return probe_fd(fd); +} + +static int probe_prog_bind_map(void) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, prog, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, NULL); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + if (prog < 0) { + close(map); + return 0; + } + + ret = bpf_prog_bind_map(prog, map, NULL); + + close(map); + close(prog); + + return ret >= 0; +} + +static int probe_module_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + struct bpf_btf_info info; + __u32 len = sizeof(info); + char name[16]; + int fd, err; + + fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); + if (fd < 0) + return 0; /* BTF not supported at all */ + + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; + * kernel's module BTF support coincides with support for + * name/name_len fields in struct bpf_btf_info. + */ + err = bpf_obj_get_info_by_fd(fd, &info, &len); + close(fd); + return !err; +} + +static int probe_perf_link(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, link_fd, err; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", + insns, ARRAY_SIZE(insns), NULL); + if (prog_fd < 0) + return -errno; + + /* use invalid perf_event FD to get EBADF, if link is supported; + * otherwise EINVAL should be returned + */ + link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + + return link_fd < 0 && err == -EBADF; +} + +static int probe_kern_bpf_cookie(void) +{ + struct bpf_insn insns[] = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie), + BPF_EXIT_INSN(), + }; + int ret, insn_cnt = ARRAY_SIZE(insns); + + ret = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL); + return probe_fd(ret); +} + +static int probe_kern_btf_enum64(void) +{ + static const char strs[] = "\0enum64"; + __u32 types[] = { + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_syscall_wrapper(void); + +enum kern_feature_result { + FEAT_UNKNOWN = 0, + FEAT_SUPPORTED = 1, + FEAT_MISSING = 2, +}; + +typedef int (*feature_probe_fn)(void); + +static struct kern_feature_desc { + const char *desc; + feature_probe_fn probe; + enum kern_feature_result res; +} feature_probes[__FEAT_CNT] = { + [FEAT_PROG_NAME] = { + "BPF program name", probe_kern_prog_name, + }, + [FEAT_GLOBAL_DATA] = { + "global variables", probe_kern_global_data, + }, + [FEAT_BTF] = { + "minimal BTF", probe_kern_btf, + }, + [FEAT_BTF_FUNC] = { + "BTF functions", probe_kern_btf_func, + }, + [FEAT_BTF_GLOBAL_FUNC] = { + "BTF global function", probe_kern_btf_func_global, + }, + [FEAT_BTF_DATASEC] = { + "BTF data section and variable", probe_kern_btf_datasec, + }, + [FEAT_ARRAY_MMAP] = { + "ARRAY map mmap()", probe_kern_array_mmap, + }, + [FEAT_EXP_ATTACH_TYPE] = { + "BPF_PROG_LOAD expected_attach_type attribute", + probe_kern_exp_attach_type, + }, + [FEAT_PROBE_READ_KERN] = { + "bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel, + }, + [FEAT_PROG_BIND_MAP] = { + "BPF_PROG_BIND_MAP support", probe_prog_bind_map, + }, + [FEAT_MODULE_BTF] = { + "module BTF support", probe_module_btf, + }, + [FEAT_BTF_FLOAT] = { + "BTF_KIND_FLOAT support", probe_kern_btf_float, + }, + [FEAT_PERF_LINK] = { + "BPF perf link support", probe_perf_link, + }, + [FEAT_BTF_DECL_TAG] = { + "BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag, + }, + [FEAT_BTF_TYPE_TAG] = { + "BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag, + }, + [FEAT_MEMCG_ACCOUNT] = { + "memcg-based memory accounting", probe_memcg_account, + }, + [FEAT_BPF_COOKIE] = { + "BPF cookie support", probe_kern_bpf_cookie, + }, + [FEAT_BTF_ENUM64] = { + "BTF_KIND_ENUM64 support", probe_kern_btf_enum64, + }, + [FEAT_SYSCALL_WRAPPER] = { + "Kernel using syscall wrapper", probe_kern_syscall_wrapper, + }, +}; + +bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) +{ + struct kern_feature_desc *feat = &feature_probes[feat_id]; + int ret; + + if (obj && obj->gen_loader) + /* To generate loader program assume the latest kernel + * to avoid doing extra prog_load, map_create syscalls. + */ + return true; + + if (READ_ONCE(feat->res) == FEAT_UNKNOWN) { + ret = feat->probe(); + if (ret > 0) { + WRITE_ONCE(feat->res, FEAT_SUPPORTED); + } else if (ret == 0) { + WRITE_ONCE(feat->res, FEAT_MISSING); + } else { + pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret); + WRITE_ONCE(feat->res, FEAT_MISSING); + } + } + + return READ_ONCE(feat->res) == FEAT_SUPPORTED; +} + +static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) +{ + struct bpf_map_info map_info; + char msg[STRERR_BUFSIZE]; + __u32 map_info_len = sizeof(map_info); + int err; + + memset(&map_info, 0, map_info_len); + err = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len); + if (err && errno == EINVAL) + err = bpf_get_map_info_from_fdinfo(map_fd, &map_info); + if (err) { + pr_warn("failed to get map info for map FD %d: %s\n", map_fd, + libbpf_strerror_r(errno, msg, sizeof(msg))); + return false; + } + + return (map_info.type == map->def.type && + map_info.key_size == map->def.key_size && + map_info.value_size == map->def.value_size && + map_info.max_entries == map->def.max_entries && + map_info.map_flags == map->def.map_flags && + map_info.map_extra == map->map_extra); +} + +static int +bpf_object__reuse_map(struct bpf_map *map) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + int err, pin_fd; + + pin_fd = bpf_obj_get(map->pin_path); + if (pin_fd < 0) { + err = -errno; + if (err == -ENOENT) { + pr_debug("found no pinned map to reuse at '%s'\n", + map->pin_path); + return 0; + } + + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warn("couldn't retrieve pinned map '%s': %s\n", + map->pin_path, cp); + return err; + } + + if (!map_is_reuse_compat(map, pin_fd)) { + pr_warn("couldn't reuse pinned map at '%s': parameter mismatch\n", + map->pin_path); + close(pin_fd); + return -EINVAL; + } + + err = bpf_map__reuse_fd(map, pin_fd); + close(pin_fd); + if (err) + return err; + + map->pinned = true; + pr_debug("reused pinned map at '%s'\n", map->pin_path); + + return 0; +} + +static int +bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) +{ + enum libbpf_map_type map_type = map->libbpf_type; + char *cp, errmsg[STRERR_BUFSIZE]; + int err, zero = 0; + + if (obj->gen_loader) { + bpf_gen__map_update_elem(obj->gen_loader, map - obj->maps, + map->mmaped, map->def.value_size); + if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG) + bpf_gen__map_freeze(obj->gen_loader, map - obj->maps); + return 0; + } + err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); + if (err) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error setting initial map(%s) contents: %s\n", + map->name, cp); + return err; + } + + /* Freeze .rodata and .kconfig map as read-only from syscall side. */ + if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG) { + err = bpf_map_freeze(map->fd); + if (err) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error freezing map(%s) as read-only: %s\n", + map->name, cp); + return err; + } + } + return 0; +} + +static void bpf_map__destroy(struct bpf_map *map); + +static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner) +{ + LIBBPF_OPTS(bpf_map_create_opts, create_attr); + struct bpf_map_def *def = &map->def; + const char *map_name = NULL; + int err = 0; + + if (kernel_supports(obj, FEAT_PROG_NAME)) + map_name = map->name; + create_attr.map_ifindex = map->map_ifindex; + create_attr.map_flags = def->map_flags; + create_attr.numa_node = map->numa_node; + create_attr.map_extra = map->map_extra; + + if (bpf_map__is_struct_ops(map)) + create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; + + if (obj->btf && btf__fd(obj->btf) >= 0) { + create_attr.btf_fd = btf__fd(obj->btf); + create_attr.btf_key_type_id = map->btf_key_type_id; + create_attr.btf_value_type_id = map->btf_value_type_id; + } + + if (bpf_map_type__is_map_in_map(def->type)) { + if (map->inner_map) { + err = bpf_object__create_map(obj, map->inner_map, true); + if (err) { + pr_warn("map '%s': failed to create inner map: %d\n", + map->name, err); + return err; + } + map->inner_map_fd = bpf_map__fd(map->inner_map); + } + if (map->inner_map_fd >= 0) + create_attr.inner_map_fd = map->inner_map_fd; + } + + switch (def->type) { + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_STACK_TRACE: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; + default: + break; + } + + if (obj->gen_loader) { + bpf_gen__map_create(obj->gen_loader, def->type, map_name, + def->key_size, def->value_size, def->max_entries, + &create_attr, is_inner ? -1 : map - obj->maps); + /* Pretend to have valid FD to pass various fd >= 0 checks. + * This fd == 0 will not be used with any syscall and will be reset to -1 eventually. + */ + map->fd = 0; + } else { + map->fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); + } + if (map->fd < 0 && (create_attr.btf_key_type_id || + create_attr.btf_value_type_id)) { + char *cp, errmsg[STRERR_BUFSIZE]; + + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", + map->name, cp, err); + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; + map->fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); + } + + err = map->fd < 0 ? -errno : 0; + + if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) { + if (obj->gen_loader) + map->inner_map->fd = -1; + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + + return err; +} + +static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map) +{ + const struct bpf_map *targ_map; + unsigned int i; + int fd, err = 0; + + for (i = 0; i < map->init_slots_sz; i++) { + if (!map->init_slots[i]) + continue; + + targ_map = map->init_slots[i]; + fd = bpf_map__fd(targ_map); + + if (obj->gen_loader) { + bpf_gen__populate_outer_map(obj->gen_loader, + map - obj->maps, i, + targ_map - obj->maps); + } else { + err = bpf_map_update_elem(map->fd, &i, &fd, 0); + } + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", + map->name, i, targ_map->name, fd, err); + return err; + } + pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", + map->name, i, targ_map->name, fd); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + return 0; +} + +static int init_prog_array_slots(struct bpf_object *obj, struct bpf_map *map) +{ + const struct bpf_program *targ_prog; + unsigned int i; + int fd, err; + + if (obj->gen_loader) + return -ENOTSUP; + + for (i = 0; i < map->init_slots_sz; i++) { + if (!map->init_slots[i]) + continue; + + targ_prog = map->init_slots[i]; + fd = bpf_program__fd(targ_prog); + + err = bpf_map_update_elem(map->fd, &i, &fd, 0); + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to prog '%s' fd=%d: %d\n", + map->name, i, targ_prog->name, fd, err); + return err; + } + pr_debug("map '%s': slot [%d] set to prog '%s' fd=%d\n", + map->name, i, targ_prog->name, fd); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + return 0; +} + +static int bpf_object_init_prog_arrays(struct bpf_object *obj) +{ + struct bpf_map *map; + int i, err; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + if (!map->init_slots_sz || map->def.type != BPF_MAP_TYPE_PROG_ARRAY) + continue; + + err = init_prog_array_slots(obj, map); + if (err < 0) { + zclose(map->fd); + return err; + } + } + return 0; +} + +static int map_set_def_max_entries(struct bpf_map *map) +{ + if (map->def.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && !map->def.max_entries) { + int nr_cpus; + + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + pr_warn("map '%s': failed to determine number of system CPUs: %d\n", + map->name, nr_cpus); + return nr_cpus; + } + pr_debug("map '%s': setting size to %d\n", map->name, nr_cpus); + map->def.max_entries = nr_cpus; + } + + return 0; +} + +static int +bpf_object__create_maps(struct bpf_object *obj) +{ + struct bpf_map *map; + char *cp, errmsg[STRERR_BUFSIZE]; + unsigned int i, j; + int err; + bool retried; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + /* To support old kernels, we skip creating global data maps + * (.rodata, .data, .kconfig, etc); later on, during program + * loading, if we detect that at least one of the to-be-loaded + * programs is referencing any global data map, we'll error + * out with program name and relocation index logged. + * This approach allows to accommodate Clang emitting + * unnecessary .rodata.str1.1 sections for string literals, + * but also it allows to have CO-RE applications that use + * global variables in some of BPF programs, but not others. + * If those global variable-using programs are not loaded at + * runtime due to bpf_program__set_autoload(prog, false), + * bpf_object loading will succeed just fine even on old + * kernels. + */ + if (bpf_map__is_internal(map) && !kernel_supports(obj, FEAT_GLOBAL_DATA)) + map->autocreate = false; + + if (!map->autocreate) { + pr_debug("map '%s': skipped auto-creating...\n", map->name); + continue; + } + + err = map_set_def_max_entries(map); + if (err) + goto err_out; + + retried = false; +retry: + if (map->pin_path) { + err = bpf_object__reuse_map(map); + if (err) { + pr_warn("map '%s': error reusing pinned map\n", + map->name); + goto err_out; + } + if (retried && map->fd < 0) { + pr_warn("map '%s': cannot find pinned map\n", + map->name); + err = -ENOENT; + goto err_out; + } + } + + if (map->fd >= 0) { + pr_debug("map '%s': skipping creation (preset fd=%d)\n", + map->name, map->fd); + } else { + err = bpf_object__create_map(obj, map, false); + if (err) + goto err_out; + + pr_debug("map '%s': created successfully, fd=%d\n", + map->name, map->fd); + + if (bpf_map__is_internal(map)) { + err = bpf_object__populate_internal_map(obj, map); + if (err < 0) { + zclose(map->fd); + goto err_out; + } + } + + if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) { + err = init_map_in_map_slots(obj, map); + if (err < 0) { + zclose(map->fd); + goto err_out; + } + } + } + + if (map->pin_path && !map->pinned) { + err = bpf_map__pin(map, NULL); + if (err) { + zclose(map->fd); + if (!retried && err == -EEXIST) { + retried = true; + goto retry; + } + pr_warn("map '%s': failed to auto-pin at '%s': %d\n", + map->name, map->pin_path, err); + goto err_out; + } + } + } + + return 0; + +err_out: + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("map '%s': failed to create: %s(%d)\n", map->name, cp, err); + pr_perm_msg(err); + for (j = 0; j < i; j++) + zclose(obj->maps[j].fd); + return err; +} + +static bool bpf_core_is_flavor_sep(const char *s) +{ + /* check X___Y name pattern, where X and Y are not underscores */ + return s[0] != '_' && /* X */ + s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */ + s[4] != '_'; /* Y */ +} + +/* Given 'some_struct_name___with_flavor' return the length of a name prefix + * before last triple underscore. Struct name part after last triple + * underscore is ignored by BPF CO-RE relocation during relocation matching. + */ +size_t bpf_core_essential_name_len(const char *name) +{ + size_t n = strlen(name); + int i; + + for (i = n - 5; i >= 0; i--) { + if (bpf_core_is_flavor_sep(name + i)) + return i + 1; + } + return n; +} + +void bpf_core_free_cands(struct bpf_core_cand_list *cands) +{ + if (!cands) + return; + + free(cands->cands); + free(cands); +} + +int bpf_core_add_cands(struct bpf_core_cand *local_cand, + size_t local_essent_len, + const struct btf *targ_btf, + const char *targ_btf_name, + int targ_start_id, + struct bpf_core_cand_list *cands) +{ + struct bpf_core_cand *new_cands, *cand; + const struct btf_type *t, *local_t; + const char *targ_name, *local_name; + size_t targ_essent_len; + int n, i; + + local_t = btf__type_by_id(local_cand->btf, local_cand->id); + local_name = btf__str_by_offset(local_cand->btf, local_t->name_off); + + n = btf__type_cnt(targ_btf); + for (i = targ_start_id; i < n; i++) { + t = btf__type_by_id(targ_btf, i); + if (!btf_kind_core_compat(t, local_t)) + continue; + + targ_name = btf__name_by_offset(targ_btf, t->name_off); + if (str_is_empty(targ_name)) + continue; + + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != local_essent_len) + continue; + + if (strncmp(local_name, targ_name, local_essent_len) != 0) + continue; + + pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s in [%s]\n", + local_cand->id, btf_kind_str(local_t), + local_name, i, btf_kind_str(t), targ_name, + targ_btf_name); + new_cands = libbpf_reallocarray(cands->cands, cands->len + 1, + sizeof(*cands->cands)); + if (!new_cands) + return -ENOMEM; + + cand = &new_cands[cands->len]; + cand->btf = targ_btf; + cand->id = i; + + cands->cands = new_cands; + cands->len++; + } + return 0; +} + +static int load_module_btfs(struct bpf_object *obj) +{ + struct bpf_btf_info info; + struct module_btf *mod_btf; + struct btf *btf; + char name[64]; + __u32 id = 0, len; + int err, fd; + + if (obj->btf_modules_loaded) + return 0; + + if (obj->gen_loader) + return 0; + + /* don't do this again, even if we find no module BTFs */ + obj->btf_modules_loaded = true; + + /* kernel too old to support module BTFs */ + if (!kernel_supports(obj, FEAT_MODULE_BTF)) + return 0; + + while (true) { + err = bpf_btf_get_next_id(id, &id); + if (err && errno == ENOENT) + return 0; + if (err) { + err = -errno; + pr_warn("failed to iterate BTF objects: %d\n", err); + return err; + } + + fd = bpf_btf_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; /* expected race: BTF was unloaded */ + err = -errno; + pr_warn("failed to get BTF object #%d FD: %d\n", id, err); + return err; + } + + len = sizeof(info); + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + err = -errno; + pr_warn("failed to get BTF object #%d info: %d\n", id, err); + goto err_out; + } + + /* ignore non-module BTFs */ + if (!info.kernel_btf || strcmp(name, "vmlinux") == 0) { + close(fd); + continue; + } + + btf = btf_get_from_fd(fd, obj->btf_vmlinux); + err = libbpf_get_error(btf); + if (err) { + pr_warn("failed to load module [%s]'s BTF object #%d: %d\n", + name, id, err); + goto err_out; + } + + err = libbpf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); + if (err) + goto err_out; + + mod_btf = &obj->btf_modules[obj->btf_module_cnt++]; + + mod_btf->btf = btf; + mod_btf->id = id; + mod_btf->fd = fd; + mod_btf->name = strdup(name); + if (!mod_btf->name) { + err = -ENOMEM; + goto err_out; + } + continue; + +err_out: + close(fd); + return err; + } + + return 0; +} + +static struct bpf_core_cand_list * +bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 local_type_id) +{ + struct bpf_core_cand local_cand = {}; + struct bpf_core_cand_list *cands; + const struct btf *main_btf; + const struct btf_type *local_t; + const char *local_name; + size_t local_essent_len; + int err, i; + + local_cand.btf = local_btf; + local_cand.id = local_type_id; + local_t = btf__type_by_id(local_btf, local_type_id); + if (!local_t) + return ERR_PTR(-EINVAL); + + local_name = btf__name_by_offset(local_btf, local_t->name_off); + if (str_is_empty(local_name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(local_name); + + cands = calloc(1, sizeof(*cands)); + if (!cands) + return ERR_PTR(-ENOMEM); + + /* Attempt to find target candidates in vmlinux BTF first */ + main_btf = obj->btf_vmlinux_override ?: obj->btf_vmlinux; + err = bpf_core_add_cands(&local_cand, local_essent_len, main_btf, "vmlinux", 1, cands); + if (err) + goto err_out; + + /* if vmlinux BTF has any candidate, don't got for module BTFs */ + if (cands->len) + return cands; + + /* if vmlinux BTF was overridden, don't attempt to load module BTFs */ + if (obj->btf_vmlinux_override) + return cands; + + /* now look through module BTFs, trying to still find candidates */ + err = load_module_btfs(obj); + if (err) + goto err_out; + + for (i = 0; i < obj->btf_module_cnt; i++) { + err = bpf_core_add_cands(&local_cand, local_essent_len, + obj->btf_modules[i].btf, + obj->btf_modules[i].name, + btf__type_cnt(obj->btf_vmlinux), + cands); + if (err) + goto err_out; + } + + return cands; +err_out: + bpf_core_free_cands(cands); + return ERR_PTR(err); +} + +/* Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, 32); +} + +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, 32); +} + +static size_t bpf_core_hash_fn(const long key, void *ctx) +{ + return key; +} + +static bool bpf_core_equal_fn(const long k1, const long k2, void *ctx) +{ + return k1 == k2; +} + +static int record_relo_core(struct bpf_program *prog, + const struct bpf_core_relo *core_relo, int insn_idx) +{ + struct reloc_desc *relos, *relo; + + relos = libbpf_reallocarray(prog->reloc_desc, + prog->nr_reloc + 1, sizeof(*relos)); + if (!relos) + return -ENOMEM; + relo = &relos[prog->nr_reloc]; + relo->type = RELO_CORE; + relo->insn_idx = insn_idx; + relo->core_relo = core_relo; + prog->reloc_desc = relos; + prog->nr_reloc++; + return 0; +} + +static const struct bpf_core_relo *find_relo_core(struct bpf_program *prog, int insn_idx) +{ + struct reloc_desc *relo; + int i; + + for (i = 0; i < prog->nr_reloc; i++) { + relo = &prog->reloc_desc[i]; + if (relo->type != RELO_CORE || relo->insn_idx != insn_idx) + continue; + + return relo->core_relo; + } + + return NULL; +} + +static int bpf_core_resolve_relo(struct bpf_program *prog, + const struct bpf_core_relo *relo, + int relo_idx, + const struct btf *local_btf, + struct hashmap *cand_cache, + struct bpf_core_relo_res *targ_res) +{ + struct bpf_core_spec specs_scratch[3] = {}; + struct bpf_core_cand_list *cands = NULL; + const char *prog_name = prog->name; + const struct btf_type *local_type; + const char *local_name; + __u32 local_id = relo->type_id; + int err; + + local_type = btf__type_by_id(local_btf, local_id); + if (!local_type) + return -EINVAL; + + local_name = btf__name_by_offset(local_btf, local_type->name_off); + if (!local_name) + return -EINVAL; + + if (relo->kind != BPF_CORE_TYPE_ID_LOCAL && + !hashmap__find(cand_cache, local_id, &cands)) { + cands = bpf_core_find_cands(prog->obj, local_btf, local_id); + if (IS_ERR(cands)) { + pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n", + prog_name, relo_idx, local_id, btf_kind_str(local_type), + local_name, PTR_ERR(cands)); + return PTR_ERR(cands); + } + err = hashmap__set(cand_cache, local_id, cands, NULL, NULL); + if (err) { + bpf_core_free_cands(cands); + return err; + } + } + + return bpf_core_calc_relo_insn(prog_name, relo, relo_idx, local_btf, cands, specs_scratch, + targ_res); +} + +static int +bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) +{ + const struct btf_ext_info_sec *sec; + struct bpf_core_relo_res targ_res; + const struct bpf_core_relo *rec; + const struct btf_ext_info *seg; + struct hashmap_entry *entry; + struct hashmap *cand_cache = NULL; + struct bpf_program *prog; + struct bpf_insn *insn; + const char *sec_name; + int i, err = 0, insn_idx, sec_idx, sec_num; + + if (obj->btf_ext->core_relo_info.len == 0) + return 0; + + if (targ_btf_path) { + obj->btf_vmlinux_override = btf__parse(targ_btf_path, NULL); + err = libbpf_get_error(obj->btf_vmlinux_override); + if (err) { + pr_warn("failed to parse target BTF: %d\n", err); + return err; + } + } + + cand_cache = hashmap__new(bpf_core_hash_fn, bpf_core_equal_fn, NULL); + if (IS_ERR(cand_cache)) { + err = PTR_ERR(cand_cache); + goto out; + } + + seg = &obj->btf_ext->core_relo_info; + sec_num = 0; + for_each_btf_ext_sec(seg, sec) { + sec_idx = seg->sec_idxs[sec_num]; + sec_num++; + + sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off); + if (str_is_empty(sec_name)) { + err = -EINVAL; + goto out; + } + + pr_debug("sec '%s': found %d CO-RE relocations\n", sec_name, sec->num_info); + + for_each_btf_ext_rec(seg, sec, i, rec) { + if (rec->insn_off % BPF_INSN_SZ) + return -EINVAL; + insn_idx = rec->insn_off / BPF_INSN_SZ; + prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); + if (!prog) { + /* When __weak subprog is "overridden" by another instance + * of the subprog from a different object file, linker still + * appends all the .BTF.ext info that used to belong to that + * eliminated subprogram. + * This is similar to what x86-64 linker does for relocations. + * So just ignore such relocations just like we ignore + * subprog instructions when discovering subprograms. + */ + pr_debug("sec '%s': skipping CO-RE relocation #%d for insn #%d belonging to eliminated weak subprogram\n", + sec_name, i, insn_idx); + continue; + } + /* no need to apply CO-RE relocation if the program is + * not going to be loaded + */ + if (!prog->autoload) + continue; + + /* adjust insn_idx from section frame of reference to the local + * program's frame of reference; (sub-)program code is not yet + * relocated, so it's enough to just subtract in-section offset + */ + insn_idx = insn_idx - prog->sec_insn_off; + if (insn_idx >= prog->insns_cnt) + return -EINVAL; + insn = &prog->insns[insn_idx]; + + err = record_relo_core(prog, rec, insn_idx); + if (err) { + pr_warn("prog '%s': relo #%d: failed to record relocation: %d\n", + prog->name, i, err); + goto out; + } + + if (prog->obj->gen_loader) + continue; + + err = bpf_core_resolve_relo(prog, rec, i, obj->btf, cand_cache, &targ_res); + if (err) { + pr_warn("prog '%s': relo #%d: failed to relocate: %d\n", + prog->name, i, err); + goto out; + } + + err = bpf_core_patch_insn(prog->name, insn, insn_idx, rec, i, &targ_res); + if (err) { + pr_warn("prog '%s': relo #%d: failed to patch insn #%u: %d\n", + prog->name, i, insn_idx, err); + goto out; + } + } + } + +out: + /* obj->btf_vmlinux and module BTFs are freed after object load */ + btf__free(obj->btf_vmlinux_override); + obj->btf_vmlinux_override = NULL; + + if (!IS_ERR_OR_NULL(cand_cache)) { + hashmap__for_each_entry(cand_cache, entry, i) { + bpf_core_free_cands(entry->pvalue); + } + hashmap__free(cand_cache); + } + return err; +} + +/* base map load ldimm64 special constant, used also for log fixup logic */ +#define MAP_LDIMM64_POISON_BASE 2001000000 +#define MAP_LDIMM64_POISON_PFX "200100" + +static void poison_map_ldimm64(struct bpf_program *prog, int relo_idx, + int insn_idx, struct bpf_insn *insn, + int map_idx, const struct bpf_map *map) +{ + int i; + + pr_debug("prog '%s': relo #%d: poisoning insn #%d that loads map #%d '%s'\n", + prog->name, relo_idx, insn_idx, map_idx, map->name); + + /* we turn single ldimm64 into two identical invalid calls */ + for (i = 0; i < 2; i++) { + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with something like: + * invalid func unknown#2001000123 + * where lower 123 is map index into obj->maps[] array + */ + insn->imm = MAP_LDIMM64_POISON_BASE + map_idx; + + insn++; + } +} + +/* Relocate data references within program code: + * - map references; + * - global variable references; + * - extern references. + */ +static int +bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) +{ + int i; + + for (i = 0; i < prog->nr_reloc; i++) { + struct reloc_desc *relo = &prog->reloc_desc[i]; + struct bpf_insn *insn = &prog->insns[relo->insn_idx]; + const struct bpf_map *map; + struct extern_desc *ext; + + switch (relo->type) { + case RELO_LD64: + map = &obj->maps[relo->map_idx]; + if (obj->gen_loader) { + insn[0].src_reg = BPF_PSEUDO_MAP_IDX; + insn[0].imm = relo->map_idx; + } else if (map->autocreate) { + insn[0].src_reg = BPF_PSEUDO_MAP_FD; + insn[0].imm = map->fd; + } else { + poison_map_ldimm64(prog, i, relo->insn_idx, insn, + relo->map_idx, map); + } + break; + case RELO_DATA: + map = &obj->maps[relo->map_idx]; + insn[1].imm = insn[0].imm + relo->sym_off; + if (obj->gen_loader) { + insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; + insn[0].imm = relo->map_idx; + } else if (map->autocreate) { + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn[0].imm = map->fd; + } else { + poison_map_ldimm64(prog, i, relo->insn_idx, insn, + relo->map_idx, map); + } + break; + case RELO_EXTERN_VAR: + ext = &obj->externs[relo->sym_off]; + if (ext->type == EXT_KCFG) { + if (obj->gen_loader) { + insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; + insn[0].imm = obj->kconfig_map_idx; + } else { + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn[0].imm = obj->maps[obj->kconfig_map_idx].fd; + } + insn[1].imm = ext->kcfg.data_off; + } else /* EXT_KSYM */ { + if (ext->ksym.type_id && ext->is_set) { /* typed ksyms */ + insn[0].src_reg = BPF_PSEUDO_BTF_ID; + insn[0].imm = ext->ksym.kernel_btf_id; + insn[1].imm = ext->ksym.kernel_btf_obj_fd; + } else { /* typeless ksyms or unresolved typed ksyms */ + insn[0].imm = (__u32)ext->ksym.addr; + insn[1].imm = ext->ksym.addr >> 32; + } + } + break; + case RELO_EXTERN_FUNC: + ext = &obj->externs[relo->sym_off]; + insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; + if (ext->is_set) { + insn[0].imm = ext->ksym.kernel_btf_id; + insn[0].off = ext->ksym.btf_fd_idx; + } else { /* unresolved weak kfunc */ + insn[0].imm = 0; + insn[0].off = 0; + } + break; + case RELO_SUBPROG_ADDR: + if (insn[0].src_reg != BPF_PSEUDO_FUNC) { + pr_warn("prog '%s': relo #%d: bad insn\n", + prog->name, i); + return -EINVAL; + } + /* handled already */ + break; + case RELO_CALL: + /* handled already */ + break; + case RELO_CORE: + /* will be handled by bpf_program_record_relos() */ + break; + default: + pr_warn("prog '%s': relo #%d: bad relo type %d\n", + prog->name, i, relo->type); + return -EINVAL; + } + } + + return 0; +} + +static int adjust_prog_btf_ext_info(const struct bpf_object *obj, + const struct bpf_program *prog, + const struct btf_ext_info *ext_info, + void **prog_info, __u32 *prog_rec_cnt, + __u32 *prog_rec_sz) +{ + void *copy_start = NULL, *copy_end = NULL; + void *rec, *rec_end, *new_prog_info; + const struct btf_ext_info_sec *sec; + size_t old_sz, new_sz; + int i, sec_num, sec_idx, off_adj; + + sec_num = 0; + for_each_btf_ext_sec(ext_info, sec) { + sec_idx = ext_info->sec_idxs[sec_num]; + sec_num++; + if (prog->sec_idx != sec_idx) + continue; + + for_each_btf_ext_rec(ext_info, sec, i, rec) { + __u32 insn_off = *(__u32 *)rec / BPF_INSN_SZ; + + if (insn_off < prog->sec_insn_off) + continue; + if (insn_off >= prog->sec_insn_off + prog->sec_insn_cnt) + break; + + if (!copy_start) + copy_start = rec; + copy_end = rec + ext_info->rec_size; + } + + if (!copy_start) + return -ENOENT; + + /* append func/line info of a given (sub-)program to the main + * program func/line info + */ + old_sz = (size_t)(*prog_rec_cnt) * ext_info->rec_size; + new_sz = old_sz + (copy_end - copy_start); + new_prog_info = realloc(*prog_info, new_sz); + if (!new_prog_info) + return -ENOMEM; + *prog_info = new_prog_info; + *prog_rec_cnt = new_sz / ext_info->rec_size; + memcpy(new_prog_info + old_sz, copy_start, copy_end - copy_start); + + /* Kernel instruction offsets are in units of 8-byte + * instructions, while .BTF.ext instruction offsets generated + * by Clang are in units of bytes. So convert Clang offsets + * into kernel offsets and adjust offset according to program + * relocated position. + */ + off_adj = prog->sub_insn_off - prog->sec_insn_off; + rec = new_prog_info + old_sz; + rec_end = new_prog_info + new_sz; + for (; rec < rec_end; rec += ext_info->rec_size) { + __u32 *insn_off = rec; + + *insn_off = *insn_off / BPF_INSN_SZ + off_adj; + } + *prog_rec_sz = ext_info->rec_size; + return 0; + } + + return -ENOENT; +} + +static int +reloc_prog_func_and_line_info(const struct bpf_object *obj, + struct bpf_program *main_prog, + const struct bpf_program *prog) +{ + int err; + + /* no .BTF.ext relocation if .BTF.ext is missing or kernel doesn't + * supprot func/line info + */ + if (!obj->btf_ext || !kernel_supports(obj, FEAT_BTF_FUNC)) + return 0; + + /* only attempt func info relocation if main program's func_info + * relocation was successful + */ + if (main_prog != prog && !main_prog->func_info) + goto line_info; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->func_info, + &main_prog->func_info, + &main_prog->func_info_cnt, + &main_prog->func_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext function info: %d\n", + prog->name, err); + return err; + } + if (main_prog->func_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext function info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext function info for the main program, skipping all of .BTF.ext func info.\n", + prog->name); + } + +line_info: + /* don't relocate line info if main program's relocation failed */ + if (main_prog != prog && !main_prog->line_info) + return 0; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->line_info, + &main_prog->line_info, + &main_prog->line_info_cnt, + &main_prog->line_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext line info: %d\n", + prog->name, err); + return err; + } + if (main_prog->line_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext line info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext line info for the main program, skipping all of .BTF.ext line info.\n", + prog->name); + } + return 0; +} + +static int cmp_relo_by_insn_idx(const void *key, const void *elem) +{ + size_t insn_idx = *(const size_t *)key; + const struct reloc_desc *relo = elem; + + if (insn_idx == relo->insn_idx) + return 0; + return insn_idx < relo->insn_idx ? -1 : 1; +} + +static struct reloc_desc *find_prog_insn_relo(const struct bpf_program *prog, size_t insn_idx) +{ + if (!prog->nr_reloc) + return NULL; + return bsearch(&insn_idx, prog->reloc_desc, prog->nr_reloc, + sizeof(*prog->reloc_desc), cmp_relo_by_insn_idx); +} + +static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_program *subprog) +{ + int new_cnt = main_prog->nr_reloc + subprog->nr_reloc; + struct reloc_desc *relos; + int i; + + if (main_prog == subprog) + return 0; + relos = libbpf_reallocarray(main_prog->reloc_desc, new_cnt, sizeof(*relos)); + if (!relos) + return -ENOMEM; + if (subprog->nr_reloc) + memcpy(relos + main_prog->nr_reloc, subprog->reloc_desc, + sizeof(*relos) * subprog->nr_reloc); + + for (i = main_prog->nr_reloc; i < new_cnt; i++) + relos[i].insn_idx += subprog->sub_insn_off; + /* After insn_idx adjustment the 'relos' array is still sorted + * by insn_idx and doesn't break bsearch. + */ + main_prog->reloc_desc = relos; + main_prog->nr_reloc = new_cnt; + return 0; +} + +static int +bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, + struct bpf_program *prog) +{ + size_t sub_insn_idx, insn_idx, new_cnt; + struct bpf_program *subprog; + struct bpf_insn *insns, *insn; + struct reloc_desc *relo; + int err; + + err = reloc_prog_func_and_line_info(obj, main_prog, prog); + if (err) + return err; + + for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) { + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn)) + continue; + + relo = find_prog_insn_relo(prog, insn_idx); + if (relo && relo->type == RELO_EXTERN_FUNC) + /* kfunc relocations will be handled later + * in bpf_object__relocate_data() + */ + continue; + if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) { + pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n", + prog->name, insn_idx, relo->type); + return -LIBBPF_ERRNO__RELOC; + } + if (relo) { + /* sub-program instruction index is a combination of + * an offset of a symbol pointed to by relocation and + * call instruction's imm field; for global functions, + * call always has imm = -1, but for static functions + * relocation is against STT_SECTION and insn->imm + * points to a start of a static function + * + * for subprog addr relocation, the relo->sym_off + insn->imm is + * the byte offset in the corresponding section. + */ + if (relo->type == RELO_CALL) + sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + else + sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ; + } else if (insn_is_pseudo_func(insn)) { + /* + * RELO_SUBPROG_ADDR relo is always emitted even if both + * functions are in the same section, so it shouldn't reach here. + */ + pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n", + prog->name, insn_idx); + return -LIBBPF_ERRNO__RELOC; + } else { + /* if subprogram call is to a static function within + * the same ELF section, there won't be any relocation + * emitted, but it also means there is no additional + * offset necessary, insns->imm is relative to + * instruction's original position within the section + */ + sub_insn_idx = prog->sec_insn_off + insn_idx + insn->imm + 1; + } + + /* we enforce that sub-programs should be in .text section */ + subprog = find_prog_by_sec_insn(obj, obj->efile.text_shndx, sub_insn_idx); + if (!subprog) { + pr_warn("prog '%s': no .text section found yet sub-program call exists\n", + prog->name); + return -LIBBPF_ERRNO__RELOC; + } + + /* if it's the first call instruction calling into this + * subprogram (meaning this subprog hasn't been processed + * yet) within the context of current main program: + * - append it at the end of main program's instructions blog; + * - process is recursively, while current program is put on hold; + * - if that subprogram calls some other not yet processes + * subprogram, same thing will happen recursively until + * there are no more unprocesses subprograms left to append + * and relocate. + */ + if (subprog->sub_insn_off == 0) { + subprog->sub_insn_off = main_prog->insns_cnt; + + new_cnt = main_prog->insns_cnt + subprog->insns_cnt; + insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); + return -ENOMEM; + } + main_prog->insns = insns; + main_prog->insns_cnt = new_cnt; + + memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, + subprog->insns_cnt * sizeof(*insns)); + + pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", + main_prog->name, subprog->insns_cnt, subprog->name); + + /* The subprog insns are now appended. Append its relos too. */ + err = append_subprog_relos(main_prog, subprog); + if (err) + return err; + err = bpf_object__reloc_code(obj, main_prog, subprog); + if (err) + return err; + } + + /* main_prog->insns memory could have been re-allocated, so + * calculate pointer again + */ + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + /* calculate correct instruction position within current main + * prog; each main prog can have a different set of + * subprograms appended (potentially in different order as + * well), so position of any subprog can be different for + * different main programs + */ + insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; + + pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", + prog->name, insn_idx, insn->imm, subprog->name, subprog->sub_insn_off); + } + + return 0; +} + +/* + * Relocate sub-program calls. + * + * Algorithm operates as follows. Each entry-point BPF program (referred to as + * main prog) is processed separately. For each subprog (non-entry functions, + * that can be called from either entry progs or other subprogs) gets their + * sub_insn_off reset to zero. This serves as indicator that this subprogram + * hasn't been yet appended and relocated within current main prog. Once its + * relocated, sub_insn_off will point at the position within current main prog + * where given subprog was appended. This will further be used to relocate all + * the call instructions jumping into this subprog. + * + * We start with main program and process all call instructions. If the call + * is into a subprog that hasn't been processed (i.e., subprog->sub_insn_off + * is zero), subprog instructions are appended at the end of main program's + * instruction array. Then main program is "put on hold" while we recursively + * process newly appended subprogram. If that subprogram calls into another + * subprogram that hasn't been appended, new subprogram is appended again to + * the *main* prog's instructions (subprog's instructions are always left + * untouched, as they need to be in unmodified state for subsequent main progs + * and subprog instructions are always sent only as part of a main prog) and + * the process continues recursively. Once all the subprogs called from a main + * prog or any of its subprogs are appended (and relocated), all their + * positions within finalized instructions array are known, so it's easy to + * rewrite call instructions with correct relative offsets, corresponding to + * desired target subprog. + * + * Its important to realize that some subprogs might not be called from some + * main prog and any of its called/used subprogs. Those will keep their + * subprog->sub_insn_off as zero at all times and won't be appended to current + * main prog and won't be relocated within the context of current main prog. + * They might still be used from other main progs later. + * + * Visually this process can be shown as below. Suppose we have two main + * programs mainA and mainB and BPF object contains three subprogs: subA, + * subB, and subC. mainA calls only subA, mainB calls only subC, but subA and + * subC both call subB: + * + * +--------+ +-------+ + * | v v | + * +--+---+ +--+-+-+ +---+--+ + * | subA | | subB | | subC | + * +--+---+ +------+ +---+--+ + * ^ ^ + * | | + * +---+-------+ +------+----+ + * | mainA | | mainB | + * +-----------+ +-----------+ + * + * We'll start relocating mainA, will find subA, append it and start + * processing sub A recursively: + * + * +-----------+------+ + * | mainA | subA | + * +-----------+------+ + * + * At this point we notice that subB is used from subA, so we append it and + * relocate (there are no further subcalls from subB): + * + * +-----------+------+------+ + * | mainA | subA | subB | + * +-----------+------+------+ + * + * At this point, we relocate subA calls, then go one level up and finish with + * relocatin mainA calls. mainA is done. + * + * For mainB process is similar but results in different order. We start with + * mainB and skip subA and subB, as mainB never calls them (at least + * directly), but we see subC is needed, so we append and start processing it: + * + * +-----------+------+ + * | mainB | subC | + * +-----------+------+ + * Now we see subC needs subB, so we go back to it, append and relocate it: + * + * +-----------+------+------+ + * | mainB | subC | subB | + * +-----------+------+------+ + * + * At this point we unwind recursion, relocate calls in subC, then in mainB. + */ +static int +bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) +{ + struct bpf_program *subprog; + int i, err; + + /* mark all subprogs as not relocated (yet) within the context of + * current main program + */ + for (i = 0; i < obj->nr_programs; i++) { + subprog = &obj->programs[i]; + if (!prog_is_subprog(obj, subprog)) + continue; + + subprog->sub_insn_off = 0; + } + + err = bpf_object__reloc_code(obj, prog, prog); + if (err) + return err; + + return 0; +} + +static void +bpf_object__free_relocs(struct bpf_object *obj) +{ + struct bpf_program *prog; + int i; + + /* free up relocation descriptors */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + zfree(&prog->reloc_desc); + prog->nr_reloc = 0; + } +} + +static int cmp_relocs(const void *_a, const void *_b) +{ + const struct reloc_desc *a = _a; + const struct reloc_desc *b = _b; + + if (a->insn_idx != b->insn_idx) + return a->insn_idx < b->insn_idx ? -1 : 1; + + /* no two relocations should have the same insn_idx, but ... */ + if (a->type != b->type) + return a->type < b->type ? -1 : 1; + + return 0; +} + +static void bpf_object__sort_relos(struct bpf_object *obj) +{ + int i; + + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *p = &obj->programs[i]; + + if (!p->nr_reloc) + continue; + + qsort(p->reloc_desc, p->nr_reloc, sizeof(*p->reloc_desc), cmp_relocs); + } +} + +static int +bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) +{ + struct bpf_program *prog; + size_t i, j; + int err; + + if (obj->btf_ext) { + err = bpf_object__relocate_core(obj, targ_btf_path); + if (err) { + pr_warn("failed to perform CO-RE relocations: %d\n", + err); + return err; + } + bpf_object__sort_relos(obj); + } + + /* Before relocating calls pre-process relocations and mark + * few ld_imm64 instructions that points to subprogs. + * Otherwise bpf_object__reloc_code() later would have to consider + * all ld_imm64 insns as relocation candidates. That would + * reduce relocation speed, since amount of find_prog_insn_relo() + * would increase and most of them will fail to find a relo. + */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + for (j = 0; j < prog->nr_reloc; j++) { + struct reloc_desc *relo = &prog->reloc_desc[j]; + struct bpf_insn *insn = &prog->insns[relo->insn_idx]; + + /* mark the insn, so it's recognized by insn_is_pseudo_func() */ + if (relo->type == RELO_SUBPROG_ADDR) + insn[0].src_reg = BPF_PSEUDO_FUNC; + } + } + + /* relocate subprogram calls and append used subprograms to main + * programs; each copy of subprogram code needs to be relocated + * differently for each main program, because its code location might + * have changed. + * Append subprog relos to main programs to allow data relos to be + * processed after text is completely relocated. + */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + /* sub-program's sub-calls are relocated within the context of + * its main program only + */ + if (prog_is_subprog(obj, prog)) + continue; + if (!prog->autoload) + continue; + + err = bpf_object__relocate_calls(obj, prog); + if (err) { + pr_warn("prog '%s': failed to relocate calls: %d\n", + prog->name, err); + return err; + } + } + /* Process data relos for main programs */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + if (prog_is_subprog(obj, prog)) + continue; + if (!prog->autoload) + continue; + err = bpf_object__relocate_data(obj, prog); + if (err) { + pr_warn("prog '%s': failed to relocate data references: %d\n", + prog->name, err); + return err; + } + } + + return 0; +} + +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + Elf64_Shdr *shdr, Elf_Data *data); + +static int bpf_object__collect_map_relos(struct bpf_object *obj, + Elf64_Shdr *shdr, Elf_Data *data) +{ + const int bpf_ptr_sz = 8, host_ptr_sz = sizeof(void *); + int i, j, nrels, new_sz; + const struct btf_var_secinfo *vi = NULL; + const struct btf_type *sec, *var, *def; + struct bpf_map *map = NULL, *targ_map = NULL; + struct bpf_program *targ_prog = NULL; + bool is_prog_array, is_map_in_map; + const struct btf_member *member; + const char *name, *mname, *type; + unsigned int moff; + Elf64_Sym *sym; + Elf64_Rel *rel; + void *tmp; + + if (!obj->efile.btf_maps_sec_btf_id || !obj->btf) + return -EINVAL; + sec = btf__type_by_id(obj->btf, obj->efile.btf_maps_sec_btf_id); + if (!sec) + return -EINVAL; + + nrels = shdr->sh_size / shdr->sh_entsize; + for (i = 0; i < nrels; i++) { + rel = elf_rel_by_idx(data, i); + if (!rel) { + pr_warn(".maps relo #%d: failed to get ELF relo\n", i); + return -LIBBPF_ERRNO__FORMAT; + } + + sym = elf_sym_by_idx(obj, ELF64_R_SYM(rel->r_info)); + if (!sym) { + pr_warn(".maps relo #%d: symbol %zx not found\n", + i, (size_t)ELF64_R_SYM(rel->r_info)); + return -LIBBPF_ERRNO__FORMAT; + } + name = elf_sym_str(obj, sym->st_name) ?: "<?>"; + + pr_debug(".maps relo #%d: for %zd value %zd rel->r_offset %zu name %d ('%s')\n", + i, (ssize_t)(rel->r_info >> 32), (size_t)sym->st_value, + (size_t)rel->r_offset, sym->st_name, name); + + for (j = 0; j < obj->nr_maps; j++) { + map = &obj->maps[j]; + if (map->sec_idx != obj->efile.btf_maps_shndx) + continue; + + vi = btf_var_secinfos(sec) + map->btf_var_idx; + if (vi->offset <= rel->r_offset && + rel->r_offset + bpf_ptr_sz <= vi->offset + vi->size) + break; + } + if (j == obj->nr_maps) { + pr_warn(".maps relo #%d: cannot find map '%s' at rel->r_offset %zu\n", + i, name, (size_t)rel->r_offset); + return -EINVAL; + } + + is_map_in_map = bpf_map_type__is_map_in_map(map->def.type); + is_prog_array = map->def.type == BPF_MAP_TYPE_PROG_ARRAY; + type = is_map_in_map ? "map" : "prog"; + if (is_map_in_map) { + if (sym->st_shndx != obj->efile.btf_maps_shndx) { + pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS && + map->def.key_size != sizeof(int)) { + pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n", + i, map->name, sizeof(int)); + return -EINVAL; + } + targ_map = bpf_object__find_map_by_name(obj, name); + if (!targ_map) { + pr_warn(".maps relo #%d: '%s' isn't a valid map reference\n", + i, name); + return -ESRCH; + } + } else if (is_prog_array) { + targ_prog = bpf_object__find_program_by_name(obj, name); + if (!targ_prog) { + pr_warn(".maps relo #%d: '%s' isn't a valid program reference\n", + i, name); + return -ESRCH; + } + if (targ_prog->sec_idx != sym->st_shndx || + targ_prog->sec_insn_off * 8 != sym->st_value || + prog_is_subprog(obj, targ_prog)) { + pr_warn(".maps relo #%d: '%s' isn't an entry-point program\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + } else { + return -EINVAL; + } + + var = btf__type_by_id(obj->btf, vi->type); + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (btf_vlen(def) == 0) + return -EINVAL; + member = btf_members(def) + btf_vlen(def) - 1; + mname = btf__name_by_offset(obj->btf, member->name_off); + if (strcmp(mname, "values")) + return -EINVAL; + + moff = btf_member_bit_offset(def, btf_vlen(def) - 1) / 8; + if (rel->r_offset - vi->offset < moff) + return -EINVAL; + + moff = rel->r_offset - vi->offset - moff; + /* here we use BPF pointer size, which is always 64 bit, as we + * are parsing ELF that was built for BPF target + */ + if (moff % bpf_ptr_sz) + return -EINVAL; + moff /= bpf_ptr_sz; + if (moff >= map->init_slots_sz) { + new_sz = moff + 1; + tmp = libbpf_reallocarray(map->init_slots, new_sz, host_ptr_sz); + if (!tmp) + return -ENOMEM; + map->init_slots = tmp; + memset(map->init_slots + map->init_slots_sz, 0, + (new_sz - map->init_slots_sz) * host_ptr_sz); + map->init_slots_sz = new_sz; + } + map->init_slots[moff] = is_map_in_map ? (void *)targ_map : (void *)targ_prog; + + pr_debug(".maps relo #%d: map '%s' slot [%d] points to %s '%s'\n", + i, map->name, moff, type, name); + } + + return 0; +} + +static int bpf_object__collect_relos(struct bpf_object *obj) +{ + int i, err; + + for (i = 0; i < obj->efile.sec_cnt; i++) { + struct elf_sec_desc *sec_desc = &obj->efile.secs[i]; + Elf64_Shdr *shdr; + Elf_Data *data; + int idx; + + if (sec_desc->sec_type != SEC_RELO) + continue; + + shdr = sec_desc->shdr; + data = sec_desc->data; + idx = shdr->sh_info; + + if (shdr->sh_type != SHT_REL) { + pr_warn("internal error at %d\n", __LINE__); + return -LIBBPF_ERRNO__INTERNAL; + } + + if (idx == obj->efile.st_ops_shndx) + err = bpf_object__collect_st_ops_relos(obj, shdr, data); + else if (idx == obj->efile.btf_maps_shndx) + err = bpf_object__collect_map_relos(obj, shdr, data); + else + err = bpf_object__collect_prog_relos(obj, shdr, data); + if (err) + return err; + } + + bpf_object__sort_relos(obj); + return 0; +} + +static bool insn_is_helper_call(struct bpf_insn *insn, enum bpf_func_id *func_id) +{ + if (BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_CALL && + BPF_SRC(insn->code) == BPF_K && + insn->src_reg == 0 && + insn->dst_reg == 0) { + *func_id = insn->imm; + return true; + } + return false; +} + +static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program *prog) +{ + struct bpf_insn *insn = prog->insns; + enum bpf_func_id func_id; + int i; + + if (obj->gen_loader) + return 0; + + for (i = 0; i < prog->insns_cnt; i++, insn++) { + if (!insn_is_helper_call(insn, &func_id)) + continue; + + /* on kernels that don't yet support + * bpf_probe_read_{kernel,user}[_str] helpers, fall back + * to bpf_probe_read() which works well for old kernels + */ + switch (func_id) { + case BPF_FUNC_probe_read_kernel: + case BPF_FUNC_probe_read_user: + if (!kernel_supports(obj, FEAT_PROBE_READ_KERN)) + insn->imm = BPF_FUNC_probe_read; + break; + case BPF_FUNC_probe_read_kernel_str: + case BPF_FUNC_probe_read_user_str: + if (!kernel_supports(obj, FEAT_PROBE_READ_KERN)) + insn->imm = BPF_FUNC_probe_read_str; + break; + default: + break; + } + } + return 0; +} + +static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attach_name, + int *btf_obj_fd, int *btf_type_id); + +/* this is called as prog->sec_def->prog_prepare_load_fn for libbpf-supported sec_defs */ +static int libbpf_prepare_prog_load(struct bpf_program *prog, + struct bpf_prog_load_opts *opts, long cookie) +{ + enum sec_def_flags def = cookie; + + /* old kernels might not support specifying expected_attach_type */ + if ((def & SEC_EXP_ATTACH_OPT) && !kernel_supports(prog->obj, FEAT_EXP_ATTACH_TYPE)) + opts->expected_attach_type = 0; + + if (def & SEC_SLEEPABLE) + opts->prog_flags |= BPF_F_SLEEPABLE; + + if (prog->type == BPF_PROG_TYPE_XDP && (def & SEC_XDP_FRAGS)) + opts->prog_flags |= BPF_F_XDP_HAS_FRAGS; + + if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) { + int btf_obj_fd = 0, btf_type_id = 0, err; + const char *attach_name; + + attach_name = strchr(prog->sec_name, '/'); + if (!attach_name) { + /* if BPF program is annotated with just SEC("fentry") + * (or similar) without declaratively specifying + * target, then it is expected that target will be + * specified with bpf_program__set_attach_target() at + * runtime before BPF object load step. If not, then + * there is nothing to load into the kernel as BPF + * verifier won't be able to validate BPF program + * correctness anyways. + */ + pr_warn("prog '%s': no BTF-based attach target is specified, use bpf_program__set_attach_target()\n", + prog->name); + return -EINVAL; + } + attach_name++; /* skip over / */ + + err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd, &btf_type_id); + if (err) + return err; + + /* cache resolved BTF FD and BTF type ID in the prog */ + prog->attach_btf_obj_fd = btf_obj_fd; + prog->attach_btf_id = btf_type_id; + + /* but by now libbpf common logic is not utilizing + * prog->atach_btf_obj_fd/prog->attach_btf_id anymore because + * this callback is called after opts were populated by + * libbpf, so this callback has to update opts explicitly here + */ + opts->attach_btf_obj_fd = btf_obj_fd; + opts->attach_btf_id = btf_type_id; + } + return 0; +} + +static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz); + +static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog, + struct bpf_insn *insns, int insns_cnt, + const char *license, __u32 kern_version, int *prog_fd) +{ + LIBBPF_OPTS(bpf_prog_load_opts, load_attr); + const char *prog_name = NULL; + char *cp, errmsg[STRERR_BUFSIZE]; + size_t log_buf_size = 0; + char *log_buf = NULL, *tmp; + int btf_fd, ret, err; + bool own_log_buf = true; + __u32 log_level = prog->log_level; + + if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* + * The program type must be set. Most likely we couldn't find a proper + * section definition at load time, and thus we didn't infer the type. + */ + pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", + prog->name, prog->sec_name); + return -EINVAL; + } + + if (!insns || !insns_cnt) + return -EINVAL; + + load_attr.expected_attach_type = prog->expected_attach_type; + if (kernel_supports(obj, FEAT_PROG_NAME)) + prog_name = prog->name; + load_attr.attach_prog_fd = prog->attach_prog_fd; + load_attr.attach_btf_obj_fd = prog->attach_btf_obj_fd; + load_attr.attach_btf_id = prog->attach_btf_id; + load_attr.kern_version = kern_version; + load_attr.prog_ifindex = prog->prog_ifindex; + + /* specify func_info/line_info only if kernel supports them */ + btf_fd = bpf_object__btf_fd(obj); + if (btf_fd >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { + load_attr.prog_btf_fd = btf_fd; + load_attr.func_info = prog->func_info; + load_attr.func_info_rec_size = prog->func_info_rec_size; + load_attr.func_info_cnt = prog->func_info_cnt; + load_attr.line_info = prog->line_info; + load_attr.line_info_rec_size = prog->line_info_rec_size; + load_attr.line_info_cnt = prog->line_info_cnt; + } + load_attr.log_level = log_level; + load_attr.prog_flags = prog->prog_flags; + load_attr.fd_array = obj->fd_array; + + /* adjust load_attr if sec_def provides custom preload callback */ + if (prog->sec_def && prog->sec_def->prog_prepare_load_fn) { + err = prog->sec_def->prog_prepare_load_fn(prog, &load_attr, prog->sec_def->cookie); + if (err < 0) { + pr_warn("prog '%s': failed to prepare load attributes: %d\n", + prog->name, err); + return err; + } + insns = prog->insns; + insns_cnt = prog->insns_cnt; + } + + if (obj->gen_loader) { + bpf_gen__prog_load(obj->gen_loader, prog->type, prog->name, + license, insns, insns_cnt, &load_attr, + prog - obj->programs); + *prog_fd = -1; + return 0; + } + +retry_load: + /* if log_level is zero, we don't request logs initially even if + * custom log_buf is specified; if the program load fails, then we'll + * bump log_level to 1 and use either custom log_buf or we'll allocate + * our own and retry the load to get details on what failed + */ + if (log_level) { + if (prog->log_buf) { + log_buf = prog->log_buf; + log_buf_size = prog->log_size; + own_log_buf = false; + } else if (obj->log_buf) { + log_buf = obj->log_buf; + log_buf_size = obj->log_size; + own_log_buf = false; + } else { + log_buf_size = max((size_t)BPF_LOG_BUF_SIZE, log_buf_size * 2); + tmp = realloc(log_buf, log_buf_size); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + log_buf = tmp; + log_buf[0] = '\0'; + own_log_buf = true; + } + } + + load_attr.log_buf = log_buf; + load_attr.log_size = log_buf_size; + load_attr.log_level = log_level; + + ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr); + if (ret >= 0) { + if (log_level && own_log_buf) { + pr_debug("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", + prog->name, log_buf); + } + + if (obj->has_rodata && kernel_supports(obj, FEAT_PROG_BIND_MAP)) { + struct bpf_map *map; + int i; + + for (i = 0; i < obj->nr_maps; i++) { + map = &prog->obj->maps[i]; + if (map->libbpf_type != LIBBPF_MAP_RODATA) + continue; + + if (bpf_prog_bind_map(ret, bpf_map__fd(map), NULL)) { + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': failed to bind map '%s': %s\n", + prog->name, map->real_name, cp); + /* Don't fail hard if can't bind rodata. */ + } + } + } + + *prog_fd = ret; + ret = 0; + goto out; + } + + if (log_level == 0) { + log_level = 1; + goto retry_load; + } + /* On ENOSPC, increase log buffer size and retry, unless custom + * log_buf is specified. + * Be careful to not overflow u32, though. Kernel's log buf size limit + * isn't part of UAPI so it can always be bumped to full 4GB. So don't + * multiply by 2 unless we are sure we'll fit within 32 bits. + * Currently, we'll get -EINVAL when we reach (UINT_MAX >> 2). + */ + if (own_log_buf && errno == ENOSPC && log_buf_size <= UINT_MAX / 2) + goto retry_load; + + ret = -errno; + + /* post-process verifier log to improve error descriptions */ + fixup_verifier_log(prog, log_buf, log_buf_size); + + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': BPF program load failed: %s\n", prog->name, cp); + pr_perm_msg(ret); + + if (own_log_buf && log_buf && log_buf[0] != '\0') { + pr_warn("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", + prog->name, log_buf); + } + +out: + if (own_log_buf) + free(log_buf); + return ret; +} + +static char *find_prev_line(char *buf, char *cur) +{ + char *p; + + if (cur == buf) /* end of a log buf */ + return NULL; + + p = cur - 1; + while (p - 1 >= buf && *(p - 1) != '\n') + p--; + + return p; +} + +static void patch_log(char *buf, size_t buf_sz, size_t log_sz, + char *orig, size_t orig_sz, const char *patch) +{ + /* size of the remaining log content to the right from the to-be-replaced part */ + size_t rem_sz = (buf + log_sz) - (orig + orig_sz); + size_t patch_sz = strlen(patch); + + if (patch_sz != orig_sz) { + /* If patch line(s) are longer than original piece of verifier log, + * shift log contents by (patch_sz - orig_sz) bytes to the right + * starting from after to-be-replaced part of the log. + * + * If patch line(s) are shorter than original piece of verifier log, + * shift log contents by (orig_sz - patch_sz) bytes to the left + * starting from after to-be-replaced part of the log + * + * We need to be careful about not overflowing available + * buf_sz capacity. If that's the case, we'll truncate the end + * of the original log, as necessary. + */ + if (patch_sz > orig_sz) { + if (orig + patch_sz >= buf + buf_sz) { + /* patch is big enough to cover remaining space completely */ + patch_sz -= (orig + patch_sz) - (buf + buf_sz) + 1; + rem_sz = 0; + } else if (patch_sz - orig_sz > buf_sz - log_sz) { + /* patch causes part of remaining log to be truncated */ + rem_sz -= (patch_sz - orig_sz) - (buf_sz - log_sz); + } + } + /* shift remaining log to the right by calculated amount */ + memmove(orig + patch_sz, orig + orig_sz, rem_sz); + } + + memcpy(orig, patch, patch_sz); +} + +static void fixup_log_failed_core_relo(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded CO-RE relocation: + * line1 -> 123: (85) call unknown#195896080 + * line2 -> invalid func unknown#195896080 + * line3 -> <anything else or end of buffer> + * + * "123" is the index of the instruction that was poisoned. We extract + * instruction index to find corresponding CO-RE relocation and + * replace this part of the log with more relevant information about + * failed CO-RE relocation. + */ + const struct bpf_core_relo *relo; + struct bpf_core_spec spec; + char patch[512], spec_buf[256]; + int insn_idx, err, spec_len; + + if (sscanf(line1, "%d: (%*d) call unknown#195896080\n", &insn_idx) != 1) + return; + + relo = find_relo_core(prog, insn_idx); + if (!relo) + return; + + err = bpf_core_parse_spec(prog->name, prog->obj->btf, relo, &spec); + if (err) + return; + + spec_len = bpf_core_format_spec(spec_buf, sizeof(spec_buf), &spec); + snprintf(patch, sizeof(patch), + "%d: <invalid CO-RE relocation>\n" + "failed to resolve CO-RE relocation %s%s\n", + insn_idx, spec_buf, spec_len >= sizeof(spec_buf) ? "..." : ""); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_log_missing_map_load(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded CO-RE relocation: + * line1 -> 123: (85) call unknown#2001000345 + * line2 -> invalid func unknown#2001000345 + * line3 -> <anything else or end of buffer> + * + * "123" is the index of the instruction that was poisoned. + * "345" in "2001000345" are map index in obj->maps to fetch map name. + */ + struct bpf_object *obj = prog->obj; + const struct bpf_map *map; + int insn_idx, map_idx; + char patch[128]; + + if (sscanf(line1, "%d: (%*d) call unknown#%d\n", &insn_idx, &map_idx) != 2) + return; + + map_idx -= MAP_LDIMM64_POISON_BASE; + if (map_idx < 0 || map_idx >= obj->nr_maps) + return; + map = &obj->maps[map_idx]; + + snprintf(patch, sizeof(patch), + "%d: <invalid BPF map reference>\n" + "BPF map '%s' is referenced but wasn't created\n", + insn_idx, map->name); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz) +{ + /* look for familiar error patterns in last N lines of the log */ + const size_t max_last_line_cnt = 10; + char *prev_line, *cur_line, *next_line; + size_t log_sz; + int i; + + if (!buf) + return; + + log_sz = strlen(buf) + 1; + next_line = buf + log_sz - 1; + + for (i = 0; i < max_last_line_cnt; i++, next_line = cur_line) { + cur_line = find_prev_line(buf, next_line); + if (!cur_line) + return; + + /* failed CO-RE relocation case */ + if (str_has_pfx(cur_line, "invalid func unknown#195896080\n")) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + fixup_log_failed_core_relo(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; + } else if (str_has_pfx(cur_line, "invalid func unknown#"MAP_LDIMM64_POISON_PFX)) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + fixup_log_missing_map_load(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; + } + } +} + +static int bpf_program_record_relos(struct bpf_program *prog) +{ + struct bpf_object *obj = prog->obj; + int i; + + for (i = 0; i < prog->nr_reloc; i++) { + struct reloc_desc *relo = &prog->reloc_desc[i]; + struct extern_desc *ext = &obj->externs[relo->sym_off]; + + switch (relo->type) { + case RELO_EXTERN_VAR: + if (ext->type != EXT_KSYM) + continue; + bpf_gen__record_extern(obj->gen_loader, ext->name, + ext->is_weak, !ext->ksym.type_id, + BTF_KIND_VAR, relo->insn_idx); + break; + case RELO_EXTERN_FUNC: + bpf_gen__record_extern(obj->gen_loader, ext->name, + ext->is_weak, false, BTF_KIND_FUNC, + relo->insn_idx); + break; + case RELO_CORE: { + struct bpf_core_relo cr = { + .insn_off = relo->insn_idx * 8, + .type_id = relo->core_relo->type_id, + .access_str_off = relo->core_relo->access_str_off, + .kind = relo->core_relo->kind, + }; + + bpf_gen__record_relo_core(obj->gen_loader, &cr); + break; + } + default: + continue; + } + } + return 0; +} + +static int +bpf_object__load_progs(struct bpf_object *obj, int log_level) +{ + struct bpf_program *prog; + size_t i; + int err; + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + err = bpf_object__sanitize_prog(obj, prog); + if (err) + return err; + } + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + if (prog_is_subprog(obj, prog)) + continue; + if (!prog->autoload) { + pr_debug("prog '%s': skipped loading\n", prog->name); + continue; + } + prog->log_level |= log_level; + + if (obj->gen_loader) + bpf_program_record_relos(prog); + + err = bpf_object_load_prog(obj, prog, prog->insns, prog->insns_cnt, + obj->license, obj->kern_version, &prog->fd); + if (err) { + pr_warn("prog '%s': failed to load: %d\n", prog->name, err); + return err; + } + } + + bpf_object__free_relocs(obj); + return 0; +} + +static const struct bpf_sec_def *find_sec_def(const char *sec_name); + +static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object_open_opts *opts) +{ + struct bpf_program *prog; + int err; + + bpf_object__for_each_program(prog, obj) { + prog->sec_def = find_sec_def(prog->sec_name); + if (!prog->sec_def) { + /* couldn't guess, but user might manually specify */ + pr_debug("prog '%s': unrecognized ELF section name '%s'\n", + prog->name, prog->sec_name); + continue; + } + + prog->type = prog->sec_def->prog_type; + prog->expected_attach_type = prog->sec_def->expected_attach_type; + + /* sec_def can have custom callback which should be called + * after bpf_program is initialized to adjust its properties + */ + if (prog->sec_def->prog_setup_fn) { + err = prog->sec_def->prog_setup_fn(prog, prog->sec_def->cookie); + if (err < 0) { + pr_warn("prog '%s': failed to initialize: %d\n", + prog->name, err); + return err; + } + } + } + + return 0; +} + +static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts) +{ + const char *obj_name, *kconfig, *btf_tmp_path; + struct bpf_object *obj; + char tmp_name[64]; + int err; + char *log_buf; + size_t log_size; + __u32 log_level; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn("failed to init libelf for %s\n", + path ? : "(mem buf)"); + return ERR_PTR(-LIBBPF_ERRNO__LIBELF); + } + + if (!OPTS_VALID(opts, bpf_object_open_opts)) + return ERR_PTR(-EINVAL); + + obj_name = OPTS_GET(opts, object_name, NULL); + if (obj_buf) { + if (!obj_name) { + snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx", + (unsigned long)obj_buf, + (unsigned long)obj_buf_sz); + obj_name = tmp_name; + } + path = obj_name; + pr_debug("loading object '%s' from buffer\n", obj_name); + } + + log_buf = OPTS_GET(opts, kernel_log_buf, NULL); + log_size = OPTS_GET(opts, kernel_log_size, 0); + log_level = OPTS_GET(opts, kernel_log_level, 0); + if (log_size > UINT_MAX) + return ERR_PTR(-EINVAL); + if (log_size && !log_buf) + return ERR_PTR(-EINVAL); + + obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name); + if (IS_ERR(obj)) + return obj; + + obj->log_buf = log_buf; + obj->log_size = log_size; + obj->log_level = log_level; + + btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL); + if (btf_tmp_path) { + if (strlen(btf_tmp_path) >= PATH_MAX) { + err = -ENAMETOOLONG; + goto out; + } + obj->btf_custom_path = strdup(btf_tmp_path); + if (!obj->btf_custom_path) { + err = -ENOMEM; + goto out; + } + } + + kconfig = OPTS_GET(opts, kconfig, NULL); + if (kconfig) { + obj->kconfig = strdup(kconfig); + if (!obj->kconfig) { + err = -ENOMEM; + goto out; + } + } + + err = bpf_object__elf_init(obj); + err = err ? : bpf_object__check_endianness(obj); + err = err ? : bpf_object__elf_collect(obj); + err = err ? : bpf_object__collect_externs(obj); + err = err ? : bpf_object_fixup_btf(obj); + err = err ? : bpf_object__init_maps(obj, opts); + err = err ? : bpf_object_init_progs(obj, opts); + err = err ? : bpf_object__collect_relos(obj); + if (err) + goto out; + + bpf_object__elf_finish(obj); + + return obj; +out: + bpf_object__close(obj); + return ERR_PTR(err); +} + +struct bpf_object * +bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) +{ + if (!path) + return libbpf_err_ptr(-EINVAL); + + pr_debug("loading %s\n", path); + + return libbpf_ptr(bpf_object_open(path, NULL, 0, opts)); +} + +struct bpf_object *bpf_object__open(const char *path) +{ + return bpf_object__open_file(path, NULL); +} + +struct bpf_object * +bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts) +{ + if (!obj_buf || obj_buf_sz == 0) + return libbpf_err_ptr(-EINVAL); + + return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)); +} + +static int bpf_object_unload(struct bpf_object *obj) +{ + size_t i; + + if (!obj) + return libbpf_err(-EINVAL); + + for (i = 0; i < obj->nr_maps; i++) { + zclose(obj->maps[i].fd); + if (obj->maps[i].st_ops) + zfree(&obj->maps[i].st_ops->kern_vdata); + } + + for (i = 0; i < obj->nr_programs; i++) + bpf_program__unload(&obj->programs[i]); + + return 0; +} + +static int bpf_object__sanitize_maps(struct bpf_object *obj) +{ + struct bpf_map *m; + + bpf_object__for_each_map(m, obj) { + if (!bpf_map__is_internal(m)) + continue; + if (!kernel_supports(obj, FEAT_ARRAY_MMAP)) + m->def.map_flags ^= BPF_F_MMAPABLE; + } + + return 0; +} + +int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +{ + char sym_type, sym_name[500]; + unsigned long long sym_addr; + int ret, err = 0; + FILE *f; + + f = fopen("/proc/kallsyms", "r"); + if (!f) { + err = -errno; + pr_warn("failed to open /proc/kallsyms: %d\n", err); + return err; + } + + while (true) { + ret = fscanf(f, "%llx %c %499s%*[^\n]\n", + &sym_addr, &sym_type, sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 3) { + pr_warn("failed to read kallsyms entry: %d\n", ret); + err = -EINVAL; + break; + } + + err = cb(sym_addr, sym_type, sym_name, ctx); + if (err) + break; + } + + fclose(f); + return err; +} + +static int kallsyms_cb(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx) +{ + struct bpf_object *obj = ctx; + const struct btf_type *t; + struct extern_desc *ext; + + ext = find_extern_by_name(obj, sym_name); + if (!ext || ext->type != EXT_KSYM) + return 0; + + t = btf__type_by_id(obj->btf, ext->btf_id); + if (!btf_is_var(t)) + return 0; + + if (ext->is_set && ext->ksym.addr != sym_addr) { + pr_warn("extern (ksym) '%s': resolution is ambiguous: 0x%llx or 0x%llx\n", + sym_name, ext->ksym.addr, sym_addr); + return -EINVAL; + } + if (!ext->is_set) { + ext->is_set = true; + ext->ksym.addr = sym_addr; + pr_debug("extern (ksym) '%s': set to 0x%llx\n", sym_name, sym_addr); + } + return 0; +} + +static int bpf_object__read_kallsyms_file(struct bpf_object *obj) +{ + return libbpf_kallsyms_parse(kallsyms_cb, obj); +} + +static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name, + __u16 kind, struct btf **res_btf, + struct module_btf **res_mod_btf) +{ + struct module_btf *mod_btf; + struct btf *btf; + int i, id, err; + + btf = obj->btf_vmlinux; + mod_btf = NULL; + id = btf__find_by_name_kind(btf, ksym_name, kind); + + if (id == -ENOENT) { + err = load_module_btfs(obj); + if (err) + return err; + + for (i = 0; i < obj->btf_module_cnt; i++) { + /* we assume module_btf's BTF FD is always >0 */ + mod_btf = &obj->btf_modules[i]; + btf = mod_btf->btf; + id = btf__find_by_name_kind_own(btf, ksym_name, kind); + if (id != -ENOENT) + break; + } + } + if (id <= 0) + return -ESRCH; + + *res_btf = btf; + *res_mod_btf = mod_btf; + return id; +} + +static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + const struct btf_type *targ_var, *targ_type; + __u32 targ_type_id, local_type_id; + struct module_btf *mod_btf = NULL; + const char *targ_var_name; + struct btf *btf = NULL; + int id, err; + + id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &mod_btf); + if (id < 0) { + if (id == -ESRCH && ext->is_weak) + return 0; + pr_warn("extern (var ksym) '%s': not found in kernel BTF\n", + ext->name); + return id; + } + + /* find local type_id */ + local_type_id = ext->ksym.type_id; + + /* find target type_id */ + targ_var = btf__type_by_id(btf, id); + targ_var_name = btf__name_by_offset(btf, targ_var->name_off); + targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); + + err = bpf_core_types_are_compat(obj->btf, local_type_id, + btf, targ_type_id); + if (err <= 0) { + const struct btf_type *local_type; + const char *targ_name, *local_name; + + local_type = btf__type_by_id(obj->btf, local_type_id); + local_name = btf__name_by_offset(obj->btf, local_type->name_off); + targ_name = btf__name_by_offset(btf, targ_type->name_off); + + pr_warn("extern (var ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", + ext->name, local_type_id, + btf_kind_str(local_type), local_name, targ_type_id, + btf_kind_str(targ_type), targ_name); + return -EINVAL; + } + + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = mod_btf ? mod_btf->fd : 0; + ext->ksym.kernel_btf_id = id; + pr_debug("extern (var ksym) '%s': resolved to [%d] %s %s\n", + ext->name, id, btf_kind_str(targ_var), targ_var_name); + + return 0; +} + +static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + int local_func_proto_id, kfunc_proto_id, kfunc_id; + struct module_btf *mod_btf = NULL; + const struct btf_type *kern_func; + struct btf *kern_btf = NULL; + int ret; + + local_func_proto_id = ext->ksym.type_id; + + kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC, &kern_btf, &mod_btf); + if (kfunc_id < 0) { + if (kfunc_id == -ESRCH && ext->is_weak) + return 0; + pr_warn("extern (func ksym) '%s': not found in kernel or module BTFs\n", + ext->name); + return kfunc_id; + } + + kern_func = btf__type_by_id(kern_btf, kfunc_id); + kfunc_proto_id = kern_func->type; + + ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id, + kern_btf, kfunc_proto_id); + if (ret <= 0) { + pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with kernel [%d]\n", + ext->name, local_func_proto_id, kfunc_proto_id); + return -EINVAL; + } + + /* set index for module BTF fd in fd_array, if unset */ + if (mod_btf && !mod_btf->fd_array_idx) { + /* insn->off is s16 */ + if (obj->fd_array_cnt == INT16_MAX) { + pr_warn("extern (func ksym) '%s': module BTF fd index %d too big to fit in bpf_insn offset\n", + ext->name, mod_btf->fd_array_idx); + return -E2BIG; + } + /* Cannot use index 0 for module BTF fd */ + if (!obj->fd_array_cnt) + obj->fd_array_cnt = 1; + + ret = libbpf_ensure_mem((void **)&obj->fd_array, &obj->fd_array_cap, sizeof(int), + obj->fd_array_cnt + 1); + if (ret) + return ret; + mod_btf->fd_array_idx = obj->fd_array_cnt; + /* we assume module BTF FD is always >0 */ + obj->fd_array[obj->fd_array_cnt++] = mod_btf->fd; + } + + ext->is_set = true; + ext->ksym.kernel_btf_id = kfunc_id; + ext->ksym.btf_fd_idx = mod_btf ? mod_btf->fd_array_idx : 0; + pr_debug("extern (func ksym) '%s': resolved to kernel [%d]\n", + ext->name, kfunc_id); + + return 0; +} + +static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +{ + const struct btf_type *t; + struct extern_desc *ext; + int i, err; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM || !ext->ksym.type_id) + continue; + + if (obj->gen_loader) { + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = 0; + ext->ksym.kernel_btf_id = 0; + continue; + } + t = btf__type_by_id(obj->btf, ext->btf_id); + if (btf_is_var(t)) + err = bpf_object__resolve_ksym_var_btf_id(obj, ext); + else + err = bpf_object__resolve_ksym_func_btf_id(obj, ext); + if (err) + return err; + } + return 0; +} + +static int bpf_object__resolve_externs(struct bpf_object *obj, + const char *extra_kconfig) +{ + bool need_config = false, need_kallsyms = false; + bool need_vmlinux_btf = false; + struct extern_desc *ext; + void *kcfg_data = NULL; + int err, i; + + if (obj->nr_extern == 0) + return 0; + + if (obj->kconfig_map_idx >= 0) + kcfg_data = obj->maps[obj->kconfig_map_idx].mmaped; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + + if (ext->type == EXT_KSYM) { + if (ext->ksym.type_id) + need_vmlinux_btf = true; + else + need_kallsyms = true; + continue; + } else if (ext->type == EXT_KCFG) { + void *ext_ptr = kcfg_data + ext->kcfg.data_off; + __u64 value = 0; + + /* Kconfig externs need actual /proc/config.gz */ + if (str_has_pfx(ext->name, "CONFIG_")) { + need_config = true; + continue; + } + + /* Virtual kcfg externs are customly handled by libbpf */ + if (strcmp(ext->name, "LINUX_KERNEL_VERSION") == 0) { + value = get_kernel_version(); + if (!value) { + pr_warn("extern (kcfg) '%s': failed to get kernel version\n", ext->name); + return -EINVAL; + } + } else if (strcmp(ext->name, "LINUX_HAS_BPF_COOKIE") == 0) { + value = kernel_supports(obj, FEAT_BPF_COOKIE); + } else if (strcmp(ext->name, "LINUX_HAS_SYSCALL_WRAPPER") == 0) { + value = kernel_supports(obj, FEAT_SYSCALL_WRAPPER); + } else if (!str_has_pfx(ext->name, "LINUX_") || !ext->is_weak) { + /* Currently libbpf supports only CONFIG_ and LINUX_ prefixed + * __kconfig externs, where LINUX_ ones are virtual and filled out + * customly by libbpf (their values don't come from Kconfig). + * If LINUX_xxx variable is not recognized by libbpf, but is marked + * __weak, it defaults to zero value, just like for CONFIG_xxx + * externs. + */ + pr_warn("extern (kcfg) '%s': unrecognized virtual extern\n", ext->name); + return -EINVAL; + } + + err = set_kcfg_value_num(ext, ext_ptr, value); + if (err) + return err; + pr_debug("extern (kcfg) '%s': set to 0x%llx\n", + ext->name, (long long)value); + } else { + pr_warn("extern '%s': unrecognized extern kind\n", ext->name); + return -EINVAL; + } + } + if (need_config && extra_kconfig) { + err = bpf_object__read_kconfig_mem(obj, extra_kconfig, kcfg_data); + if (err) + return -EINVAL; + need_config = false; + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type == EXT_KCFG && !ext->is_set) { + need_config = true; + break; + } + } + } + if (need_config) { + err = bpf_object__read_kconfig_file(obj, kcfg_data); + if (err) + return -EINVAL; + } + if (need_kallsyms) { + err = bpf_object__read_kallsyms_file(obj); + if (err) + return -EINVAL; + } + if (need_vmlinux_btf) { + err = bpf_object__resolve_ksyms_btf_id(obj); + if (err) + return -EINVAL; + } + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + + if (!ext->is_set && !ext->is_weak) { + pr_warn("extern '%s' (strong): not resolved\n", ext->name); + return -ESRCH; + } else if (!ext->is_set) { + pr_debug("extern '%s' (weak): not resolved, defaulting to zero\n", + ext->name); + } + } + + return 0; +} + +static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path) +{ + int err, i; + + if (!obj) + return libbpf_err(-EINVAL); + + if (obj->loaded) { + pr_warn("object '%s': load can't be attempted twice\n", obj->name); + return libbpf_err(-EINVAL); + } + + if (obj->gen_loader) + bpf_gen__init(obj->gen_loader, extra_log_level, obj->nr_programs, obj->nr_maps); + + err = bpf_object__probe_loading(obj); + err = err ? : bpf_object__load_vmlinux_btf(obj, false); + err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); + err = err ? : bpf_object__sanitize_and_load_btf(obj); + err = err ? : bpf_object__sanitize_maps(obj); + err = err ? : bpf_object__init_kern_struct_ops_maps(obj); + err = err ? : bpf_object__create_maps(obj); + err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); + err = err ? : bpf_object__load_progs(obj, extra_log_level); + err = err ? : bpf_object_init_prog_arrays(obj); + + if (obj->gen_loader) { + /* reset FDs */ + if (obj->btf) + btf__set_fd(obj->btf, -1); + for (i = 0; i < obj->nr_maps; i++) + obj->maps[i].fd = -1; + if (!err) + err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps); + } + + /* clean up fd_array */ + zfree(&obj->fd_array); + + /* clean up module BTFs */ + for (i = 0; i < obj->btf_module_cnt; i++) { + close(obj->btf_modules[i].fd); + btf__free(obj->btf_modules[i].btf); + free(obj->btf_modules[i].name); + } + free(obj->btf_modules); + + /* clean up vmlinux BTF */ + btf__free(obj->btf_vmlinux); + obj->btf_vmlinux = NULL; + + obj->loaded = true; /* doesn't matter if successfully or not */ + + if (err) + goto out; + + return 0; +out: + /* unpin any maps that were auto-pinned during load */ + for (i = 0; i < obj->nr_maps; i++) + if (obj->maps[i].pinned && !obj->maps[i].reused) + bpf_map__unpin(&obj->maps[i], NULL); + + bpf_object_unload(obj); + pr_warn("failed to load object '%s'\n", obj->path); + return libbpf_err(err); +} + +int bpf_object__load(struct bpf_object *obj) +{ + return bpf_object_load(obj, 0, NULL); +} + +static int make_parent_dir(const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + char *dname, *dir; + int err = 0; + + dname = strdup(path); + if (dname == NULL) + return -ENOMEM; + + dir = dirname(dname); + if (mkdir(dir, 0700) && errno != EEXIST) + err = -errno; + + free(dname); + if (err) { + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warn("failed to mkdir %s: %s\n", path, cp); + } + return err; +} + +static int check_path(const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct statfs st_fs; + char *dname, *dir; + int err = 0; + + if (path == NULL) + return -EINVAL; + + dname = strdup(path); + if (dname == NULL) + return -ENOMEM; + + dir = dirname(dname); + if (statfs(dir, &st_fs)) { + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("failed to statfs %s: %s\n", dir, cp); + err = -errno; + } + free(dname); + + if (!err && st_fs.f_type != BPF_FS_MAGIC) { + pr_warn("specified path %s is not on BPF FS\n", path); + err = -EINVAL; + } + + return err; +} + +int bpf_program__pin(struct bpf_program *prog, const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + int err; + + if (prog->fd < 0) { + pr_warn("prog '%s': can't pin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } + + err = make_parent_dir(path); + if (err) + return libbpf_err(err); + + err = check_path(path); + if (err) + return libbpf_err(err); + + if (bpf_obj_pin(prog->fd, path)) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': failed to pin at '%s': %s\n", prog->name, path, cp); + return libbpf_err(err); + } + + pr_debug("prog '%s': pinned at '%s'\n", prog->name, path); + return 0; +} + +int bpf_program__unpin(struct bpf_program *prog, const char *path) +{ + int err; + + if (prog->fd < 0) { + pr_warn("prog '%s': can't unpin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } + + err = check_path(path); + if (err) + return libbpf_err(err); + + err = unlink(path); + if (err) + return libbpf_err(-errno); + + pr_debug("prog '%s': unpinned from '%s'\n", prog->name, path); + return 0; +} + +int bpf_map__pin(struct bpf_map *map, const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + int err; + + if (map == NULL) { + pr_warn("invalid map pointer\n"); + return libbpf_err(-EINVAL); + } + + if (map->pin_path) { + if (path && strcmp(path, map->pin_path)) { + pr_warn("map '%s' already has pin path '%s' different from '%s'\n", + bpf_map__name(map), map->pin_path, path); + return libbpf_err(-EINVAL); + } else if (map->pinned) { + pr_debug("map '%s' already pinned at '%s'; not re-pinning\n", + bpf_map__name(map), map->pin_path); + return 0; + } + } else { + if (!path) { + pr_warn("missing a path to pin map '%s' at\n", + bpf_map__name(map)); + return libbpf_err(-EINVAL); + } else if (map->pinned) { + pr_warn("map '%s' already pinned\n", bpf_map__name(map)); + return libbpf_err(-EEXIST); + } + + map->pin_path = strdup(path); + if (!map->pin_path) { + err = -errno; + goto out_err; + } + } + + err = make_parent_dir(map->pin_path); + if (err) + return libbpf_err(err); + + err = check_path(map->pin_path); + if (err) + return libbpf_err(err); + + if (bpf_obj_pin(map->fd, map->pin_path)) { + err = -errno; + goto out_err; + } + + map->pinned = true; + pr_debug("pinned map '%s'\n", map->pin_path); + + return 0; + +out_err: + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warn("failed to pin map: %s\n", cp); + return libbpf_err(err); +} + +int bpf_map__unpin(struct bpf_map *map, const char *path) +{ + int err; + + if (map == NULL) { + pr_warn("invalid map pointer\n"); + return libbpf_err(-EINVAL); + } + + if (map->pin_path) { + if (path && strcmp(path, map->pin_path)) { + pr_warn("map '%s' already has pin path '%s' different from '%s'\n", + bpf_map__name(map), map->pin_path, path); + return libbpf_err(-EINVAL); + } + path = map->pin_path; + } else if (!path) { + pr_warn("no path to unpin map '%s' from\n", + bpf_map__name(map)); + return libbpf_err(-EINVAL); + } + + err = check_path(path); + if (err) + return libbpf_err(err); + + err = unlink(path); + if (err != 0) + return libbpf_err(-errno); + + map->pinned = false; + pr_debug("unpinned map '%s' from '%s'\n", bpf_map__name(map), path); + + return 0; +} + +int bpf_map__set_pin_path(struct bpf_map *map, const char *path) +{ + char *new = NULL; + + if (path) { + new = strdup(path); + if (!new) + return libbpf_err(-errno); + } + + free(map->pin_path); + map->pin_path = new; + return 0; +} + +__alias(bpf_map__pin_path) +const char *bpf_map__get_pin_path(const struct bpf_map *map); + +const char *bpf_map__pin_path(const struct bpf_map *map) +{ + return map->pin_path; +} + +bool bpf_map__is_pinned(const struct bpf_map *map) +{ + return map->pinned; +} + +static void sanitize_pin_path(char *s) +{ + /* bpffs disallows periods in path names */ + while (*s) { + if (*s == '.') + *s = '_'; + s++; + } +} + +int bpf_object__pin_maps(struct bpf_object *obj, const char *path) +{ + struct bpf_map *map; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + if (!obj->loaded) { + pr_warn("object not yet loaded; load it first\n"); + return libbpf_err(-ENOENT); + } + + bpf_object__for_each_map(map, obj) { + char *pin_path = NULL; + char buf[PATH_MAX]; + + if (!map->autocreate) + continue; + + if (path) { + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + goto err_unpin_maps; + sanitize_pin_path(buf); + pin_path = buf; + } else if (!map->pin_path) { + continue; + } + + err = bpf_map__pin(map, pin_path); + if (err) + goto err_unpin_maps; + } + + return 0; + +err_unpin_maps: + while ((map = bpf_object__prev_map(obj, map))) { + if (!map->pin_path) + continue; + + bpf_map__unpin(map, NULL); + } + + return libbpf_err(err); +} + +int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) +{ + struct bpf_map *map; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + bpf_object__for_each_map(map, obj) { + char *pin_path = NULL; + char buf[PATH_MAX]; + + if (path) { + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return libbpf_err(err); + sanitize_pin_path(buf); + pin_path = buf; + } else if (!map->pin_path) { + continue; + } + + err = bpf_map__unpin(map, pin_path); + if (err) + return libbpf_err(err); + } + + return 0; +} + +int bpf_object__pin_programs(struct bpf_object *obj, const char *path) +{ + struct bpf_program *prog; + char buf[PATH_MAX]; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + if (!obj->loaded) { + pr_warn("object not yet loaded; load it first\n"); + return libbpf_err(-ENOENT); + } + + bpf_object__for_each_program(prog, obj) { + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) + goto err_unpin_programs; + + err = bpf_program__pin(prog, buf); + if (err) + goto err_unpin_programs; + } + + return 0; + +err_unpin_programs: + while ((prog = bpf_object__prev_program(obj, prog))) { + if (pathname_concat(buf, sizeof(buf), path, prog->name)) + continue; + + bpf_program__unpin(prog, buf); + } + + return libbpf_err(err); +} + +int bpf_object__unpin_programs(struct bpf_object *obj, const char *path) +{ + struct bpf_program *prog; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + bpf_object__for_each_program(prog, obj) { + char buf[PATH_MAX]; + + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) + return libbpf_err(err); + + err = bpf_program__unpin(prog, buf); + if (err) + return libbpf_err(err); + } + + return 0; +} + +int bpf_object__pin(struct bpf_object *obj, const char *path) +{ + int err; + + err = bpf_object__pin_maps(obj, path); + if (err) + return libbpf_err(err); + + err = bpf_object__pin_programs(obj, path); + if (err) { + bpf_object__unpin_maps(obj, path); + return libbpf_err(err); + } + + return 0; +} + +static void bpf_map__destroy(struct bpf_map *map) +{ + if (map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + if (map->mmaped) { + munmap(map->mmaped, bpf_map_mmap_sz(map)); + map->mmaped = NULL; + } + + if (map->st_ops) { + zfree(&map->st_ops->data); + zfree(&map->st_ops->progs); + zfree(&map->st_ops->kern_func_off); + zfree(&map->st_ops); + } + + zfree(&map->name); + zfree(&map->real_name); + zfree(&map->pin_path); + + if (map->fd >= 0) + zclose(map->fd); +} + +void bpf_object__close(struct bpf_object *obj) +{ + size_t i; + + if (IS_ERR_OR_NULL(obj)) + return; + + usdt_manager_free(obj->usdt_man); + obj->usdt_man = NULL; + + bpf_gen__free(obj->gen_loader); + bpf_object__elf_finish(obj); + bpf_object_unload(obj); + btf__free(obj->btf); + btf_ext__free(obj->btf_ext); + + for (i = 0; i < obj->nr_maps; i++) + bpf_map__destroy(&obj->maps[i]); + + zfree(&obj->btf_custom_path); + zfree(&obj->kconfig); + zfree(&obj->externs); + obj->nr_extern = 0; + + zfree(&obj->maps); + obj->nr_maps = 0; + + if (obj->programs && obj->nr_programs) { + for (i = 0; i < obj->nr_programs; i++) + bpf_program__exit(&obj->programs[i]); + } + zfree(&obj->programs); + + free(obj); +} + +const char *bpf_object__name(const struct bpf_object *obj) +{ + return obj ? obj->name : libbpf_err_ptr(-EINVAL); +} + +unsigned int bpf_object__kversion(const struct bpf_object *obj) +{ + return obj ? obj->kern_version : 0; +} + +struct btf *bpf_object__btf(const struct bpf_object *obj) +{ + return obj ? obj->btf : NULL; +} + +int bpf_object__btf_fd(const struct bpf_object *obj) +{ + return obj->btf ? btf__fd(obj->btf) : -1; +} + +int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) +{ + if (obj->loaded) + return libbpf_err(-EINVAL); + + obj->kern_version = kern_version; + + return 0; +} + +int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts) +{ + struct bpf_gen *gen; + + if (!opts) + return -EFAULT; + if (!OPTS_VALID(opts, gen_loader_opts)) + return -EINVAL; + gen = calloc(sizeof(*gen), 1); + if (!gen) + return -ENOMEM; + gen->opts = opts; + obj->gen_loader = gen; + return 0; +} + +static struct bpf_program * +__bpf_program__iter(const struct bpf_program *p, const struct bpf_object *obj, + bool forward) +{ + size_t nr_programs = obj->nr_programs; + ssize_t idx; + + if (!nr_programs) + return NULL; + + if (!p) + /* Iter from the beginning */ + return forward ? &obj->programs[0] : + &obj->programs[nr_programs - 1]; + + if (p->obj != obj) { + pr_warn("error: program handler doesn't match object\n"); + return errno = EINVAL, NULL; + } + + idx = (p - obj->programs) + (forward ? 1 : -1); + if (idx >= obj->nr_programs || idx < 0) + return NULL; + return &obj->programs[idx]; +} + +struct bpf_program * +bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prev) +{ + struct bpf_program *prog = prev; + + do { + prog = __bpf_program__iter(prog, obj, true); + } while (prog && prog_is_subprog(obj, prog)); + + return prog; +} + +struct bpf_program * +bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *next) +{ + struct bpf_program *prog = next; + + do { + prog = __bpf_program__iter(prog, obj, false); + } while (prog && prog_is_subprog(obj, prog)); + + return prog; +} + +void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex) +{ + prog->prog_ifindex = ifindex; +} + +const char *bpf_program__name(const struct bpf_program *prog) +{ + return prog->name; +} + +const char *bpf_program__section_name(const struct bpf_program *prog) +{ + return prog->sec_name; +} + +bool bpf_program__autoload(const struct bpf_program *prog) +{ + return prog->autoload; +} + +int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) +{ + if (prog->obj->loaded) + return libbpf_err(-EINVAL); + + prog->autoload = autoload; + return 0; +} + +bool bpf_program__autoattach(const struct bpf_program *prog) +{ + return prog->autoattach; +} + +void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach) +{ + prog->autoattach = autoattach; +} + +const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog) +{ + return prog->insns; +} + +size_t bpf_program__insn_cnt(const struct bpf_program *prog) +{ + return prog->insns_cnt; +} + +int bpf_program__set_insns(struct bpf_program *prog, + struct bpf_insn *new_insns, size_t new_insn_cnt) +{ + struct bpf_insn *insns; + + if (prog->obj->loaded) + return -EBUSY; + + insns = libbpf_reallocarray(prog->insns, new_insn_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", prog->name); + return -ENOMEM; + } + memcpy(insns, new_insns, new_insn_cnt * sizeof(*insns)); + + prog->insns = insns; + prog->insns_cnt = new_insn_cnt; + return 0; +} + +int bpf_program__fd(const struct bpf_program *prog) +{ + if (!prog) + return libbpf_err(-EINVAL); + + if (prog->fd < 0) + return libbpf_err(-ENOENT); + + return prog->fd; +} + +__alias(bpf_program__type) +enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); + +enum bpf_prog_type bpf_program__type(const struct bpf_program *prog) +{ + return prog->type; +} + +int bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->type = type; + return 0; +} + +__alias(bpf_program__expected_attach_type) +enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); + +enum bpf_attach_type bpf_program__expected_attach_type(const struct bpf_program *prog) +{ + return prog->expected_attach_type; +} + +int bpf_program__set_expected_attach_type(struct bpf_program *prog, + enum bpf_attach_type type) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->expected_attach_type = type; + return 0; +} + +__u32 bpf_program__flags(const struct bpf_program *prog) +{ + return prog->prog_flags; +} + +int bpf_program__set_flags(struct bpf_program *prog, __u32 flags) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->prog_flags = flags; + return 0; +} + +__u32 bpf_program__log_level(const struct bpf_program *prog) +{ + return prog->log_level; +} + +int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->log_level = log_level; + return 0; +} + +const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size) +{ + *log_size = prog->log_size; + return prog->log_buf; +} + +int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size) +{ + if (log_size && !log_buf) + return -EINVAL; + if (prog->log_size > UINT_MAX) + return -EINVAL; + if (prog->obj->loaded) + return -EBUSY; + + prog->log_buf = log_buf; + prog->log_size = log_size; + return 0; +} + +#define SEC_DEF(sec_pfx, ptype, atype, flags, ...) { \ + .sec = (char *)sec_pfx, \ + .prog_type = BPF_PROG_TYPE_##ptype, \ + .expected_attach_type = atype, \ + .cookie = (long)(flags), \ + .prog_prepare_load_fn = libbpf_prepare_prog_load, \ + __VA_ARGS__ \ +} + +static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); + +static const struct bpf_sec_def section_defs[] = { + SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), + SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE), + SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE), + SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), + SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), + SEC_DEF("kretprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), + SEC_DEF("uretprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), + SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("ksyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), + SEC_DEF("kretsyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), + SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), + SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), + SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), + SEC_DEF("action", SCHED_ACT, 0, SEC_NONE), + SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tp+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tracepoint.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tp.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("tp_btf+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fentry+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fmod_ret+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fexit+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fentry.s+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fmod_ret.s+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fexit.s+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), + SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), + SEC_DEF("lsm_cgroup+", LSM, BPF_LSM_CGROUP, SEC_ATTACH_BTF), + SEC_DEF("iter+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter), + SEC_DEF("iter.s+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter), + SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE), + SEC_DEF("xdp.frags/devmap", XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS), + SEC_DEF("xdp/devmap", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE), + SEC_DEF("xdp.frags/cpumap", XDP, BPF_XDP_CPUMAP, SEC_XDP_FRAGS), + SEC_DEF("xdp/cpumap", XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE), + SEC_DEF("xdp.frags", XDP, BPF_XDP, SEC_XDP_FRAGS), + SEC_DEF("xdp", XDP, BPF_XDP, SEC_ATTACHABLE_OPT), + SEC_DEF("perf_event", PERF_EVENT, 0, SEC_NONE), + SEC_DEF("lwt_in", LWT_IN, 0, SEC_NONE), + SEC_DEF("lwt_out", LWT_OUT, 0, SEC_NONE), + SEC_DEF("lwt_xmit", LWT_XMIT, 0, SEC_NONE), + SEC_DEF("lwt_seg6local", LWT_SEG6LOCAL, 0, SEC_NONE), + SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), + SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), + SEC_DEF("flow_dissector", FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/ingress", CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/egress", CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/skb", CGROUP_SKB, 0, SEC_NONE), + SEC_DEF("cgroup/sock_create", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock_release", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/post_bind4", CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/post_bind6", CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT), + SEC_DEF("struct_ops+", STRUCT_OPS, 0, SEC_NONE), + SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), +}; + +static size_t custom_sec_def_cnt; +static struct bpf_sec_def *custom_sec_defs; +static struct bpf_sec_def custom_fallback_def; +static bool has_custom_fallback_def; + +static int last_custom_sec_def_handler_id; + +int libbpf_register_prog_handler(const char *sec, + enum bpf_prog_type prog_type, + enum bpf_attach_type exp_attach_type, + const struct libbpf_prog_handler_opts *opts) +{ + struct bpf_sec_def *sec_def; + + if (!OPTS_VALID(opts, libbpf_prog_handler_opts)) + return libbpf_err(-EINVAL); + + if (last_custom_sec_def_handler_id == INT_MAX) /* prevent overflow */ + return libbpf_err(-E2BIG); + + if (sec) { + sec_def = libbpf_reallocarray(custom_sec_defs, custom_sec_def_cnt + 1, + sizeof(*sec_def)); + if (!sec_def) + return libbpf_err(-ENOMEM); + + custom_sec_defs = sec_def; + sec_def = &custom_sec_defs[custom_sec_def_cnt]; + } else { + if (has_custom_fallback_def) + return libbpf_err(-EBUSY); + + sec_def = &custom_fallback_def; + } + + sec_def->sec = sec ? strdup(sec) : NULL; + if (sec && !sec_def->sec) + return libbpf_err(-ENOMEM); + + sec_def->prog_type = prog_type; + sec_def->expected_attach_type = exp_attach_type; + sec_def->cookie = OPTS_GET(opts, cookie, 0); + + sec_def->prog_setup_fn = OPTS_GET(opts, prog_setup_fn, NULL); + sec_def->prog_prepare_load_fn = OPTS_GET(opts, prog_prepare_load_fn, NULL); + sec_def->prog_attach_fn = OPTS_GET(opts, prog_attach_fn, NULL); + + sec_def->handler_id = ++last_custom_sec_def_handler_id; + + if (sec) + custom_sec_def_cnt++; + else + has_custom_fallback_def = true; + + return sec_def->handler_id; +} + +int libbpf_unregister_prog_handler(int handler_id) +{ + struct bpf_sec_def *sec_defs; + int i; + + if (handler_id <= 0) + return libbpf_err(-EINVAL); + + if (has_custom_fallback_def && custom_fallback_def.handler_id == handler_id) { + memset(&custom_fallback_def, 0, sizeof(custom_fallback_def)); + has_custom_fallback_def = false; + return 0; + } + + for (i = 0; i < custom_sec_def_cnt; i++) { + if (custom_sec_defs[i].handler_id == handler_id) + break; + } + + if (i == custom_sec_def_cnt) + return libbpf_err(-ENOENT); + + free(custom_sec_defs[i].sec); + for (i = i + 1; i < custom_sec_def_cnt; i++) + custom_sec_defs[i - 1] = custom_sec_defs[i]; + custom_sec_def_cnt--; + + /* try to shrink the array, but it's ok if we couldn't */ + sec_defs = libbpf_reallocarray(custom_sec_defs, custom_sec_def_cnt, sizeof(*sec_defs)); + if (sec_defs) + custom_sec_defs = sec_defs; + + return 0; +} + +static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_name) +{ + size_t len = strlen(sec_def->sec); + + /* "type/" always has to have proper SEC("type/extras") form */ + if (sec_def->sec[len - 1] == '/') { + if (str_has_pfx(sec_name, sec_def->sec)) + return true; + return false; + } + + /* "type+" means it can be either exact SEC("type") or + * well-formed SEC("type/extras") with proper '/' separator + */ + if (sec_def->sec[len - 1] == '+') { + len--; + /* not even a prefix */ + if (strncmp(sec_name, sec_def->sec, len) != 0) + return false; + /* exact match or has '/' separator */ + if (sec_name[len] == '\0' || sec_name[len] == '/') + return true; + return false; + } + + return strcmp(sec_name, sec_def->sec) == 0; +} + +static const struct bpf_sec_def *find_sec_def(const char *sec_name) +{ + const struct bpf_sec_def *sec_def; + int i, n; + + n = custom_sec_def_cnt; + for (i = 0; i < n; i++) { + sec_def = &custom_sec_defs[i]; + if (sec_def_matches(sec_def, sec_name)) + return sec_def; + } + + n = ARRAY_SIZE(section_defs); + for (i = 0; i < n; i++) { + sec_def = §ion_defs[i]; + if (sec_def_matches(sec_def, sec_name)) + return sec_def; + } + + if (has_custom_fallback_def) + return &custom_fallback_def; + + return NULL; +} + +#define MAX_TYPE_NAME_SIZE 32 + +static char *libbpf_get_type_names(bool attach_type) +{ + int i, len = ARRAY_SIZE(section_defs) * MAX_TYPE_NAME_SIZE; + char *buf; + + buf = malloc(len); + if (!buf) + return NULL; + + buf[0] = '\0'; + /* Forge string buf with all available names */ + for (i = 0; i < ARRAY_SIZE(section_defs); i++) { + const struct bpf_sec_def *sec_def = §ion_defs[i]; + + if (attach_type) { + if (sec_def->prog_prepare_load_fn != libbpf_prepare_prog_load) + continue; + + if (!(sec_def->cookie & SEC_ATTACHABLE)) + continue; + } + + if (strlen(buf) + strlen(section_defs[i].sec) + 2 > len) { + free(buf); + return NULL; + } + strcat(buf, " "); + strcat(buf, section_defs[i].sec); + } + + return buf; +} + +int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type) +{ + const struct bpf_sec_def *sec_def; + char *type_names; + + if (!name) + return libbpf_err(-EINVAL); + + sec_def = find_sec_def(name); + if (sec_def) { + *prog_type = sec_def->prog_type; + *expected_attach_type = sec_def->expected_attach_type; + return 0; + } + + pr_debug("failed to guess program type from ELF section '%s'\n", name); + type_names = libbpf_get_type_names(false); + if (type_names != NULL) { + pr_debug("supported section(type) names are:%s\n", type_names); + free(type_names); + } + + return libbpf_err(-ESRCH); +} + +const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(attach_type_name)) + return NULL; + + return attach_type_name[t]; +} + +const char *libbpf_bpf_link_type_str(enum bpf_link_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(link_type_name)) + return NULL; + + return link_type_name[t]; +} + +const char *libbpf_bpf_map_type_str(enum bpf_map_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(map_type_name)) + return NULL; + + return map_type_name[t]; +} + +const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(prog_type_name)) + return NULL; + + return prog_type_name[t]; +} + +static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, + size_t offset) +{ + struct bpf_map *map; + size_t i; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + if (!bpf_map__is_struct_ops(map)) + continue; + if (map->sec_offset <= offset && + offset - map->sec_offset < map->def.value_size) + return map; + } + + return NULL; +} + +/* Collect the reloc from ELF and populate the st_ops->progs[] */ +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + Elf64_Shdr *shdr, Elf_Data *data) +{ + const struct btf_member *member; + struct bpf_struct_ops *st_ops; + struct bpf_program *prog; + unsigned int shdr_idx; + const struct btf *btf; + struct bpf_map *map; + unsigned int moff, insn_idx; + const char *name; + __u32 member_idx; + Elf64_Sym *sym; + Elf64_Rel *rel; + int i, nrels; + + btf = obj->btf; + nrels = shdr->sh_size / shdr->sh_entsize; + for (i = 0; i < nrels; i++) { + rel = elf_rel_by_idx(data, i); + if (!rel) { + pr_warn("struct_ops reloc: failed to get %d reloc\n", i); + return -LIBBPF_ERRNO__FORMAT; + } + + sym = elf_sym_by_idx(obj, ELF64_R_SYM(rel->r_info)); + if (!sym) { + pr_warn("struct_ops reloc: symbol %zx not found\n", + (size_t)ELF64_R_SYM(rel->r_info)); + return -LIBBPF_ERRNO__FORMAT; + } + + name = elf_sym_str(obj, sym->st_name) ?: "<?>"; + map = find_struct_ops_map_by_offset(obj, rel->r_offset); + if (!map) { + pr_warn("struct_ops reloc: cannot find map at rel->r_offset %zu\n", + (size_t)rel->r_offset); + return -EINVAL; + } + + moff = rel->r_offset - map->sec_offset; + shdr_idx = sym->st_shndx; + st_ops = map->st_ops; + pr_debug("struct_ops reloc %s: for %lld value %lld shdr_idx %u rel->r_offset %zu map->sec_offset %zu name %d (\'%s\')\n", + map->name, + (long long)(rel->r_info >> 32), + (long long)sym->st_value, + shdr_idx, (size_t)rel->r_offset, + map->sec_offset, sym->st_name, name); + + if (shdr_idx >= SHN_LORESERVE) { + pr_warn("struct_ops reloc %s: rel->r_offset %zu shdr_idx %u unsupported non-static function\n", + map->name, (size_t)rel->r_offset, shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % BPF_INSN_SZ) { + pr_warn("struct_ops reloc %s: invalid target program offset %llu\n", + map->name, (unsigned long long)sym->st_value); + return -LIBBPF_ERRNO__FORMAT; + } + insn_idx = sym->st_value / BPF_INSN_SZ; + + member = find_member_by_offset(st_ops->type, moff * 8); + if (!member) { + pr_warn("struct_ops reloc %s: cannot find member at moff %u\n", + map->name, moff); + return -EINVAL; + } + member_idx = member - btf_members(st_ops->type); + name = btf__name_by_offset(btf, member->name_off); + + if (!resolve_func_ptr(btf, member->type, NULL)) { + pr_warn("struct_ops reloc %s: cannot relocate non func ptr %s\n", + map->name, name); + return -EINVAL; + } + + prog = find_prog_by_sec_insn(obj, shdr_idx, insn_idx); + if (!prog) { + pr_warn("struct_ops reloc %s: cannot find prog at shdr_idx %u to relocate func ptr %s\n", + map->name, shdr_idx, name); + return -EINVAL; + } + + /* prevent the use of BPF prog with invalid type */ + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) { + pr_warn("struct_ops reloc %s: prog %s is not struct_ops BPF program\n", + map->name, prog->name); + return -EINVAL; + } + + /* if we haven't yet processed this BPF program, record proper + * attach_btf_id and member_idx + */ + if (!prog->attach_btf_id) { + prog->attach_btf_id = st_ops->type_id; + prog->expected_attach_type = member_idx; + } + + /* struct_ops BPF prog can be re-used between multiple + * .struct_ops as long as it's the same struct_ops struct + * definition and the same function pointer field + */ + if (prog->attach_btf_id != st_ops->type_id || + prog->expected_attach_type != member_idx) { + pr_warn("struct_ops reloc %s: cannot use prog %s in sec %s with type %u attach_btf_id %u expected_attach_type %u for func ptr %s\n", + map->name, prog->name, prog->sec_name, prog->type, + prog->attach_btf_id, prog->expected_attach_type, name); + return -EINVAL; + } + + st_ops->progs[member_idx] = prog; + } + + return 0; +} + +#define BTF_TRACE_PREFIX "btf_trace_" +#define BTF_LSM_PREFIX "bpf_lsm_" +#define BTF_ITER_PREFIX "bpf_iter_" +#define BTF_MAX_NAME_SIZE 128 + +void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, + const char **prefix, int *kind) +{ + switch (attach_type) { + case BPF_TRACE_RAW_TP: + *prefix = BTF_TRACE_PREFIX; + *kind = BTF_KIND_TYPEDEF; + break; + case BPF_LSM_MAC: + case BPF_LSM_CGROUP: + *prefix = BTF_LSM_PREFIX; + *kind = BTF_KIND_FUNC; + break; + case BPF_TRACE_ITER: + *prefix = BTF_ITER_PREFIX; + *kind = BTF_KIND_FUNC; + break; + default: + *prefix = ""; + *kind = BTF_KIND_FUNC; + } +} + +static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, + const char *name, __u32 kind) +{ + char btf_type_name[BTF_MAX_NAME_SIZE]; + int ret; + + ret = snprintf(btf_type_name, sizeof(btf_type_name), + "%s%s", prefix, name); + /* snprintf returns the number of characters written excluding the + * terminating null. So, if >= BTF_MAX_NAME_SIZE are written, it + * indicates truncation. + */ + if (ret < 0 || ret >= sizeof(btf_type_name)) + return -ENAMETOOLONG; + return btf__find_by_name_kind(btf, btf_type_name, kind); +} + +static inline int find_attach_btf_id(struct btf *btf, const char *name, + enum bpf_attach_type attach_type) +{ + const char *prefix; + int kind; + + btf_get_kernel_prefix_kind(attach_type, &prefix, &kind); + return find_btf_by_prefix_kind(btf, prefix, name, kind); +} + +int libbpf_find_vmlinux_btf_id(const char *name, + enum bpf_attach_type attach_type) +{ + struct btf *btf; + int err; + + btf = btf__load_vmlinux_btf(); + err = libbpf_get_error(btf); + if (err) { + pr_warn("vmlinux BTF is not found\n"); + return libbpf_err(err); + } + + err = find_attach_btf_id(btf, name, attach_type); + if (err <= 0) + pr_warn("%s is not found in vmlinux BTF\n", name); + + btf__free(btf); + return libbpf_err(err); +} + +static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) +{ + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + struct btf *btf; + int err; + + memset(&info, 0, info_len); + err = bpf_obj_get_info_by_fd(attach_prog_fd, &info, &info_len); + if (err) { + pr_warn("failed bpf_obj_get_info_by_fd for FD %d: %d\n", + attach_prog_fd, err); + return err; + } + + err = -EINVAL; + if (!info.btf_id) { + pr_warn("The target program doesn't have BTF\n"); + goto out; + } + btf = btf__load_from_kernel_by_id(info.btf_id); + err = libbpf_get_error(btf); + if (err) { + pr_warn("Failed to get BTF %d of the program: %d\n", info.btf_id, err); + goto out; + } + err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); + btf__free(btf); + if (err <= 0) { + pr_warn("%s is not found in prog's BTF\n", name); + goto out; + } +out: + return err; +} + +static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, + enum bpf_attach_type attach_type, + int *btf_obj_fd, int *btf_type_id) +{ + int ret, i; + + ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; + + ret = load_module_btfs(obj); + if (ret) + return ret; + + for (i = 0; i < obj->btf_module_cnt; i++) { + const struct module_btf *mod = &obj->btf_modules[i]; + + ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = mod->fd; + *btf_type_id = ret; + return 0; + } + if (ret == -ENOENT) + continue; + + return ret; + } + + return -ESRCH; +} + +static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attach_name, + int *btf_obj_fd, int *btf_type_id) +{ + enum bpf_attach_type attach_type = prog->expected_attach_type; + __u32 attach_prog_fd = prog->attach_prog_fd; + int err = 0; + + /* BPF program's BTF ID */ + if (prog->type == BPF_PROG_TYPE_EXT || attach_prog_fd) { + if (!attach_prog_fd) { + pr_warn("prog '%s': attach program FD is not set\n", prog->name); + return -EINVAL; + } + err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd); + if (err < 0) { + pr_warn("prog '%s': failed to find BPF program (FD %d) BTF ID for '%s': %d\n", + prog->name, attach_prog_fd, attach_name, err); + return err; + } + *btf_obj_fd = 0; + *btf_type_id = err; + return 0; + } + + /* kernel/module BTF ID */ + if (prog->obj->gen_loader) { + bpf_gen__record_attach_target(prog->obj->gen_loader, attach_name, attach_type); + *btf_obj_fd = 0; + *btf_type_id = 1; + } else { + err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id); + } + if (err) { + pr_warn("prog '%s': failed to find kernel BTF type ID of '%s': %d\n", + prog->name, attach_name, err); + return err; + } + return 0; +} + +int libbpf_attach_type_by_name(const char *name, + enum bpf_attach_type *attach_type) +{ + char *type_names; + const struct bpf_sec_def *sec_def; + + if (!name) + return libbpf_err(-EINVAL); + + sec_def = find_sec_def(name); + if (!sec_def) { + pr_debug("failed to guess attach type based on ELF section name '%s'\n", name); + type_names = libbpf_get_type_names(true); + if (type_names != NULL) { + pr_debug("attachable section(type) names are:%s\n", type_names); + free(type_names); + } + + return libbpf_err(-EINVAL); + } + + if (sec_def->prog_prepare_load_fn != libbpf_prepare_prog_load) + return libbpf_err(-EINVAL); + if (!(sec_def->cookie & SEC_ATTACHABLE)) + return libbpf_err(-EINVAL); + + *attach_type = sec_def->expected_attach_type; + return 0; +} + +int bpf_map__fd(const struct bpf_map *map) +{ + return map ? map->fd : libbpf_err(-EINVAL); +} + +static bool map_uses_real_name(const struct bpf_map *map) +{ + /* Since libbpf started to support custom .data.* and .rodata.* maps, + * their user-visible name differs from kernel-visible name. Users see + * such map's corresponding ELF section name as a map name. + * This check distinguishes .data/.rodata from .data.* and .rodata.* + * maps to know which name has to be returned to the user. + */ + if (map->libbpf_type == LIBBPF_MAP_DATA && strcmp(map->real_name, DATA_SEC) != 0) + return true; + if (map->libbpf_type == LIBBPF_MAP_RODATA && strcmp(map->real_name, RODATA_SEC) != 0) + return true; + return false; +} + +const char *bpf_map__name(const struct bpf_map *map) +{ + if (!map) + return NULL; + + if (map_uses_real_name(map)) + return map->real_name; + + return map->name; +} + +enum bpf_map_type bpf_map__type(const struct bpf_map *map) +{ + return map->def.type; +} + +int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.type = type; + return 0; +} + +__u32 bpf_map__map_flags(const struct bpf_map *map) +{ + return map->def.map_flags; +} + +int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.map_flags = flags; + return 0; +} + +__u64 bpf_map__map_extra(const struct bpf_map *map) +{ + return map->map_extra; +} + +int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->map_extra = map_extra; + return 0; +} + +__u32 bpf_map__numa_node(const struct bpf_map *map) +{ + return map->numa_node; +} + +int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->numa_node = numa_node; + return 0; +} + +__u32 bpf_map__key_size(const struct bpf_map *map) +{ + return map->def.key_size; +} + +int bpf_map__set_key_size(struct bpf_map *map, __u32 size) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.key_size = size; + return 0; +} + +__u32 bpf_map__value_size(const struct bpf_map *map) +{ + return map->def.value_size; +} + +int bpf_map__set_value_size(struct bpf_map *map, __u32 size) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.value_size = size; + return 0; +} + +__u32 bpf_map__btf_key_type_id(const struct bpf_map *map) +{ + return map ? map->btf_key_type_id : 0; +} + +__u32 bpf_map__btf_value_type_id(const struct bpf_map *map) +{ + return map ? map->btf_value_type_id : 0; +} + +int bpf_map__set_initial_value(struct bpf_map *map, + const void *data, size_t size) +{ + if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG || + size != map->def.value_size || map->fd >= 0) + return libbpf_err(-EINVAL); + + memcpy(map->mmaped, data, size); + return 0; +} + +const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) +{ + if (!map->mmaped) + return NULL; + *psize = map->def.value_size; + return map->mmaped; +} + +bool bpf_map__is_internal(const struct bpf_map *map) +{ + return map->libbpf_type != LIBBPF_MAP_UNSPEC; +} + +__u32 bpf_map__ifindex(const struct bpf_map *map) +{ + return map->map_ifindex; +} + +int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->map_ifindex = ifindex; + return 0; +} + +int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) { + pr_warn("error: unsupported map type\n"); + return libbpf_err(-EINVAL); + } + if (map->inner_map_fd != -1) { + pr_warn("error: inner_map_fd already specified\n"); + return libbpf_err(-EINVAL); + } + if (map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + map->inner_map_fd = fd; + return 0; +} + +static struct bpf_map * +__bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) +{ + ssize_t idx; + struct bpf_map *s, *e; + + if (!obj || !obj->maps) + return errno = EINVAL, NULL; + + s = obj->maps; + e = obj->maps + obj->nr_maps; + + if ((m < s) || (m >= e)) { + pr_warn("error in %s: map handler doesn't belong to object\n", + __func__); + return errno = EINVAL, NULL; + } + + idx = (m - obj->maps) + i; + if (idx >= obj->nr_maps || idx < 0) + return NULL; + return &obj->maps[idx]; +} + +struct bpf_map * +bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *prev) +{ + if (prev == NULL) + return obj->maps; + + return __bpf_map__iter(prev, obj, 1); +} + +struct bpf_map * +bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *next) +{ + if (next == NULL) { + if (!obj->nr_maps) + return NULL; + return obj->maps + obj->nr_maps - 1; + } + + return __bpf_map__iter(next, obj, -1); +} + +struct bpf_map * +bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name) +{ + struct bpf_map *pos; + + bpf_object__for_each_map(pos, obj) { + /* if it's a special internal map name (which always starts + * with dot) then check if that special name matches the + * real map name (ELF section name) + */ + if (name[0] == '.') { + if (pos->real_name && strcmp(pos->real_name, name) == 0) + return pos; + continue; + } + /* otherwise map name has to be an exact match */ + if (map_uses_real_name(pos)) { + if (strcmp(pos->real_name, name) == 0) + return pos; + continue; + } + if (strcmp(pos->name, name) == 0) + return pos; + } + return errno = ENOENT, NULL; +} + +int +bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) +{ + return bpf_map__fd(bpf_object__find_map_by_name(obj, name)); +} + +static int validate_map_op(const struct bpf_map *map, size_t key_sz, + size_t value_sz, bool check_value_sz) +{ + if (map->fd <= 0) + return -ENOENT; + + if (map->def.key_size != key_sz) { + pr_warn("map '%s': unexpected key size %zu provided, expected %u\n", + map->name, key_sz, map->def.key_size); + return -EINVAL; + } + + if (!check_value_sz) + return 0; + + switch (map->def.type) { + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: { + int num_cpu = libbpf_num_possible_cpus(); + size_t elem_sz = roundup(map->def.value_size, 8); + + if (value_sz != num_cpu * elem_sz) { + pr_warn("map '%s': unexpected value size %zu provided for per-CPU map, expected %d * %zu = %zd\n", + map->name, value_sz, num_cpu, elem_sz, num_cpu * elem_sz); + return -EINVAL; + } + break; + } + default: + if (map->def.value_size != value_sz) { + pr_warn("map '%s': unexpected value size %zu provided, expected %u\n", + map->name, value_sz, map->def.value_size); + return -EINVAL; + } + break; + } + return 0; +} + +int bpf_map__lookup_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_lookup_elem_flags(map->fd, key, value, flags); +} + +int bpf_map__update_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + const void *value, size_t value_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_update_elem(map->fd, key, value, flags); +} + +int bpf_map__delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + if (err) + return libbpf_err(err); + + return bpf_map_delete_elem_flags(map->fd, key, flags); +} + +int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_lookup_and_delete_elem_flags(map->fd, key, value, flags); +} + +int bpf_map__get_next_key(const struct bpf_map *map, + const void *cur_key, void *next_key, size_t key_sz) +{ + int err; + + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + if (err) + return libbpf_err(err); + + return bpf_map_get_next_key(map->fd, cur_key, next_key); +} + +long libbpf_get_error(const void *ptr) +{ + if (!IS_ERR_OR_NULL(ptr)) + return 0; + + if (IS_ERR(ptr)) + errno = -PTR_ERR(ptr); + + /* If ptr == NULL, then errno should be already set by the failing + * API, because libbpf never returns NULL on success and it now always + * sets errno on error. So no extra errno handling for ptr == NULL + * case. + */ + return -errno; +} + +/* Replace link's underlying BPF program with the new one */ +int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) +{ + int ret; + + ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); + return libbpf_err_errno(ret); +} + +/* Release "ownership" of underlying BPF resource (typically, BPF program + * attached to some BPF hook, e.g., tracepoint, kprobe, etc). Disconnected + * link, when destructed through bpf_link__destroy() call won't attempt to + * detach/unregisted that BPF resource. This is useful in situations where, + * say, attached BPF program has to outlive userspace program that attached it + * in the system. Depending on type of BPF program, though, there might be + * additional steps (like pinning BPF program in BPF FS) necessary to ensure + * exit of userspace program doesn't trigger automatic detachment and clean up + * inside the kernel. + */ +void bpf_link__disconnect(struct bpf_link *link) +{ + link->disconnected = true; +} + +int bpf_link__destroy(struct bpf_link *link) +{ + int err = 0; + + if (IS_ERR_OR_NULL(link)) + return 0; + + if (!link->disconnected && link->detach) + err = link->detach(link); + if (link->pin_path) + free(link->pin_path); + if (link->dealloc) + link->dealloc(link); + else + free(link); + + return libbpf_err(err); +} + +int bpf_link__fd(const struct bpf_link *link) +{ + return link->fd; +} + +const char *bpf_link__pin_path(const struct bpf_link *link) +{ + return link->pin_path; +} + +static int bpf_link__detach_fd(struct bpf_link *link) +{ + return libbpf_err_errno(close(link->fd)); +} + +struct bpf_link *bpf_link__open(const char *path) +{ + struct bpf_link *link; + int fd; + + fd = bpf_obj_get(path); + if (fd < 0) { + fd = -errno; + pr_warn("failed to open link at %s: %d\n", path, fd); + return libbpf_err_ptr(fd); + } + + link = calloc(1, sizeof(*link)); + if (!link) { + close(fd); + return libbpf_err_ptr(-ENOMEM); + } + link->detach = &bpf_link__detach_fd; + link->fd = fd; + + link->pin_path = strdup(path); + if (!link->pin_path) { + bpf_link__destroy(link); + return libbpf_err_ptr(-ENOMEM); + } + + return link; +} + +int bpf_link__detach(struct bpf_link *link) +{ + return bpf_link_detach(link->fd) ? -errno : 0; +} + +int bpf_link__pin(struct bpf_link *link, const char *path) +{ + int err; + + if (link->pin_path) + return libbpf_err(-EBUSY); + err = make_parent_dir(path); + if (err) + return libbpf_err(err); + err = check_path(path); + if (err) + return libbpf_err(err); + + link->pin_path = strdup(path); + if (!link->pin_path) + return libbpf_err(-ENOMEM); + + if (bpf_obj_pin(link->fd, link->pin_path)) { + err = -errno; + zfree(&link->pin_path); + return libbpf_err(err); + } + + pr_debug("link fd=%d: pinned at %s\n", link->fd, link->pin_path); + return 0; +} + +int bpf_link__unpin(struct bpf_link *link) +{ + int err; + + if (!link->pin_path) + return libbpf_err(-EINVAL); + + err = unlink(link->pin_path); + if (err != 0) + return -errno; + + pr_debug("link fd=%d: unpinned from %s\n", link->fd, link->pin_path); + zfree(&link->pin_path); + return 0; +} + +struct bpf_link_perf { + struct bpf_link link; + int perf_event_fd; + /* legacy kprobe support: keep track of probe identifier and type */ + char *legacy_probe_name; + bool legacy_is_kprobe; + bool legacy_is_retprobe; +}; + +static int remove_kprobe_event_legacy(const char *probe_name, bool retprobe); +static int remove_uprobe_event_legacy(const char *probe_name, bool retprobe); + +static int bpf_link_perf_detach(struct bpf_link *link) +{ + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + int err = 0; + + if (ioctl(perf_link->perf_event_fd, PERF_EVENT_IOC_DISABLE, 0) < 0) + err = -errno; + + if (perf_link->perf_event_fd != link->fd) + close(perf_link->perf_event_fd); + close(link->fd); + + /* legacy uprobe/kprobe needs to be removed after perf event fd closure */ + if (perf_link->legacy_probe_name) { + if (perf_link->legacy_is_kprobe) { + err = remove_kprobe_event_legacy(perf_link->legacy_probe_name, + perf_link->legacy_is_retprobe); + } else { + err = remove_uprobe_event_legacy(perf_link->legacy_probe_name, + perf_link->legacy_is_retprobe); + } + } + + return err; +} + +static void bpf_link_perf_dealloc(struct bpf_link *link) +{ + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + free(perf_link->legacy_probe_name); + free(perf_link); +} + +struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *prog, int pfd, + const struct bpf_perf_event_opts *opts) +{ + char errmsg[STRERR_BUFSIZE]; + struct bpf_link_perf *link; + int prog_fd, link_fd = -1, err; + + if (!OPTS_VALID(opts, bpf_perf_event_opts)) + return libbpf_err_ptr(-EINVAL); + + if (pfd < 0) { + pr_warn("prog '%s': invalid perf event FD %d\n", + prog->name, pfd); + return libbpf_err_ptr(-EINVAL); + } + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->link.detach = &bpf_link_perf_detach; + link->link.dealloc = &bpf_link_perf_dealloc; + link->perf_event_fd = pfd; + + if (kernel_supports(prog->obj, FEAT_PERF_LINK)) { + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts, + .perf_event.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0)); + + link_fd = bpf_link_create(prog_fd, pfd, BPF_PERF_EVENT, &link_opts); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create BPF link for perf_event FD %d: %d (%s)\n", + prog->name, pfd, + err, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + link->link.fd = link_fd; + } else { + if (OPTS_GET(opts, bpf_cookie, 0)) { + pr_warn("prog '%s': user context value is not supported\n", prog->name); + err = -EOPNOTSUPP; + goto err_out; + } + + if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach to perf_event FD %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + if (err == -EPROTO) + pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", + prog->name, pfd); + goto err_out; + } + link->link.fd = pfd; + } + if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + err = -errno; + pr_warn("prog '%s': failed to enable perf_event FD %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + + return &link->link; +err_out: + if (link_fd >= 0) + close(link_fd); + free(link); + return libbpf_err_ptr(err); +} + +struct bpf_link *bpf_program__attach_perf_event(const struct bpf_program *prog, int pfd) +{ + return bpf_program__attach_perf_event_opts(prog, pfd, NULL); +} + +/* + * this function is expected to parse integer in the range of [0, 2^31-1] from + * given file using scanf format string fmt. If actual parsed value is + * negative, the result might be indistinguishable from error + */ +static int parse_uint_from_file(const char *file, const char *fmt) +{ + char buf[STRERR_BUFSIZE]; + int err, ret; + FILE *f; + + f = fopen(file, "r"); + if (!f) { + err = -errno; + pr_debug("failed to open '%s': %s\n", file, + libbpf_strerror_r(err, buf, sizeof(buf))); + return err; + } + err = fscanf(f, fmt, &ret); + if (err != 1) { + err = err == EOF ? -EIO : -errno; + pr_debug("failed to parse '%s': %s\n", file, + libbpf_strerror_r(err, buf, sizeof(buf))); + fclose(f); + return err; + } + fclose(f); + return ret; +} + +static int determine_kprobe_perf_type(void) +{ + const char *file = "/sys/bus/event_source/devices/kprobe/type"; + + return parse_uint_from_file(file, "%d\n"); +} + +static int determine_uprobe_perf_type(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/type"; + + return parse_uint_from_file(file, "%d\n"); +} + +static int determine_kprobe_retprobe_bit(void) +{ + const char *file = "/sys/bus/event_source/devices/kprobe/format/retprobe"; + + return parse_uint_from_file(file, "config:%d\n"); +} + +static int determine_uprobe_retprobe_bit(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/format/retprobe"; + + return parse_uint_from_file(file, "config:%d\n"); +} + +#define PERF_UPROBE_REF_CTR_OFFSET_BITS 32 +#define PERF_UPROBE_REF_CTR_OFFSET_SHIFT 32 + +static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, + uint64_t offset, int pid, size_t ref_ctr_off) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + char errmsg[STRERR_BUFSIZE]; + int type, pfd; + + if ((__u64)ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) + return -EINVAL; + + memset(&attr, 0, attr_sz); + + type = uprobe ? determine_uprobe_perf_type() + : determine_kprobe_perf_type(); + if (type < 0) { + pr_warn("failed to determine %s perf type: %s\n", + uprobe ? "uprobe" : "kprobe", + libbpf_strerror_r(type, errmsg, sizeof(errmsg))); + return type; + } + if (retprobe) { + int bit = uprobe ? determine_uprobe_retprobe_bit() + : determine_kprobe_retprobe_bit(); + + if (bit < 0) { + pr_warn("failed to determine %s retprobe bit: %s\n", + uprobe ? "uprobe" : "kprobe", + libbpf_strerror_r(bit, errmsg, sizeof(errmsg))); + return bit; + } + attr.config |= 1 << bit; + } + attr.size = attr_sz; + attr.type = type; + attr.config |= (__u64)ref_ctr_off << PERF_UPROBE_REF_CTR_OFFSET_SHIFT; + attr.config1 = ptr_to_u64(name); /* kprobe_func or uprobe_path */ + attr.config2 = offset; /* kprobe_addr or probe_offset */ + + /* pid filter is meaningful only for uprobes */ + pfd = syscall(__NR_perf_event_open, &attr, + pid < 0 ? -1 : pid /* pid */, + pid == -1 ? 0 : -1 /* cpu */, + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + return pfd >= 0 ? pfd : -errno; +} + +static int append_to_file(const char *file, const char *fmt, ...) +{ + int fd, n, err = 0; + va_list ap; + + fd = open(file, O_WRONLY | O_APPEND | O_CLOEXEC, 0); + if (fd < 0) + return -errno; + + va_start(ap, fmt); + n = vdprintf(fd, fmt, ap); + va_end(ap); + + if (n < 0) + err = -errno; + + close(fd); + return err; +} + +#define DEBUGFS "/sys/kernel/debug/tracing" +#define TRACEFS "/sys/kernel/tracing" + +static bool use_debugfs(void) +{ + static int has_debugfs = -1; + + if (has_debugfs < 0) + has_debugfs = faccessat(AT_FDCWD, DEBUGFS, F_OK, AT_EACCESS) == 0; + + return has_debugfs == 1; +} + +static const char *tracefs_path(void) +{ + return use_debugfs() ? DEBUGFS : TRACEFS; +} + +static const char *tracefs_kprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/kprobe_events" : TRACEFS"/kprobe_events"; +} + +static const char *tracefs_uprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/uprobe_events" : TRACEFS"/uprobe_events"; +} + +static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz, + const char *kfunc_name, size_t offset) +{ + static int index = 0; + + snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx_%d", getpid(), kfunc_name, offset, + __sync_fetch_and_add(&index, 1)); +} + +static int add_kprobe_event_legacy(const char *probe_name, bool retprobe, + const char *kfunc_name, size_t offset) +{ + return append_to_file(tracefs_kprobe_events(), "%c:%s/%s %s+0x%zx", + retprobe ? 'r' : 'p', + retprobe ? "kretprobes" : "kprobes", + probe_name, kfunc_name, offset); +} + +static int remove_kprobe_event_legacy(const char *probe_name, bool retprobe) +{ + return append_to_file(tracefs_kprobe_events(), "-:%s/%s", + retprobe ? "kretprobes" : "kprobes", probe_name); +} + +static int determine_kprobe_perf_type_legacy(const char *probe_name, bool retprobe) +{ + char file[256]; + + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "kretprobes" : "kprobes", probe_name); + + return parse_uint_from_file(file, "%d\n"); +} + +static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, + const char *kfunc_name, size_t offset, int pid) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + char errmsg[STRERR_BUFSIZE]; + int type, pfd, err; + + err = add_kprobe_event_legacy(probe_name, retprobe, kfunc_name, offset); + if (err < 0) { + pr_warn("failed to add legacy kprobe event for '%s+0x%zx': %s\n", + kfunc_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + return err; + } + type = determine_kprobe_perf_type_legacy(probe_name, retprobe); + if (type < 0) { + err = type; + pr_warn("failed to determine legacy kprobe event id for '%s+0x%zx': %s\n", + kfunc_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; + attr.config = type; + attr.type = PERF_TYPE_TRACEPOINT; + + pfd = syscall(__NR_perf_event_open, &attr, + pid < 0 ? -1 : pid, /* pid */ + pid == -1 ? 0 : -1, /* cpu */ + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + err = -errno; + pr_warn("legacy kprobe perf_event_open() failed: %s\n", + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + return pfd; + +err_clean_legacy: + /* Clear the newly added legacy kprobe_event */ + remove_kprobe_event_legacy(probe_name, retprobe); + return err; +} + +static const char *arch_specific_syscall_pfx(void) +{ +#if defined(__x86_64__) + return "x64"; +#elif defined(__i386__) + return "ia32"; +#elif defined(__s390x__) + return "s390x"; +#elif defined(__s390__) + return "s390"; +#elif defined(__arm__) + return "arm"; +#elif defined(__aarch64__) + return "arm64"; +#elif defined(__mips__) + return "mips"; +#elif defined(__riscv) + return "riscv"; +#elif defined(__powerpc__) + return "powerpc"; +#elif defined(__powerpc64__) + return "powerpc64"; +#else + return NULL; +#endif +} + +static int probe_kern_syscall_wrapper(void) +{ + char syscall_name[64]; + const char *ksys_pfx; + + ksys_pfx = arch_specific_syscall_pfx(); + if (!ksys_pfx) + return 0; + + snprintf(syscall_name, sizeof(syscall_name), "__%s_sys_bpf", ksys_pfx); + + if (determine_kprobe_perf_type() >= 0) { + int pfd; + + pfd = perf_event_open_probe(false, false, syscall_name, 0, getpid(), 0); + if (pfd >= 0) + close(pfd); + + return pfd >= 0 ? 1 : 0; + } else { /* legacy mode */ + char probe_name[128]; + + gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), syscall_name, 0); + if (add_kprobe_event_legacy(probe_name, false, syscall_name, 0) < 0) + return 0; + + (void)remove_kprobe_event_legacy(probe_name, false); + return 1; + } +} + +struct bpf_link * +bpf_program__attach_kprobe_opts(const struct bpf_program *prog, + const char *func_name, + const struct bpf_kprobe_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + char errmsg[STRERR_BUFSIZE]; + char *legacy_probe = NULL; + struct bpf_link *link; + size_t offset; + bool retprobe, legacy; + int pfd, err; + + if (!OPTS_VALID(opts, bpf_kprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + retprobe = OPTS_GET(opts, retprobe, false); + offset = OPTS_GET(opts, offset, 0); + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + legacy = determine_kprobe_perf_type() < 0; + if (!legacy) { + pfd = perf_event_open_probe(false /* uprobe */, retprobe, + func_name, offset, + -1 /* pid */, 0 /* ref_ctr_off */); + } else { + char probe_name[256]; + + gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), + func_name, offset); + + legacy_probe = strdup(probe_name); + if (!legacy_probe) + return libbpf_err_ptr(-ENOMEM); + + pfd = perf_event_kprobe_open_legacy(legacy_probe, retprobe, func_name, + offset, -1 /* pid */); + } + if (pfd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create %s '%s+0x%zx' perf event: %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", + func_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); + err = libbpf_get_error(link); + if (err) { + close(pfd); + pr_warn("prog '%s': failed to attach to %s '%s+0x%zx': %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", + func_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + if (legacy) { + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + perf_link->legacy_probe_name = legacy_probe; + perf_link->legacy_is_kprobe = true; + perf_link->legacy_is_retprobe = retprobe; + } + + return link; + +err_clean_legacy: + if (legacy) + remove_kprobe_event_legacy(legacy_probe, retprobe); +err_out: + free(legacy_probe); + return libbpf_err_ptr(err); +} + +struct bpf_link *bpf_program__attach_kprobe(const struct bpf_program *prog, + bool retprobe, + const char *func_name) +{ + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts, + .retprobe = retprobe, + ); + + return bpf_program__attach_kprobe_opts(prog, func_name, &opts); +} + +struct bpf_link *bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts) +{ + LIBBPF_OPTS(bpf_kprobe_opts, kprobe_opts); + char func_name[128]; + + if (!OPTS_VALID(opts, bpf_ksyscall_opts)) + return libbpf_err_ptr(-EINVAL); + + if (kernel_supports(prog->obj, FEAT_SYSCALL_WRAPPER)) { + /* arch_specific_syscall_pfx() should never return NULL here + * because it is guarded by kernel_supports(). However, since + * compiler does not know that we have an explicit conditional + * as well. + */ + snprintf(func_name, sizeof(func_name), "__%s_sys_%s", + arch_specific_syscall_pfx() ? : "", syscall_name); + } else { + snprintf(func_name, sizeof(func_name), "__se_sys_%s", syscall_name); + } + + kprobe_opts.retprobe = OPTS_GET(opts, retprobe, false); + kprobe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + return bpf_program__attach_kprobe_opts(prog, func_name, &kprobe_opts); +} + +/* Adapted from perf/util/string.c */ +static bool glob_match(const char *str, const char *pat) +{ + while (*str && *pat && *pat != '*') { + if (*pat == '?') { /* Matches any single character */ + str++; + pat++; + continue; + } + if (*str != *pat) + return false; + str++; + pat++; + } + /* Check wild card */ + if (*pat == '*') { + while (*pat == '*') + pat++; + if (!*pat) /* Tail wild card matches all */ + return true; + while (*str) + if (glob_match(str++, pat)) + return true; + } + return !*str && !*pat; +} + +struct kprobe_multi_resolve { + const char *pattern; + unsigned long *addrs; + size_t cap; + size_t cnt; +}; + +static int +resolve_kprobe_multi_cb(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx) +{ + struct kprobe_multi_resolve *res = ctx; + int err; + + if (!glob_match(sym_name, res->pattern)) + return 0; + + err = libbpf_ensure_mem((void **) &res->addrs, &res->cap, sizeof(unsigned long), + res->cnt + 1); + if (err) + return err; + + res->addrs[res->cnt++] = (unsigned long) sym_addr; + return 0; +} + +struct bpf_link * +bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, + const char *pattern, + const struct bpf_kprobe_multi_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, lopts); + struct kprobe_multi_resolve res = { + .pattern = pattern, + }; + struct bpf_link *link = NULL; + char errmsg[STRERR_BUFSIZE]; + const unsigned long *addrs; + int err, link_fd, prog_fd; + const __u64 *cookies; + const char **syms; + bool retprobe; + size_t cnt; + + if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) + return libbpf_err_ptr(-EINVAL); + + syms = OPTS_GET(opts, syms, false); + addrs = OPTS_GET(opts, addrs, false); + cnt = OPTS_GET(opts, cnt, false); + cookies = OPTS_GET(opts, cookies, false); + + if (!pattern && !addrs && !syms) + return libbpf_err_ptr(-EINVAL); + if (pattern && (addrs || syms || cookies || cnt)) + return libbpf_err_ptr(-EINVAL); + if (!pattern && !cnt) + return libbpf_err_ptr(-EINVAL); + if (addrs && syms) + return libbpf_err_ptr(-EINVAL); + + if (pattern) { + err = libbpf_kallsyms_parse(resolve_kprobe_multi_cb, &res); + if (err) + goto error; + if (!res.cnt) { + err = -ENOENT; + goto error; + } + addrs = res.addrs; + cnt = res.cnt; + } + + retprobe = OPTS_GET(opts, retprobe, false); + + lopts.kprobe_multi.syms = syms; + lopts.kprobe_multi.addrs = addrs; + lopts.kprobe_multi.cookies = cookies; + lopts.kprobe_multi.cnt = cnt; + lopts.kprobe_multi.flags = retprobe ? BPF_F_KPROBE_MULTI_RETURN : 0; + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto error; + } + link->detach = &bpf_link__detach_fd; + + prog_fd = bpf_program__fd(prog); + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach: %s\n", + prog->name, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto error; + } + link->fd = link_fd; + free(res.addrs); + return link; + +error: + free(link); + free(res.addrs); + return libbpf_err_ptr(err); +} + +static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts); + unsigned long offset = 0; + const char *func_name; + char *func; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe") and SEC("kretprobe") */ + if (strcmp(prog->sec_name, "kprobe") == 0 || strcmp(prog->sec_name, "kretprobe") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe/"); + if (opts.retprobe) + func_name = prog->sec_name + sizeof("kretprobe/") - 1; + else + func_name = prog->sec_name + sizeof("kprobe/") - 1; + + n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%li", &func, &offset); + if (n < 1) { + pr_warn("kprobe name is invalid: %s\n", func_name); + return -EINVAL; + } + if (opts.retprobe && offset != 0) { + free(func); + pr_warn("kretprobes do not support offset specification\n"); + return -EINVAL; + } + + opts.offset = offset; + *link = bpf_program__attach_kprobe_opts(prog, func, &opts); + free(func); + return libbpf_get_error(*link); +} + +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_ksyscall_opts, opts); + const char *syscall_name; + + *link = NULL; + + /* no auto-attach for SEC("ksyscall") and SEC("kretsyscall") */ + if (strcmp(prog->sec_name, "ksyscall") == 0 || strcmp(prog->sec_name, "kretsyscall") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretsyscall/"); + if (opts.retprobe) + syscall_name = prog->sec_name + sizeof("kretsyscall/") - 1; + else + syscall_name = prog->sec_name + sizeof("ksyscall/") - 1; + + *link = bpf_program__attach_ksyscall(prog, syscall_name, &opts); + return *link ? 0 : -errno; +} + +static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + const char *spec; + char *pattern; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe.multi") and SEC("kretprobe.multi") */ + if (strcmp(prog->sec_name, "kprobe.multi") == 0 || + strcmp(prog->sec_name, "kretprobe.multi") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe.multi/"); + if (opts.retprobe) + spec = prog->sec_name + sizeof("kretprobe.multi/") - 1; + else + spec = prog->sec_name + sizeof("kprobe.multi/") - 1; + + n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); + if (n < 1) { + pr_warn("kprobe multi pattern is invalid: %s\n", pattern); + return -EINVAL; + } + + *link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts); + free(pattern); + return libbpf_get_error(*link); +} + +static void gen_uprobe_legacy_event_name(char *buf, size_t buf_sz, + const char *binary_path, uint64_t offset) +{ + int i; + + snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx", getpid(), binary_path, (size_t)offset); + + /* sanitize binary_path in the probe name */ + for (i = 0; buf[i]; i++) { + if (!isalnum(buf[i])) + buf[i] = '_'; + } +} + +static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe, + const char *binary_path, size_t offset) +{ + return append_to_file(tracefs_uprobe_events(), "%c:%s/%s %s:0x%zx", + retprobe ? 'r' : 'p', + retprobe ? "uretprobes" : "uprobes", + probe_name, binary_path, offset); +} + +static inline int remove_uprobe_event_legacy(const char *probe_name, bool retprobe) +{ + return append_to_file(tracefs_uprobe_events(), "-:%s/%s", + retprobe ? "uretprobes" : "uprobes", probe_name); +} + +static int determine_uprobe_perf_type_legacy(const char *probe_name, bool retprobe) +{ + char file[512]; + + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "uretprobes" : "uprobes", probe_name); + + return parse_uint_from_file(file, "%d\n"); +} + +static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, + const char *binary_path, size_t offset, int pid) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + int type, pfd, err; + + err = add_uprobe_event_legacy(probe_name, retprobe, binary_path, offset); + if (err < 0) { + pr_warn("failed to add legacy uprobe event for %s:0x%zx: %d\n", + binary_path, (size_t)offset, err); + return err; + } + type = determine_uprobe_perf_type_legacy(probe_name, retprobe); + if (type < 0) { + err = type; + pr_warn("failed to determine legacy uprobe event id for %s:0x%zx: %d\n", + binary_path, offset, err); + goto err_clean_legacy; + } + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; + attr.config = type; + attr.type = PERF_TYPE_TRACEPOINT; + + pfd = syscall(__NR_perf_event_open, &attr, + pid < 0 ? -1 : pid, /* pid */ + pid == -1 ? 0 : -1, /* cpu */ + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + err = -errno; + pr_warn("legacy uprobe perf_event_open() failed: %d\n", err); + goto err_clean_legacy; + } + return pfd; + +err_clean_legacy: + /* Clear the newly added legacy uprobe_event */ + remove_uprobe_event_legacy(probe_name, retprobe); + return err; +} + +/* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ +static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn) +{ + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + + if (!gelf_getshdr(scn, &sh)) + continue; + if (sh.sh_type == sh_type) + return scn; + } + return NULL; +} + +/* Find offset of function name in object specified by path. "name" matches + * symbol name or name@@LIB for library functions. + */ +static long elf_find_func_offset(const char *binary_path, const char *name) +{ + int fd, i, sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; + bool is_shared_lib, is_name_qualified; + char errmsg[STRERR_BUFSIZE]; + long ret = -ENOENT; + size_t name_len; + GElf_Ehdr ehdr; + Elf *elf; + + fd = open(binary_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + ret = -errno; + pr_warn("failed to open %s: %s\n", binary_path, + libbpf_strerror_r(ret, errmsg, sizeof(errmsg))); + return ret; + } + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1)); + close(fd); + return -LIBBPF_ERRNO__FORMAT; + } + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + /* for shared lib case, we do not need to calculate relative offset */ + is_shared_lib = ehdr.e_type == ET_DYN; + + name_len = strlen(name); + /* Does name specify "@@LIB"? */ + is_name_qualified = strstr(name, "@@") != NULL; + + /* Search SHT_DYNSYM, SHT_SYMTAB for symbol. This search order is used because if + * a binary is stripped, it may only have SHT_DYNSYM, and a fully-statically + * linked binary may not have SHT_DYMSYM, so absence of a section should not be + * reported as a warning/error. + */ + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + size_t nr_syms, strtabidx, idx; + Elf_Data *symbols = NULL; + Elf_Scn *scn = NULL; + int last_bind = -1; + const char *sname; + GElf_Shdr sh; + + scn = elf_find_next_scn_by_type(elf, sh_types[i], NULL); + if (!scn) { + pr_debug("elf: failed to find symbol table ELF sections in '%s'\n", + binary_path); + continue; + } + if (!gelf_getshdr(scn, &sh)) + continue; + strtabidx = sh.sh_link; + symbols = elf_getdata(scn, 0); + if (!symbols) { + pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n", + binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + nr_syms = symbols->d_size / sh.sh_entsize; + + for (idx = 0; idx < nr_syms; idx++) { + int curr_bind; + GElf_Sym sym; + Elf_Scn *sym_scn; + GElf_Shdr sym_sh; + + if (!gelf_getsym(symbols, idx, &sym)) + continue; + + if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) + continue; + + sname = elf_strptr(elf, strtabidx, sym.st_name); + if (!sname) + continue; + + curr_bind = GELF_ST_BIND(sym.st_info); + + /* User can specify func, func@@LIB or func@@LIB_VERSION. */ + if (strncmp(sname, name, name_len) != 0) + continue; + /* ...but we don't want a search for "foo" to match 'foo2" also, so any + * additional characters in sname should be of the form "@@LIB". + */ + if (!is_name_qualified && sname[name_len] != '\0' && sname[name_len] != '@') + continue; + + if (ret >= 0) { + /* handle multiple matches */ + if (last_bind != STB_WEAK && curr_bind != STB_WEAK) { + /* Only accept one non-weak bind. */ + pr_warn("elf: ambiguous match for '%s', '%s' in '%s'\n", + sname, name, binary_path); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } else if (curr_bind == STB_WEAK) { + /* already have a non-weak bind, and + * this is a weak bind, so ignore. + */ + continue; + } + } + + /* Transform symbol's virtual address (absolute for + * binaries and relative for shared libs) into file + * offset, which is what kernel is expecting for + * uprobe/uretprobe attachment. + * See Documentation/trace/uprobetracer.rst for more + * details. + * This is done by looking up symbol's containing + * section's header and using it's virtual address + * (sh_addr) and corresponding file offset (sh_offset) + * to transform sym.st_value (virtual address) into + * desired final file offset. + */ + sym_scn = elf_getscn(elf, sym.st_shndx); + if (!sym_scn) + continue; + if (!gelf_getshdr(sym_scn, &sym_sh)) + continue; + + ret = sym.st_value - sym_sh.sh_addr + sym_sh.sh_offset; + last_bind = curr_bind; + } + if (ret > 0) + break; + } + + if (ret > 0) { + pr_debug("elf: symbol address match for '%s' in '%s': 0x%lx\n", name, binary_path, + ret); + } else { + if (ret == 0) { + pr_warn("elf: '%s' is 0 in symtab for '%s': %s\n", name, binary_path, + is_shared_lib ? "should not be 0 in a shared library" : + "try using shared library path instead"); + ret = -ENOENT; + } else { + pr_warn("elf: failed to find symbol '%s' in '%s'\n", name, binary_path); + } + } +out: + elf_end(elf); + close(fd); + return ret; +} + +static const char *arch_specific_lib_paths(void) +{ + /* + * Based on https://packages.debian.org/sid/libc6. + * + * Assume that the traced program is built for the same architecture + * as libbpf, which should cover the vast majority of cases. + */ +#if defined(__x86_64__) + return "/lib/x86_64-linux-gnu"; +#elif defined(__i386__) + return "/lib/i386-linux-gnu"; +#elif defined(__s390x__) + return "/lib/s390x-linux-gnu"; +#elif defined(__s390__) + return "/lib/s390-linux-gnu"; +#elif defined(__arm__) && defined(__SOFTFP__) + return "/lib/arm-linux-gnueabi"; +#elif defined(__arm__) && !defined(__SOFTFP__) + return "/lib/arm-linux-gnueabihf"; +#elif defined(__aarch64__) + return "/lib/aarch64-linux-gnu"; +#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 64 + return "/lib/mips64el-linux-gnuabi64"; +#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 32 + return "/lib/mipsel-linux-gnu"; +#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return "/lib/powerpc64le-linux-gnu"; +#elif defined(__sparc__) && defined(__arch64__) + return "/lib/sparc64-linux-gnu"; +#elif defined(__riscv) && __riscv_xlen == 64 + return "/lib/riscv64-linux-gnu"; +#else + return NULL; +#endif +} + +/* Get full path to program/shared library. */ +static int resolve_full_path(const char *file, char *result, size_t result_sz) +{ + const char *search_paths[3] = {}; + int i, perm; + + if (str_has_sfx(file, ".so") || strstr(file, ".so.")) { + search_paths[0] = getenv("LD_LIBRARY_PATH"); + search_paths[1] = "/usr/lib64:/usr/lib"; + search_paths[2] = arch_specific_lib_paths(); + perm = R_OK; + } else { + search_paths[0] = getenv("PATH"); + search_paths[1] = "/usr/bin:/usr/sbin"; + perm = R_OK | X_OK; + } + + for (i = 0; i < ARRAY_SIZE(search_paths); i++) { + const char *s; + + if (!search_paths[i]) + continue; + for (s = search_paths[i]; s != NULL; s = strchr(s, ':')) { + char *next_path; + int seg_len; + + if (s[0] == ':') + s++; + next_path = strchr(s, ':'); + seg_len = next_path ? next_path - s : strlen(s); + if (!seg_len) + continue; + snprintf(result, result_sz, "%.*s/%s", seg_len, s, file); + /* ensure it has required permissions */ + if (faccessat(AT_FDCWD, result, perm, AT_EACCESS) < 0) + continue; + pr_debug("resolved '%s' to '%s'\n", file, result); + return 0; + } + } + return -ENOENT; +} + +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, + const char *binary_path, size_t func_offset, + const struct bpf_uprobe_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + char errmsg[STRERR_BUFSIZE], *legacy_probe = NULL; + char full_binary_path[PATH_MAX]; + struct bpf_link *link; + size_t ref_ctr_off; + int pfd, err; + bool retprobe, legacy; + const char *func_name; + + if (!OPTS_VALID(opts, bpf_uprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + retprobe = OPTS_GET(opts, retprobe, false); + ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0); + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + + if (!strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, full_binary_path, + sizeof(full_binary_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, binary_path, err); + return libbpf_err_ptr(err); + } + binary_path = full_binary_path; + } + func_name = OPTS_GET(opts, func_name, NULL); + if (func_name) { + long sym_off; + + sym_off = elf_find_func_offset(binary_path, func_name); + if (sym_off < 0) + return libbpf_err_ptr(sym_off); + func_offset += sym_off; + } + + legacy = determine_uprobe_perf_type() < 0; + if (!legacy) { + pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, + func_offset, pid, ref_ctr_off); + } else { + char probe_name[PATH_MAX + 64]; + + if (ref_ctr_off) + return libbpf_err_ptr(-EINVAL); + + gen_uprobe_legacy_event_name(probe_name, sizeof(probe_name), + binary_path, func_offset); + + legacy_probe = strdup(probe_name); + if (!legacy_probe) + return libbpf_err_ptr(-ENOMEM); + + pfd = perf_event_uprobe_open_legacy(legacy_probe, retprobe, + binary_path, func_offset, pid); + } + if (pfd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create %s '%s:0x%zx' perf event: %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", + binary_path, func_offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); + err = libbpf_get_error(link); + if (err) { + close(pfd); + pr_warn("prog '%s': failed to attach to %s '%s:0x%zx': %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", + binary_path, func_offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + if (legacy) { + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + perf_link->legacy_probe_name = legacy_probe; + perf_link->legacy_is_kprobe = false; + perf_link->legacy_is_retprobe = retprobe; + } + return link; + +err_clean_legacy: + if (legacy) + remove_uprobe_event_legacy(legacy_probe, retprobe); +err_out: + free(legacy_probe); + return libbpf_err_ptr(err); +} + +/* Format of u[ret]probe section definition supporting auto-attach: + * u[ret]probe/binary:function[+offset] + * + * binary can be an absolute/relative path or a filename; the latter is resolved to a + * full binary path via bpf_program__attach_uprobe_opts. + * + * Specifying uprobe+ ensures we carry out strict matching; either "uprobe" must be + * specified (and auto-attach is not possible) or the above format is specified for + * auto-attach. + */ +static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts); + char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; + int n, ret = -EINVAL; + long offset = 0; + + *link = NULL; + + n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[a-zA-Z0-9_.]+%li", + &probe_type, &binary_path, &func_name, &offset); + switch (n) { + case 1: + /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ + ret = 0; + break; + case 2: + pr_warn("prog '%s': section '%s' missing ':function[+offset]' specification\n", + prog->name, prog->sec_name); + break; + case 3: + case 4: + opts.retprobe = strcmp(probe_type, "uretprobe") == 0 || + strcmp(probe_type, "uretprobe.s") == 0; + if (opts.retprobe && offset != 0) { + pr_warn("prog '%s': uretprobes do not support offset specification\n", + prog->name); + break; + } + opts.func_name = func_name; + *link = bpf_program__attach_uprobe_opts(prog, -1, binary_path, offset, &opts); + ret = libbpf_get_error(*link); + break; + default: + pr_warn("prog '%s': invalid format of section definition '%s'\n", prog->name, + prog->sec_name); + break; + } + free(probe_type); + free(binary_path); + free(func_name); + + return ret; +} + +struct bpf_link *bpf_program__attach_uprobe(const struct bpf_program *prog, + bool retprobe, pid_t pid, + const char *binary_path, + size_t func_offset) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts, .retprobe = retprobe); + + return bpf_program__attach_uprobe_opts(prog, pid, binary_path, func_offset, &opts); +} + +struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, + pid_t pid, const char *binary_path, + const char *usdt_provider, const char *usdt_name, + const struct bpf_usdt_opts *opts) +{ + char resolved_path[512]; + struct bpf_object *obj = prog->obj; + struct bpf_link *link; + __u64 usdt_cookie; + int err; + + if (!OPTS_VALID(opts, bpf_uprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + if (bpf_program__fd(prog) < 0) { + pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + + if (!strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, resolved_path, sizeof(resolved_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, binary_path, err); + return libbpf_err_ptr(err); + } + binary_path = resolved_path; + } + + /* USDT manager is instantiated lazily on first USDT attach. It will + * be destroyed together with BPF object in bpf_object__close(). + */ + if (IS_ERR(obj->usdt_man)) + return libbpf_ptr(obj->usdt_man); + if (!obj->usdt_man) { + obj->usdt_man = usdt_manager_new(obj); + if (IS_ERR(obj->usdt_man)) + return libbpf_ptr(obj->usdt_man); + } + + usdt_cookie = OPTS_GET(opts, usdt_cookie, 0); + link = usdt_manager_attach_usdt(obj->usdt_man, prog, pid, binary_path, + usdt_provider, usdt_name, usdt_cookie); + err = libbpf_get_error(link); + if (err) + return libbpf_err_ptr(err); + return link; +} + +static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + char *path = NULL, *provider = NULL, *name = NULL; + const char *sec_name; + int n, err; + + sec_name = bpf_program__section_name(prog); + if (strcmp(sec_name, "usdt") == 0) { + /* no auto-attach for just SEC("usdt") */ + *link = NULL; + return 0; + } + + n = sscanf(sec_name, "usdt/%m[^:]:%m[^:]:%m[^:]", &path, &provider, &name); + if (n != 3) { + pr_warn("invalid section '%s', expected SEC(\"usdt/<path>:<provider>:<name>\")\n", + sec_name); + err = -EINVAL; + } else { + *link = bpf_program__attach_usdt(prog, -1 /* any process */, path, + provider, name, NULL); + err = libbpf_get_error(*link); + } + free(path); + free(provider); + free(name); + return err; +} + +static int determine_tracepoint_id(const char *tp_category, + const char *tp_name) +{ + char file[PATH_MAX]; + int ret; + + ret = snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), tp_category, tp_name); + if (ret < 0) + return -errno; + if (ret >= sizeof(file)) { + pr_debug("tracepoint %s/%s path is too long\n", + tp_category, tp_name); + return -E2BIG; + } + return parse_uint_from_file(file, "%d\n"); +} + +static int perf_event_open_tracepoint(const char *tp_category, + const char *tp_name) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + char errmsg[STRERR_BUFSIZE]; + int tp_id, pfd, err; + + tp_id = determine_tracepoint_id(tp_category, tp_name); + if (tp_id < 0) { + pr_warn("failed to determine tracepoint '%s/%s' perf event ID: %s\n", + tp_category, tp_name, + libbpf_strerror_r(tp_id, errmsg, sizeof(errmsg))); + return tp_id; + } + + memset(&attr, 0, attr_sz); + attr.type = PERF_TYPE_TRACEPOINT; + attr.size = attr_sz; + attr.config = tp_id; + + pfd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu */, + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + err = -errno; + pr_warn("tracepoint '%s/%s' perf_event_open() failed: %s\n", + tp_category, tp_name, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + return err; + } + return pfd; +} + +struct bpf_link *bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name, + const struct bpf_tracepoint_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int pfd, err; + + if (!OPTS_VALID(opts, bpf_tracepoint_opts)) + return libbpf_err_ptr(-EINVAL); + + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + pfd = perf_event_open_tracepoint(tp_category, tp_name); + if (pfd < 0) { + pr_warn("prog '%s': failed to create tracepoint '%s/%s' perf event: %s\n", + prog->name, tp_category, tp_name, + libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(pfd); + } + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); + err = libbpf_get_error(link); + if (err) { + close(pfd); + pr_warn("prog '%s': failed to attach to tracepoint '%s/%s': %s\n", + prog->name, tp_category, tp_name, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(err); + } + return link; +} + +struct bpf_link *bpf_program__attach_tracepoint(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name) +{ + return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL); +} + +static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + char *sec_name, *tp_cat, *tp_name; + + *link = NULL; + + /* no auto-attach for SEC("tp") or SEC("tracepoint") */ + if (strcmp(prog->sec_name, "tp") == 0 || strcmp(prog->sec_name, "tracepoint") == 0) + return 0; + + sec_name = strdup(prog->sec_name); + if (!sec_name) + return -ENOMEM; + + /* extract "tp/<category>/<name>" or "tracepoint/<category>/<name>" */ + if (str_has_pfx(prog->sec_name, "tp/")) + tp_cat = sec_name + sizeof("tp/") - 1; + else + tp_cat = sec_name + sizeof("tracepoint/") - 1; + tp_name = strchr(tp_cat, '/'); + if (!tp_name) { + free(sec_name); + return -EINVAL; + } + *tp_name = '\0'; + tp_name++; + + *link = bpf_program__attach_tracepoint(prog, tp_cat, tp_name); + free(sec_name); + return libbpf_get_error(*link); +} + +struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name) +{ + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, pfd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); + if (pfd < 0) { + pfd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to raw tracepoint '%s': %s\n", + prog->name, tp_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(pfd); + } + link->fd = pfd; + return link; +} + +static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + static const char *const prefixes[] = { + "raw_tp", + "raw_tracepoint", + "raw_tp.w", + "raw_tracepoint.w", + }; + size_t i; + const char *tp_name = NULL; + + *link = NULL; + + for (i = 0; i < ARRAY_SIZE(prefixes); i++) { + size_t pfx_len; + + if (!str_has_pfx(prog->sec_name, prefixes[i])) + continue; + + pfx_len = strlen(prefixes[i]); + /* no auto-attach case of, e.g., SEC("raw_tp") */ + if (prog->sec_name[pfx_len] == '\0') + return 0; + + if (prog->sec_name[pfx_len] != '/') + continue; + + tp_name = prog->sec_name + pfx_len + 1; + break; + } + + if (!tp_name) { + pr_warn("prog '%s': invalid section name '%s'\n", + prog->name, prog->sec_name); + return -EINVAL; + } + + *link = bpf_program__attach_raw_tracepoint(prog, tp_name); + return libbpf_get_error(*link); +} + +/* Common logic for all BPF program types that attach to a btf_id */ +static struct bpf_link *bpf_program__attach_btf_id(const struct bpf_program *prog, + const struct bpf_trace_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, pfd; + + if (!OPTS_VALID(opts, bpf_trace_opts)) + return libbpf_err_ptr(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + /* libbpf is smart enough to redirect to BPF_RAW_TRACEPOINT_OPEN on old kernels */ + link_opts.tracing.cookie = OPTS_GET(opts, cookie, 0); + pfd = bpf_link_create(prog_fd, 0, bpf_program__expected_attach_type(prog), &link_opts); + if (pfd < 0) { + pfd = -errno; + free(link); + pr_warn("prog '%s': failed to attach: %s\n", + prog->name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(pfd); + } + link->fd = pfd; + return link; +} + +struct bpf_link *bpf_program__attach_trace(const struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog, NULL); +} + +struct bpf_link *bpf_program__attach_trace_opts(const struct bpf_program *prog, + const struct bpf_trace_opts *opts) +{ + return bpf_program__attach_btf_id(prog, opts); +} + +struct bpf_link *bpf_program__attach_lsm(const struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog, NULL); +} + +static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_trace(prog); + return libbpf_get_error(*link); +} + +static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_lsm(prog); + return libbpf_get_error(*link); +} + +static struct bpf_link * +bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id, + const char *target_name) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts, + .target_btf_id = btf_id); + enum bpf_attach_type attach_type; + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, link_fd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + attach_type = bpf_program__expected_attach_type(prog); + link_fd = bpf_link_create(prog_fd, target_fd, attach_type, &opts); + if (link_fd < 0) { + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to %s: %s\n", + prog->name, target_name, + libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + return link; +} + +struct bpf_link * +bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd) +{ + return bpf_program__attach_fd(prog, cgroup_fd, 0, "cgroup"); +} + +struct bpf_link * +bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) +{ + return bpf_program__attach_fd(prog, netns_fd, 0, "netns"); +} + +struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) +{ + /* target_fd/target_ifindex use the same field in LINK_CREATE */ + return bpf_program__attach_fd(prog, ifindex, 0, "xdp"); +} + +struct bpf_link *bpf_program__attach_freplace(const struct bpf_program *prog, + int target_fd, + const char *attach_func_name) +{ + int btf_id; + + if (!!target_fd != !!attach_func_name) { + pr_warn("prog '%s': supply none or both of target_fd and attach_func_name\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (prog->type != BPF_PROG_TYPE_EXT) { + pr_warn("prog '%s': only BPF_PROG_TYPE_EXT can attach as freplace", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (target_fd) { + btf_id = libbpf_find_prog_btf_id(attach_func_name, target_fd); + if (btf_id < 0) + return libbpf_err_ptr(btf_id); + + return bpf_program__attach_fd(prog, target_fd, btf_id, "freplace"); + } else { + /* no target, so use raw_tracepoint_open for compatibility + * with old kernels + */ + return bpf_program__attach_trace(prog); + } +} + +struct bpf_link * +bpf_program__attach_iter(const struct bpf_program *prog, + const struct bpf_iter_attach_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_create_opts); + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, link_fd; + __u32 target_fd = 0; + + if (!OPTS_VALID(opts, bpf_iter_attach_opts)) + return libbpf_err_ptr(-EINVAL); + + link_create_opts.iter_info = OPTS_GET(opts, link_info, (void *)0); + link_create_opts.iter_info_len = OPTS_GET(opts, link_info_len, 0); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + link_fd = bpf_link_create(prog_fd, target_fd, BPF_TRACE_ITER, + &link_create_opts); + if (link_fd < 0) { + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to iterator: %s\n", + prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + return link; +} + +static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_iter(prog, NULL); + return libbpf_get_error(*link); +} + +struct bpf_link *bpf_program__attach(const struct bpf_program *prog) +{ + struct bpf_link *link = NULL; + int err; + + if (!prog->sec_def || !prog->sec_def->prog_attach_fn) + return libbpf_err_ptr(-EOPNOTSUPP); + + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, &link); + if (err) + return libbpf_err_ptr(err); + + /* When calling bpf_program__attach() explicitly, auto-attach support + * is expected to work, so NULL returned link is considered an error. + * This is different for skeleton's attach, see comment in + * bpf_object__attach_skeleton(). + */ + if (!link) + return libbpf_err_ptr(-EOPNOTSUPP); + + return link; +} + +static int bpf_link__detach_struct_ops(struct bpf_link *link) +{ + __u32 zero = 0; + + if (bpf_map_delete_elem(link->fd, &zero)) + return -errno; + + return 0; +} + +struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +{ + struct bpf_struct_ops *st_ops; + struct bpf_link *link; + __u32 i, zero = 0; + int err; + + if (!bpf_map__is_struct_ops(map) || map->fd == -1) + return libbpf_err_ptr(-EINVAL); + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-EINVAL); + + st_ops = map->st_ops; + for (i = 0; i < btf_vlen(st_ops->type); i++) { + struct bpf_program *prog = st_ops->progs[i]; + void *kern_data; + int prog_fd; + + if (!prog) + continue; + + prog_fd = bpf_program__fd(prog); + kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i]; + *(unsigned long *)kern_data = prog_fd; + } + + err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0); + if (err) { + err = -errno; + free(link); + return libbpf_err_ptr(err); + } + + link->detach = bpf_link__detach_struct_ops; + link->fd = map->fd; + + return link; +} + +typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, + void *private_data); + +static enum bpf_perf_event_ret +perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, + void **copy_mem, size_t *copy_size, + bpf_perf_event_print_t fn, void *private_data) +{ + struct perf_event_mmap_page *header = mmap_mem; + __u64 data_head = ring_buffer_read_head(header); + __u64 data_tail = header->data_tail; + void *base = ((__u8 *)header) + page_size; + int ret = LIBBPF_PERF_EVENT_CONT; + struct perf_event_header *ehdr; + size_t ehdr_size; + + while (data_head != data_tail) { + ehdr = base + (data_tail & (mmap_size - 1)); + ehdr_size = ehdr->size; + + if (((void *)ehdr) + ehdr_size > base + mmap_size) { + void *copy_start = ehdr; + size_t len_first = base + mmap_size - copy_start; + size_t len_secnd = ehdr_size - len_first; + + if (*copy_size < ehdr_size) { + free(*copy_mem); + *copy_mem = malloc(ehdr_size); + if (!*copy_mem) { + *copy_size = 0; + ret = LIBBPF_PERF_EVENT_ERROR; + break; + } + *copy_size = ehdr_size; + } + + memcpy(*copy_mem, copy_start, len_first); + memcpy(*copy_mem + len_first, base, len_secnd); + ehdr = *copy_mem; + } + + ret = fn(ehdr, private_data); + data_tail += ehdr_size; + if (ret != LIBBPF_PERF_EVENT_CONT) + break; + } + + ring_buffer_write_tail(header, data_tail); + return libbpf_err(ret); +} + +struct perf_buffer; + +struct perf_buffer_params { + struct perf_event_attr *attr; + /* if event_cb is specified, it takes precendence */ + perf_buffer_event_fn event_cb; + /* sample_cb and lost_cb are higher-level common-case callbacks */ + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; + int cpu_cnt; + int *cpus; + int *map_keys; +}; + +struct perf_cpu_buf { + struct perf_buffer *pb; + void *base; /* mmap()'ed memory */ + void *buf; /* for reconstructing segmented data */ + size_t buf_size; + int fd; + int cpu; + int map_key; +}; + +struct perf_buffer { + perf_buffer_event_fn event_cb; + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; /* passed into callbacks */ + + size_t page_size; + size_t mmap_size; + struct perf_cpu_buf **cpu_bufs; + struct epoll_event *events; + int cpu_cnt; /* number of allocated CPU buffers */ + int epoll_fd; /* perf event FD */ + int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */ +}; + +static void perf_buffer__free_cpu_buf(struct perf_buffer *pb, + struct perf_cpu_buf *cpu_buf) +{ + if (!cpu_buf) + return; + if (cpu_buf->base && + munmap(cpu_buf->base, pb->mmap_size + pb->page_size)) + pr_warn("failed to munmap cpu_buf #%d\n", cpu_buf->cpu); + if (cpu_buf->fd >= 0) { + ioctl(cpu_buf->fd, PERF_EVENT_IOC_DISABLE, 0); + close(cpu_buf->fd); + } + free(cpu_buf->buf); + free(cpu_buf); +} + +void perf_buffer__free(struct perf_buffer *pb) +{ + int i; + + if (IS_ERR_OR_NULL(pb)) + return; + if (pb->cpu_bufs) { + for (i = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + + if (!cpu_buf) + continue; + + bpf_map_delete_elem(pb->map_fd, &cpu_buf->map_key); + perf_buffer__free_cpu_buf(pb, cpu_buf); + } + free(pb->cpu_bufs); + } + if (pb->epoll_fd >= 0) + close(pb->epoll_fd); + free(pb->events); + free(pb); +} + +static struct perf_cpu_buf * +perf_buffer__open_cpu_buf(struct perf_buffer *pb, struct perf_event_attr *attr, + int cpu, int map_key) +{ + struct perf_cpu_buf *cpu_buf; + char msg[STRERR_BUFSIZE]; + int err; + + cpu_buf = calloc(1, sizeof(*cpu_buf)); + if (!cpu_buf) + return ERR_PTR(-ENOMEM); + + cpu_buf->pb = pb; + cpu_buf->cpu = cpu; + cpu_buf->map_key = map_key; + + cpu_buf->fd = syscall(__NR_perf_event_open, attr, -1 /* pid */, cpu, + -1, PERF_FLAG_FD_CLOEXEC); + if (cpu_buf->fd < 0) { + err = -errno; + pr_warn("failed to open perf buffer event on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + cpu_buf->base = mmap(NULL, pb->mmap_size + pb->page_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + cpu_buf->fd, 0); + if (cpu_buf->base == MAP_FAILED) { + cpu_buf->base = NULL; + err = -errno; + pr_warn("failed to mmap perf buffer on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + if (ioctl(cpu_buf->fd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + err = -errno; + pr_warn("failed to enable perf buffer event on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + return cpu_buf; + +error: + perf_buffer__free_cpu_buf(pb, cpu_buf); + return (struct perf_cpu_buf *)ERR_PTR(err); +} + +static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, + struct perf_buffer_params *p); + +struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt, + perf_buffer_sample_fn sample_cb, + perf_buffer_lost_fn lost_cb, + void *ctx, + const struct perf_buffer_opts *opts) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_buffer_params p = {}; + struct perf_event_attr attr; + + if (!OPTS_VALID(opts, perf_buffer_opts)) + return libbpf_err_ptr(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; + attr.config = PERF_COUNT_SW_BPF_OUTPUT; + attr.type = PERF_TYPE_SOFTWARE; + attr.sample_type = PERF_SAMPLE_RAW; + attr.sample_period = 1; + attr.wakeup_events = 1; + + p.attr = &attr; + p.sample_cb = sample_cb; + p.lost_cb = lost_cb; + p.ctx = ctx; + + return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); +} + +struct perf_buffer *perf_buffer__new_raw(int map_fd, size_t page_cnt, + struct perf_event_attr *attr, + perf_buffer_event_fn event_cb, void *ctx, + const struct perf_buffer_raw_opts *opts) +{ + struct perf_buffer_params p = {}; + + if (!attr) + return libbpf_err_ptr(-EINVAL); + + if (!OPTS_VALID(opts, perf_buffer_raw_opts)) + return libbpf_err_ptr(-EINVAL); + + p.attr = attr; + p.event_cb = event_cb; + p.ctx = ctx; + p.cpu_cnt = OPTS_GET(opts, cpu_cnt, 0); + p.cpus = OPTS_GET(opts, cpus, NULL); + p.map_keys = OPTS_GET(opts, map_keys, NULL); + + return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); +} + +static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, + struct perf_buffer_params *p) +{ + const char *online_cpus_file = "/sys/devices/system/cpu/online"; + struct bpf_map_info map; + char msg[STRERR_BUFSIZE]; + struct perf_buffer *pb; + bool *online = NULL; + __u32 map_info_len; + int err, i, j, n; + + if (page_cnt == 0 || (page_cnt & (page_cnt - 1))) { + pr_warn("page count should be power of two, but is %zu\n", + page_cnt); + return ERR_PTR(-EINVAL); + } + + /* best-effort sanity checks */ + memset(&map, 0, sizeof(map)); + map_info_len = sizeof(map); + err = bpf_obj_get_info_by_fd(map_fd, &map, &map_info_len); + if (err) { + err = -errno; + /* if BPF_OBJ_GET_INFO_BY_FD is supported, will return + * -EBADFD, -EFAULT, or -E2BIG on real error + */ + if (err != -EINVAL) { + pr_warn("failed to get map info for map FD %d: %s\n", + map_fd, libbpf_strerror_r(err, msg, sizeof(msg))); + return ERR_PTR(err); + } + pr_debug("failed to get map info for FD %d; API not supported? Ignoring...\n", + map_fd); + } else { + if (map.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) { + pr_warn("map '%s' should be BPF_MAP_TYPE_PERF_EVENT_ARRAY\n", + map.name); + return ERR_PTR(-EINVAL); + } + } + + pb = calloc(1, sizeof(*pb)); + if (!pb) + return ERR_PTR(-ENOMEM); + + pb->event_cb = p->event_cb; + pb->sample_cb = p->sample_cb; + pb->lost_cb = p->lost_cb; + pb->ctx = p->ctx; + + pb->page_size = getpagesize(); + pb->mmap_size = pb->page_size * page_cnt; + pb->map_fd = map_fd; + + pb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (pb->epoll_fd < 0) { + err = -errno; + pr_warn("failed to create epoll instance: %s\n", + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + if (p->cpu_cnt > 0) { + pb->cpu_cnt = p->cpu_cnt; + } else { + pb->cpu_cnt = libbpf_num_possible_cpus(); + if (pb->cpu_cnt < 0) { + err = pb->cpu_cnt; + goto error; + } + if (map.max_entries && map.max_entries < pb->cpu_cnt) + pb->cpu_cnt = map.max_entries; + } + + pb->events = calloc(pb->cpu_cnt, sizeof(*pb->events)); + if (!pb->events) { + err = -ENOMEM; + pr_warn("failed to allocate events: out of memory\n"); + goto error; + } + pb->cpu_bufs = calloc(pb->cpu_cnt, sizeof(*pb->cpu_bufs)); + if (!pb->cpu_bufs) { + err = -ENOMEM; + pr_warn("failed to allocate buffers: out of memory\n"); + goto error; + } + + err = parse_cpu_mask_file(online_cpus_file, &online, &n); + if (err) { + pr_warn("failed to get online CPU mask: %d\n", err); + goto error; + } + + for (i = 0, j = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf; + int cpu, map_key; + + cpu = p->cpu_cnt > 0 ? p->cpus[i] : i; + map_key = p->cpu_cnt > 0 ? p->map_keys[i] : i; + + /* in case user didn't explicitly requested particular CPUs to + * be attached to, skip offline/not present CPUs + */ + if (p->cpu_cnt <= 0 && (cpu >= n || !online[cpu])) + continue; + + cpu_buf = perf_buffer__open_cpu_buf(pb, p->attr, cpu, map_key); + if (IS_ERR(cpu_buf)) { + err = PTR_ERR(cpu_buf); + goto error; + } + + pb->cpu_bufs[j] = cpu_buf; + + err = bpf_map_update_elem(pb->map_fd, &map_key, + &cpu_buf->fd, 0); + if (err) { + err = -errno; + pr_warn("failed to set cpu #%d, key %d -> perf FD %d: %s\n", + cpu, map_key, cpu_buf->fd, + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + pb->events[j].events = EPOLLIN; + pb->events[j].data.ptr = cpu_buf; + if (epoll_ctl(pb->epoll_fd, EPOLL_CTL_ADD, cpu_buf->fd, + &pb->events[j]) < 0) { + err = -errno; + pr_warn("failed to epoll_ctl cpu #%d perf FD %d: %s\n", + cpu, cpu_buf->fd, + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + j++; + } + pb->cpu_cnt = j; + free(online); + + return pb; + +error: + free(online); + if (pb) + perf_buffer__free(pb); + return ERR_PTR(err); +} + +struct perf_sample_raw { + struct perf_event_header header; + uint32_t size; + char data[]; +}; + +struct perf_sample_lost { + struct perf_event_header header; + uint64_t id; + uint64_t lost; + uint64_t sample_id; +}; + +static enum bpf_perf_event_ret +perf_buffer__process_record(struct perf_event_header *e, void *ctx) +{ + struct perf_cpu_buf *cpu_buf = ctx; + struct perf_buffer *pb = cpu_buf->pb; + void *data = e; + + /* user wants full control over parsing perf event */ + if (pb->event_cb) + return pb->event_cb(pb->ctx, cpu_buf->cpu, e); + + switch (e->type) { + case PERF_RECORD_SAMPLE: { + struct perf_sample_raw *s = data; + + if (pb->sample_cb) + pb->sample_cb(pb->ctx, cpu_buf->cpu, s->data, s->size); + break; + } + case PERF_RECORD_LOST: { + struct perf_sample_lost *s = data; + + if (pb->lost_cb) + pb->lost_cb(pb->ctx, cpu_buf->cpu, s->lost); + break; + } + default: + pr_warn("unknown perf sample type %d\n", e->type); + return LIBBPF_PERF_EVENT_ERROR; + } + return LIBBPF_PERF_EVENT_CONT; +} + +static int perf_buffer__process_records(struct perf_buffer *pb, + struct perf_cpu_buf *cpu_buf) +{ + enum bpf_perf_event_ret ret; + + ret = perf_event_read_simple(cpu_buf->base, pb->mmap_size, + pb->page_size, &cpu_buf->buf, + &cpu_buf->buf_size, + perf_buffer__process_record, cpu_buf); + if (ret != LIBBPF_PERF_EVENT_CONT) + return ret; + return 0; +} + +int perf_buffer__epoll_fd(const struct perf_buffer *pb) +{ + return pb->epoll_fd; +} + +int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms) +{ + int i, cnt, err; + + cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms); + if (cnt < 0) + return -errno; + + for (i = 0; i < cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr; + + err = perf_buffer__process_records(pb, cpu_buf); + if (err) { + pr_warn("error while processing records: %d\n", err); + return libbpf_err(err); + } + } + return cnt; +} + +/* Return number of PERF_EVENT_ARRAY map slots set up by this perf_buffer + * manager. + */ +size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb) +{ + return pb->cpu_cnt; +} + +/* + * Return perf_event FD of a ring buffer in *buf_idx* slot of + * PERF_EVENT_ARRAY BPF map. This FD can be polled for new data using + * select()/poll()/epoll() Linux syscalls. + */ +int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + return cpu_buf->fd; +} + +int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, size_t *buf_size) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + *buf = cpu_buf->base; + *buf_size = pb->mmap_size; + return 0; +} + +/* + * Consume data from perf ring buffer corresponding to slot *buf_idx* in + * PERF_EVENT_ARRAY BPF map without waiting/polling. If there is no data to + * consume, do nothing and return success. + * Returns: + * - 0 on success; + * - <0 on failure. + */ +int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_idx) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + return perf_buffer__process_records(pb, cpu_buf); +} + +int perf_buffer__consume(struct perf_buffer *pb) +{ + int i, err; + + for (i = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + + if (!cpu_buf) + continue; + + err = perf_buffer__process_records(pb, cpu_buf); + if (err) { + pr_warn("perf_buffer: failed to process records in buffer #%d: %d\n", i, err); + return libbpf_err(err); + } + } + return 0; +} + +int bpf_program__set_attach_target(struct bpf_program *prog, + int attach_prog_fd, + const char *attach_func_name) +{ + int btf_obj_fd = 0, btf_id = 0, err; + + if (!prog || attach_prog_fd < 0) + return libbpf_err(-EINVAL); + + if (prog->obj->loaded) + return libbpf_err(-EINVAL); + + if (attach_prog_fd && !attach_func_name) { + /* remember attach_prog_fd and let bpf_program__load() find + * BTF ID during the program load + */ + prog->attach_prog_fd = attach_prog_fd; + return 0; + } + + if (attach_prog_fd) { + btf_id = libbpf_find_prog_btf_id(attach_func_name, + attach_prog_fd); + if (btf_id < 0) + return libbpf_err(btf_id); + } else { + if (!attach_func_name) + return libbpf_err(-EINVAL); + + /* load btf_vmlinux, if not yet */ + err = bpf_object__load_vmlinux_btf(prog->obj, true); + if (err) + return libbpf_err(err); + err = find_kernel_btf_id(prog->obj, attach_func_name, + prog->expected_attach_type, + &btf_obj_fd, &btf_id); + if (err) + return libbpf_err(err); + } + + prog->attach_btf_id = btf_id; + prog->attach_btf_obj_fd = btf_obj_fd; + prog->attach_prog_fd = attach_prog_fd; + return 0; +} + +int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz) +{ + int err = 0, n, len, start, end = -1; + bool *tmp; + + *mask = NULL; + *mask_sz = 0; + + /* Each sub string separated by ',' has format \d+-\d+ or \d+ */ + while (*s) { + if (*s == ',' || *s == '\n') { + s++; + continue; + } + n = sscanf(s, "%d%n-%d%n", &start, &len, &end, &len); + if (n <= 0 || n > 2) { + pr_warn("Failed to get CPU range %s: %d\n", s, n); + err = -EINVAL; + goto cleanup; + } else if (n == 1) { + end = start; + } + if (start < 0 || start > end) { + pr_warn("Invalid CPU range [%d,%d] in %s\n", + start, end, s); + err = -EINVAL; + goto cleanup; + } + tmp = realloc(*mask, end + 1); + if (!tmp) { + err = -ENOMEM; + goto cleanup; + } + *mask = tmp; + memset(tmp + *mask_sz, 0, start - *mask_sz); + memset(tmp + start, 1, end - start + 1); + *mask_sz = end + 1; + s += len; + } + if (!*mask_sz) { + pr_warn("Empty CPU range\n"); + return -EINVAL; + } + return 0; +cleanup: + free(*mask); + *mask = NULL; + return err; +} + +int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz) +{ + int fd, err = 0, len; + char buf[128]; + + fd = open(fcpu, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = -errno; + pr_warn("Failed to open cpu mask file %s: %d\n", fcpu, err); + return err; + } + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len <= 0) { + err = len ? -errno : -EINVAL; + pr_warn("Failed to read cpu mask from %s: %d\n", fcpu, err); + return err; + } + if (len >= sizeof(buf)) { + pr_warn("CPU mask is too big in file %s\n", fcpu); + return -E2BIG; + } + buf[len] = '\0'; + + return parse_cpu_mask_str(buf, mask, mask_sz); +} + +int libbpf_num_possible_cpus(void) +{ + static const char *fcpu = "/sys/devices/system/cpu/possible"; + static int cpus; + int err, n, i, tmp_cpus; + bool *mask; + + tmp_cpus = READ_ONCE(cpus); + if (tmp_cpus > 0) + return tmp_cpus; + + err = parse_cpu_mask_file(fcpu, &mask, &n); + if (err) + return libbpf_err(err); + + tmp_cpus = 0; + for (i = 0; i < n; i++) { + if (mask[i]) + tmp_cpus++; + } + free(mask); + + WRITE_ONCE(cpus, tmp_cpus); + return tmp_cpus; +} + +static int populate_skeleton_maps(const struct bpf_object *obj, + struct bpf_map_skeleton *maps, + size_t map_cnt) +{ + int i; + + for (i = 0; i < map_cnt; i++) { + struct bpf_map **map = maps[i].map; + const char *name = maps[i].name; + void **mmaped = maps[i].mmaped; + + *map = bpf_object__find_map_by_name(obj, name); + if (!*map) { + pr_warn("failed to find skeleton map '%s'\n", name); + return -ESRCH; + } + + /* externs shouldn't be pre-setup from user code */ + if (mmaped && (*map)->libbpf_type != LIBBPF_MAP_KCONFIG) + *mmaped = (*map)->mmaped; + } + return 0; +} + +static int populate_skeleton_progs(const struct bpf_object *obj, + struct bpf_prog_skeleton *progs, + size_t prog_cnt) +{ + int i; + + for (i = 0; i < prog_cnt; i++) { + struct bpf_program **prog = progs[i].prog; + const char *name = progs[i].name; + + *prog = bpf_object__find_program_by_name(obj, name); + if (!*prog) { + pr_warn("failed to find skeleton program '%s'\n", name); + return -ESRCH; + } + } + return 0; +} + +int bpf_object__open_skeleton(struct bpf_object_skeleton *s, + const struct bpf_object_open_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, skel_opts, + .object_name = s->name, + ); + struct bpf_object *obj; + int err; + + /* Attempt to preserve opts->object_name, unless overriden by user + * explicitly. Overwriting object name for skeletons is discouraged, + * as it breaks global data maps, because they contain object name + * prefix as their own map name prefix. When skeleton is generated, + * bpftool is making an assumption that this name will stay the same. + */ + if (opts) { + memcpy(&skel_opts, opts, sizeof(*opts)); + if (!opts->object_name) + skel_opts.object_name = s->name; + } + + obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts); + err = libbpf_get_error(obj); + if (err) { + pr_warn("failed to initialize skeleton BPF object '%s': %d\n", + s->name, err); + return libbpf_err(err); + } + + *s->obj = obj; + err = populate_skeleton_maps(obj, s->maps, s->map_cnt); + if (err) { + pr_warn("failed to populate skeleton maps for '%s': %d\n", s->name, err); + return libbpf_err(err); + } + + err = populate_skeleton_progs(obj, s->progs, s->prog_cnt); + if (err) { + pr_warn("failed to populate skeleton progs for '%s': %d\n", s->name, err); + return libbpf_err(err); + } + + return 0; +} + +int bpf_object__open_subskeleton(struct bpf_object_subskeleton *s) +{ + int err, len, var_idx, i; + const char *var_name; + const struct bpf_map *map; + struct btf *btf; + __u32 map_type_id; + const struct btf_type *map_type, *var_type; + const struct bpf_var_skeleton *var_skel; + struct btf_var_secinfo *var; + + if (!s->obj) + return libbpf_err(-EINVAL); + + btf = bpf_object__btf(s->obj); + if (!btf) { + pr_warn("subskeletons require BTF at runtime (object %s)\n", + bpf_object__name(s->obj)); + return libbpf_err(-errno); + } + + err = populate_skeleton_maps(s->obj, s->maps, s->map_cnt); + if (err) { + pr_warn("failed to populate subskeleton maps: %d\n", err); + return libbpf_err(err); + } + + err = populate_skeleton_progs(s->obj, s->progs, s->prog_cnt); + if (err) { + pr_warn("failed to populate subskeleton maps: %d\n", err); + return libbpf_err(err); + } + + for (var_idx = 0; var_idx < s->var_cnt; var_idx++) { + var_skel = &s->vars[var_idx]; + map = *var_skel->map; + map_type_id = bpf_map__btf_value_type_id(map); + map_type = btf__type_by_id(btf, map_type_id); + + if (!btf_is_datasec(map_type)) { + pr_warn("type for map '%1$s' is not a datasec: %2$s", + bpf_map__name(map), + __btf_kind_str(btf_kind(map_type))); + return libbpf_err(-EINVAL); + } + + len = btf_vlen(map_type); + var = btf_var_secinfos(map_type); + for (i = 0; i < len; i++, var++) { + var_type = btf__type_by_id(btf, var->type); + var_name = btf__name_by_offset(btf, var_type->name_off); + if (strcmp(var_name, var_skel->name) == 0) { + *var_skel->addr = map->mmaped + var->offset; + break; + } + } + } + return 0; +} + +void bpf_object__destroy_subskeleton(struct bpf_object_subskeleton *s) +{ + if (!s) + return; + free(s->maps); + free(s->progs); + free(s->vars); + free(s); +} + +int bpf_object__load_skeleton(struct bpf_object_skeleton *s) +{ + int i, err; + + err = bpf_object__load(*s->obj); + if (err) { + pr_warn("failed to load BPF skeleton '%s': %d\n", s->name, err); + return libbpf_err(err); + } + + for (i = 0; i < s->map_cnt; i++) { + struct bpf_map *map = *s->maps[i].map; + size_t mmap_sz = bpf_map_mmap_sz(map); + int prot, map_fd = bpf_map__fd(map); + void **mmaped = s->maps[i].mmaped; + + if (!mmaped) + continue; + + if (!(map->def.map_flags & BPF_F_MMAPABLE)) { + *mmaped = NULL; + continue; + } + + if (map->def.map_flags & BPF_F_RDONLY_PROG) + prot = PROT_READ; + else + prot = PROT_READ | PROT_WRITE; + + /* Remap anonymous mmap()-ed "map initialization image" as + * a BPF map-backed mmap()-ed memory, but preserving the same + * memory address. This will cause kernel to change process' + * page table to point to a different piece of kernel memory, + * but from userspace point of view memory address (and its + * contents, being identical at this point) will stay the + * same. This mapping will be released by bpf_object__close() + * as per normal clean up procedure, so we don't need to worry + * about it from skeleton's clean up perspective. + */ + *mmaped = mmap(map->mmaped, mmap_sz, prot, + MAP_SHARED | MAP_FIXED, map_fd, 0); + if (*mmaped == MAP_FAILED) { + err = -errno; + *mmaped = NULL; + pr_warn("failed to re-mmap() map '%s': %d\n", + bpf_map__name(map), err); + return libbpf_err(err); + } + } + + return 0; +} + +int bpf_object__attach_skeleton(struct bpf_object_skeleton *s) +{ + int i, err; + + for (i = 0; i < s->prog_cnt; i++) { + struct bpf_program *prog = *s->progs[i].prog; + struct bpf_link **link = s->progs[i].link; + + if (!prog->autoload || !prog->autoattach) + continue; + + /* auto-attaching not supported for this program */ + if (!prog->sec_def || !prog->sec_def->prog_attach_fn) + continue; + + /* if user already set the link manually, don't attempt auto-attach */ + if (*link) + continue; + + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, link); + if (err) { + pr_warn("prog '%s': failed to auto-attach: %d\n", + bpf_program__name(prog), err); + return libbpf_err(err); + } + + /* It's possible that for some SEC() definitions auto-attach + * is supported in some cases (e.g., if definition completely + * specifies target information), but is not in other cases. + * SEC("uprobe") is one such case. If user specified target + * binary and function name, such BPF program can be + * auto-attached. But if not, it shouldn't trigger skeleton's + * attach to fail. It should just be skipped. + * attach_fn signals such case with returning 0 (no error) and + * setting link to NULL. + */ + } + + return 0; +} + +void bpf_object__detach_skeleton(struct bpf_object_skeleton *s) +{ + int i; + + for (i = 0; i < s->prog_cnt; i++) { + struct bpf_link **link = s->progs[i].link; + + bpf_link__destroy(*link); + *link = NULL; + } +} + +void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) +{ + if (!s) + return; + + if (s->progs) + bpf_object__detach_skeleton(s); + if (s->obj) + bpf_object__close(*s->obj); + free(s->maps); + free(s->progs); + free(s); +} diff --git a/src/libbpf.h b/src/libbpf.h new file mode 100644 index 0000000..eee883f --- /dev/null +++ b/src/libbpf.h @@ -0,0 +1,1529 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Common eBPF ELF object loading operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org> + * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com> + * Copyright (C) 2015 Huawei Inc. + */ +#ifndef __LIBBPF_LIBBPF_H +#define __LIBBPF_LIBBPF_H + +#include <stdarg.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <sys/types.h> // for size_t +#include <linux/bpf.h> + +#include "libbpf_common.h" +#include "libbpf_legacy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +LIBBPF_API __u32 libbpf_major_version(void); +LIBBPF_API __u32 libbpf_minor_version(void); +LIBBPF_API const char *libbpf_version_string(void); + +enum libbpf_errno { + __LIBBPF_ERRNO__START = 4000, + + /* Something wrong in libelf */ + LIBBPF_ERRNO__LIBELF = __LIBBPF_ERRNO__START, + LIBBPF_ERRNO__FORMAT, /* BPF object format invalid */ + LIBBPF_ERRNO__KVERSION, /* Incorrect or no 'version' section */ + LIBBPF_ERRNO__ENDIAN, /* Endian mismatch */ + LIBBPF_ERRNO__INTERNAL, /* Internal error in libbpf */ + LIBBPF_ERRNO__RELOC, /* Relocation failed */ + LIBBPF_ERRNO__LOAD, /* Load program failure for unknown reason */ + LIBBPF_ERRNO__VERIFY, /* Kernel verifier blocks program loading */ + LIBBPF_ERRNO__PROG2BIG, /* Program too big */ + LIBBPF_ERRNO__KVER, /* Incorrect kernel version */ + LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */ + LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */ + LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */ + LIBBPF_ERRNO__NLPARSE, /* netlink parsing error */ + __LIBBPF_ERRNO__END, +}; + +LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); + +/** + * @brief **libbpf_bpf_attach_type_str()** converts the provided attach type + * value into a textual representation. + * @param t The attach type. + * @return Pointer to a static string identifying the attach type. NULL is + * returned for unknown **bpf_attach_type** values. + */ +LIBBPF_API const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t); + +/** + * @brief **libbpf_bpf_link_type_str()** converts the provided link type value + * into a textual representation. + * @param t The link type. + * @return Pointer to a static string identifying the link type. NULL is + * returned for unknown **bpf_link_type** values. + */ +LIBBPF_API const char *libbpf_bpf_link_type_str(enum bpf_link_type t); + +/** + * @brief **libbpf_bpf_map_type_str()** converts the provided map type value + * into a textual representation. + * @param t The map type. + * @return Pointer to a static string identifying the map type. NULL is + * returned for unknown **bpf_map_type** values. + */ +LIBBPF_API const char *libbpf_bpf_map_type_str(enum bpf_map_type t); + +/** + * @brief **libbpf_bpf_prog_type_str()** converts the provided program type + * value into a textual representation. + * @param t The program type. + * @return Pointer to a static string identifying the program type. NULL is + * returned for unknown **bpf_prog_type** values. + */ +LIBBPF_API const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t); + +enum libbpf_print_level { + LIBBPF_WARN, + LIBBPF_INFO, + LIBBPF_DEBUG, +}; + +typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level, + const char *, va_list ap); + +LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn); + +/* Hide internal to user */ +struct bpf_object; + +struct bpf_object_open_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* object name override, if provided: + * - for object open from file, this will override setting object + * name from file path's base name; + * - for object open from memory buffer, this will specify an object + * name and will override default "<addr>-<buf-size>" name; + */ + const char *object_name; + /* parse map definitions non-strictly, allowing extra attributes/data */ + bool relaxed_maps; + /* maps that set the 'pinning' attribute in their definition will have + * their pin_path attribute set to a file in this directory, and be + * auto-pinned to that path on load; defaults to "/sys/fs/bpf". + */ + const char *pin_root_path; + + __u32 :32; /* stub out now removed attach_prog_fd */ + + /* Additional kernel config content that augments and overrides + * system Kconfig for CONFIG_xxx externs. + */ + const char *kconfig; + /* Path to the custom BTF to be used for BPF CO-RE relocations. + * This custom BTF completely replaces the use of vmlinux BTF + * for the purpose of CO-RE relocations. + * NOTE: any other BPF feature (e.g., fentry/fexit programs, + * struct_ops, etc) will need actual kernel BTF at /sys/kernel/btf/vmlinux. + */ + const char *btf_custom_path; + /* Pointer to a buffer for storing kernel logs for applicable BPF + * commands. Valid kernel_log_size has to be specified as well and are + * passed-through to bpf() syscall. Keep in mind that kernel might + * fail operation with -ENOSPC error if provided buffer is too small + * to contain entire log output. + * See the comment below for kernel_log_level for interaction between + * log_buf and log_level settings. + * + * If specified, this log buffer will be passed for: + * - each BPF progral load (BPF_PROG_LOAD) attempt, unless overriden + * with bpf_program__set_log() on per-program level, to get + * BPF verifier log output. + * - during BPF object's BTF load into kernel (BPF_BTF_LOAD) to get + * BTF sanity checking log. + * + * Each BPF command (BPF_BTF_LOAD or BPF_PROG_LOAD) will overwrite + * previous contents, so if you need more fine-grained control, set + * per-program buffer with bpf_program__set_log_buf() to preserve each + * individual program's verification log. Keep using kernel_log_buf + * for BTF verification log, if necessary. + */ + char *kernel_log_buf; + size_t kernel_log_size; + /* + * Log level can be set independently from log buffer. Log_level=0 + * means that libbpf will attempt loading BTF or program without any + * logging requested, but will retry with either its own or custom log + * buffer, if provided, and log_level=1 on any error. + * And vice versa, setting log_level>0 will request BTF or prog + * loading with verbose log from the first attempt (and as such also + * for successfully loaded BTF or program), and the actual log buffer + * could be either libbpf's own auto-allocated log buffer, if + * kernel_log_buffer is NULL, or user-provided custom kernel_log_buf. + * If user didn't provide custom log buffer, libbpf will emit captured + * logs through its print callback. + */ + __u32 kernel_log_level; + + size_t :0; +}; +#define bpf_object_open_opts__last_field kernel_log_level + +LIBBPF_API struct bpf_object *bpf_object__open(const char *path); + +/** + * @brief **bpf_object__open_file()** creates a bpf_object by opening + * the BPF ELF object file pointed to by the passed path and loading it + * into memory. + * @param path BPF object file path + * @param opts options for how to load the bpf object, this parameter is + * optional and can be set to NULL + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_object * +bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts); + +/** + * @brief **bpf_object__open_mem()** creates a bpf_object by reading + * the BPF objects raw bytes from a memory buffer containing a valid + * BPF ELF object file. + * @param obj_buf pointer to the buffer containing ELF file bytes + * @param obj_buf_sz number of bytes in the buffer + * @param opts options for how to load the bpf object + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_object * +bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts); + +/* Load/unload object into/from kernel */ +LIBBPF_API int bpf_object__load(struct bpf_object *obj); + +LIBBPF_API void bpf_object__close(struct bpf_object *object); + +/* pin_maps and unpin_maps can both be called with a NULL path, in which case + * they will use the pin_path attribute of each map (and ignore all maps that + * don't have a pin_path set). + */ +LIBBPF_API int bpf_object__pin_maps(struct bpf_object *obj, const char *path); +LIBBPF_API int bpf_object__unpin_maps(struct bpf_object *obj, + const char *path); +LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj, + const char *path); +LIBBPF_API int bpf_object__unpin_programs(struct bpf_object *obj, + const char *path); +LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); + +LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); +LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); +LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version); + +struct btf; +LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); +LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); + +LIBBPF_API struct bpf_program * +bpf_object__find_program_by_name(const struct bpf_object *obj, + const char *name); + +LIBBPF_API int +libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type); +LIBBPF_API int libbpf_attach_type_by_name(const char *name, + enum bpf_attach_type *attach_type); +LIBBPF_API int libbpf_find_vmlinux_btf_id(const char *name, + enum bpf_attach_type attach_type); + +/* Accessors of bpf_program */ +struct bpf_program; + +LIBBPF_API struct bpf_program * +bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog); + +#define bpf_object__for_each_program(pos, obj) \ + for ((pos) = bpf_object__next_program((obj), NULL); \ + (pos) != NULL; \ + (pos) = bpf_object__next_program((obj), (pos))) + +LIBBPF_API struct bpf_program * +bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *prog); + +LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, + __u32 ifindex); + +LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); +LIBBPF_API const char *bpf_program__section_name(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__autoload(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_autoload(struct bpf_program *prog, bool autoload); +LIBBPF_API bool bpf_program__autoattach(const struct bpf_program *prog); +LIBBPF_API void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach); + +struct bpf_insn; + +/** + * @brief **bpf_program__insns()** gives read-only access to BPF program's + * underlying BPF instructions. + * @param prog BPF program for which to return instructions + * @return a pointer to an array of BPF instructions that belong to the + * specified BPF program + * + * Returned pointer is always valid and not NULL. Number of `struct bpf_insn` + * pointed to can be fetched using **bpf_program__insn_cnt()** API. + * + * Keep in mind, libbpf can modify and append/delete BPF program's + * instructions as it processes BPF object file and prepares everything for + * uploading into the kernel. So depending on the point in BPF object + * lifetime, **bpf_program__insns()** can return different sets of + * instructions. As an example, during BPF object load phase BPF program + * instructions will be CO-RE-relocated, BPF subprograms instructions will be + * appended, ldimm64 instructions will have FDs embedded, etc. So instructions + * returned before **bpf_object__load()** and after it might be quite + * different. + */ +LIBBPF_API const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_insns()** can set BPF program's underlying + * BPF instructions. + * + * WARNING: This is a very advanced libbpf API and users need to know + * what they are doing. This should be used from prog_prepare_load_fn + * callback only. + * + * @param prog BPF program for which to return instructions + * @param new_insns a pointer to an array of BPF instructions + * @param new_insn_cnt number of `struct bpf_insn`'s that form + * specified BPF program + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__set_insns(struct bpf_program *prog, + struct bpf_insn *new_insns, size_t new_insn_cnt); + +/** + * @brief **bpf_program__insn_cnt()** returns number of `struct bpf_insn`'s + * that form specified BPF program. + * @param prog BPF program for which to return number of BPF instructions + * + * See **bpf_program__insns()** documentation for notes on how libbpf can + * change instructions and their count during different phases of + * **bpf_object** lifetime. + */ +LIBBPF_API size_t bpf_program__insn_cnt(const struct bpf_program *prog); + +LIBBPF_API int bpf_program__fd(const struct bpf_program *prog); + +/** + * @brief **bpf_program__pin()** pins the BPF program to a file + * in the BPF FS specified by a path. This increments the programs + * reference count, allowing it to stay loaded after the process + * which loaded it has exited. + * + * @param prog BPF program to pin, must already be loaded + * @param path file path in a BPF file system + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path); + +/** + * @brief **bpf_program__unpin()** unpins the BPF program from a file + * in the BPFFS specified by a path. This decrements the programs + * reference count. + * + * The file pinning the BPF program can also be unlinked by a different + * process in which case this function will return an error. + * + * @param prog BPF program to unpin + * @param path file path to the pin in a BPF file system + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__unpin(struct bpf_program *prog, const char *path); +LIBBPF_API void bpf_program__unload(struct bpf_program *prog); + +struct bpf_link; + +LIBBPF_API struct bpf_link *bpf_link__open(const char *path); +LIBBPF_API int bpf_link__fd(const struct bpf_link *link); +LIBBPF_API const char *bpf_link__pin_path(const struct bpf_link *link); +/** + * @brief **bpf_link__pin()** pins the BPF link to a file + * in the BPF FS specified by a path. This increments the links + * reference count, allowing it to stay loaded after the process + * which loaded it has exited. + * + * @param link BPF link to pin, must already be loaded + * @param path file path in a BPF file system + * @return 0, on success; negative error code, otherwise + */ + +LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path); + +/** + * @brief **bpf_link__unpin()** unpins the BPF link from a file + * in the BPFFS specified by a path. This decrements the links + * reference count. + * + * The file pinning the BPF link can also be unlinked by a different + * process in which case this function will return an error. + * + * @param prog BPF program to unpin + * @param path file path to the pin in a BPF file system + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_link__unpin(struct bpf_link *link); +LIBBPF_API int bpf_link__update_program(struct bpf_link *link, + struct bpf_program *prog); +LIBBPF_API void bpf_link__disconnect(struct bpf_link *link); +LIBBPF_API int bpf_link__detach(struct bpf_link *link); +LIBBPF_API int bpf_link__destroy(struct bpf_link *link); + +/** + * @brief **bpf_program__attach()** is a generic function for attaching + * a BPF program based on auto-detection of program type, attach type, + * and extra paremeters, where applicable. + * + * @param prog BPF program to attach + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + * + * This is supported for: + * - kprobe/kretprobe (depends on SEC() definition) + * - uprobe/uretprobe (depends on SEC() definition) + * - tracepoint + * - raw tracepoint + * - tracing programs (typed raw TP/fentry/fexit/fmod_ret) + */ +LIBBPF_API struct bpf_link * +bpf_program__attach(const struct bpf_program *prog); + +struct bpf_perf_event_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; +}; +#define bpf_perf_event_opts__last_field bpf_cookie + +LIBBPF_API struct bpf_link * +bpf_program__attach_perf_event(const struct bpf_program *prog, int pfd); + +LIBBPF_API struct bpf_link * +bpf_program__attach_perf_event_opts(const struct bpf_program *prog, int pfd, + const struct bpf_perf_event_opts *opts); + +struct bpf_kprobe_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* function's offset to install kprobe to */ + size_t offset; + /* kprobe is return probe */ + bool retprobe; + size_t :0; +}; +#define bpf_kprobe_opts__last_field retprobe + +LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe(const struct bpf_program *prog, bool retprobe, + const char *func_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe_opts(const struct bpf_program *prog, + const char *func_name, + const struct bpf_kprobe_opts *opts); + +struct bpf_kprobe_multi_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* array of function symbols to attach */ + const char **syms; + /* array of function addresses to attach */ + const unsigned long *addrs; + /* array of user-provided values fetchable through bpf_get_attach_cookie */ + const __u64 *cookies; + /* number of elements in syms/addrs/cookies arrays */ + size_t cnt; + /* create return kprobes */ + bool retprobe; + size_t :0; +}; + +#define bpf_kprobe_multi_opts__last_field retprobe + +LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, + const char *pattern, + const struct bpf_kprobe_multi_opts *opts); + +struct bpf_ksyscall_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* attach as return probe? */ + bool retprobe; + size_t :0; +}; +#define bpf_ksyscall_opts__last_field retprobe + +/** + * @brief **bpf_program__attach_ksyscall()** attaches a BPF program + * to kernel syscall handler of a specified syscall. Optionally it's possible + * to request to install retprobe that will be triggered at syscall exit. It's + * also possible to associate BPF cookie (though options). + * + * Libbpf automatically will determine correct full kernel function name, + * which depending on system architecture and kernel version/configuration + * could be of the form __<arch>_sys_<syscall> or __se_sys_<syscall>, and will + * attach specified program using kprobe/kretprobe mechanism. + * + * **bpf_program__attach_ksyscall()** is an API counterpart of declarative + * **SEC("ksyscall/<syscall>")** annotation of BPF programs. + * + * At the moment **SEC("ksyscall")** and **bpf_program__attach_ksyscall()** do + * not handle all the calling convention quirks for mmap(), clone() and compat + * syscalls. It also only attaches to "native" syscall interfaces. If host + * system supports compat syscalls or defines 32-bit syscalls in 64-bit + * kernel, such syscall interfaces won't be attached to by libbpf. + * + * These limitations may or may not change in the future. Therefore it is + * recommended to use SEC("kprobe") for these syscalls or if working with + * compat and 32-bit interfaces is required. + * + * @param prog BPF program to attach + * @param syscall_name Symbolic name of the syscall (e.g., "bpf") + * @param opts Additional options (see **struct bpf_ksyscall_opts**) + * @return Reference to the newly created BPF link; or NULL is returned on + * error, error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts); + +struct bpf_uprobe_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* offset of kernel reference counted USDT semaphore, added in + * a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") + */ + size_t ref_ctr_offset; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* uprobe is return probe, invoked at function return time */ + bool retprobe; + /* Function name to attach to. Could be an unqualified ("abc") or library-qualified + * "abc@LIBXYZ" name. To specify function entry, func_name should be set while + * func_offset argument to bpf_prog__attach_uprobe_opts() should be 0. To trace an + * offset within a function, specify func_name and use func_offset argument to specify + * offset within the function. Shared library functions must specify the shared library + * binary_path. + */ + const char *func_name; + size_t :0; +}; +#define bpf_uprobe_opts__last_field func_name + +/** + * @brief **bpf_program__attach_uprobe()** attaches a BPF program + * to the userspace function which is found by binary path and + * offset. You can optionally specify a particular proccess to attach + * to. You can also optionally attach the program to the function + * exit instead of entry. + * + * @param prog BPF program to attach + * @param retprobe Attach to function exit + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains the function symbol + * @param func_offset Offset within the binary of the function symbol + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe(const struct bpf_program *prog, bool retprobe, + pid_t pid, const char *binary_path, + size_t func_offset); + +/** + * @brief **bpf_program__attach_uprobe_opts()** is just like + * bpf_program__attach_uprobe() except with a options struct + * for various configurations. + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains the function symbol + * @param func_offset Offset within the binary of the function symbol + * @param opts Options for altering program attachment + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, + const char *binary_path, size_t func_offset, + const struct bpf_uprobe_opts *opts); + +struct bpf_usdt_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value accessible through usdt_cookie() */ + __u64 usdt_cookie; + size_t :0; +}; +#define bpf_usdt_opts__last_field usdt_cookie + +/** + * @brief **bpf_program__attach_usdt()** is just like + * bpf_program__attach_uprobe_opts() except it covers USDT (User-space + * Statically Defined Tracepoint) attachment, instead of attaching to + * user-space function entry or exit. + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains provided USDT probe + * @param usdt_provider USDT provider name + * @param usdt_name USDT probe name + * @param opts Options for altering program attachment + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_usdt(const struct bpf_program *prog, + pid_t pid, const char *binary_path, + const char *usdt_provider, const char *usdt_name, + const struct bpf_usdt_opts *opts); + +struct bpf_tracepoint_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; +}; +#define bpf_tracepoint_opts__last_field bpf_cookie + +LIBBPF_API struct bpf_link * +bpf_program__attach_tracepoint(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name, + const struct bpf_tracepoint_opts *opts); + +LIBBPF_API struct bpf_link * +bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name); + +struct bpf_trace_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 cookie; +}; +#define bpf_trace_opts__last_field cookie + +LIBBPF_API struct bpf_link * +bpf_program__attach_trace(const struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_trace_opts(const struct bpf_program *prog, const struct bpf_trace_opts *opts); + +LIBBPF_API struct bpf_link * +bpf_program__attach_lsm(const struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd); +LIBBPF_API struct bpf_link * +bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd); +LIBBPF_API struct bpf_link * +bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); +LIBBPF_API struct bpf_link * +bpf_program__attach_freplace(const struct bpf_program *prog, + int target_fd, const char *attach_func_name); + +struct bpf_map; + +LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map); + +struct bpf_iter_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + union bpf_iter_link_info *link_info; + __u32 link_info_len; +}; +#define bpf_iter_attach_opts__last_field link_info_len + +LIBBPF_API struct bpf_link * +bpf_program__attach_iter(const struct bpf_program *prog, + const struct bpf_iter_attach_opts *opts); + +LIBBPF_API enum bpf_prog_type bpf_program__type(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_type()** sets the program + * type of the passed BPF program. + * @param prog BPF program to set the program type for + * @param type program type to set the BPF map to have + * @return error code; or 0 if no error. An error occurs + * if the object is already loaded. + * + * This must be called before the BPF object is loaded, + * otherwise it has no effect and an error is returned. + */ +LIBBPF_API int bpf_program__set_type(struct bpf_program *prog, + enum bpf_prog_type type); + +LIBBPF_API enum bpf_attach_type +bpf_program__expected_attach_type(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_expected_attach_type()** sets the + * attach type of the passed BPF program. This is used for + * auto-detection of attachment when programs are loaded. + * @param prog BPF program to set the attach type for + * @param type attach type to set the BPF map to have + * @return error code; or 0 if no error. An error occurs + * if the object is already loaded. + * + * This must be called before the BPF object is loaded, + * otherwise it has no effect and an error is returned. + */ +LIBBPF_API int +bpf_program__set_expected_attach_type(struct bpf_program *prog, + enum bpf_attach_type type); + +LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_flags(struct bpf_program *prog, __u32 flags); + +/* Per-program log level and log buffer getters/setters. + * See bpf_object_open_opts comments regarding log_level and log_buf + * interactions. + */ +LIBBPF_API __u32 bpf_program__log_level(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level); +LIBBPF_API const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size); +LIBBPF_API int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size); + +/** + * @brief **bpf_program__set_attach_target()** sets BTF-based attach target + * for supported BPF program types: + * - BTF-aware raw tracepoints (tp_btf); + * - fentry/fexit/fmod_ret; + * - lsm; + * - freplace. + * @param prog BPF program to set the attach type for + * @param type attach type to set the BPF map to have + * @return error code; or 0 if no error occurred. + */ +LIBBPF_API int +bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, + const char *attach_func_name); + +/** + * @brief **bpf_object__find_map_by_name()** returns BPF map of + * the given name, if it exists within the passed BPF object + * @param obj BPF object + * @param name name of the BPF map + * @return BPF map instance, if such map exists within the BPF object; + * or NULL otherwise. + */ +LIBBPF_API struct bpf_map * +bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name); + +LIBBPF_API int +bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name); + +LIBBPF_API struct bpf_map * +bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map); + +#define bpf_object__for_each_map(pos, obj) \ + for ((pos) = bpf_object__next_map((obj), NULL); \ + (pos) != NULL; \ + (pos) = bpf_object__next_map((obj), (pos))) +#define bpf_map__for_each bpf_object__for_each_map + +LIBBPF_API struct bpf_map * +bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *map); + +/** + * @brief **bpf_map__set_autocreate()** sets whether libbpf has to auto-create + * BPF map during BPF object load phase. + * @param map the BPF map instance + * @param autocreate whether to create BPF map during BPF object load + * @return 0 on success; -EBUSY if BPF object was already loaded + * + * **bpf_map__set_autocreate()** allows to opt-out from libbpf auto-creating + * BPF map. By default, libbpf will attempt to create every single BPF map + * defined in BPF object file using BPF_MAP_CREATE command of bpf() syscall + * and fill in map FD in BPF instructions. + * + * This API allows to opt-out of this process for specific map instance. This + * can be useful if host kernel doesn't support such BPF map type or used + * combination of flags and user application wants to avoid creating such + * a map in the first place. User is still responsible to make sure that their + * BPF-side code that expects to use such missing BPF map is recognized by BPF + * verifier as dead code, otherwise BPF verifier will reject such BPF program. + */ +LIBBPF_API int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate); +LIBBPF_API bool bpf_map__autocreate(const struct bpf_map *map); + +/** + * @brief **bpf_map__fd()** gets the file descriptor of the passed + * BPF map + * @param map the BPF map instance + * @return the file descriptor; or -EINVAL in case of an error + */ +LIBBPF_API int bpf_map__fd(const struct bpf_map *map); +LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); +/* get map name */ +LIBBPF_API const char *bpf_map__name(const struct bpf_map *map); +/* get/set map type */ +LIBBPF_API enum bpf_map_type bpf_map__type(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type); +/* get/set map size (max_entries) */ +LIBBPF_API __u32 bpf_map__max_entries(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries); +/* get/set map flags */ +LIBBPF_API __u32 bpf_map__map_flags(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags); +/* get/set map NUMA node */ +LIBBPF_API __u32 bpf_map__numa_node(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node); +/* get/set map key size */ +LIBBPF_API __u32 bpf_map__key_size(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_key_size(struct bpf_map *map, __u32 size); +/* get/set map value size */ +LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_value_size(struct bpf_map *map, __u32 size); +/* get map key/value BTF type IDs */ +LIBBPF_API __u32 bpf_map__btf_key_type_id(const struct bpf_map *map); +LIBBPF_API __u32 bpf_map__btf_value_type_id(const struct bpf_map *map); +/* get/set map if_index */ +LIBBPF_API __u32 bpf_map__ifindex(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); +/* get/set map map_extra flags */ +LIBBPF_API __u64 bpf_map__map_extra(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); + +LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, + const void *data, size_t size); +LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); + +/** + * @brief **bpf_map__is_internal()** tells the caller whether or not the + * passed map is a special map created by libbpf automatically for things like + * global variables, __ksym externs, Kconfig values, etc + * @param map the bpf_map + * @return true, if the map is an internal map; false, otherwise + */ +LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_pin_path(struct bpf_map *map, const char *path); +LIBBPF_API const char *bpf_map__pin_path(const struct bpf_map *map); +LIBBPF_API bool bpf_map__is_pinned(const struct bpf_map *map); +LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); +LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); + +LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd); +LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); + +/** + * @brief **bpf_map__lookup_elem()** allows to lookup BPF map value + * corresponding to provided key. + * @param map BPF map to lookup element in + * @param key pointer to memory containing bytes of the key used for lookup + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory in which looked up value will be stored + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__lookup_elem()** is high-level equivalent of + * **bpf_map_lookup_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__update_elem()** allows to insert or update value in BPF + * map that corresponds to provided key. + * @param map BPF map to insert to or update element in + * @param key pointer to memory containing bytes of the key + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory containing bytes of the value + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__update_elem()** is high-level equivalent of + * **bpf_map_update_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__update_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + const void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__delete_elem()** allows to delete element in BPF map that + * corresponds to provided key. + * @param map BPF map to delete element from + * @param key pointer to memory containing bytes of the key + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__delete_elem()** is high-level equivalent of + * **bpf_map_delete_elem()** API with added check for key size. + */ +LIBBPF_API int bpf_map__delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, __u64 flags); + +/** + * @brief **bpf_map__lookup_and_delete_elem()** allows to lookup BPF map value + * corresponding to provided key and atomically delete it afterwards. + * @param map BPF map to lookup element in + * @param key pointer to memory containing bytes of the key used for lookup + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory in which looked up value will be stored + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__lookup_and_delete_elem()** is high-level equivalent of + * **bpf_map_lookup_and_delete_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__get_next_key()** allows to iterate BPF map keys by + * fetching next key that follows current key. + * @param map BPF map to fetch next key from + * @param cur_key pointer to memory containing bytes of current key or NULL to + * fetch the first key + * @param next_key pointer to memory to write next key into + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @return 0, on success; -ENOENT if **cur_key** is the last key in BPF map; + * negative error, otherwise + * + * **bpf_map__get_next_key()** is high-level equivalent of + * **bpf_map_get_next_key()** API with added check for key size. + */ +LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, + const void *cur_key, void *next_key, size_t key_sz); + +struct bpf_xdp_set_link_opts { + size_t sz; + int old_fd; + size_t :0; +}; +#define bpf_xdp_set_link_opts__last_field old_fd + +struct bpf_xdp_attach_opts { + size_t sz; + int old_prog_fd; + size_t :0; +}; +#define bpf_xdp_attach_opts__last_field old_prog_fd + +struct bpf_xdp_query_opts { + size_t sz; + __u32 prog_id; /* output */ + __u32 drv_prog_id; /* output */ + __u32 hw_prog_id; /* output */ + __u32 skb_prog_id; /* output */ + __u8 attach_mode; /* output */ + size_t :0; +}; +#define bpf_xdp_query_opts__last_field attach_mode + +LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, + const struct bpf_xdp_attach_opts *opts); +LIBBPF_API int bpf_xdp_detach(int ifindex, __u32 flags, + const struct bpf_xdp_attach_opts *opts); +LIBBPF_API int bpf_xdp_query(int ifindex, int flags, struct bpf_xdp_query_opts *opts); +LIBBPF_API int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id); + +/* TC related API */ +enum bpf_tc_attach_point { + BPF_TC_INGRESS = 1 << 0, + BPF_TC_EGRESS = 1 << 1, + BPF_TC_CUSTOM = 1 << 2, +}; + +#define BPF_TC_PARENT(a, b) \ + ((((a) << 16) & 0xFFFF0000U) | ((b) & 0x0000FFFFU)) + +enum bpf_tc_flags { + BPF_TC_F_REPLACE = 1 << 0, +}; + +struct bpf_tc_hook { + size_t sz; + int ifindex; + enum bpf_tc_attach_point attach_point; + __u32 parent; + size_t :0; +}; +#define bpf_tc_hook__last_field parent + +struct bpf_tc_opts { + size_t sz; + int prog_fd; + __u32 flags; + __u32 prog_id; + __u32 handle; + __u32 priority; + size_t :0; +}; +#define bpf_tc_opts__last_field priority + +LIBBPF_API int bpf_tc_hook_create(struct bpf_tc_hook *hook); +LIBBPF_API int bpf_tc_hook_destroy(struct bpf_tc_hook *hook); +LIBBPF_API int bpf_tc_attach(const struct bpf_tc_hook *hook, + struct bpf_tc_opts *opts); +LIBBPF_API int bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts); +LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook, + struct bpf_tc_opts *opts); + +/* Ring buffer APIs */ +struct ring_buffer; +struct user_ring_buffer; + +typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); + +struct ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatiblity */ +}; + +#define ring_buffer_opts__last_field sz + +LIBBPF_API struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts); +LIBBPF_API void ring_buffer__free(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx); +LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); +LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); + +struct user_ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ +}; + +#define user_ring_buffer_opts__last_field sz + +/* @brief **user_ring_buffer__new()** creates a new instance of a user ring + * buffer. + * + * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map. + * @param opts Options for how the ring buffer should be created. + * @return A user ring buffer on success; NULL and errno being set on a + * failure. + */ +LIBBPF_API struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); + +/* @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the + * user ring buffer. + * @param rb A pointer to a user ring buffer. + * @param size The size of the sample, in bytes. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize accessing + * this function if there are multiple producers. If a size is requested that + * is larger than the size of the entire ring buffer, errno will be set to + * E2BIG and NULL is returned. If the ring buffer could accommodate the size, + * but currently does not have enough space, errno is set to ENOSPC and NULL is + * returned. + * + * After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the kernel. Otherwise, + * the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); + +/* @brief **user_ring_buffer__reserve_blocking()** reserves a record in the + * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes + * available. + * @param rb The user ring buffer. + * @param size The size of the sample, in bytes. + * @param timeout_ms The amount of time, in milliseconds, for which the caller + * should block when waiting for a sample. -1 causes the caller to block + * indefinitely. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize + * accessing this function if there are multiple producers + * + * If **timeout_ms** is -1, the function will block indefinitely until a sample + * becomes available. Otherwise, **timeout_ms** must be non-negative, or errno + * is set to EINVAL, and NULL is returned. If **timeout_ms** is 0, no blocking + * will occur and the function will return immediately after attempting to + * reserve a sample. + * + * If **size** is larger than the size of the entire ring buffer, errno is set + * to E2BIG and NULL is returned. If the ring buffer could accommodate + * **size**, but currently does not have enough space, the caller will block + * until at most **timeout_ms** has elapsed. If insufficient space is available + * at that time, errno is set to ENOSPC, and NULL is returned. + * + * The kernel guarantees that it will wake up this thread to check if + * sufficient space is available in the ring buffer at least once per + * invocation of the **bpf_ringbuf_drain()** helper function, provided that at + * least one sample is consumed, and the BPF program did not invoke the + * function with BPF_RB_NO_WAKEUP. A wakeup may occur sooner than that, but the + * kernel does not guarantee this. If the helper function is invoked with + * BPF_RB_FORCE_WAKEUP, a wakeup event will be sent even if no sample is + * consumed. + * + * When a sample of size **size** is found within **timeout_ms**, a pointer to + * the sample is returned. After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the ring buffer. + * Otherwise, the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, + __u32 size, + int timeout_ms); + +/* @brief **user_ring_buffer__submit()** submits a previously reserved sample + * into the ring buffer. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample); + +/* @brief **user_ring_buffer__discard()** discards a previously reserved sample. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample); + +/* @brief **user_ring_buffer__free()** frees a ring buffer that was previously + * created with **user_ring_buffer__new()**. + * @param rb The user ring buffer being freed. + */ +LIBBPF_API void user_ring_buffer__free(struct user_ring_buffer *rb); + +/* Perf buffer APIs */ +struct perf_buffer; + +typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu, + void *data, __u32 size); +typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt); + +/* common use perf buffer options */ +struct perf_buffer_opts { + size_t sz; +}; +#define perf_buffer_opts__last_field sz + +/** + * @brief **perf_buffer__new()** creates BPF perfbuf manager for a specified + * BPF_PERF_EVENT_ARRAY map + * @param map_fd FD of BPF_PERF_EVENT_ARRAY BPF map that will be used by BPF + * code to send data over to user-space + * @param page_cnt number of memory pages allocated for each per-CPU buffer + * @param sample_cb function called on each received data record + * @param lost_cb function called when record loss has occurred + * @param ctx user-provided extra context passed into *sample_cb* and *lost_cb* + * @return a new instance of struct perf_buffer on success, NULL on error with + * *errno* containing an error code + */ +LIBBPF_API struct perf_buffer * +perf_buffer__new(int map_fd, size_t page_cnt, + perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx, + const struct perf_buffer_opts *opts); + +enum bpf_perf_event_ret { + LIBBPF_PERF_EVENT_DONE = 0, + LIBBPF_PERF_EVENT_ERROR = -1, + LIBBPF_PERF_EVENT_CONT = -2, +}; + +struct perf_event_header; + +typedef enum bpf_perf_event_ret +(*perf_buffer_event_fn)(void *ctx, int cpu, struct perf_event_header *event); + +/* raw perf buffer options, giving most power and control */ +struct perf_buffer_raw_opts { + size_t sz; + long :0; + long :0; + /* if cpu_cnt == 0, open all on all possible CPUs (up to the number of + * max_entries of given PERF_EVENT_ARRAY map) + */ + int cpu_cnt; + /* if cpu_cnt > 0, cpus is an array of CPUs to open ring buffers on */ + int *cpus; + /* if cpu_cnt > 0, map_keys specify map keys to set per-CPU FDs for */ + int *map_keys; +}; +#define perf_buffer_raw_opts__last_field map_keys + +struct perf_event_attr; + +LIBBPF_API struct perf_buffer * +perf_buffer__new_raw(int map_fd, size_t page_cnt, struct perf_event_attr *attr, + perf_buffer_event_fn event_cb, void *ctx, + const struct perf_buffer_raw_opts *opts); + +LIBBPF_API void perf_buffer__free(struct perf_buffer *pb); +LIBBPF_API int perf_buffer__epoll_fd(const struct perf_buffer *pb); +LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms); +LIBBPF_API int perf_buffer__consume(struct perf_buffer *pb); +LIBBPF_API int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_idx); +LIBBPF_API size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb); +LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx); +/** + * @brief **perf_buffer__buffer()** returns the per-cpu raw mmap()'ed underlying + * memory region of the ring buffer. + * This ring buffer can be used to implement a custom events consumer. + * The ring buffer starts with the *struct perf_event_mmap_page*, which + * holds the ring buffer managment fields, when accessing the header + * structure it's important to be SMP aware. + * You can refer to *perf_event_read_simple* for a simple example. + * @param pb the perf buffer structure + * @param buf_idx the buffer index to retreive + * @param buf (out) gets the base pointer of the mmap()'ed memory + * @param buf_size (out) gets the size of the mmap()'ed region + * @return 0 on success, negative error code for failure + */ +LIBBPF_API int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, + size_t *buf_size); + +struct bpf_prog_linfo; +struct bpf_prog_info; + +LIBBPF_API void bpf_prog_linfo__free(struct bpf_prog_linfo *prog_linfo); +LIBBPF_API struct bpf_prog_linfo * +bpf_prog_linfo__new(const struct bpf_prog_info *info); +LIBBPF_API const struct bpf_line_info * +bpf_prog_linfo__lfind_addr_func(const struct bpf_prog_linfo *prog_linfo, + __u64 addr, __u32 func_idx, __u32 nr_skip); +LIBBPF_API const struct bpf_line_info * +bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, + __u32 insn_off, __u32 nr_skip); + +/* + * Probe for supported system features + * + * Note that running many of these probes in a short amount of time can cause + * the kernel to reach the maximal size of lockable memory allowed for the + * user, causing subsequent probes to fail. In this case, the caller may want + * to adjust that limit with setrlimit(). + */ + +/** + * @brief **libbpf_probe_bpf_prog_type()** detects if host kernel supports + * BPF programs of a given type. + * @param prog_type BPF program type to detect kernel support for + * @param opts reserved for future extensibility, should be NULL + * @return 1, if given program type is supported; 0, if given program type is + * not supported; negative error code if feature detection failed or can't be + * performed + * + * Make sure the process has required set of CAP_* permissions (or runs as + * root) when performing feature checking. + */ +LIBBPF_API int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts); +/** + * @brief **libbpf_probe_bpf_map_type()** detects if host kernel supports + * BPF maps of a given type. + * @param map_type BPF map type to detect kernel support for + * @param opts reserved for future extensibility, should be NULL + * @return 1, if given map type is supported; 0, if given map type is + * not supported; negative error code if feature detection failed or can't be + * performed + * + * Make sure the process has required set of CAP_* permissions (or runs as + * root) when performing feature checking. + */ +LIBBPF_API int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts); +/** + * @brief **libbpf_probe_bpf_helper()** detects if host kernel supports the + * use of a given BPF helper from specified BPF program type. + * @param prog_type BPF program type used to check the support of BPF helper + * @param helper_id BPF helper ID (enum bpf_func_id) to check support for + * @param opts reserved for future extensibility, should be NULL + * @return 1, if given combination of program type and helper is supported; 0, + * if the combination is not supported; negative error code if feature + * detection for provided input arguments failed or can't be performed + * + * Make sure the process has required set of CAP_* permissions (or runs as + * root) when performing feature checking. + */ +LIBBPF_API int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, + enum bpf_func_id helper_id, const void *opts); + +/** + * @brief **libbpf_num_possible_cpus()** is a helper function to get the + * number of possible CPUs that the host kernel supports and expects. + * @return number of possible CPUs; or error code on failure + * + * Example usage: + * + * int ncpus = libbpf_num_possible_cpus(); + * if (ncpus < 0) { + * // error handling + * } + * long values[ncpus]; + * bpf_map_lookup_elem(per_cpu_map_fd, key, values); + */ +LIBBPF_API int libbpf_num_possible_cpus(void); + +struct bpf_map_skeleton { + const char *name; + struct bpf_map **map; + void **mmaped; +}; + +struct bpf_prog_skeleton { + const char *name; + struct bpf_program **prog; + struct bpf_link **link; +}; + +struct bpf_object_skeleton { + size_t sz; /* size of this struct, for forward/backward compatibility */ + + const char *name; + const void *data; + size_t data_sz; + + struct bpf_object **obj; + + int map_cnt; + int map_skel_sz; /* sizeof(struct bpf_map_skeleton) */ + struct bpf_map_skeleton *maps; + + int prog_cnt; + int prog_skel_sz; /* sizeof(struct bpf_prog_skeleton) */ + struct bpf_prog_skeleton *progs; +}; + +LIBBPF_API int +bpf_object__open_skeleton(struct bpf_object_skeleton *s, + const struct bpf_object_open_opts *opts); +LIBBPF_API int bpf_object__load_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API int bpf_object__attach_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API void bpf_object__detach_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s); + +struct bpf_var_skeleton { + const char *name; + struct bpf_map **map; + void **addr; +}; + +struct bpf_object_subskeleton { + size_t sz; /* size of this struct, for forward/backward compatibility */ + + const struct bpf_object *obj; + + int map_cnt; + int map_skel_sz; /* sizeof(struct bpf_map_skeleton) */ + struct bpf_map_skeleton *maps; + + int prog_cnt; + int prog_skel_sz; /* sizeof(struct bpf_prog_skeleton) */ + struct bpf_prog_skeleton *progs; + + int var_cnt; + int var_skel_sz; /* sizeof(struct bpf_var_skeleton) */ + struct bpf_var_skeleton *vars; +}; + +LIBBPF_API int +bpf_object__open_subskeleton(struct bpf_object_subskeleton *s); +LIBBPF_API void +bpf_object__destroy_subskeleton(struct bpf_object_subskeleton *s); + +struct gen_loader_opts { + size_t sz; /* size of this struct, for forward/backward compatiblity */ + const char *data; + const char *insns; + __u32 data_sz; + __u32 insns_sz; +}; + +#define gen_loader_opts__last_field insns_sz +LIBBPF_API int bpf_object__gen_loader(struct bpf_object *obj, + struct gen_loader_opts *opts); + +enum libbpf_tristate { + TRI_NO = 0, + TRI_YES = 1, + TRI_MODULE = 2, +}; + +struct bpf_linker_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; +}; +#define bpf_linker_opts__last_field sz + +struct bpf_linker_file_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; +}; +#define bpf_linker_file_opts__last_field sz + +struct bpf_linker; + +LIBBPF_API struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts); +LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, + const char *filename, + const struct bpf_linker_file_opts *opts); +LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker); +LIBBPF_API void bpf_linker__free(struct bpf_linker *linker); + +/* + * Custom handling of BPF program's SEC() definitions + */ + +struct bpf_prog_load_opts; /* defined in bpf.h */ + +/* Called during bpf_object__open() for each recognized BPF program. Callback + * can use various bpf_program__set_*() setters to adjust whatever properties + * are necessary. + */ +typedef int (*libbpf_prog_setup_fn_t)(struct bpf_program *prog, long cookie); + +/* Called right before libbpf performs bpf_prog_load() to load BPF program + * into the kernel. Callback can adjust opts as necessary. + */ +typedef int (*libbpf_prog_prepare_load_fn_t)(struct bpf_program *prog, + struct bpf_prog_load_opts *opts, long cookie); + +/* Called during skeleton attach or through bpf_program__attach(). If + * auto-attach is not supported, callback should return 0 and set link to + * NULL (it's not considered an error during skeleton attach, but it will be + * an error for bpf_program__attach() calls). On error, error should be + * returned directly and link set to NULL. On success, return 0 and set link + * to a valid struct bpf_link. + */ +typedef int (*libbpf_prog_attach_fn_t)(const struct bpf_program *prog, long cookie, + struct bpf_link **link); + +struct libbpf_prog_handler_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* User-provided value that is passed to prog_setup_fn, + * prog_prepare_load_fn, and prog_attach_fn callbacks. Allows user to + * register one set of callbacks for multiple SEC() definitions and + * still be able to distinguish them, if necessary. For example, + * libbpf itself is using this to pass necessary flags (e.g., + * sleepable flag) to a common internal SEC() handler. + */ + long cookie; + /* BPF program initialization callback (see libbpf_prog_setup_fn_t). + * Callback is optional, pass NULL if it's not necessary. + */ + libbpf_prog_setup_fn_t prog_setup_fn; + /* BPF program loading callback (see libbpf_prog_prepare_load_fn_t). + * Callback is optional, pass NULL if it's not necessary. + */ + libbpf_prog_prepare_load_fn_t prog_prepare_load_fn; + /* BPF program attach callback (see libbpf_prog_attach_fn_t). + * Callback is optional, pass NULL if it's not necessary. + */ + libbpf_prog_attach_fn_t prog_attach_fn; +}; +#define libbpf_prog_handler_opts__last_field prog_attach_fn + +/** + * @brief **libbpf_register_prog_handler()** registers a custom BPF program + * SEC() handler. + * @param sec section prefix for which custom handler is registered + * @param prog_type BPF program type associated with specified section + * @param exp_attach_type Expected BPF attach type associated with specified section + * @param opts optional cookie, callbacks, and other extra options + * @return Non-negative handler ID is returned on success. This handler ID has + * to be passed to *libbpf_unregister_prog_handler()* to unregister such + * custom handler. Negative error code is returned on error. + * + * *sec* defines which SEC() definitions are handled by this custom handler + * registration. *sec* can have few different forms: + * - if *sec* is just a plain string (e.g., "abc"), it will match only + * SEC("abc"). If BPF program specifies SEC("abc/whatever") it will result + * in an error; + * - if *sec* is of the form "abc/", proper SEC() form is + * SEC("abc/something"), where acceptable "something" should be checked by + * *prog_init_fn* callback, if there are additional restrictions; + * - if *sec* is of the form "abc+", it will successfully match both + * SEC("abc") and SEC("abc/whatever") forms; + * - if *sec* is NULL, custom handler is registered for any BPF program that + * doesn't match any of the registered (custom or libbpf's own) SEC() + * handlers. There could be only one such generic custom handler registered + * at any given time. + * + * All custom handlers (except the one with *sec* == NULL) are processed + * before libbpf's own SEC() handlers. It is allowed to "override" libbpf's + * SEC() handlers by registering custom ones for the same section prefix + * (i.e., it's possible to have custom SEC("perf_event/LLC-load-misses") + * handler). + * + * Note, like much of global libbpf APIs (e.g., libbpf_set_print(), + * libbpf_set_strict_mode(), etc)) these APIs are not thread-safe. User needs + * to ensure synchronization if there is a risk of running this API from + * multiple threads simultaneously. + */ +LIBBPF_API int libbpf_register_prog_handler(const char *sec, + enum bpf_prog_type prog_type, + enum bpf_attach_type exp_attach_type, + const struct libbpf_prog_handler_opts *opts); +/** + * @brief *libbpf_unregister_prog_handler()* unregisters previously registered + * custom BPF program SEC() handler. + * @param handler_id handler ID returned by *libbpf_register_prog_handler()* + * after successful registration + * @return 0 on success, negative error code if handler isn't found + * + * Note, like much of global libbpf APIs (e.g., libbpf_set_print(), + * libbpf_set_strict_mode(), etc)) these APIs are not thread-safe. User needs + * to ensure synchronization if there is a risk of running this API from + * multiple threads simultaneously. + */ +LIBBPF_API int libbpf_unregister_prog_handler(int handler_id); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_LIBBPF_H */ diff --git a/src/libbpf.map b/src/libbpf.map new file mode 100644 index 0000000..71bf569 --- /dev/null +++ b/src/libbpf.map @@ -0,0 +1,384 @@ +LIBBPF_0.0.1 { + global: + bpf_btf_get_fd_by_id; + bpf_map__btf_key_type_id; + bpf_map__btf_value_type_id; + bpf_map__fd; + bpf_map__name; + bpf_map__pin; + bpf_map__reuse_fd; + bpf_map__set_ifindex; + bpf_map__set_inner_map_fd; + bpf_map__unpin; + bpf_map_delete_elem; + bpf_map_get_fd_by_id; + bpf_map_get_next_id; + bpf_map_get_next_key; + bpf_map_lookup_and_delete_elem; + bpf_map_lookup_elem; + bpf_map_update_elem; + bpf_obj_get; + bpf_obj_get_info_by_fd; + bpf_obj_pin; + bpf_object__btf_fd; + bpf_object__close; + bpf_object__find_map_by_name; + bpf_object__kversion; + bpf_object__load; + bpf_object__name; + bpf_object__open; + bpf_object__pin; + bpf_object__pin_maps; + bpf_object__pin_programs; + bpf_object__unpin_maps; + bpf_object__unpin_programs; + bpf_prog_attach; + bpf_prog_detach; + bpf_prog_detach2; + bpf_prog_get_fd_by_id; + bpf_prog_get_next_id; + bpf_prog_query; + bpf_program__fd; + bpf_program__pin; + bpf_program__set_expected_attach_type; + bpf_program__set_ifindex; + bpf_program__set_type; + bpf_program__unload; + bpf_program__unpin; + bpf_prog_linfo__free; + bpf_prog_linfo__new; + bpf_prog_linfo__lfind_addr_func; + bpf_prog_linfo__lfind; + bpf_raw_tracepoint_open; + bpf_task_fd_query; + btf__fd; + btf__find_by_name; + btf__free; + btf__name_by_offset; + btf__new; + btf__resolve_size; + btf__resolve_type; + btf__type_by_id; + libbpf_attach_type_by_name; + libbpf_get_error; + libbpf_prog_type_by_name; + libbpf_set_print; + libbpf_strerror; + local: + *; +}; + +LIBBPF_0.0.2 { + global: + bpf_map_lookup_elem_flags; + bpf_object__btf; + bpf_object__find_map_fd_by_name; + btf__get_raw_data; + btf_ext__free; + btf_ext__get_raw_data; + btf_ext__new; +} LIBBPF_0.0.1; + +LIBBPF_0.0.3 { + global: + bpf_map__is_internal; + bpf_map_freeze; +} LIBBPF_0.0.2; + +LIBBPF_0.0.4 { + global: + bpf_link__destroy; + bpf_program__attach_kprobe; + bpf_program__attach_perf_event; + bpf_program__attach_raw_tracepoint; + bpf_program__attach_tracepoint; + bpf_program__attach_uprobe; + btf_dump__dump_type; + btf_dump__free; + btf__parse_elf; + libbpf_num_possible_cpus; + perf_buffer__free; + perf_buffer__poll; +} LIBBPF_0.0.3; + +LIBBPF_0.0.5 { + global: + bpf_btf_get_next_id; +} LIBBPF_0.0.4; + +LIBBPF_0.0.6 { + global: + bpf_map__get_pin_path; + bpf_map__is_pinned; + bpf_map__set_pin_path; + bpf_object__open_file; + bpf_object__open_mem; + bpf_program__attach_trace; + bpf_program__get_expected_attach_type; + bpf_program__get_type; + btf__find_by_name_kind; + libbpf_find_vmlinux_btf_id; +} LIBBPF_0.0.5; + +LIBBPF_0.0.7 { + global: + btf_dump__emit_type_decl; + bpf_link__disconnect; + bpf_map__attach_struct_ops; + bpf_map_delete_batch; + bpf_map_lookup_and_delete_batch; + bpf_map_lookup_batch; + bpf_map_update_batch; + bpf_object__find_program_by_name; + bpf_object__attach_skeleton; + bpf_object__destroy_skeleton; + bpf_object__detach_skeleton; + bpf_object__load_skeleton; + bpf_object__open_skeleton; + bpf_program__attach; + bpf_program__name; + btf__align_of; + libbpf_find_kernel_btf; +} LIBBPF_0.0.6; + +LIBBPF_0.0.8 { + global: + bpf_link__fd; + bpf_link__open; + bpf_link__pin; + bpf_link__pin_path; + bpf_link__unpin; + bpf_link__update_program; + bpf_link_create; + bpf_link_update; + bpf_map__set_initial_value; + bpf_prog_attach_opts; + bpf_program__attach_cgroup; + bpf_program__attach_lsm; + bpf_program__set_attach_target; +} LIBBPF_0.0.7; + +LIBBPF_0.0.9 { + global: + bpf_enable_stats; + bpf_iter_create; + bpf_link_get_fd_by_id; + bpf_link_get_next_id; + bpf_program__attach_iter; + bpf_program__attach_netns; + perf_buffer__consume; + ring_buffer__add; + ring_buffer__consume; + ring_buffer__free; + ring_buffer__new; + ring_buffer__poll; +} LIBBPF_0.0.8; + +LIBBPF_0.1.0 { + global: + bpf_link__detach; + bpf_link_detach; + bpf_map__ifindex; + bpf_map__key_size; + bpf_map__map_flags; + bpf_map__max_entries; + bpf_map__numa_node; + bpf_map__set_key_size; + bpf_map__set_map_flags; + bpf_map__set_max_entries; + bpf_map__set_numa_node; + bpf_map__set_type; + bpf_map__set_value_size; + bpf_map__type; + bpf_map__value_size; + bpf_program__attach_xdp; + bpf_program__autoload; + bpf_program__set_autoload; + btf__parse; + btf__parse_raw; + btf__pointer_size; + btf__set_fd; + btf__set_pointer_size; +} LIBBPF_0.0.9; + +LIBBPF_0.2.0 { + global: + bpf_prog_bind_map; + bpf_prog_test_run_opts; + bpf_program__attach_freplace; + bpf_program__section_name; + btf__add_array; + btf__add_const; + btf__add_enum; + btf__add_enum_value; + btf__add_datasec; + btf__add_datasec_var_info; + btf__add_field; + btf__add_func; + btf__add_func_param; + btf__add_func_proto; + btf__add_fwd; + btf__add_int; + btf__add_ptr; + btf__add_restrict; + btf__add_str; + btf__add_struct; + btf__add_typedef; + btf__add_union; + btf__add_var; + btf__add_volatile; + btf__endianness; + btf__find_str; + btf__new_empty; + btf__set_endianness; + btf__str_by_offset; + perf_buffer__buffer_cnt; + perf_buffer__buffer_fd; + perf_buffer__epoll_fd; + perf_buffer__consume_buffer; +} LIBBPF_0.1.0; + +LIBBPF_0.3.0 { + global: + btf__base_btf; + btf__parse_elf_split; + btf__parse_raw_split; + btf__parse_split; + btf__new_empty_split; + btf__new_split; + ring_buffer__epoll_fd; +} LIBBPF_0.2.0; + +LIBBPF_0.4.0 { + global: + btf__add_float; + btf__add_type; + bpf_linker__add_file; + bpf_linker__finalize; + bpf_linker__free; + bpf_linker__new; + bpf_map__inner_map; + bpf_object__set_kversion; + bpf_tc_attach; + bpf_tc_detach; + bpf_tc_hook_create; + bpf_tc_hook_destroy; + bpf_tc_query; +} LIBBPF_0.3.0; + +LIBBPF_0.5.0 { + global: + bpf_map__initial_value; + bpf_map__pin_path; + bpf_map_lookup_and_delete_elem_flags; + bpf_program__attach_kprobe_opts; + bpf_program__attach_perf_event_opts; + bpf_program__attach_tracepoint_opts; + bpf_program__attach_uprobe_opts; + bpf_object__gen_loader; + btf__load_from_kernel_by_id; + btf__load_from_kernel_by_id_split; + btf__load_into_kernel; + btf__load_module_btf; + btf__load_vmlinux_btf; + btf_dump__dump_type_data; + libbpf_set_strict_mode; +} LIBBPF_0.4.0; + +LIBBPF_0.6.0 { + global: + bpf_map__map_extra; + bpf_map__set_map_extra; + bpf_map_create; + bpf_object__next_map; + bpf_object__next_program; + bpf_object__prev_map; + bpf_object__prev_program; + bpf_prog_load; + bpf_program__flags; + bpf_program__insn_cnt; + bpf_program__insns; + bpf_program__set_flags; + btf__add_btf; + btf__add_decl_tag; + btf__add_type_tag; + btf__dedup; + btf__raw_data; + btf__type_cnt; + btf_dump__new; + libbpf_major_version; + libbpf_minor_version; + libbpf_version_string; + perf_buffer__new; + perf_buffer__new_raw; +} LIBBPF_0.5.0; + +LIBBPF_0.7.0 { + global: + bpf_btf_load; + bpf_program__expected_attach_type; + bpf_program__log_buf; + bpf_program__log_level; + bpf_program__set_log_buf; + bpf_program__set_log_level; + bpf_program__type; + bpf_xdp_attach; + bpf_xdp_detach; + bpf_xdp_query; + bpf_xdp_query_id; + btf_ext__raw_data; + libbpf_probe_bpf_helper; + libbpf_probe_bpf_map_type; + libbpf_probe_bpf_prog_type; + libbpf_set_memlock_rlim; +} LIBBPF_0.6.0; + +LIBBPF_0.8.0 { + global: + bpf_map__autocreate; + bpf_map__get_next_key; + bpf_map__delete_elem; + bpf_map__lookup_and_delete_elem; + bpf_map__lookup_elem; + bpf_map__set_autocreate; + bpf_map__update_elem; + bpf_map_delete_elem_flags; + bpf_object__destroy_subskeleton; + bpf_object__open_subskeleton; + bpf_program__attach_kprobe_multi_opts; + bpf_program__attach_trace_opts; + bpf_program__attach_usdt; + bpf_program__set_insns; + libbpf_register_prog_handler; + libbpf_unregister_prog_handler; +} LIBBPF_0.7.0; + +LIBBPF_1.0.0 { + global: + bpf_obj_get_opts; + bpf_prog_query_opts; + bpf_program__attach_ksyscall; + bpf_program__autoattach; + bpf_program__set_autoattach; + btf__add_enum64; + btf__add_enum64_value; + libbpf_bpf_attach_type_str; + libbpf_bpf_link_type_str; + libbpf_bpf_map_type_str; + libbpf_bpf_prog_type_str; + perf_buffer__buffer; +} LIBBPF_0.8.0; + +LIBBPF_1.1.0 { + global: + bpf_btf_get_fd_by_id_opts; + bpf_link_get_fd_by_id_opts; + bpf_map_get_fd_by_id_opts; + bpf_prog_get_fd_by_id_opts; + user_ring_buffer__discard; + user_ring_buffer__free; + user_ring_buffer__new; + user_ring_buffer__reserve; + user_ring_buffer__reserve_blocking; + user_ring_buffer__submit; +} LIBBPF_1.0.0; diff --git a/src/libbpf.pc.template b/src/libbpf.pc.template new file mode 100644 index 0000000..b45ed53 --- /dev/null +++ b/src/libbpf.pc.template @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +prefix=@PREFIX@ +libdir=@LIBDIR@ +includedir=${prefix}/include + +Name: libbpf +Description: BPF library +Version: @VERSION@ +Libs: -L${libdir} -lbpf +Requires.private: libelf zlib +Cflags: -I${includedir} diff --git a/src/libbpf_common.h b/src/libbpf_common.h new file mode 100644 index 0000000..9a7937f --- /dev/null +++ b/src/libbpf_common.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Common user-facing libbpf helpers. + * + * Copyright (c) 2019 Facebook + */ + +#ifndef __LIBBPF_LIBBPF_COMMON_H +#define __LIBBPF_LIBBPF_COMMON_H + +#include <string.h> +#include "libbpf_version.h" + +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + +#define LIBBPF_DEPRECATED(msg) __attribute__((deprecated(msg))) + +/* Mark a symbol as deprecated when libbpf version is >= {major}.{minor} */ +#define LIBBPF_DEPRECATED_SINCE(major, minor, msg) \ + __LIBBPF_MARK_DEPRECATED_ ## major ## _ ## minor \ + (LIBBPF_DEPRECATED("libbpf v" # major "." # minor "+: " msg)) + +#define __LIBBPF_CURRENT_VERSION_GEQ(major, minor) \ + (LIBBPF_MAJOR_VERSION > (major) || \ + (LIBBPF_MAJOR_VERSION == (major) && LIBBPF_MINOR_VERSION >= (minor))) + +/* Add checks for other versions below when planning deprecation of API symbols + * with the LIBBPF_DEPRECATED_SINCE macro. + */ +#if __LIBBPF_CURRENT_VERSION_GEQ(1, 0) +#define __LIBBPF_MARK_DEPRECATED_1_0(X) X +#else +#define __LIBBPF_MARK_DEPRECATED_1_0(X) +#endif + +/* This set of internal macros allows to do "function overloading" based on + * number of arguments provided by used in backwards-compatible way during the + * transition to libbpf 1.0 + * It's ugly but necessary evil that will be cleaned up when we get to 1.0. + * See bpf_prog_load() overload for example. + */ +#define ___libbpf_cat(A, B) A ## B +#define ___libbpf_select(NAME, NUM) ___libbpf_cat(NAME, NUM) +#define ___libbpf_nth(_1, _2, _3, _4, _5, _6, N, ...) N +#define ___libbpf_cnt(...) ___libbpf_nth(__VA_ARGS__, 6, 5, 4, 3, 2, 1) +#define ___libbpf_overload(NAME, ...) ___libbpf_select(NAME, ___libbpf_cnt(__VA_ARGS__))(__VA_ARGS__) + +/* Helper macro to declare and initialize libbpf options struct + * + * This dance with uninitialized declaration, followed by memset to zero, + * followed by assignment using compound literal syntax is done to preserve + * ability to use a nice struct field initialization syntax and **hopefully** + * have all the padding bytes initialized to zero. It's not guaranteed though, + * when copying literal, that compiler won't copy garbage in literal's padding + * bytes, but that's the best way I've found and it seems to work in practice. + * + * Macro declares opts struct of given type and name, zero-initializes, + * including any extra padding, it with memset() and then assigns initial + * values provided by users in struct initializer-syntax as varargs. + */ +#define LIBBPF_OPTS(TYPE, NAME, ...) \ + struct TYPE NAME = ({ \ + memset(&NAME, 0, sizeof(struct TYPE)); \ + (struct TYPE) { \ + .sz = sizeof(struct TYPE), \ + __VA_ARGS__ \ + }; \ + }) + +#endif /* __LIBBPF_LIBBPF_COMMON_H */ diff --git a/src/libbpf_errno.c b/src/libbpf_errno.c new file mode 100644 index 0000000..6b18017 --- /dev/null +++ b/src/libbpf_errno.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org> + * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com> + * Copyright (C) 2015 Huawei Inc. + * Copyright (C) 2017 Nicira, Inc. + */ + +#undef _GNU_SOURCE +#include <stdio.h> +#include <string.h> + +#include "libbpf.h" +#include "libbpf_internal.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +#define ERRNO_OFFSET(e) ((e) - __LIBBPF_ERRNO__START) +#define ERRCODE_OFFSET(c) ERRNO_OFFSET(LIBBPF_ERRNO__##c) +#define NR_ERRNO (__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START) + +static const char *libbpf_strerror_table[NR_ERRNO] = { + [ERRCODE_OFFSET(LIBELF)] = "Something wrong in libelf", + [ERRCODE_OFFSET(FORMAT)] = "BPF object format invalid", + [ERRCODE_OFFSET(KVERSION)] = "'version' section incorrect or lost", + [ERRCODE_OFFSET(ENDIAN)] = "Endian mismatch", + [ERRCODE_OFFSET(INTERNAL)] = "Internal error in libbpf", + [ERRCODE_OFFSET(RELOC)] = "Relocation failed", + [ERRCODE_OFFSET(VERIFY)] = "Kernel verifier blocks program loading", + [ERRCODE_OFFSET(PROG2BIG)] = "Program too big", + [ERRCODE_OFFSET(KVER)] = "Incorrect kernel version", + [ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type", + [ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message", + [ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence", + [ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing", +}; + +int libbpf_strerror(int err, char *buf, size_t size) +{ + int ret; + + if (!buf || !size) + return libbpf_err(-EINVAL); + + err = err > 0 ? err : -err; + + if (err < __LIBBPF_ERRNO__START) { + ret = strerror_r(err, buf, size); + buf[size - 1] = '\0'; + return libbpf_err_errno(ret); + } + + if (err < __LIBBPF_ERRNO__END) { + const char *msg; + + msg = libbpf_strerror_table[ERRNO_OFFSET(err)]; + ret = snprintf(buf, size, "%s", msg); + buf[size - 1] = '\0'; + /* The length of the buf and msg is positive. + * A negative number may be returned only when the + * size exceeds INT_MAX. Not likely to appear. + */ + if (ret >= size) + return libbpf_err(-ERANGE); + return 0; + } + + ret = snprintf(buf, size, "Unknown libbpf error %d", err); + buf[size - 1] = '\0'; + if (ret >= size) + return libbpf_err(-ERANGE); + return libbpf_err(-ENOENT); +} diff --git a/src/libbpf_internal.h b/src/libbpf_internal.h new file mode 100644 index 0000000..377642f --- /dev/null +++ b/src/libbpf_internal.h @@ -0,0 +1,579 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Internal libbpf helpers. + * + * Copyright (c) 2019 Facebook + */ + +#ifndef __LIBBPF_LIBBPF_INTERNAL_H +#define __LIBBPF_LIBBPF_INTERNAL_H + +#include <stdlib.h> +#include <limits.h> +#include <errno.h> +#include <linux/err.h> +#include <fcntl.h> +#include <unistd.h> +#include "relo_core.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* prevent accidental re-addition of reallocarray() */ +#pragma GCC poison reallocarray + +#include "libbpf.h" +#include "btf.h" + +#ifndef EM_BPF +#define EM_BPF 247 +#endif + +#ifndef R_BPF_64_64 +#define R_BPF_64_64 1 +#endif +#ifndef R_BPF_64_ABS64 +#define R_BPF_64_ABS64 2 +#endif +#ifndef R_BPF_64_ABS32 +#define R_BPF_64_ABS32 3 +#endif +#ifndef R_BPF_64_32 +#define R_BPF_64_32 10 +#endif + +#ifndef SHT_LLVM_ADDRSIG +#define SHT_LLVM_ADDRSIG 0x6FFF4C03 +#endif + +/* if libelf is old and doesn't support mmap(), fall back to read() */ +#ifndef ELF_C_READ_MMAP +#define ELF_C_READ_MMAP ELF_C_READ +#endif + +/* Older libelf all end up in this expression, for both 32 and 64 bit */ +#ifndef ELF64_ST_VISIBILITY +#define ELF64_ST_VISIBILITY(o) ((o) & 0x03) +#endif + +#define BTF_INFO_ENC(kind, kind_flag, vlen) \ + ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) +#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) +#define BTF_INT_ENC(encoding, bits_offset, nr_bits) \ + ((encoding) << 24 | (bits_offset) << 16 | (nr_bits)) +#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \ + BTF_INT_ENC(encoding, bits_offset, bits) +#define BTF_MEMBER_ENC(name, type, bits_offset) (name), (type), (bits_offset) +#define BTF_PARAM_ENC(name, type) (name), (type) +#define BTF_VAR_SECINFO_ENC(type, offset, size) (type), (offset), (size) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) +#define BTF_TYPE_DECL_TAG_ENC(value, type, component_idx) \ + BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_DECL_TAG, 0, 0), type), (component_idx) +#define BTF_TYPE_TYPE_TAG_ENC(value, type) \ + BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_TYPE_TAG, 0, 0), type) + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#ifndef min +# define min(x, y) ((x) < (y) ? (x) : (y)) +#endif +#ifndef max +# define max(x, y) ((x) < (y) ? (y) : (x)) +#endif +#ifndef offsetofend +# define offsetofend(TYPE, FIELD) \ + (offsetof(TYPE, FIELD) + sizeof(((TYPE *)0)->FIELD)) +#endif +#ifndef __alias +#define __alias(symbol) __attribute__((alias(#symbol))) +#endif + +/* Check whether a string `str` has prefix `pfx`, regardless if `pfx` is + * a string literal known at compilation time or char * pointer known only at + * runtime. + */ +#define str_has_pfx(str, pfx) \ + (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0) + +/* suffix check */ +static inline bool str_has_sfx(const char *str, const char *sfx) +{ + size_t str_len = strlen(str); + size_t sfx_len = strlen(sfx); + + if (sfx_len > str_len) + return false; + return strcmp(str + str_len - sfx_len, sfx) == 0; +} + +/* Symbol versioning is different between static and shared library. + * Properly versioned symbols are needed for shared library, but + * only the symbol of the new version is needed for static library. + * Starting with GNU C 10, use symver attribute instead of .symver assembler + * directive, which works better with GCC LTO builds. + */ +#if defined(SHARED) && defined(__GNUC__) && __GNUC__ >= 10 + +#define DEFAULT_VERSION(internal_name, api_name, version) \ + __attribute__((symver(#api_name "@@" #version))) +#define COMPAT_VERSION(internal_name, api_name, version) \ + __attribute__((symver(#api_name "@" #version))) + +#elif defined(SHARED) + +#define COMPAT_VERSION(internal_name, api_name, version) \ + asm(".symver " #internal_name "," #api_name "@" #version); +#define DEFAULT_VERSION(internal_name, api_name, version) \ + asm(".symver " #internal_name "," #api_name "@@" #version); + +#else /* !SHARED */ + +#define COMPAT_VERSION(internal_name, api_name, version) +#define DEFAULT_VERSION(internal_name, api_name, version) \ + extern typeof(internal_name) api_name \ + __attribute__((alias(#internal_name))); + +#endif + +extern void libbpf_print(enum libbpf_print_level level, + const char *format, ...) + __attribute__((format(printf, 2, 3))); + +#define __pr(level, fmt, ...) \ +do { \ + libbpf_print(level, "libbpf: " fmt, ##__VA_ARGS__); \ +} while (0) + +#define pr_warn(fmt, ...) __pr(LIBBPF_WARN, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) __pr(LIBBPF_INFO, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) __pr(LIBBPF_DEBUG, fmt, ##__VA_ARGS__) + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +struct bpf_link { + int (*detach)(struct bpf_link *link); + void (*dealloc)(struct bpf_link *link); + char *pin_path; /* NULL, if not pinned */ + int fd; /* hook FD, -1 if not applicable */ + bool disconnected; +}; + +/* + * Re-implement glibc's reallocarray() for libbpf internal-only use. + * reallocarray(), unfortunately, is not available in all versions of glibc, + * so requires extra feature detection and using reallocarray() stub from + * <tools/libc_compat.h> and COMPAT_NEED_REALLOCARRAY. All this complicates + * build of libbpf unnecessarily and is just a maintenance burden. Instead, + * it's trivial to implement libbpf-specific internal version and use it + * throughout libbpf. + */ +static inline void *libbpf_reallocarray(void *ptr, size_t nmemb, size_t size) +{ + size_t total; + +#if __has_builtin(__builtin_mul_overflow) + if (unlikely(__builtin_mul_overflow(nmemb, size, &total))) + return NULL; +#else + if (size == 0 || nmemb > ULONG_MAX / size) + return NULL; + total = nmemb * size; +#endif + return realloc(ptr, total); +} + +/* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst + * is zero-terminated string no matter what (unless sz == 0, in which case + * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs + * in what is returned. Given this is internal helper, it's trivial to extend + * this, when necessary. Use this instead of strncpy inside libbpf source code. + */ +static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz) +{ + size_t i; + + if (sz == 0) + return; + + sz--; + for (i = 0; i < sz && src[i]; i++) + dst[i] = src[i]; + dst[i] = '\0'; +} + +__u32 get_kernel_version(void); + +struct btf; +struct btf_type; + +struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id); +const char *btf_kind_str(const struct btf_type *t); +const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); + +static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t) +{ + return (enum btf_func_linkage)(int)btf_vlen(t); +} + +static inline __u32 btf_type_info(int kind, int vlen, int kflag) +{ + return (kflag << 31) | (kind << 24) | vlen; +} + +enum map_def_parts { + MAP_DEF_MAP_TYPE = 0x001, + MAP_DEF_KEY_TYPE = 0x002, + MAP_DEF_KEY_SIZE = 0x004, + MAP_DEF_VALUE_TYPE = 0x008, + MAP_DEF_VALUE_SIZE = 0x010, + MAP_DEF_MAX_ENTRIES = 0x020, + MAP_DEF_MAP_FLAGS = 0x040, + MAP_DEF_NUMA_NODE = 0x080, + MAP_DEF_PINNING = 0x100, + MAP_DEF_INNER_MAP = 0x200, + MAP_DEF_MAP_EXTRA = 0x400, + + MAP_DEF_ALL = 0x7ff, /* combination of all above */ +}; + +struct btf_map_def { + enum map_def_parts parts; + __u32 map_type; + __u32 key_type_id; + __u32 key_size; + __u32 value_type_id; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + __u32 numa_node; + __u32 pinning; + __u64 map_extra; +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def); + +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt); +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); + +static inline bool libbpf_is_mem_zeroed(const char *p, ssize_t len) +{ + while (len > 0) { + if (*p) + return false; + p++; + len--; + } + return true; +} + +static inline bool libbpf_validate_opts(const char *opts, + size_t opts_sz, size_t user_sz, + const char *type_name) +{ + if (user_sz < sizeof(size_t)) { + pr_warn("%s size (%zu) is too small\n", type_name, user_sz); + return false; + } + if (!libbpf_is_mem_zeroed(opts + opts_sz, (ssize_t)user_sz - opts_sz)) { + pr_warn("%s has non-zero extra bytes\n", type_name); + return false; + } + return true; +} + +#define OPTS_VALID(opts, type) \ + (!(opts) || libbpf_validate_opts((const char *)opts, \ + offsetofend(struct type, \ + type##__last_field), \ + (opts)->sz, #type)) +#define OPTS_HAS(opts, field) \ + ((opts) && opts->sz >= offsetofend(typeof(*(opts)), field)) +#define OPTS_GET(opts, field, fallback_value) \ + (OPTS_HAS(opts, field) ? (opts)->field : fallback_value) +#define OPTS_SET(opts, field, value) \ + do { \ + if (OPTS_HAS(opts, field)) \ + (opts)->field = value; \ + } while (0) + +#define OPTS_ZEROED(opts, last_nonzero_field) \ +({ \ + ssize_t __off = offsetofend(typeof(*(opts)), last_nonzero_field); \ + !(opts) || libbpf_is_mem_zeroed((const void *)opts + __off, \ + (opts)->sz - __off); \ +}) + +enum kern_feature_id { + /* v4.14: kernel support for program & map names. */ + FEAT_PROG_NAME, + /* v5.2: kernel support for global data sections. */ + FEAT_GLOBAL_DATA, + /* BTF support */ + FEAT_BTF, + /* BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO support */ + FEAT_BTF_FUNC, + /* BTF_KIND_VAR and BTF_KIND_DATASEC support */ + FEAT_BTF_DATASEC, + /* BTF_FUNC_GLOBAL is supported */ + FEAT_BTF_GLOBAL_FUNC, + /* BPF_F_MMAPABLE is supported for arrays */ + FEAT_ARRAY_MMAP, + /* kernel support for expected_attach_type in BPF_PROG_LOAD */ + FEAT_EXP_ATTACH_TYPE, + /* bpf_probe_read_{kernel,user}[_str] helpers */ + FEAT_PROBE_READ_KERN, + /* BPF_PROG_BIND_MAP is supported */ + FEAT_PROG_BIND_MAP, + /* Kernel support for module BTFs */ + FEAT_MODULE_BTF, + /* BTF_KIND_FLOAT support */ + FEAT_BTF_FLOAT, + /* BPF perf link support */ + FEAT_PERF_LINK, + /* BTF_KIND_DECL_TAG support */ + FEAT_BTF_DECL_TAG, + /* BTF_KIND_TYPE_TAG support */ + FEAT_BTF_TYPE_TAG, + /* memcg-based accounting for BPF maps and progs */ + FEAT_MEMCG_ACCOUNT, + /* BPF cookie (bpf_get_attach_cookie() BPF helper) support */ + FEAT_BPF_COOKIE, + /* BTF_KIND_ENUM64 support and BTF_KIND_ENUM kflag support */ + FEAT_BTF_ENUM64, + /* Kernel uses syscall wrapper (CONFIG_ARCH_HAS_SYSCALL_WRAPPER) */ + FEAT_SYSCALL_WRAPPER, + __FEAT_CNT, +}; + +int probe_memcg_account(void); +bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id); +int bump_rlimit_memlock(void); + +int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz); +int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); +int libbpf__load_raw_btf(const char *raw_types, size_t types_len, + const char *str_sec, size_t str_len); +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level); + +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); +void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, + const char **prefix, int *kind); + +struct btf_ext_info { + /* + * info points to the individual info section (e.g. func_info and + * line_info) from the .BTF.ext. It does not include the __u32 rec_size. + */ + void *info; + __u32 rec_size; + __u32 len; + /* optional (maintained internally by libbpf) mapping between .BTF.ext + * section and corresponding ELF section. This is used to join + * information like CO-RE relocation records with corresponding BPF + * programs defined in ELF sections + */ + __u32 *sec_idxs; + int sec_cnt; +}; + +#define for_each_btf_ext_sec(seg, sec) \ + for (sec = (seg)->info; \ + (void *)sec < (seg)->info + (seg)->len; \ + sec = (void *)sec + sizeof(struct btf_ext_info_sec) + \ + (seg)->rec_size * sec->num_info) + +#define for_each_btf_ext_rec(seg, sec, i, rec) \ + for (i = 0, rec = (void *)&(sec)->data; \ + i < (sec)->num_info; \ + i++, rec = (void *)rec + (seg)->rec_size) + +/* + * The .BTF.ext ELF section layout defined as + * struct btf_ext_header + * func_info subsection + * + * The func_info subsection layout: + * record size for struct bpf_func_info in the func_info subsection + * struct btf_sec_func_info for section #1 + * a list of bpf_func_info records for section #1 + * where struct bpf_func_info mimics one in include/uapi/linux/bpf.h + * but may not be identical + * struct btf_sec_func_info for section #2 + * a list of bpf_func_info records for section #2 + * ...... + * + * Note that the bpf_func_info record size in .BTF.ext may not + * be the same as the one defined in include/uapi/linux/bpf.h. + * The loader should ensure that record_size meets minimum + * requirement and pass the record as is to the kernel. The + * kernel will handle the func_info properly based on its contents. + */ +struct btf_ext_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 func_info_off; + __u32 func_info_len; + __u32 line_info_off; + __u32 line_info_len; + + /* optional part of .BTF.ext header */ + __u32 core_relo_off; + __u32 core_relo_len; +}; + +struct btf_ext { + union { + struct btf_ext_header *hdr; + void *data; + }; + struct btf_ext_info func_info; + struct btf_ext_info line_info; + struct btf_ext_info core_relo_info; + __u32 data_size; +}; + +struct btf_ext_info_sec { + __u32 sec_name_off; + __u32 num_info; + /* Followed by num_info * record_size number of bytes */ + __u8 data[]; +}; + +/* The minimum bpf_func_info checked by the loader */ +struct bpf_func_info_min { + __u32 insn_off; + __u32 type_id; +}; + +/* The minimum bpf_line_info checked by the loader */ +struct bpf_line_info_min { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + + +typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx); +typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx); +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx); +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx); +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx); +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx); +__s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, + __u32 kind); + +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); + +/* handle direct returned errors */ +static inline int libbpf_err(int ret) +{ + if (ret < 0) + errno = -ret; + return ret; +} + +/* handle errno-based (e.g., syscall or libc) errors according to libbpf's + * strict mode settings + */ +static inline int libbpf_err_errno(int ret) +{ + /* errno is already assumed to be set on error */ + return ret < 0 ? -errno : ret; +} + +/* handle error for pointer-returning APIs, err is assumed to be < 0 always */ +static inline void *libbpf_err_ptr(int err) +{ + /* set errno on error, this doesn't break anything */ + errno = -err; + return NULL; +} + +/* handle pointer-returning APIs' error handling */ +static inline void *libbpf_ptr(void *ret) +{ + /* set errno on error, this doesn't break anything */ + if (IS_ERR(ret)) + errno = -PTR_ERR(ret); + + return IS_ERR(ret) ? NULL : ret; +} + +static inline bool str_is_empty(const char *s) +{ + return !s || !s[0]; +} + +static inline bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +/* if fd is stdin, stdout, or stderr, dup to a fd greater than 2 + * Takes ownership of the fd passed in, and closes it if calling + * fcntl(fd, F_DUPFD_CLOEXEC, 3). + */ +static inline int ensure_good_fd(int fd) +{ + int old_fd = fd, saved_errno; + + if (fd < 0) + return fd; + if (fd < 3) { + fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + saved_errno = errno; + close(old_fd); + if (fd < 0) { + pr_warn("failed to dup FD %d to FD > 2: %d\n", old_fd, -saved_errno); + errno = saved_errno; + } + } + return fd; +} + +/* The following two functions are exposed to bpftool */ +int bpf_core_add_cands(struct bpf_core_cand *local_cand, + size_t local_essent_len, + const struct btf *targ_btf, + const char *targ_btf_name, + int targ_start_id, + struct bpf_core_cand_list *cands); +void bpf_core_free_cands(struct bpf_core_cand_list *cands); + +struct usdt_manager *usdt_manager_new(struct bpf_object *obj); +void usdt_manager_free(struct usdt_manager *man); +struct bpf_link * usdt_manager_attach_usdt(struct usdt_manager *man, + const struct bpf_program *prog, + pid_t pid, const char *path, + const char *usdt_provider, const char *usdt_name, + __u64 usdt_cookie); + +static inline bool is_pow_of_2(size_t x) +{ + return x && (x & (x - 1)) == 0; +} + +#define PROG_LOAD_ATTEMPTS 5 +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts); + +#endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/src/libbpf_legacy.h b/src/libbpf_legacy.h new file mode 100644 index 0000000..1e1be46 --- /dev/null +++ b/src/libbpf_legacy.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Libbpf legacy APIs (either discouraged or deprecated, as mentioned in [0]) + * + * [0] https://docs.google.com/document/d/1UyjTZuPFWiPFyKk1tV5an11_iaRuec6U-ZESZ54nNTY + * + * Copyright (C) 2021 Facebook + */ +#ifndef __LIBBPF_LEGACY_BPF_H +#define __LIBBPF_LEGACY_BPF_H + +#include <linux/bpf.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include "libbpf_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* As of libbpf 1.0 libbpf_set_strict_mode() and enum libbpf_struct_mode have + * no effect. But they are left in libbpf_legacy.h so that applications that + * prepared for libbpf 1.0 before final release by using + * libbpf_set_strict_mode() still work with libbpf 1.0+ without any changes. + */ +enum libbpf_strict_mode { + /* Turn on all supported strict features of libbpf to simulate libbpf + * v1.0 behavior. + * This will be the default behavior in libbpf v1.0. + */ + LIBBPF_STRICT_ALL = 0xffffffff, + + /* + * Disable any libbpf 1.0 behaviors. This is the default before libbpf + * v1.0. It won't be supported anymore in v1.0, please update your + * code so that it handles LIBBPF_STRICT_ALL mode before libbpf v1.0. + */ + LIBBPF_STRICT_NONE = 0x00, + /* + * Return NULL pointers on error, not ERR_PTR(err). + * Additionally, libbpf also always sets errno to corresponding Exx + * (positive) error code. + */ + LIBBPF_STRICT_CLEAN_PTRS = 0x01, + /* + * Return actual error codes from low-level APIs directly, not just -1. + * Additionally, libbpf also always sets errno to corresponding Exx + * (positive) error code. + */ + LIBBPF_STRICT_DIRECT_ERRS = 0x02, + /* + * Enforce strict BPF program section (SEC()) names. + * E.g., while prefiously SEC("xdp_whatever") or SEC("perf_event_blah") were + * allowed, with LIBBPF_STRICT_SEC_PREFIX this will become + * unrecognized by libbpf and would have to be just SEC("xdp") and + * SEC("xdp") and SEC("perf_event"). + * + * Note, in this mode the program pin path will be based on the + * function name instead of section name. + * + * Additionally, routines in the .text section are always considered + * sub-programs. Legacy behavior allows for a single routine in .text + * to be a program. + */ + LIBBPF_STRICT_SEC_NAME = 0x04, + /* + * Disable the global 'bpf_objects_list'. Maintaining this list adds + * a race condition to bpf_object__open() and bpf_object__close(). + * Clients can maintain it on their own if it is valuable for them. + */ + LIBBPF_STRICT_NO_OBJECT_LIST = 0x08, + /* + * Automatically bump RLIMIT_MEMLOCK using setrlimit() before the + * first BPF program or map creation operation. This is done only if + * kernel is too old to support memcg-based memory accounting for BPF + * subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY, + * but it can be overriden with libbpf_set_memlock_rlim() API. + * Note that libbpf_set_memlock_rlim() needs to be called before + * the very first bpf_prog_load(), bpf_map_create() or bpf_object__load() + * operation. + */ + LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK = 0x10, + /* + * Error out on any SEC("maps") map definition, which are deprecated + * in favor of BTF-defined map definitions in SEC(".maps"). + */ + LIBBPF_STRICT_MAP_DEFINITIONS = 0x20, + + __LIBBPF_STRICT_LAST, +}; + +LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode); + +/** + * @brief **libbpf_get_error()** extracts the error code from the passed + * pointer + * @param ptr pointer returned from libbpf API function + * @return error code; or 0 if no error occured + * + * Note, as of libbpf 1.0 this function is not necessary and not recommended + * to be used. Libbpf doesn't return error code embedded into the pointer + * itself. Instead, NULL is returned on error and error code is passed through + * thread-local errno variable. **libbpf_get_error()** is just returning -errno + * value if it receives NULL, which is correct only if errno hasn't been + * modified between libbpf API call and corresponding **libbpf_get_error()** + * call. Prefer to check return for NULL and use errno directly. + * + * This API is left in libbpf 1.0 to allow applications that were 1.0-ready + * before final libbpf 1.0 without needing to change them. + */ +LIBBPF_API long libbpf_get_error(const void *ptr); + +#define DECLARE_LIBBPF_OPTS LIBBPF_OPTS + +/* "Discouraged" APIs which don't follow consistent libbpf naming patterns. + * They are normally a trivial aliases or wrappers for proper APIs and are + * left to minimize unnecessary disruption for users of libbpf. But they + * shouldn't be used going forward. + */ + +struct bpf_program; +struct bpf_map; +struct btf; +struct btf_ext; + +LIBBPF_API struct btf *libbpf_find_kernel_btf(void); + +LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); +LIBBPF_API enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); +LIBBPF_API const char *bpf_map__get_pin_path(const struct bpf_map *map); +LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size); +LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_LEGACY_BPF_H */ diff --git a/src/libbpf_probes.c b/src/libbpf_probes.c new file mode 100644 index 0000000..b44fcbb --- /dev/null +++ b/src/libbpf_probes.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2019 Netronome Systems, Inc. */ + +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <net/if.h> +#include <sys/utsname.h> + +#include <linux/btf.h> +#include <linux/filter.h> +#include <linux/kernel.h> + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +static int probe_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, size_t insns_cnt, + char *log_buf, size_t log_buf_sz) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = log_buf, + .log_size = log_buf_sz, + .log_level = log_buf ? 1 : 0, + ); + int fd, err, exp_err = 0; + const char *exp_msg = NULL; + char buf[4096]; + + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + opts.expected_attach_type = BPF_CGROUP_INET4_CONNECT; + break; + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + opts.expected_attach_type = BPF_CGROUP_GETSOCKOPT; + break; + case BPF_PROG_TYPE_SK_LOOKUP: + opts.expected_attach_type = BPF_SK_LOOKUP; + break; + case BPF_PROG_TYPE_KPROBE: + opts.kern_version = get_kernel_version(); + break; + case BPF_PROG_TYPE_LIRC_MODE2: + opts.expected_attach_type = BPF_LIRC_MODE2; + break; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + opts.log_buf = buf; + opts.log_size = sizeof(buf); + opts.log_level = 1; + if (prog_type == BPF_PROG_TYPE_TRACING) + opts.expected_attach_type = BPF_TRACE_FENTRY; + else + opts.expected_attach_type = BPF_MODIFY_RETURN; + opts.attach_btf_id = 1; + + exp_err = -EINVAL; + exp_msg = "attach_btf_id 1 is not a function"; + break; + case BPF_PROG_TYPE_EXT: + opts.log_buf = buf; + opts.log_size = sizeof(buf); + opts.log_level = 1; + opts.attach_btf_id = 1; + + exp_err = -EINVAL; + exp_msg = "Cannot replace kernel functions"; + break; + case BPF_PROG_TYPE_SYSCALL: + opts.prog_flags = BPF_F_SLEEPABLE; + break; + case BPF_PROG_TYPE_STRUCT_OPS: + exp_err = -524; /* -ENOTSUPP */ + break; + case BPF_PROG_TYPE_UNSPEC: + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + break; + default: + return -EOPNOTSUPP; + } + + fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, &opts); + err = -errno; + if (fd >= 0) + close(fd); + if (exp_err) { + if (fd >= 0 || err != exp_err) + return 0; + if (exp_msg && !strstr(buf, exp_msg)) + return 0; + return 1; + } + return fd >= 0 ? 1 : 0; +} + +int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN() + }; + const size_t insn_cnt = ARRAY_SIZE(insns); + int ret; + + if (opts) + return libbpf_err(-EINVAL); + + ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0); + return libbpf_err(ret); +} + +int libbpf__load_raw_btf(const char *raw_types, size_t types_len, + const char *str_sec, size_t str_len) +{ + struct btf_header hdr = { + .magic = BTF_MAGIC, + .version = BTF_VERSION, + .hdr_len = sizeof(struct btf_header), + .type_len = types_len, + .str_off = types_len, + .str_len = str_len, + }; + int btf_fd, btf_len; + __u8 *raw_btf; + + btf_len = hdr.hdr_len + hdr.type_len + hdr.str_len; + raw_btf = malloc(btf_len); + if (!raw_btf) + return -ENOMEM; + + memcpy(raw_btf, &hdr, sizeof(hdr)); + memcpy(raw_btf + hdr.hdr_len, raw_types, hdr.type_len); + memcpy(raw_btf + hdr.hdr_len + hdr.type_len, str_sec, hdr.str_len); + + btf_fd = bpf_btf_load(raw_btf, btf_len, NULL); + + free(raw_btf); + return btf_fd; +} + +static int load_local_storage_btf(void) +{ + const char strs[] = "\0bpf_spin_lock\0val\0cnt\0l"; + /* struct bpf_spin_lock { + * int val; + * }; + * struct val { + * int cnt; + * struct bpf_spin_lock l; + * }; + */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* struct bpf_spin_lock */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), + BTF_MEMBER_ENC(15, 1, 0), /* int val; */ + /* struct val */ /* [3] */ + BTF_TYPE_ENC(15, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8), + BTF_MEMBER_ENC(19, 1, 0), /* int cnt; */ + BTF_MEMBER_ENC(23, 2, 32),/* struct bpf_spin_lock l; */ + }; + + return libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs)); +} + +static int probe_map_create(enum bpf_map_type map_type) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts); + int key_size, value_size, max_entries; + __u32 btf_key_type_id = 0, btf_value_type_id = 0; + int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err = 0; + + key_size = sizeof(__u32); + value_size = sizeof(__u32); + max_entries = 1; + + switch (map_type) { + case BPF_MAP_TYPE_STACK_TRACE: + value_size = sizeof(__u64); + break; + case BPF_MAP_TYPE_LPM_TRIE: + key_size = sizeof(__u64); + value_size = sizeof(__u64); + opts.map_flags = BPF_F_NO_PREALLOC; + break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: + key_size = sizeof(struct bpf_cgroup_storage_key); + value_size = sizeof(__u64); + max_entries = 0; + break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + key_size = 0; + break; + case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: + btf_key_type_id = 1; + btf_value_type_id = 3; + value_size = 8; + max_entries = 0; + opts.map_flags = BPF_F_NO_PREALLOC; + btf_fd = load_local_storage_btf(); + if (btf_fd < 0) + return btf_fd; + break; + case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: + key_size = 0; + value_size = 0; + max_entries = sysconf(_SC_PAGE_SIZE); + break; + case BPF_MAP_TYPE_STRUCT_OPS: + /* we'll get -ENOTSUPP for invalid BTF type ID for struct_ops */ + opts.btf_vmlinux_value_type_id = 1; + exp_err = -524; /* -ENOTSUPP */ + break; + case BPF_MAP_TYPE_BLOOM_FILTER: + key_size = 0; + max_entries = 1; + break; + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + break; + case BPF_MAP_TYPE_UNSPEC: + default: + return -EOPNOTSUPP; + } + + if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + fd_inner = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, + sizeof(__u32), sizeof(__u32), 1, NULL); + if (fd_inner < 0) + goto cleanup; + + opts.inner_map_fd = fd_inner; + } + + if (btf_fd >= 0) { + opts.btf_fd = btf_fd; + opts.btf_key_type_id = btf_key_type_id; + opts.btf_value_type_id = btf_value_type_id; + } + + fd = bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts); + err = -errno; + +cleanup: + if (fd >= 0) + close(fd); + if (fd_inner >= 0) + close(fd_inner); + if (btf_fd >= 0) + close(btf_fd); + + if (exp_err) + return fd < 0 && err == exp_err ? 1 : 0; + else + return fd >= 0 ? 1 : 0; +} + +int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts) +{ + int ret; + + if (opts) + return libbpf_err(-EINVAL); + + ret = probe_map_create(map_type); + return libbpf_err(ret); +} + +int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id, + const void *opts) +{ + struct bpf_insn insns[] = { + BPF_EMIT_CALL((__u32)helper_id), + BPF_EXIT_INSN(), + }; + const size_t insn_cnt = ARRAY_SIZE(insns); + char buf[4096]; + int ret; + + if (opts) + return libbpf_err(-EINVAL); + + /* we can't successfully load all prog types to check for BPF helper + * support, so bail out with -EOPNOTSUPP error + */ + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + return -EOPNOTSUPP; + default: + break; + } + + buf[0] = '\0'; + ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf)); + if (ret < 0) + return libbpf_err(ret); + + /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) + * at all, it will emit something like "invalid func unknown#181". + * If BPF verifier recognizes BPF helper but it's not supported for + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * In both cases, provided combination of BPF program type and BPF + * helper is not supported by the kernel. + * In all other cases, probe_prog_load() above will either succeed (e.g., + * because BPF helper happens to accept no input arguments or it + * accepts one input argument and initial PTR_TO_CTX is fine for + * that), or we'll get some more specific BPF verifier error about + * some unsatisfied conditions. + */ + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + return 0; + return 1; /* assume supported */ +} diff --git a/src/libbpf_version.h b/src/libbpf_version.h new file mode 100644 index 0000000..e944f5b --- /dev/null +++ b/src/libbpf_version.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (C) 2021 Facebook */ +#ifndef __LIBBPF_VERSION_H +#define __LIBBPF_VERSION_H + +#define LIBBPF_MAJOR_VERSION 1 +#define LIBBPF_MINOR_VERSION 1 + +#endif /* __LIBBPF_VERSION_H */ diff --git a/src/linker.c b/src/linker.c new file mode 100644 index 0000000..4ac02c2 --- /dev/null +++ b/src/linker.c @@ -0,0 +1,2900 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * BPF static linker + * + * Copyright (c) 2021 Facebook + */ +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <linux/err.h> +#include <linux/btf.h> +#include <elf.h> +#include <libelf.h> +#include <fcntl.h> +#include "libbpf.h" +#include "btf.h" +#include "libbpf_internal.h" +#include "strset.h" + +#define BTF_EXTERN_SEC ".extern" + +struct src_sec { + const char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + /* positional (not necessarily ELF) index of a matching section in a final object file */ + int dst_id; + /* section data offset in a matching output section */ + int dst_off; + /* whether section is omitted from the final ELF file */ + bool skipped; + /* whether section is an ephemeral section, not mapped to an ELF section */ + bool ephemeral; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* corresponding BTF DATASEC type ID */ + int sec_type_id; +}; + +struct src_obj { + const char *filename; + int fd; + Elf *elf; + /* Section header strings section index */ + size_t shstrs_sec_idx; + /* SYMTAB section index */ + size_t symtab_sec_idx; + + struct btf *btf; + struct btf_ext *btf_ext; + + /* List of sections (including ephemeral). Slot zero is unused. */ + struct src_sec *secs; + int sec_cnt; + + /* mapping of symbol indices from src to dst ELF */ + int *sym_map; + /* mapping from the src BTF type IDs to dst ones */ + int *btf_type_map; +}; + +/* single .BTF.ext data section */ +struct btf_ext_sec_data { + size_t rec_cnt; + __u32 rec_sz; + void *recs; +}; + +struct glob_sym { + /* ELF symbol index */ + int sym_idx; + /* associated section id for .ksyms, .kconfig, etc, but not .extern */ + int sec_id; + /* extern name offset in STRTAB */ + int name_off; + /* optional associated BTF type ID */ + int btf_id; + /* BTF type ID to which VAR/FUNC type is pointing to; used for + * rewriting types when extern VAR/FUNC is resolved to a concrete + * definition + */ + int underlying_btf_id; + /* sec_var index in the corresponding dst_sec, if exists */ + int var_idx; + + /* extern or resolved/global symbol */ + bool is_extern; + /* weak or strong symbol, never goes back from strong to weak */ + bool is_weak; +}; + +struct dst_sec { + char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + + bool ephemeral; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* final output section size */ + int sec_sz; + /* final output contents of the section */ + void *raw_data; + + /* corresponding STT_SECTION symbol index in SYMTAB */ + int sec_sym_idx; + + /* section's DATASEC variable info, emitted on BTF finalization */ + bool has_btf; + int sec_var_cnt; + struct btf_var_secinfo *sec_vars; + + /* section's .BTF.ext data */ + struct btf_ext_sec_data func_info; + struct btf_ext_sec_data line_info; + struct btf_ext_sec_data core_relo_info; +}; + +struct bpf_linker { + char *filename; + int fd; + Elf *elf; + Elf64_Ehdr *elf_hdr; + + /* Output sections metadata */ + struct dst_sec *secs; + int sec_cnt; + + struct strset *strtab_strs; /* STRTAB unique strings */ + size_t strtab_sec_idx; /* STRTAB section index */ + size_t symtab_sec_idx; /* SYMTAB section index */ + + struct btf *btf; + struct btf_ext *btf_ext; + + /* global (including extern) ELF symbols */ + int glob_sym_cnt; + struct glob_sym *glob_syms; +}; + +#define pr_warn_elf(fmt, ...) \ + libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)) + +static int init_output_elf(struct bpf_linker *linker, const char *file); + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, + const struct bpf_linker_file_opts *opts, + struct src_obj *obj); +static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_btf(struct src_obj *obj); +static int linker_sanity_check_btf_ext(struct src_obj *obj); +static int linker_fixup_btf(struct src_obj *obj); +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx); +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj); + +static int finalize_btf(struct bpf_linker *linker); +static int finalize_btf_ext(struct bpf_linker *linker); + +void bpf_linker__free(struct bpf_linker *linker) +{ + int i; + + if (!linker) + return; + + free(linker->filename); + + if (linker->elf) + elf_end(linker->elf); + + if (linker->fd >= 0) + close(linker->fd); + + strset__free(linker->strtab_strs); + + btf__free(linker->btf); + btf_ext__free(linker->btf_ext); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + free(sec->sec_name); + free(sec->raw_data); + free(sec->sec_vars); + + free(sec->func_info.recs); + free(sec->line_info.recs); + free(sec->core_relo_info.recs); + } + free(linker->secs); + + free(linker->glob_syms); + free(linker); +} + +struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts) +{ + struct bpf_linker *linker; + int err; + + if (!OPTS_VALID(opts, bpf_linker_opts)) + return errno = EINVAL, NULL; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn_elf("libelf initialization failed"); + return errno = EINVAL, NULL; + } + + linker = calloc(1, sizeof(*linker)); + if (!linker) + return errno = ENOMEM, NULL; + + linker->fd = -1; + + err = init_output_elf(linker, filename); + if (err) + goto err_out; + + return linker; + +err_out: + bpf_linker__free(linker); + return errno = -err, NULL; +} + +static struct dst_sec *add_dst_sec(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *secs = linker->secs, *sec; + size_t new_cnt = linker->sec_cnt ? linker->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + linker->sec_cnt, 0, (new_cnt - linker->sec_cnt) * sizeof(*secs)); + + linker->secs = secs; + linker->sec_cnt = new_cnt; + + sec = &linker->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = strdup(sec_name); + if (!sec->sec_name) + return NULL; + + return sec; +} + +static Elf64_Sym *add_new_sym(struct bpf_linker *linker, size_t *sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms, *sym; + size_t sym_cnt = symtab->sec_sz / sizeof(*sym); + + syms = libbpf_reallocarray(symtab->raw_data, sym_cnt + 1, sizeof(*sym)); + if (!syms) + return NULL; + + sym = &syms[sym_cnt]; + memset(sym, 0, sizeof(*sym)); + + symtab->raw_data = syms; + symtab->sec_sz += sizeof(*sym); + symtab->shdr->sh_size += sizeof(*sym); + symtab->data->d_size += sizeof(*sym); + + if (sym_idx) + *sym_idx = sym_cnt; + + return sym; +} + +static int init_output_elf(struct bpf_linker *linker, const char *file) +{ + int err, str_off; + Elf64_Sym *init_sym; + struct dst_sec *sec; + + linker->filename = strdup(file); + if (!linker->filename) + return -ENOMEM; + + linker->fd = open(file, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644); + if (linker->fd < 0) { + err = -errno; + pr_warn("failed to create '%s': %d\n", file, err); + return err; + } + + linker->elf = elf_begin(linker->fd, ELF_C_WRITE, NULL); + if (!linker->elf) { + pr_warn_elf("failed to create ELF object"); + return -EINVAL; + } + + /* ELF header */ + linker->elf_hdr = elf64_newehdr(linker->elf); + if (!linker->elf_hdr) { + pr_warn_elf("failed to create ELF header"); + return -EINVAL; + } + + linker->elf_hdr->e_machine = EM_BPF; + linker->elf_hdr->e_type = ET_REL; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2LSB; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER__" +#endif + + /* STRTAB */ + /* initialize strset with an empty string to conform to ELF */ + linker->strtab_strs = strset__new(INT_MAX, "", sizeof("")); + if (libbpf_get_error(linker->strtab_strs)) + return libbpf_get_error(linker->strtab_strs); + + sec = add_dst_sec(linker, ".strtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create STRTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create STRTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->elf_hdr->e_shstrndx = sec->sec_idx; + linker->strtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_STRTAB; + sec->shdr->sh_flags = SHF_STRINGS; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = 0; + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 1; + sec->shdr->sh_size = sec->sec_sz = 0; + sec->shdr->sh_entsize = 0; + + /* SYMTAB */ + sec = add_dst_sec(linker, ".symtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create SYMTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create SYMTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->symtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_SYMTAB; + sec->shdr->sh_flags = 0; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = linker->strtab_sec_idx; + /* sh_info should be one greater than the index of the last local + * symbol (i.e., binding is STB_LOCAL). But why and who cares? + */ + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 8; + sec->shdr->sh_entsize = sizeof(Elf64_Sym); + + /* .BTF */ + linker->btf = btf__new_empty(); + err = libbpf_get_error(linker->btf); + if (err) + return err; + + /* add the special all-zero symbol */ + init_sym = add_new_sym(linker, NULL); + if (!init_sym) + return -EINVAL; + + init_sym->st_name = 0; + init_sym->st_info = 0; + init_sym->st_other = 0; + init_sym->st_shndx = SHN_UNDEF; + init_sym->st_value = 0; + init_sym->st_size = 0; + + return 0; +} + +int bpf_linker__add_file(struct bpf_linker *linker, const char *filename, + const struct bpf_linker_file_opts *opts) +{ + struct src_obj obj = {}; + int err = 0; + + if (!OPTS_VALID(opts, bpf_linker_file_opts)) + return libbpf_err(-EINVAL); + + if (!linker->elf) + return libbpf_err(-EINVAL); + + err = err ?: linker_load_obj_file(linker, filename, opts, &obj); + err = err ?: linker_append_sec_data(linker, &obj); + err = err ?: linker_append_elf_syms(linker, &obj); + err = err ?: linker_append_elf_relos(linker, &obj); + err = err ?: linker_append_btf(linker, &obj); + err = err ?: linker_append_btf_ext(linker, &obj); + + /* free up src_obj resources */ + free(obj.btf_type_map); + btf__free(obj.btf); + btf_ext__free(obj.btf_ext); + free(obj.secs); + free(obj.sym_map); + if (obj.elf) + elf_end(obj.elf); + if (obj.fd >= 0) + close(obj.fd); + + return libbpf_err(err); +} + +static bool is_dwarf_sec_name(const char *name) +{ + /* approximation, but the actual list is too long */ + return strncmp(name, ".debug_", sizeof(".debug_") - 1) == 0; +} + +static bool is_ignored_sec(struct src_sec *sec) +{ + Elf64_Shdr *shdr = sec->shdr; + const char *name = sec->sec_name; + + /* no special handling of .strtab */ + if (shdr->sh_type == SHT_STRTAB) + return true; + + /* ignore .llvm_addrsig section as well */ + if (shdr->sh_type == SHT_LLVM_ADDRSIG) + return true; + + /* no subprograms will lead to an empty .text section, ignore it */ + if (shdr->sh_type == SHT_PROGBITS && shdr->sh_size == 0 && + strcmp(sec->sec_name, ".text") == 0) + return true; + + /* DWARF sections */ + if (is_dwarf_sec_name(sec->sec_name)) + return true; + + if (strncmp(name, ".rel", sizeof(".rel") - 1) == 0) { + name += sizeof(".rel") - 1; + /* DWARF section relocations */ + if (is_dwarf_sec_name(name)) + return true; + + /* .BTF and .BTF.ext don't need relocations */ + if (strcmp(name, BTF_ELF_SEC) == 0 || + strcmp(name, BTF_EXT_ELF_SEC) == 0) + return true; + } + + return false; +} + +static struct src_sec *add_src_sec(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *secs = obj->secs, *sec; + size_t new_cnt = obj->sec_cnt ? obj->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + obj->sec_cnt, 0, (new_cnt - obj->sec_cnt) * sizeof(*secs)); + + obj->secs = secs; + obj->sec_cnt = new_cnt; + + sec = &obj->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = sec_name; + + return sec; +} + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, + const struct bpf_linker_file_opts *opts, + struct src_obj *obj) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + const int host_endianness = ELFDATA2LSB; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + const int host_endianness = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER__" +#endif + int err = 0; + Elf_Scn *scn; + Elf_Data *data; + Elf64_Ehdr *ehdr; + Elf64_Shdr *shdr; + struct src_sec *sec; + + pr_debug("linker: adding object file '%s'...\n", filename); + + obj->filename = filename; + + obj->fd = open(filename, O_RDONLY | O_CLOEXEC); + if (obj->fd < 0) { + err = -errno; + pr_warn("failed to open file '%s': %d\n", filename, err); + return err; + } + obj->elf = elf_begin(obj->fd, ELF_C_READ_MMAP, NULL); + if (!obj->elf) { + err = -errno; + pr_warn_elf("failed to parse ELF file '%s'", filename); + return err; + } + + /* Sanity check ELF file high-level properties */ + ehdr = elf64_getehdr(obj->elf); + if (!ehdr) { + err = -errno; + pr_warn_elf("failed to get ELF header for %s", filename); + return err; + } + if (ehdr->e_ident[EI_DATA] != host_endianness) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported byte order of ELF file %s", filename); + return err; + } + if (ehdr->e_type != ET_REL + || ehdr->e_machine != EM_BPF + || ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported kind of ELF file %s", filename); + return err; + } + + if (elf_getshdrstrndx(obj->elf, &obj->shstrs_sec_idx)) { + err = -errno; + pr_warn_elf("failed to get SHSTRTAB section index for %s", filename); + return err; + } + + scn = NULL; + while ((scn = elf_nextscn(obj->elf, scn)) != NULL) { + size_t sec_idx = elf_ndxscn(scn); + const char *sec_name; + + shdr = elf64_getshdr(scn); + if (!shdr) { + err = -errno; + pr_warn_elf("failed to get section #%zu header for %s", + sec_idx, filename); + return err; + } + + sec_name = elf_strptr(obj->elf, obj->shstrs_sec_idx, shdr->sh_name); + if (!sec_name) { + err = -errno; + pr_warn_elf("failed to get section #%zu name for %s", + sec_idx, filename); + return err; + } + + data = elf_getdata(scn, 0); + if (!data) { + err = -errno; + pr_warn_elf("failed to get section #%zu (%s) data from %s", + sec_idx, sec_name, filename); + return err; + } + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->scn = scn; + sec->shdr = shdr; + sec->data = data; + sec->sec_idx = elf_ndxscn(scn); + + if (is_ignored_sec(sec)) { + sec->skipped = true; + continue; + } + + switch (shdr->sh_type) { + case SHT_SYMTAB: + if (obj->symtab_sec_idx) { + err = -EOPNOTSUPP; + pr_warn("multiple SYMTAB sections found, not supported\n"); + return err; + } + obj->symtab_sec_idx = sec_idx; + break; + case SHT_STRTAB: + /* we'll construct our own string table */ + break; + case SHT_PROGBITS: + if (strcmp(sec_name, BTF_ELF_SEC) == 0) { + obj->btf = btf__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf); + if (err) { + pr_warn("failed to parse .BTF from %s: %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + if (strcmp(sec_name, BTF_EXT_ELF_SEC) == 0) { + obj->btf_ext = btf_ext__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf_ext); + if (err) { + pr_warn("failed to parse .BTF.ext from '%s': %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + + /* data & code */ + break; + case SHT_NOBITS: + /* BSS */ + break; + case SHT_REL: + /* relocations */ + break; + default: + pr_warn("unrecognized section #%zu (%s) in %s\n", + sec_idx, sec_name, filename); + err = -EINVAL; + return err; + } + } + + err = err ?: linker_sanity_check_elf(obj); + err = err ?: linker_sanity_check_btf(obj); + err = err ?: linker_sanity_check_btf_ext(obj); + err = err ?: linker_fixup_btf(obj); + + return err; +} + +static int linker_sanity_check_elf(struct src_obj *obj) +{ + struct src_sec *sec; + int i, err; + + if (!obj->symtab_sec_idx) { + pr_warn("ELF is missing SYMTAB section in %s\n", obj->filename); + return -EINVAL; + } + if (!obj->shstrs_sec_idx) { + pr_warn("ELF is missing section headers STRTAB section in %s\n", obj->filename); + return -EINVAL; + } + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (sec->sec_name[0] == '\0') { + pr_warn("ELF section #%zu has empty name in %s\n", sec->sec_idx, obj->filename); + return -EINVAL; + } + + if (sec->shdr->sh_addralign && !is_pow_of_2(sec->shdr->sh_addralign)) + return -EINVAL; + if (sec->shdr->sh_addralign != sec->data->d_align) + return -EINVAL; + + if (sec->shdr->sh_size != sec->data->d_size) + return -EINVAL; + + switch (sec->shdr->sh_type) { + case SHT_SYMTAB: + err = linker_sanity_check_elf_symtab(obj, sec); + if (err) + return err; + break; + case SHT_STRTAB: + break; + case SHT_PROGBITS: + if (sec->shdr->sh_flags & SHF_EXECINSTR) { + if (sec->shdr->sh_size % sizeof(struct bpf_insn) != 0) + return -EINVAL; + } + break; + case SHT_NOBITS: + break; + case SHT_REL: + err = linker_sanity_check_elf_relos(obj, sec); + if (err) + return err; + break; + case SHT_LLVM_ADDRSIG: + break; + default: + pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", + sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + return -EINVAL; + } + } + + return 0; +} + +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec; + Elf64_Sym *sym; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { + pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_link]; + if (link_sec->shdr->sh_type != SHT_STRTAB) { + pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + sym = sec->data->d_buf; + for (i = 0; i < n; i++, sym++) { + int sym_type = ELF64_ST_TYPE(sym->st_info); + int sym_bind = ELF64_ST_BIND(sym->st_info); + int sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + + if (i == 0) { + if (sym->st_name != 0 || sym->st_info != 0 + || sym->st_other != 0 || sym->st_shndx != 0 + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #0 is invalid in %s\n", obj->filename); + return -EINVAL; + } + continue; + } + if (sym_bind != STB_LOCAL && sym_bind != STB_GLOBAL && sym_bind != STB_WEAK) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol binding %d\n", + i, sec->sec_idx, sym_bind); + return -EINVAL; + } + if (sym_vis != STV_DEFAULT && sym_vis != STV_HIDDEN) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol visibility %d\n", + i, sec->sec_idx, sym_vis); + return -EINVAL; + } + if (sym->st_shndx == 0) { + if (sym_type != STT_NOTYPE || sym_bind == STB_LOCAL + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #%d is invalid extern symbol in %s\n", + i, obj->filename); + + return -EINVAL; + } + continue; + } + if (sym->st_shndx < SHN_LORESERVE && sym->st_shndx >= obj->sec_cnt) { + pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", + i, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); + return -EINVAL; + } + if (sym_type == STT_SECTION) { + if (sym->st_value != 0) + return -EINVAL; + continue; + } + } + + return 0; +} + +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec, *sym_sec; + Elf64_Rel *relo; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + /* SHT_REL's sh_link should point to SYMTAB */ + if (sec->shdr->sh_link != obj->symtab_sec_idx) { + pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + /* SHT_REL's sh_info points to relocated section */ + if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { + pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_info]; + + /* .rel<secname> -> <secname> pattern is followed */ + if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 + || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { + pr_warn("ELF relo section #%zu name has invalid name in %s\n", + sec->sec_idx, obj->filename); + return -EINVAL; + } + + /* don't further validate relocations for ignored sections */ + if (link_sec->skipped) + return 0; + + /* relocatable section is data or instructions */ + if (link_sec->shdr->sh_type != SHT_PROGBITS && link_sec->shdr->sh_type != SHT_NOBITS) { + pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + + /* check sanity of each relocation */ + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + relo = sec->data->d_buf; + sym_sec = &obj->secs[obj->symtab_sec_idx]; + for (i = 0; i < n; i++, relo++) { + size_t sym_idx = ELF64_R_SYM(relo->r_info); + size_t sym_type = ELF64_R_TYPE(relo->r_info); + + if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32 && + sym_type != R_BPF_64_ABS64 && sym_type != R_BPF_64_ABS32) { + pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", + i, sec->sec_idx, sym_type, obj->filename); + return -EINVAL; + } + + if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { + pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + + if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { + if (relo->r_offset % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + } + } + + return 0; +} + +static int check_btf_type_id(__u32 *type_id, void *ctx) +{ + struct btf *btf = ctx; + + if (*type_id >= btf__type_cnt(btf)) + return -EINVAL; + + return 0; +} + +static int check_btf_str_off(__u32 *str_off, void *ctx) +{ + struct btf *btf = ctx; + const char *s; + + s = btf__str_by_offset(btf, *str_off); + + if (!s) + return -EINVAL; + + return 0; +} + +static int linker_sanity_check_btf(struct src_obj *obj) +{ + struct btf_type *t; + int i, n, err = 0; + + if (!obj->btf) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + t = btf_type_by_id(obj->btf, i); + + err = err ?: btf_type_visit_type_ids(t, check_btf_type_id, obj->btf); + err = err ?: btf_type_visit_str_offs(t, check_btf_str_off, obj->btf); + if (err) + return err; + } + + return 0; +} + +static int linker_sanity_check_btf_ext(struct src_obj *obj) +{ + int err = 0; + + if (!obj->btf_ext) + return 0; + + /* can't use .BTF.ext without .BTF */ + if (!obj->btf) + return -EINVAL; + + err = err ?: btf_ext_visit_type_ids(obj->btf_ext, check_btf_type_id, obj->btf); + err = err ?: btf_ext_visit_str_offs(obj->btf_ext, check_btf_str_off, obj->btf); + if (err) + return err; + + return 0; +} + +static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + dst_sec->sec_sz = 0; + dst_sec->sec_idx = 0; + dst_sec->ephemeral = src_sec->ephemeral; + + /* ephemeral sections are just thin section shells lacking most parts */ + if (src_sec->ephemeral) + return 0; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -ENOMEM; + + dst_sec->scn = scn; + dst_sec->shdr = shdr; + dst_sec->data = data; + dst_sec->sec_idx = elf_ndxscn(scn); + + name_off = strset__add_str(linker->strtab_strs, src_sec->sec_name); + if (name_off < 0) + return name_off; + + shdr->sh_name = name_off; + shdr->sh_type = src_sec->shdr->sh_type; + shdr->sh_flags = src_sec->shdr->sh_flags; + shdr->sh_size = 0; + /* sh_link and sh_info have different meaning for different types of + * sections, so we leave it up to the caller code to fill them in, if + * necessary + */ + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = src_sec->shdr->sh_addralign; + shdr->sh_entsize = src_sec->shdr->sh_entsize; + + data->d_type = src_sec->data->d_type; + data->d_size = 0; + data->d_buf = NULL; + data->d_align = src_sec->data->d_align; + data->d_off = 0; + + return 0; +} + +static struct dst_sec *find_dst_sec_by_name(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *sec; + int i; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static bool secs_match(struct dst_sec *dst, struct src_sec *src) +{ + if (dst->ephemeral || src->ephemeral) + return true; + + if (dst->shdr->sh_type != src->shdr->sh_type) { + pr_warn("sec %s types mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_flags != src->shdr->sh_flags) { + pr_warn("sec %s flags mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_entsize != src->shdr->sh_entsize) { + pr_warn("sec %s entsize mismatch\n", dst->sec_name); + return false; + } + + return true; +} + +static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + if (dst_sec->sec_sz != src_sec->shdr->sh_size) + return false; + if (memcmp(dst_sec->raw_data, src_sec->data->d_buf, dst_sec->sec_sz) != 0) + return false; + return true; +} + +static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src_sec *src) +{ + void *tmp; + size_t dst_align, src_align; + size_t dst_align_sz, dst_final_sz; + int err; + + /* Ephemeral source section doesn't contribute anything to ELF + * section data. + */ + if (src->ephemeral) + return 0; + + /* Some sections (like .maps) can contain both externs (and thus be + * ephemeral) and non-externs (map definitions). So it's possible that + * it has to be "upgraded" from ephemeral to non-ephemeral when the + * first non-ephemeral entity appears. In such case, we add ELF + * section, data, etc. + */ + if (dst->ephemeral) { + err = init_sec(linker, dst, src); + if (err) + return err; + } + + dst_align = dst->shdr->sh_addralign; + src_align = src->shdr->sh_addralign; + if (dst_align == 0) + dst_align = 1; + if (dst_align < src_align) + dst_align = src_align; + + dst_align_sz = (dst->sec_sz + dst_align - 1) / dst_align * dst_align; + + /* no need to re-align final size */ + dst_final_sz = dst_align_sz + src->shdr->sh_size; + + if (src->shdr->sh_type != SHT_NOBITS) { + tmp = realloc(dst->raw_data, dst_final_sz); + if (!tmp) + return -ENOMEM; + dst->raw_data = tmp; + + /* pad dst section, if it's alignment forced size increase */ + memset(dst->raw_data + dst->sec_sz, 0, dst_align_sz - dst->sec_sz); + /* now copy src data at a properly aligned offset */ + memcpy(dst->raw_data + dst_align_sz, src->data->d_buf, src->shdr->sh_size); + } + + dst->sec_sz = dst_final_sz; + dst->shdr->sh_size = dst_final_sz; + dst->data->d_size = dst_final_sz; + + dst->shdr->sh_addralign = dst_align; + dst->data->d_align = dst_align; + + src->dst_off = dst_align_sz; + + return 0; +} + +static bool is_data_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped) + return false; + /* ephemeral sections are data sections, e.g., .kconfig, .ksyms */ + if (sec->ephemeral) + return true; + return sec->shdr->sh_type == SHT_PROGBITS || sec->shdr->sh_type == SHT_NOBITS; +} + +static bool is_relo_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped || sec->ephemeral) + return false; + return sec->shdr->sh_type == SHT_REL; +} + +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj) +{ + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + + src_sec = &obj->secs[i]; + if (!is_data_sec(src_sec)) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else { + if (!secs_match(dst_sec, src_sec)) { + pr_warn("ELF sections %s are incompatible\n", src_sec->sec_name); + return -1; + } + + /* "license" and "version" sections are deduped */ + if (strcmp(src_sec->sec_name, "license") == 0 + || strcmp(src_sec->sec_name, "version") == 0) { + if (!sec_content_is_same(dst_sec, src_sec)) { + pr_warn("non-identical contents of section '%s' are not supported\n", src_sec->sec_name); + return -EINVAL; + } + src_sec->skipped = true; + src_sec->dst_id = dst_sec->id; + continue; + } + } + + /* record mapped section index */ + src_sec->dst_id = dst_sec->id; + + err = extend_sec(linker, dst_sec, src_sec); + if (err) + return err; + } + + return 0; +} + +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize, err; + int str_sec_idx = symtab->shdr->sh_link; + const char *sym_name; + + obj->sym_map = calloc(n + 1, sizeof(*obj->sym_map)); + if (!obj->sym_map) + return -ENOMEM; + + for (i = 0; i < n; i++, sym++) { + /* We already validated all-zero symbol #0 and we already + * appended it preventively to the final SYMTAB, so skip it. + */ + if (i == 0) + continue; + + sym_name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!sym_name) { + pr_warn("can't fetch symbol name for symbol #%d in '%s'\n", i, obj->filename); + return -EINVAL; + } + + err = linker_append_elf_sym(linker, obj, sym, sym_name, i); + if (err) + return err; + } + + return 0; +} + +static Elf64_Sym *get_sym_by_idx(struct bpf_linker *linker, size_t sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms = symtab->raw_data; + + return &syms[sym_idx]; +} + +static struct glob_sym *find_glob_sym(struct bpf_linker *linker, const char *sym_name) +{ + struct glob_sym *glob_sym; + const char *name; + int i; + + for (i = 0; i < linker->glob_sym_cnt; i++) { + glob_sym = &linker->glob_syms[i]; + name = strset__data(linker->strtab_strs) + glob_sym->name_off; + + if (strcmp(name, sym_name) == 0) + return glob_sym; + } + + return NULL; +} + +static struct glob_sym *add_glob_sym(struct bpf_linker *linker) +{ + struct glob_sym *syms, *sym; + + syms = libbpf_reallocarray(linker->glob_syms, linker->glob_sym_cnt + 1, + sizeof(*linker->glob_syms)); + if (!syms) + return NULL; + + sym = &syms[linker->glob_sym_cnt]; + memset(sym, 0, sizeof(*sym)); + sym->var_idx = -1; + + linker->glob_syms = syms; + linker->glob_sym_cnt++; + + return sym; +} + +static bool glob_sym_btf_matches(const char *sym_name, bool exact, + const struct btf *btf1, __u32 id1, + const struct btf *btf2, __u32 id2) +{ + const struct btf_type *t1, *t2; + bool is_static1, is_static2; + const char *n1, *n2; + int i, n; + +recur: + n1 = n2 = NULL; + t1 = skip_mods_and_typedefs(btf1, id1, &id1); + t2 = skip_mods_and_typedefs(btf2, id2, &id2); + + /* check if only one side is FWD, otherwise handle with common logic */ + if (!exact && btf_is_fwd(t1) != btf_is_fwd(t2)) { + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible forward declaration names '%s' and '%s'\n", + sym_name, n1, n2); + return false; + } + /* validate if FWD kind matches concrete kind */ + if (btf_is_fwd(t1)) { + if (btf_kflag(t1) && btf_is_union(t2)) + return true; + if (!btf_kflag(t1) && btf_is_struct(t2)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t1) ? "union" : "struct", btf_kind_str(t2)); + } else { + if (btf_kflag(t2) && btf_is_union(t1)) + return true; + if (!btf_kflag(t2) && btf_is_struct(t1)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t2) ? "union" : "struct", btf_kind_str(t1)); + } + return false; + } + + if (btf_kind(t1) != btf_kind(t2)) { + pr_warn("global '%s': incompatible BTF kinds %s and %s\n", + sym_name, btf_kind_str(t1), btf_kind_str(t2)); + return false; + } + + switch (btf_kind(t1)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible %s names '%s' and '%s'\n", + sym_name, btf_kind_str(t1), n1, n2); + return false; + } + break; + default: + break; + } + + switch (btf_kind(t1)) { + case BTF_KIND_UNKN: /* void */ + case BTF_KIND_FWD: + return true; + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + /* ignore encoding for int and enum values for enum */ + if (t1->size != t2->size) { + pr_warn("global '%s': incompatible %s '%s' size %u and %u\n", + sym_name, btf_kind_str(t1), n1, t1->size, t2->size); + return false; + } + return true; + case BTF_KIND_PTR: + /* just validate overall shape of the referenced type, so no + * contents comparison for struct/union, and allowd fwd vs + * struct/union + */ + exact = false; + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_ARRAY: + /* ignore index type and array size */ + id1 = btf_array(t1)->type; + id2 = btf_array(t2)->type; + goto recur; + case BTF_KIND_FUNC: + /* extern and global linkages are compatible */ + is_static1 = btf_func_linkage(t1) == BTF_FUNC_STATIC; + is_static2 = btf_func_linkage(t2) == BTF_FUNC_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible func '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_VAR: + /* extern and global linkages are compatible */ + is_static1 = btf_var(t1)->linkage == BTF_VAR_STATIC; + is_static2 = btf_var(t2)->linkage == BTF_VAR_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible var '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m1, *m2; + + if (!exact) + return true; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s fields %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < n; i++, m1++, m2++) { + n1 = btf__str_by_offset(btf1, m1->name_off); + n2 = btf__str_by_offset(btf2, m2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible field #%d names '%s' and '%s'\n", + sym_name, i, n1, n2); + return false; + } + if (m1->offset != m2->offset) { + pr_warn("global '%s': incompatible field #%d ('%s') offsets\n", + sym_name, i, n1); + return false; + } + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + return true; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *m1, *m2; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s params %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < n; i++, m1++, m2++) { + /* ignore func arg names */ + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + /* now check return type as well */ + id1 = t1->type; + id2 = t2->type; + goto recur; + } + + /* skip_mods_and_typedefs() make this impossible */ + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + /* DATASECs are never compared with each other */ + case BTF_KIND_DATASEC: + default: + pr_warn("global '%s': unsupported BTF kind %s\n", + sym_name, btf_kind_str(t1)); + return false; + } +} + +static bool map_defs_match(const char *sym_name, + const struct btf *main_btf, + const struct btf_map_def *main_def, + const struct btf_map_def *main_inner_def, + const struct btf *extra_btf, + const struct btf_map_def *extra_def, + const struct btf_map_def *extra_inner_def) +{ + const char *reason; + + if (main_def->map_type != extra_def->map_type) { + reason = "type"; + goto mismatch; + } + + /* check key type/size match */ + if (main_def->key_size != extra_def->key_size) { + reason = "key_size"; + goto mismatch; + } + if (!!main_def->key_type_id != !!extra_def->key_type_id) { + reason = "key type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_KEY_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->key_type_id, + extra_btf, extra_def->key_type_id)) { + reason = "key type"; + goto mismatch; + } + + /* validate value type/size match */ + if (main_def->value_size != extra_def->value_size) { + reason = "value_size"; + goto mismatch; + } + if (!!main_def->value_type_id != !!extra_def->value_type_id) { + reason = "value type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_VALUE_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->value_type_id, + extra_btf, extra_def->value_type_id)) { + reason = "key type"; + goto mismatch; + } + + if (main_def->max_entries != extra_def->max_entries) { + reason = "max_entries"; + goto mismatch; + } + if (main_def->map_flags != extra_def->map_flags) { + reason = "map_flags"; + goto mismatch; + } + if (main_def->numa_node != extra_def->numa_node) { + reason = "numa_node"; + goto mismatch; + } + if (main_def->pinning != extra_def->pinning) { + reason = "pinning"; + goto mismatch; + } + + if ((main_def->parts & MAP_DEF_INNER_MAP) != (extra_def->parts & MAP_DEF_INNER_MAP)) { + reason = "inner map"; + goto mismatch; + } + + if (main_def->parts & MAP_DEF_INNER_MAP) { + char inner_map_name[128]; + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", sym_name); + + return map_defs_match(inner_map_name, + main_btf, main_inner_def, NULL, + extra_btf, extra_inner_def, NULL); + } + + return true; + +mismatch: + pr_warn("global '%s': map %s mismatch\n", sym_name, reason); + return false; +} + +static bool glob_map_defs_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, int btf_id) +{ + struct btf_map_def dst_def = {}, dst_inner_def = {}; + struct btf_map_def src_def = {}, src_inner_def = {}; + const struct btf_type *t; + int err; + + t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(t)) { + pr_warn("global '%s': invalid map definition type [%d]\n", sym_name, btf_id); + return false; + } + t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + + err = parse_btf_map_def(sym_name, obj->btf, t, true /*strict*/, &src_def, &src_inner_def); + if (err) { + pr_warn("global '%s': invalid map definition\n", sym_name); + return false; + } + + /* re-parse existing map definition */ + t = btf__type_by_id(linker->btf, glob_sym->btf_id); + t = skip_mods_and_typedefs(linker->btf, t->type, NULL); + err = parse_btf_map_def(sym_name, linker->btf, t, true /*strict*/, &dst_def, &dst_inner_def); + if (err) { + /* this should not happen, because we already validated it */ + pr_warn("global '%s': invalid dst map definition\n", sym_name); + return false; + } + + /* Currently extern map definition has to be complete and match + * concrete map definition exactly. This restriction might be lifted + * in the future. + */ + return map_defs_match(sym_name, linker->btf, &dst_def, &dst_inner_def, + obj->btf, &src_def, &src_inner_def); +} + +static bool glob_syms_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, size_t sym_idx, int btf_id) +{ + const struct btf_type *src_t; + + /* if we are dealing with externs, BTF types describing both global + * and extern VARs/FUNCs should be completely present in all files + */ + if (!glob_sym->btf_id || !btf_id) { + pr_warn("BTF info is missing for global symbol '%s'\n", sym_name); + return false; + } + + src_t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(src_t) && !btf_is_func(src_t)) { + pr_warn("only extern variables and functions are supported, but got '%s' for '%s'\n", + btf_kind_str(src_t), sym_name); + return false; + } + + /* deal with .maps definitions specially */ + if (glob_sym->sec_id && strcmp(linker->secs[glob_sym->sec_id].sec_name, MAPS_ELF_SEC) == 0) + return glob_map_defs_match(sym_name, linker, glob_sym, obj, sym, btf_id); + + if (!glob_sym_btf_matches(sym_name, true /*exact*/, + linker->btf, glob_sym->btf_id, obj->btf, btf_id)) + return false; + + return true; +} + +static bool btf_is_non_static(const struct btf_type *t) +{ + return (btf_is_var(t) && btf_var(t)->linkage != BTF_VAR_STATIC) + || (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_STATIC); +} + +static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, + int *out_btf_sec_id, int *out_btf_id) +{ + int i, j, n, m, btf_id = 0; + const struct btf_type *t; + const struct btf_var_secinfo *vi; + const char *name; + + if (!obj->btf) { + pr_warn("failed to find BTF info for object '%s'\n", obj->filename); + return -EINVAL; + } + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(obj->btf, i); + + /* some global and extern FUNCs and VARs might not be associated with any + * DATASEC, so try to detect them in the same pass + */ + if (btf_is_non_static(t)) { + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, sym_name) != 0) + continue; + + /* remember and still try to find DATASEC */ + btf_id = i; + continue; + } + + if (!btf_is_datasec(t)) + continue; + + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + t = btf__type_by_id(obj->btf, vi->type); + name = btf__str_by_offset(obj->btf, t->name_off); + + if (strcmp(name, sym_name) != 0) + continue; + if (btf_is_var(t) && btf_var(t)->linkage == BTF_VAR_STATIC) + continue; + if (btf_is_func(t) && btf_func_linkage(t) == BTF_FUNC_STATIC) + continue; + + if (btf_id && btf_id != vi->type) { + pr_warn("global/extern '%s' BTF is ambiguous: both types #%d and #%u match\n", + sym_name, btf_id, vi->type); + return -EINVAL; + } + + *out_btf_sec_id = i; + *out_btf_id = vi->type; + + return 0; + } + } + + /* free-floating extern or global FUNC */ + if (btf_id) { + *out_btf_sec_id = 0; + *out_btf_id = btf_id; + return 0; + } + + pr_warn("failed to find BTF info for global/extern symbol '%s'\n", sym_name); + return -ENOENT; +} + +static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *sec; + int i; + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static int complete_extern_btf_info(struct btf *dst_btf, int dst_id, + struct btf *src_btf, int src_id) +{ + struct btf_type *dst_t = btf_type_by_id(dst_btf, dst_id); + struct btf_type *src_t = btf_type_by_id(src_btf, src_id); + struct btf_param *src_p, *dst_p; + const char *s; + int i, n, off; + + /* We already made sure that source and destination types (FUNC or + * VAR) match in terms of types and argument names. + */ + if (btf_is_var(dst_t)) { + btf_var(dst_t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + return 0; + } + + dst_t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_GLOBAL, 0); + + /* now onto FUNC_PROTO types */ + src_t = btf_type_by_id(src_btf, src_t->type); + dst_t = btf_type_by_id(dst_btf, dst_t->type); + + /* Fill in all the argument names, which for extern FUNCs are missing. + * We'll end up with two copies of FUNCs/VARs for externs, but that + * will be taken care of by BTF dedup at the very end. + * It might be that BTF types for extern in one file has less/more BTF + * information (e.g., FWD instead of full STRUCT/UNION information), + * but that should be (in most cases, subject to BTF dedup rules) + * handled and resolved by BTF dedup algorithm as well, so we won't + * worry about it. Our only job is to make sure that argument names + * are populated on both sides, otherwise BTF dedup will pedantically + * consider them different. + */ + src_p = btf_params(src_t); + dst_p = btf_params(dst_t); + for (i = 0, n = btf_vlen(dst_t); i < n; i++, src_p++, dst_p++) { + if (!src_p->name_off) + continue; + + /* src_btf has more complete info, so add name to dst_btf */ + s = btf__str_by_offset(src_btf, src_p->name_off); + off = btf__add_str(dst_btf, s); + if (off < 0) + return off; + dst_p->name_off = off; + } + return 0; +} + +static void sym_update_bind(Elf64_Sym *sym, int sym_bind) +{ + sym->st_info = ELF64_ST_INFO(sym_bind, ELF64_ST_TYPE(sym->st_info)); +} + +static void sym_update_type(Elf64_Sym *sym, int sym_type) +{ + sym->st_info = ELF64_ST_INFO(ELF64_ST_BIND(sym->st_info), sym_type); +} + +static void sym_update_visibility(Elf64_Sym *sym, int sym_vis) +{ + /* libelf doesn't provide setters for ST_VISIBILITY, + * but it is stored in the lower 2 bits of st_other + */ + sym->st_other &= ~0x03; + sym->st_other |= sym_vis; +} + +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx) +{ + struct src_sec *src_sec = NULL; + struct dst_sec *dst_sec = NULL; + struct glob_sym *glob_sym = NULL; + int name_off, sym_type, sym_bind, sym_vis, err; + int btf_sec_id = 0, btf_id = 0; + size_t dst_sym_idx; + Elf64_Sym *dst_sym; + bool sym_is_extern; + + sym_type = ELF64_ST_TYPE(sym->st_info); + sym_bind = ELF64_ST_BIND(sym->st_info); + sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + sym_is_extern = sym->st_shndx == SHN_UNDEF; + + if (sym_is_extern) { + if (!obj->btf) { + pr_warn("externs without BTF info are not supported\n"); + return -ENOTSUP; + } + } else if (sym->st_shndx < SHN_LORESERVE) { + src_sec = &obj->secs[sym->st_shndx]; + if (src_sec->skipped) + return 0; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* allow only one STT_SECTION symbol per section */ + if (sym_type == STT_SECTION && dst_sec->sec_sym_idx) { + obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx; + return 0; + } + } + + if (sym_bind == STB_LOCAL) + goto add_sym; + + /* find matching BTF info */ + err = find_glob_sym_btf(obj, sym, sym_name, &btf_sec_id, &btf_id); + if (err) + return err; + + if (sym_is_extern && btf_sec_id) { + const char *sec_name = NULL; + const struct btf_type *t; + + t = btf__type_by_id(obj->btf, btf_sec_id); + sec_name = btf__str_by_offset(obj->btf, t->name_off); + + /* Clang puts unannotated extern vars into + * '.extern' BTF DATASEC. Treat them the same + * as unannotated extern funcs (which are + * currently not put into any DATASECs). + * Those don't have associated src_sec/dst_sec. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) != 0) { + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("failed to find matching ELF sec '%s'\n", sec_name); + return -ENOENT; + } + dst_sec = &linker->secs[src_sec->dst_id]; + } + } + + glob_sym = find_glob_sym(linker, sym_name); + if (glob_sym) { + /* Preventively resolve to existing symbol. This is + * needed for further relocation symbol remapping in + * the next step of linking. + */ + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + + /* If both symbols are non-externs, at least one of + * them has to be STB_WEAK, otherwise they are in + * a conflict with each other. + */ + if (!sym_is_extern && !glob_sym->is_extern + && !glob_sym->is_weak && sym_bind != STB_WEAK) { + pr_warn("conflicting non-weak symbol #%d (%s) definition in '%s'\n", + src_sym_idx, sym_name, obj->filename); + return -EINVAL; + } + + if (!glob_syms_match(sym_name, linker, glob_sym, obj, sym, src_sym_idx, btf_id)) + return -EINVAL; + + dst_sym = get_sym_by_idx(linker, glob_sym->sym_idx); + + /* If new symbol is strong, then force dst_sym to be strong as + * well; this way a mix of weak and non-weak extern + * definitions will end up being strong. + */ + if (sym_bind == STB_GLOBAL) { + /* We still need to preserve type (NOTYPE or + * OBJECT/FUNC, depending on whether the symbol is + * extern or not) + */ + sym_update_bind(dst_sym, STB_GLOBAL); + glob_sym->is_weak = false; + } + + /* Non-default visibility is "contaminating", with stricter + * visibility overwriting more permissive ones, even if more + * permissive visibility comes from just an extern definition. + * Currently only STV_DEFAULT and STV_HIDDEN are allowed and + * ensured by ELF symbol sanity checks above. + */ + if (sym_vis > ELF64_ST_VISIBILITY(dst_sym->st_other)) + sym_update_visibility(dst_sym, sym_vis); + + /* If the new symbol is extern, then regardless if + * existing symbol is extern or resolved global, just + * keep the existing one untouched. + */ + if (sym_is_extern) + return 0; + + /* If existing symbol is a strong resolved symbol, bail out, + * because we lost resolution battle have nothing to + * contribute. We already checked abover that there is no + * strong-strong conflict. We also already tightened binding + * and visibility, so nothing else to contribute at that point. + */ + if (!glob_sym->is_extern && sym_bind == STB_WEAK) + return 0; + + /* At this point, new symbol is strong non-extern, + * so overwrite glob_sym with new symbol information. + * Preserve binding and visibility. + */ + sym_update_type(dst_sym, sym_type); + dst_sym->st_shndx = dst_sec->sec_idx; + dst_sym->st_value = src_sec->dst_off + sym->st_value; + dst_sym->st_size = sym->st_size; + + /* see comment below about dst_sec->id vs dst_sec->sec_idx */ + glob_sym->sec_id = dst_sec->id; + glob_sym->is_extern = false; + + if (complete_extern_btf_info(linker->btf, glob_sym->btf_id, + obj->btf, btf_id)) + return -EINVAL; + + /* request updating VAR's/FUNC's underlying BTF type when appending BTF type */ + glob_sym->underlying_btf_id = 0; + + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + return 0; + } + +add_sym: + name_off = strset__add_str(linker->strtab_strs, sym_name); + if (name_off < 0) + return name_off; + + dst_sym = add_new_sym(linker, &dst_sym_idx); + if (!dst_sym) + return -ENOMEM; + + dst_sym->st_name = name_off; + dst_sym->st_info = sym->st_info; + dst_sym->st_other = sym->st_other; + dst_sym->st_shndx = dst_sec ? dst_sec->sec_idx : sym->st_shndx; + dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; + dst_sym->st_size = sym->st_size; + + obj->sym_map[src_sym_idx] = dst_sym_idx; + + if (sym_type == STT_SECTION && dst_sym) { + dst_sec->sec_sym_idx = dst_sym_idx; + dst_sym->st_value = 0; + } + + if (sym_bind != STB_LOCAL) { + glob_sym = add_glob_sym(linker); + if (!glob_sym) + return -ENOMEM; + + glob_sym->sym_idx = dst_sym_idx; + /* we use dst_sec->id (and not dst_sec->sec_idx), because + * ephemeral sections (.kconfig, .ksyms, etc) don't have + * sec_idx (as they don't have corresponding ELF section), but + * still have id. .extern doesn't have even ephemeral section + * associated with it, so dst_sec->id == dst_sec->sec_idx == 0. + */ + glob_sym->sec_id = dst_sec ? dst_sec->id : 0; + glob_sym->name_off = name_off; + /* we will fill btf_id in during BTF merging step */ + glob_sym->btf_id = 0; + glob_sym->is_extern = sym_is_extern; + glob_sym->is_weak = sym_bind == STB_WEAK; + } + + return 0; +} + +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx]; + struct dst_sec *dst_symtab; + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec, *src_linked_sec; + struct dst_sec *dst_sec, *dst_linked_sec; + Elf64_Rel *src_rel, *dst_rel; + int j, n; + + src_sec = &obj->secs[i]; + if (!is_relo_sec(src_sec)) + continue; + + /* shdr->sh_info points to relocatable section */ + src_linked_sec = &obj->secs[src_sec->shdr->sh_info]; + if (src_linked_sec->skipped) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else if (!secs_match(dst_sec, src_sec)) { + pr_warn("sections %s are not compatible\n", src_sec->sec_name); + return -1; + } + + /* add_dst_sec() above could have invalidated linker->secs */ + dst_symtab = &linker->secs[linker->symtab_sec_idx]; + + /* shdr->sh_link points to SYMTAB */ + dst_sec->shdr->sh_link = linker->symtab_sec_idx; + + /* shdr->sh_info points to relocated section */ + dst_linked_sec = &linker->secs[src_linked_sec->dst_id]; + dst_sec->shdr->sh_info = dst_linked_sec->sec_idx; + + src_sec->dst_id = dst_sec->id; + err = extend_sec(linker, dst_sec, src_sec); + if (err) + return err; + + src_rel = src_sec->data->d_buf; + dst_rel = dst_sec->raw_data + src_sec->dst_off; + n = src_sec->shdr->sh_size / src_sec->shdr->sh_entsize; + for (j = 0; j < n; j++, src_rel++, dst_rel++) { + size_t src_sym_idx = ELF64_R_SYM(src_rel->r_info); + size_t sym_type = ELF64_R_TYPE(src_rel->r_info); + Elf64_Sym *src_sym, *dst_sym; + size_t dst_sym_idx; + + src_sym_idx = ELF64_R_SYM(src_rel->r_info); + src_sym = src_symtab->data->d_buf + sizeof(*src_sym) * src_sym_idx; + + dst_sym_idx = obj->sym_map[src_sym_idx]; + dst_sym = dst_symtab->raw_data + sizeof(*dst_sym) * dst_sym_idx; + dst_rel->r_offset += src_linked_sec->dst_off; + sym_type = ELF64_R_TYPE(src_rel->r_info); + dst_rel->r_info = ELF64_R_INFO(dst_sym_idx, sym_type); + + if (ELF64_ST_TYPE(src_sym->st_info) == STT_SECTION) { + struct src_sec *sec = &obj->secs[src_sym->st_shndx]; + struct bpf_insn *insn; + + if (src_linked_sec->shdr->sh_flags & SHF_EXECINSTR) { + /* calls to the very first static function inside + * .text section at offset 0 will + * reference section symbol, not the + * function symbol. Fix that up, + * otherwise it won't be possible to + * relocate calls to two different + * static functions with the same name + * (rom two different object files) + */ + insn = dst_linked_sec->raw_data + dst_rel->r_offset; + if (insn->code == (BPF_JMP | BPF_CALL)) + insn->imm += sec->dst_off / sizeof(struct bpf_insn); + else + insn->imm += sec->dst_off; + } else { + pr_warn("relocation against STT_SECTION in non-exec section is not supported!\n"); + return -EINVAL; + } + } + + } + } + + return 0; +} + +static Elf64_Sym *find_sym_by_name(struct src_obj *obj, size_t sec_idx, + int sym_type, const char *sym_name) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + int str_sec_idx = symtab->shdr->sh_link; + const char *name; + + for (i = 0; i < n; i++, sym++) { + if (sym->st_shndx != sec_idx) + continue; + if (ELF64_ST_TYPE(sym->st_info) != sym_type) + continue; + + name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!name) + return NULL; + + if (strcmp(sym_name, name) != 0) + continue; + + return sym; + } + + return NULL; +} + +static int linker_fixup_btf(struct src_obj *obj) +{ + const char *sec_name; + struct src_sec *sec; + int i, j, n, m; + + if (!obj->btf) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + struct btf_var_secinfo *vi; + struct btf_type *t; + + t = btf_type_by_id(obj->btf, i); + if (btf_kind(t) != BTF_KIND_DATASEC) + continue; + + sec_name = btf__str_by_offset(obj->btf, t->name_off); + sec = find_src_sec_by_name(obj, sec_name); + if (sec) { + /* record actual section size, unless ephemeral */ + if (sec->shdr) + t->size = sec->shdr->sh_size; + } else { + /* BTF can have some sections that are not represented + * in ELF, e.g., .kconfig, .ksyms, .extern, which are used + * for special extern variables. + * + * For all but one such special (ephemeral) + * sections, we pre-create "section shells" to be able + * to keep track of extra per-section metadata later + * (e.g., those BTF extern variables). + * + * .extern is even more special, though, because it + * contains extern variables that need to be resolved + * by static linker, not libbpf and kernel. When such + * externs are resolved, we are going to remove them + * from .extern BTF section and might end up not + * needing it at all. Each resolved extern should have + * matching non-extern VAR/FUNC in other sections. + * + * We do support leaving some of the externs + * unresolved, though, to support cases of building + * libraries, which will later be linked against final + * BPF applications. So if at finalization we still + * see unresolved externs, we'll create .extern + * section on our own. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) == 0) + continue; + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->ephemeral = true; + sec->sec_idx = 0; /* will match UNDEF shndx in ELF */ + } + + /* remember ELF section and its BTF type ID match */ + sec->sec_type_id = i; + + /* fix up variable offsets */ + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + const struct btf_type *vt = btf__type_by_id(obj->btf, vi->type); + const char *var_name = btf__str_by_offset(obj->btf, vt->name_off); + int var_linkage = btf_var(vt)->linkage; + Elf64_Sym *sym; + + /* no need to patch up static or extern vars */ + if (var_linkage != BTF_VAR_GLOBAL_ALLOCATED) + continue; + + sym = find_sym_by_name(obj, sec->sec_idx, STT_OBJECT, var_name); + if (!sym) { + pr_warn("failed to find symbol for variable '%s' in section '%s'\n", var_name, sec_name); + return -ENOENT; + } + + vi->offset = sym->st_value; + } + } + + return 0; +} + +static int remap_type_id(__u32 *type_id, void *ctx) +{ + int *id_map = ctx; + int new_id = id_map[*type_id]; + + /* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */ + if (new_id == 0 && *type_id != 0) { + pr_warn("failed to find new ID mapping for original BTF type ID %u\n", *type_id); + return -EINVAL; + } + + *type_id = id_map[*type_id]; + + return 0; +} + +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_type *t; + int i, j, n, start_id, id; + const char *name; + + if (!obj->btf) + return 0; + + start_id = btf__type_cnt(linker->btf); + n = btf__type_cnt(obj->btf); + + obj->btf_type_map = calloc(n + 1, sizeof(int)); + if (!obj->btf_type_map) + return -ENOMEM; + + for (i = 1; i < n; i++) { + struct glob_sym *glob_sym = NULL; + + t = btf__type_by_id(obj->btf, i); + + /* DATASECs are handled specially below */ + if (btf_kind(t) == BTF_KIND_DATASEC) + continue; + + if (btf_is_non_static(t)) { + /* there should be glob_sym already */ + name = btf__str_by_offset(obj->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + + /* VARs without corresponding glob_sym are those that + * belong to skipped/deduplicated sections (i.e., + * license and version), so just skip them + */ + if (!glob_sym) + continue; + + /* linker_append_elf_sym() might have requested + * updating underlying type ID, if extern was resolved + * to strong symbol or weak got upgraded to non-weak + */ + if (glob_sym->underlying_btf_id == 0) + glob_sym->underlying_btf_id = -t->type; + + /* globals from previous object files that match our + * VAR/FUNC already have a corresponding associated + * BTF type, so just make sure to use it + */ + if (glob_sym->btf_id) { + /* reuse existing BTF type for global var/func */ + obj->btf_type_map[i] = glob_sym->btf_id; + continue; + } + } + + id = btf__add_type(linker->btf, obj->btf, t); + if (id < 0) { + pr_warn("failed to append BTF type #%d from file '%s'\n", i, obj->filename); + return id; + } + + obj->btf_type_map[i] = id; + + /* record just appended BTF type for var/func */ + if (glob_sym) { + glob_sym->btf_id = id; + glob_sym->underlying_btf_id = -t->type; + } + } + + /* remap all the types except DATASECs */ + n = btf__type_cnt(linker->btf); + for (i = start_id; i < n; i++) { + struct btf_type *dst_t = btf_type_by_id(linker->btf, i); + + if (btf_type_visit_type_ids(dst_t, remap_type_id, obj->btf_type_map)) + return -EINVAL; + } + + /* Rewrite VAR/FUNC underlying types (i.e., FUNC's FUNC_PROTO and VAR's + * actual type), if necessary + */ + for (i = 0; i < linker->glob_sym_cnt; i++) { + struct glob_sym *glob_sym = &linker->glob_syms[i]; + struct btf_type *glob_t; + + if (glob_sym->underlying_btf_id >= 0) + continue; + + glob_sym->underlying_btf_id = obj->btf_type_map[-glob_sym->underlying_btf_id]; + + glob_t = btf_type_by_id(linker->btf, glob_sym->btf_id); + glob_t->type = glob_sym->underlying_btf_id; + } + + /* append DATASEC info */ + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + const struct btf_var_secinfo *src_var; + struct btf_var_secinfo *dst_var; + + src_sec = &obj->secs[i]; + if (!src_sec->sec_type_id || src_sec->skipped) + continue; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* Mark section as having BTF regardless of the presence of + * variables. In some cases compiler might generate empty BTF + * with no variables information. E.g., when promoting local + * array/structure variable initial values and BPF object + * file otherwise has no read-only static variables in + * .rodata. We need to preserve such empty BTF and just set + * correct section size. + */ + dst_sec->has_btf = true; + + t = btf__type_by_id(obj->btf, src_sec->sec_type_id); + src_var = btf_var_secinfos(t); + n = btf_vlen(t); + for (j = 0; j < n; j++, src_var++) { + void *sec_vars = dst_sec->sec_vars; + int new_id = obj->btf_type_map[src_var->type]; + struct glob_sym *glob_sym = NULL; + + t = btf_type_by_id(linker->btf, new_id); + if (btf_is_non_static(t)) { + name = btf__str_by_offset(linker->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + if (glob_sym->sec_id != dst_sec->id) { + pr_warn("global '%s': section mismatch %d vs %d\n", + name, glob_sym->sec_id, dst_sec->id); + return -EINVAL; + } + } + + /* If there is already a member (VAR or FUNC) mapped + * to the same type, don't add a duplicate entry. + * This will happen when multiple object files define + * the same extern VARs/FUNCs. + */ + if (glob_sym && glob_sym->var_idx >= 0) { + __s64 sz; + + dst_var = &dst_sec->sec_vars[glob_sym->var_idx]; + /* Because underlying BTF type might have + * changed, so might its size have changed, so + * re-calculate and update it in sec_var. + */ + sz = btf__resolve_size(linker->btf, glob_sym->underlying_btf_id); + if (sz < 0) { + pr_warn("global '%s': failed to resolve size of underlying type: %d\n", + name, (int)sz); + return -EINVAL; + } + dst_var->size = sz; + continue; + } + + sec_vars = libbpf_reallocarray(sec_vars, + dst_sec->sec_var_cnt + 1, + sizeof(*dst_sec->sec_vars)); + if (!sec_vars) + return -ENOMEM; + + dst_sec->sec_vars = sec_vars; + dst_sec->sec_var_cnt++; + + dst_var = &dst_sec->sec_vars[dst_sec->sec_var_cnt - 1]; + dst_var->type = obj->btf_type_map[src_var->type]; + dst_var->size = src_var->size; + dst_var->offset = src_sec->dst_off + src_var->offset; + + if (glob_sym) + glob_sym->var_idx = dst_sec->sec_var_cnt - 1; + } + } + + return 0; +} + +static void *add_btf_ext_rec(struct btf_ext_sec_data *ext_data, const void *src_rec) +{ + void *tmp; + + tmp = libbpf_reallocarray(ext_data->recs, ext_data->rec_cnt + 1, ext_data->rec_sz); + if (!tmp) + return NULL; + ext_data->recs = tmp; + + tmp += ext_data->rec_cnt * ext_data->rec_sz; + memcpy(tmp, src_rec, ext_data->rec_sz); + + ext_data->rec_cnt++; + + return tmp; +} + +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_ext_info_sec *ext_sec; + const char *sec_name, *s; + struct src_sec *src_sec; + struct dst_sec *dst_sec; + int rec_sz, str_off, i; + + if (!obj->btf_ext) + return 0; + + rec_sz = obj->btf_ext->func_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->func_info, ext_sec) { + struct bpf_func_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->func_info.rec_sz == 0) + dst_sec->func_info.rec_sz = rec_sz; + if (dst_sec->func_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->func_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->func_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + } + } + + rec_sz = obj->btf_ext->line_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->line_info, ext_sec) { + struct bpf_line_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->line_info.rec_sz == 0) + dst_sec->line_info.rec_sz = rec_sz; + if (dst_sec->line_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->line_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->line_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + + s = btf__str_by_offset(obj->btf, src_rec->file_name_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->file_name_off = str_off; + + s = btf__str_by_offset(obj->btf, src_rec->line_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->line_off = str_off; + + /* dst_rec->line_col is fine */ + } + } + + rec_sz = obj->btf_ext->core_relo_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->core_relo_info, ext_sec) { + struct bpf_core_relo *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->core_relo_info.rec_sz == 0) + dst_sec->core_relo_info.rec_sz = rec_sz; + if (dst_sec->core_relo_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->core_relo_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->core_relo_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + + s = btf__str_by_offset(obj->btf, src_rec->access_str_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->access_str_off = str_off; + + /* dst_rec->kind is fine */ + } + } + + return 0; +} + +int bpf_linker__finalize(struct bpf_linker *linker) +{ + struct dst_sec *sec; + size_t strs_sz; + const void *strs; + int err, i; + + if (!linker->elf) + return libbpf_err(-EINVAL); + + err = finalize_btf(linker); + if (err) + return libbpf_err(err); + + /* Finalize strings */ + strs_sz = strset__data_size(linker->strtab_strs); + strs = strset__data(linker->strtab_strs); + + sec = &linker->secs[linker->strtab_sec_idx]; + sec->data->d_align = 1; + sec->data->d_off = 0LL; + sec->data->d_buf = (void *)strs; + sec->data->d_type = ELF_T_BYTE; + sec->data->d_size = strs_sz; + sec->shdr->sh_size = strs_sz; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + /* STRTAB is handled specially above */ + if (sec->sec_idx == linker->strtab_sec_idx) + continue; + + /* special ephemeral sections (.ksyms, .kconfig, etc) */ + if (!sec->scn) + continue; + + sec->data->d_buf = sec->raw_data; + } + + /* Finalize ELF layout */ + if (elf_update(linker->elf, ELF_C_NULL) < 0) { + err = -errno; + pr_warn_elf("failed to finalize ELF layout"); + return libbpf_err(err); + } + + /* Write out final ELF contents */ + if (elf_update(linker->elf, ELF_C_WRITE) < 0) { + err = -errno; + pr_warn_elf("failed to write ELF contents"); + return libbpf_err(err); + } + + elf_end(linker->elf); + close(linker->fd); + + linker->elf = NULL; + linker->fd = -1; + + return 0; +} + +static int emit_elf_data_sec(struct bpf_linker *linker, const char *sec_name, + size_t align, const void *raw_data, size_t raw_sz) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + name_off = strset__add_str(linker->strtab_strs, sec_name); + if (name_off < 0) + return name_off; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -EINVAL; + + shdr->sh_name = name_off; + shdr->sh_type = SHT_PROGBITS; + shdr->sh_flags = 0; + shdr->sh_size = raw_sz; + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = align; + shdr->sh_entsize = 0; + + data->d_type = ELF_T_BYTE; + data->d_size = raw_sz; + data->d_buf = (void *)raw_data; + data->d_align = align; + data->d_off = 0; + + return 0; +} + +static int finalize_btf(struct bpf_linker *linker) +{ + LIBBPF_OPTS(btf_dedup_opts, opts); + struct btf *btf = linker->btf; + const void *raw_data; + int i, j, id, err; + __u32 raw_sz; + + /* bail out if no BTF data was produced */ + if (btf__type_cnt(linker->btf) == 1) + return 0; + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (!sec->has_btf) + continue; + + id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz); + if (id < 0) { + pr_warn("failed to add consolidated BTF type for datasec '%s': %d\n", + sec->sec_name, id); + return id; + } + + for (j = 0; j < sec->sec_var_cnt; j++) { + struct btf_var_secinfo *vi = &sec->sec_vars[j]; + + if (btf__add_datasec_var_info(btf, vi->type, vi->offset, vi->size)) + return -EINVAL; + } + } + + err = finalize_btf_ext(linker); + if (err) { + pr_warn(".BTF.ext generation failed: %d\n", err); + return err; + } + + opts.btf_ext = linker->btf_ext; + err = btf__dedup(linker->btf, &opts); + if (err) { + pr_warn("BTF dedup failed: %d\n", err); + return err; + } + + /* Emit .BTF section */ + raw_data = btf__raw_data(linker->btf, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF ELF section: %d\n", err); + return err; + } + + /* Emit .BTF.ext section */ + if (linker->btf_ext) { + raw_data = btf_ext__get_raw_data(linker->btf_ext, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_EXT_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF.ext ELF section: %d\n", err); + return err; + } + } + + return 0; +} + +static int emit_btf_ext_data(struct bpf_linker *linker, void *output, + const char *sec_name, struct btf_ext_sec_data *sec_data) +{ + struct btf_ext_info_sec *sec_info; + void *cur = output; + int str_off; + size_t sz; + + if (!sec_data->rec_cnt) + return 0; + + str_off = btf__add_str(linker->btf, sec_name); + if (str_off < 0) + return -ENOMEM; + + sec_info = cur; + sec_info->sec_name_off = str_off; + sec_info->num_info = sec_data->rec_cnt; + cur += sizeof(struct btf_ext_info_sec); + + sz = sec_data->rec_cnt * sec_data->rec_sz; + memcpy(cur, sec_data->recs, sz); + cur += sz; + + return cur - output; +} + +static int finalize_btf_ext(struct bpf_linker *linker) +{ + size_t funcs_sz = 0, lines_sz = 0, core_relos_sz = 0, total_sz = 0; + size_t func_rec_sz = 0, line_rec_sz = 0, core_relo_rec_sz = 0; + struct btf_ext_header *hdr; + void *data, *cur; + int i, err, sz; + + /* validate that all sections have the same .BTF.ext record sizes + * and calculate total data size for each type of data (func info, + * line info, core relos) + */ + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (sec->func_info.rec_cnt) { + if (func_rec_sz == 0) + func_rec_sz = sec->func_info.rec_sz; + if (func_rec_sz != sec->func_info.rec_sz) { + pr_warn("mismatch in func_info record size %zu != %u\n", + func_rec_sz, sec->func_info.rec_sz); + return -EINVAL; + } + + funcs_sz += sizeof(struct btf_ext_info_sec) + func_rec_sz * sec->func_info.rec_cnt; + } + if (sec->line_info.rec_cnt) { + if (line_rec_sz == 0) + line_rec_sz = sec->line_info.rec_sz; + if (line_rec_sz != sec->line_info.rec_sz) { + pr_warn("mismatch in line_info record size %zu != %u\n", + line_rec_sz, sec->line_info.rec_sz); + return -EINVAL; + } + + lines_sz += sizeof(struct btf_ext_info_sec) + line_rec_sz * sec->line_info.rec_cnt; + } + if (sec->core_relo_info.rec_cnt) { + if (core_relo_rec_sz == 0) + core_relo_rec_sz = sec->core_relo_info.rec_sz; + if (core_relo_rec_sz != sec->core_relo_info.rec_sz) { + pr_warn("mismatch in core_relo_info record size %zu != %u\n", + core_relo_rec_sz, sec->core_relo_info.rec_sz); + return -EINVAL; + } + + core_relos_sz += sizeof(struct btf_ext_info_sec) + core_relo_rec_sz * sec->core_relo_info.rec_cnt; + } + } + + if (!funcs_sz && !lines_sz && !core_relos_sz) + return 0; + + total_sz += sizeof(struct btf_ext_header); + if (funcs_sz) { + funcs_sz += sizeof(__u32); /* record size prefix */ + total_sz += funcs_sz; + } + if (lines_sz) { + lines_sz += sizeof(__u32); /* record size prefix */ + total_sz += lines_sz; + } + if (core_relos_sz) { + core_relos_sz += sizeof(__u32); /* record size prefix */ + total_sz += core_relos_sz; + } + + cur = data = calloc(1, total_sz); + if (!data) + return -ENOMEM; + + hdr = cur; + hdr->magic = BTF_MAGIC; + hdr->version = BTF_VERSION; + hdr->flags = 0; + hdr->hdr_len = sizeof(struct btf_ext_header); + cur += sizeof(struct btf_ext_header); + + /* All offsets are in bytes relative to the end of this header */ + hdr->func_info_off = 0; + hdr->func_info_len = funcs_sz; + hdr->line_info_off = funcs_sz; + hdr->line_info_len = lines_sz; + hdr->core_relo_off = funcs_sz + lines_sz; + hdr->core_relo_len = core_relos_sz; + + if (funcs_sz) { + *(__u32 *)cur = func_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (lines_sz) { + *(__u32 *)cur = line_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (core_relos_sz) { + *(__u32 *)cur = core_relo_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + linker->btf_ext = btf_ext__new(data, total_sz); + err = libbpf_get_error(linker->btf_ext); + if (err) { + linker->btf_ext = NULL; + pr_warn("failed to parse final .BTF.ext data: %d\n", err); + goto out; + } + +out: + free(data); + return err; +} diff --git a/src/netlink.c b/src/netlink.c new file mode 100644 index 0000000..3510458 --- /dev/null +++ b/src/netlink.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include <stdlib.h> +#include <memory.h> +#include <unistd.h> +#include <arpa/inet.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/pkt_cls.h> +#include <linux/rtnetlink.h> +#include <sys/socket.h> +#include <errno.h> +#include <time.h> + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" +#include "nlattr.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); + +typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, libbpf_dump_nlmsg_t, + void *cookie); + +struct xdp_link_info { + __u32 prog_id; + __u32 drv_prog_id; + __u32 hw_prog_id; + __u32 skb_prog_id; + __u8 attach_mode; +}; + +struct xdp_id_md { + int ifindex; + __u32 flags; + struct xdp_link_info info; +}; + +static int libbpf_netlink_open(__u32 *nl_pid) +{ + struct sockaddr_nl sa; + socklen_t addrlen; + int one = 1, ret; + int sock; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (sock < 0) + return -errno; + + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + &one, sizeof(one)) < 0) { + pr_warn("Netlink error reporting not supported\n"); + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { + ret = -errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + ret = -LIBBPF_ERRNO__INTERNAL; + goto cleanup; + } + + *nl_pid = sa.nl_pid; + return sock; + +cleanup: + close(sock); + return ret; +} + +static void libbpf_netlink_close(int sock) +{ + close(sock); +} + +enum { + NL_CONT, + NL_NEXT, + NL_DONE, +}; + +static int netlink_recvmsg(int sock, struct msghdr *mhdr, int flags) +{ + int len; + + do { + len = recvmsg(sock, mhdr, flags); + } while (len < 0 && (errno == EINTR || errno == EAGAIN)); + + if (len < 0) + return -errno; + return len; +} + +static int alloc_iov(struct iovec *iov, int len) +{ + void *nbuf; + + nbuf = realloc(iov->iov_base, len); + if (!nbuf) + return -ENOMEM; + + iov->iov_base = nbuf; + iov->iov_len = len; + return 0; +} + +static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq, + __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct iovec iov = {}; + struct msghdr mhdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + bool multipart = true; + struct nlmsgerr *err; + struct nlmsghdr *nh; + int len, ret; + + ret = alloc_iov(&iov, 4096); + if (ret) + goto done; + + while (multipart) { +start: + multipart = false; + len = netlink_recvmsg(sock, &mhdr, MSG_PEEK | MSG_TRUNC); + if (len < 0) { + ret = len; + goto done; + } + + if (len > iov.iov_len) { + ret = alloc_iov(&iov, len); + if (ret) + goto done; + } + + len = netlink_recvmsg(sock, &mhdr, 0); + if (len < 0) { + ret = len; + goto done; + } + + if (len == 0) + break; + + for (nh = (struct nlmsghdr *)iov.iov_base; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != nl_pid) { + ret = -LIBBPF_ERRNO__WRNGPID; + goto done; + } + if (nh->nlmsg_seq != seq) { + ret = -LIBBPF_ERRNO__INVSEQ; + goto done; + } + if (nh->nlmsg_flags & NLM_F_MULTI) + multipart = true; + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + libbpf_nla_dump_errormsg(nh); + goto done; + case NLMSG_DONE: + ret = 0; + goto done; + default: + break; + } + if (_fn) { + ret = _fn(nh, fn, cookie); + switch (ret) { + case NL_CONT: + break; + case NL_NEXT: + goto start; + case NL_DONE: + ret = 0; + goto done; + default: + goto done; + } + } + } + } + ret = 0; +done: + free(iov.iov_base); + return ret; +} + +static int libbpf_netlink_send_recv(struct libbpf_nla_req *req, + __dump_nlmsg_t parse_msg, + libbpf_dump_nlmsg_t parse_attr, + void *cookie) +{ + __u32 nl_pid = 0; + int sock, ret; + + sock = libbpf_netlink_open(&nl_pid); + if (sock < 0) + return sock; + + req->nh.nlmsg_pid = 0; + req->nh.nlmsg_seq = time(NULL); + + if (send(sock, req, req->nh.nlmsg_len, 0) < 0) { + ret = -errno; + goto out; + } + + ret = libbpf_netlink_recv(sock, nl_pid, req->nh.nlmsg_seq, + parse_msg, parse_attr, cookie); +out: + libbpf_netlink_close(sock); + return ret; +} + +static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, + __u32 flags) +{ + struct nlattr *nla; + int ret; + struct libbpf_nla_req req; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + + nla = nlattr_begin_nested(&req, IFLA_XDP); + if (!nla) + return -EMSGSIZE; + ret = nlattr_add(&req, IFLA_XDP_FD, &fd, sizeof(fd)); + if (ret < 0) + return ret; + if (flags) { + ret = nlattr_add(&req, IFLA_XDP_FLAGS, &flags, sizeof(flags)); + if (ret < 0) + return ret; + } + if (flags & XDP_FLAGS_REPLACE) { + ret = nlattr_add(&req, IFLA_XDP_EXPECTED_FD, &old_fd, + sizeof(old_fd)); + if (ret < 0) + return ret; + } + nlattr_end_nested(&req, nla); + + return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); +} + +int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts) +{ + int old_prog_fd, err; + + if (!OPTS_VALID(opts, bpf_xdp_attach_opts)) + return libbpf_err(-EINVAL); + + old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + if (old_prog_fd) + flags |= XDP_FLAGS_REPLACE; + else + old_prog_fd = -1; + + err = __bpf_set_link_xdp_fd_replace(ifindex, prog_fd, old_prog_fd, flags); + return libbpf_err(err); +} + +int bpf_xdp_detach(int ifindex, __u32 flags, const struct bpf_xdp_attach_opts *opts) +{ + return bpf_xdp_attach(ifindex, -1, flags, opts); +} + +static int __dump_link_nlmsg(struct nlmsghdr *nlh, + libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) +{ + struct nlattr *tb[IFLA_MAX + 1], *attr; + struct ifinfomsg *ifi = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); + attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi))); + + if (libbpf_nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_link_nlmsg(cookie, ifi, tb); +} + +static int get_xdp_info(void *cookie, void *msg, struct nlattr **tb) +{ + struct nlattr *xdp_tb[IFLA_XDP_MAX + 1]; + struct xdp_id_md *xdp_id = cookie; + struct ifinfomsg *ifinfo = msg; + int ret; + + if (xdp_id->ifindex && xdp_id->ifindex != ifinfo->ifi_index) + return 0; + + if (!tb[IFLA_XDP]) + return 0; + + ret = libbpf_nla_parse_nested(xdp_tb, IFLA_XDP_MAX, tb[IFLA_XDP], NULL); + if (ret) + return ret; + + if (!xdp_tb[IFLA_XDP_ATTACHED]) + return 0; + + xdp_id->info.attach_mode = libbpf_nla_getattr_u8( + xdp_tb[IFLA_XDP_ATTACHED]); + + if (xdp_id->info.attach_mode == XDP_ATTACHED_NONE) + return 0; + + if (xdp_tb[IFLA_XDP_PROG_ID]) + xdp_id->info.prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_PROG_ID]); + + if (xdp_tb[IFLA_XDP_SKB_PROG_ID]) + xdp_id->info.skb_prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_SKB_PROG_ID]); + + if (xdp_tb[IFLA_XDP_DRV_PROG_ID]) + xdp_id->info.drv_prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_DRV_PROG_ID]); + + if (xdp_tb[IFLA_XDP_HW_PROG_ID]) + xdp_id->info.hw_prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_HW_PROG_ID]); + + return 0; +} + +int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) +{ + struct libbpf_nla_req req = { + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nh.nlmsg_type = RTM_GETLINK, + .nh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .ifinfo.ifi_family = AF_PACKET, + }; + struct xdp_id_md xdp_id = {}; + int err; + + if (!OPTS_VALID(opts, bpf_xdp_query_opts)) + return libbpf_err(-EINVAL); + + if (xdp_flags & ~XDP_FLAGS_MASK) + return libbpf_err(-EINVAL); + + /* Check whether the single {HW,DRV,SKB} mode is set */ + xdp_flags &= XDP_FLAGS_SKB_MODE | XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE; + if (xdp_flags & (xdp_flags - 1)) + return libbpf_err(-EINVAL); + + xdp_id.ifindex = ifindex; + xdp_id.flags = xdp_flags; + + err = libbpf_netlink_send_recv(&req, __dump_link_nlmsg, + get_xdp_info, &xdp_id); + if (err) + return libbpf_err(err); + + OPTS_SET(opts, prog_id, xdp_id.info.prog_id); + OPTS_SET(opts, drv_prog_id, xdp_id.info.drv_prog_id); + OPTS_SET(opts, hw_prog_id, xdp_id.info.hw_prog_id); + OPTS_SET(opts, skb_prog_id, xdp_id.info.skb_prog_id); + OPTS_SET(opts, attach_mode, xdp_id.info.attach_mode); + + return 0; +} + +int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) +{ + LIBBPF_OPTS(bpf_xdp_query_opts, opts); + int ret; + + ret = bpf_xdp_query(ifindex, flags, &opts); + if (ret) + return libbpf_err(ret); + + flags &= XDP_FLAGS_MODES; + + if (opts.attach_mode != XDP_ATTACHED_MULTI && !flags) + *prog_id = opts.prog_id; + else if (flags & XDP_FLAGS_DRV_MODE) + *prog_id = opts.drv_prog_id; + else if (flags & XDP_FLAGS_HW_MODE) + *prog_id = opts.hw_prog_id; + else if (flags & XDP_FLAGS_SKB_MODE) + *prog_id = opts.skb_prog_id; + else + *prog_id = 0; + + return 0; +} + + +typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); + +static int clsact_config(struct libbpf_nla_req *req) +{ + req->tc.tcm_parent = TC_H_CLSACT; + req->tc.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); + + return nlattr_add(req, TCA_KIND, "clsact", sizeof("clsact")); +} + +static int attach_point_to_config(struct bpf_tc_hook *hook, + qdisc_config_t *config) +{ + switch (OPTS_GET(hook, attach_point, 0)) { + case BPF_TC_INGRESS: + case BPF_TC_EGRESS: + case BPF_TC_INGRESS | BPF_TC_EGRESS: + if (OPTS_GET(hook, parent, 0)) + return -EINVAL; + *config = &clsact_config; + return 0; + case BPF_TC_CUSTOM: + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + +static int tc_get_tcm_parent(enum bpf_tc_attach_point attach_point, + __u32 *parent) +{ + switch (attach_point) { + case BPF_TC_INGRESS: + case BPF_TC_EGRESS: + if (*parent) + return -EINVAL; + *parent = TC_H_MAKE(TC_H_CLSACT, + attach_point == BPF_TC_INGRESS ? + TC_H_MIN_INGRESS : TC_H_MIN_EGRESS); + break; + case BPF_TC_CUSTOM: + if (!*parent) + return -EINVAL; + break; + default: + return -EINVAL; + } + return 0; +} + +static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) +{ + qdisc_config_t config; + int ret; + struct libbpf_nla_req req; + + ret = attach_point_to_config(hook, &config); + if (ret < 0) + return ret; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags; + req.nh.nlmsg_type = cmd; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = OPTS_GET(hook, ifindex, 0); + + ret = config(&req); + if (ret < 0) + return ret; + + return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); +} + +static int tc_qdisc_create_excl(struct bpf_tc_hook *hook) +{ + return tc_qdisc_modify(hook, RTM_NEWQDISC, NLM_F_CREATE | NLM_F_EXCL); +} + +static int tc_qdisc_delete(struct bpf_tc_hook *hook) +{ + return tc_qdisc_modify(hook, RTM_DELQDISC, 0); +} + +int bpf_tc_hook_create(struct bpf_tc_hook *hook) +{ + int ret; + + if (!hook || !OPTS_VALID(hook, bpf_tc_hook) || + OPTS_GET(hook, ifindex, 0) <= 0) + return libbpf_err(-EINVAL); + + ret = tc_qdisc_create_excl(hook); + return libbpf_err(ret); +} + +static int __bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts, + const bool flush); + +int bpf_tc_hook_destroy(struct bpf_tc_hook *hook) +{ + if (!hook || !OPTS_VALID(hook, bpf_tc_hook) || + OPTS_GET(hook, ifindex, 0) <= 0) + return libbpf_err(-EINVAL); + + switch (OPTS_GET(hook, attach_point, 0)) { + case BPF_TC_INGRESS: + case BPF_TC_EGRESS: + return libbpf_err(__bpf_tc_detach(hook, NULL, true)); + case BPF_TC_INGRESS | BPF_TC_EGRESS: + return libbpf_err(tc_qdisc_delete(hook)); + case BPF_TC_CUSTOM: + return libbpf_err(-EOPNOTSUPP); + default: + return libbpf_err(-EINVAL); + } +} + +struct bpf_cb_ctx { + struct bpf_tc_opts *opts; + bool processed; +}; + +static int __get_tc_info(void *cookie, struct tcmsg *tc, struct nlattr **tb, + bool unicast) +{ + struct nlattr *tbb[TCA_BPF_MAX + 1]; + struct bpf_cb_ctx *info = cookie; + + if (!info || !info->opts) + return -EINVAL; + if (unicast && info->processed) + return -EINVAL; + if (!tb[TCA_OPTIONS]) + return NL_CONT; + + libbpf_nla_parse_nested(tbb, TCA_BPF_MAX, tb[TCA_OPTIONS], NULL); + if (!tbb[TCA_BPF_ID]) + return -EINVAL; + + OPTS_SET(info->opts, prog_id, libbpf_nla_getattr_u32(tbb[TCA_BPF_ID])); + OPTS_SET(info->opts, handle, tc->tcm_handle); + OPTS_SET(info->opts, priority, TC_H_MAJ(tc->tcm_info) >> 16); + + info->processed = true; + return unicast ? NL_NEXT : NL_DONE; +} + +static int get_tc_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct tcmsg *tc = NLMSG_DATA(nh); + struct nlattr *tb[TCA_MAX + 1]; + + libbpf_nla_parse(tb, TCA_MAX, + (struct nlattr *)((void *)tc + NLMSG_ALIGN(sizeof(*tc))), + NLMSG_PAYLOAD(nh, sizeof(*tc)), NULL); + if (!tb[TCA_KIND]) + return NL_CONT; + return __get_tc_info(cookie, tc, tb, nh->nlmsg_flags & NLM_F_ECHO); +} + +static int tc_add_fd_and_name(struct libbpf_nla_req *req, int fd) +{ + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + char name[256]; + int len, ret; + + memset(&info, 0, info_len); + ret = bpf_obj_get_info_by_fd(fd, &info, &info_len); + if (ret < 0) + return ret; + + ret = nlattr_add(req, TCA_BPF_FD, &fd, sizeof(fd)); + if (ret < 0) + return ret; + len = snprintf(name, sizeof(name), "%s:[%u]", info.name, info.id); + if (len < 0) + return -errno; + if (len >= sizeof(name)) + return -ENAMETOOLONG; + return nlattr_add(req, TCA_BPF_NAME, name, len + 1); +} + +int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) +{ + __u32 protocol, bpf_flags, handle, priority, parent, prog_id, flags; + int ret, ifindex, attach_point, prog_fd; + struct bpf_cb_ctx info = {}; + struct libbpf_nla_req req; + struct nlattr *nla; + + if (!hook || !opts || + !OPTS_VALID(hook, bpf_tc_hook) || + !OPTS_VALID(opts, bpf_tc_opts)) + return libbpf_err(-EINVAL); + + ifindex = OPTS_GET(hook, ifindex, 0); + parent = OPTS_GET(hook, parent, 0); + attach_point = OPTS_GET(hook, attach_point, 0); + + handle = OPTS_GET(opts, handle, 0); + priority = OPTS_GET(opts, priority, 0); + prog_fd = OPTS_GET(opts, prog_fd, 0); + prog_id = OPTS_GET(opts, prog_id, 0); + flags = OPTS_GET(opts, flags, 0); + + if (ifindex <= 0 || !prog_fd || prog_id) + return libbpf_err(-EINVAL); + if (priority > UINT16_MAX) + return libbpf_err(-EINVAL); + if (flags & ~BPF_TC_F_REPLACE) + return libbpf_err(-EINVAL); + + flags = (flags & BPF_TC_F_REPLACE) ? NLM_F_REPLACE : NLM_F_EXCL; + protocol = ETH_P_ALL; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | + NLM_F_ECHO | flags; + req.nh.nlmsg_type = RTM_NEWTFILTER; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = ifindex; + req.tc.tcm_handle = handle; + req.tc.tcm_info = TC_H_MAKE(priority << 16, htons(protocol)); + + ret = tc_get_tcm_parent(attach_point, &parent); + if (ret < 0) + return libbpf_err(ret); + req.tc.tcm_parent = parent; + + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return libbpf_err(ret); + nla = nlattr_begin_nested(&req, TCA_OPTIONS); + if (!nla) + return libbpf_err(-EMSGSIZE); + ret = tc_add_fd_and_name(&req, prog_fd); + if (ret < 0) + return libbpf_err(ret); + bpf_flags = TCA_BPF_FLAG_ACT_DIRECT; + ret = nlattr_add(&req, TCA_BPF_FLAGS, &bpf_flags, sizeof(bpf_flags)); + if (ret < 0) + return libbpf_err(ret); + nlattr_end_nested(&req, nla); + + info.opts = opts; + + ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); + if (ret < 0) + return libbpf_err(ret); + if (!info.processed) + return libbpf_err(-ENOENT); + return ret; +} + +static int __bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts, + const bool flush) +{ + __u32 protocol = 0, handle, priority, parent, prog_id, flags; + int ret, ifindex, attach_point, prog_fd; + struct libbpf_nla_req req; + + if (!hook || + !OPTS_VALID(hook, bpf_tc_hook) || + !OPTS_VALID(opts, bpf_tc_opts)) + return -EINVAL; + + ifindex = OPTS_GET(hook, ifindex, 0); + parent = OPTS_GET(hook, parent, 0); + attach_point = OPTS_GET(hook, attach_point, 0); + + handle = OPTS_GET(opts, handle, 0); + priority = OPTS_GET(opts, priority, 0); + prog_fd = OPTS_GET(opts, prog_fd, 0); + prog_id = OPTS_GET(opts, prog_id, 0); + flags = OPTS_GET(opts, flags, 0); + + if (ifindex <= 0 || flags || prog_fd || prog_id) + return -EINVAL; + if (priority > UINT16_MAX) + return -EINVAL; + if (!flush) { + if (!handle || !priority) + return -EINVAL; + protocol = ETH_P_ALL; + } else { + if (handle || priority) + return -EINVAL; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_DELTFILTER; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = ifindex; + if (!flush) { + req.tc.tcm_handle = handle; + req.tc.tcm_info = TC_H_MAKE(priority << 16, htons(protocol)); + } + + ret = tc_get_tcm_parent(attach_point, &parent); + if (ret < 0) + return ret; + req.tc.tcm_parent = parent; + + if (!flush) { + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return ret; + } + + return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); +} + +int bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts) +{ + int ret; + + if (!opts) + return libbpf_err(-EINVAL); + + ret = __bpf_tc_detach(hook, opts, false); + return libbpf_err(ret); +} + +int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) +{ + __u32 protocol, handle, priority, parent, prog_id, flags; + int ret, ifindex, attach_point, prog_fd; + struct bpf_cb_ctx info = {}; + struct libbpf_nla_req req; + + if (!hook || !opts || + !OPTS_VALID(hook, bpf_tc_hook) || + !OPTS_VALID(opts, bpf_tc_opts)) + return libbpf_err(-EINVAL); + + ifindex = OPTS_GET(hook, ifindex, 0); + parent = OPTS_GET(hook, parent, 0); + attach_point = OPTS_GET(hook, attach_point, 0); + + handle = OPTS_GET(opts, handle, 0); + priority = OPTS_GET(opts, priority, 0); + prog_fd = OPTS_GET(opts, prog_fd, 0); + prog_id = OPTS_GET(opts, prog_id, 0); + flags = OPTS_GET(opts, flags, 0); + + if (ifindex <= 0 || flags || prog_fd || prog_id || + !handle || !priority) + return libbpf_err(-EINVAL); + if (priority > UINT16_MAX) + return libbpf_err(-EINVAL); + + protocol = ETH_P_ALL; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST; + req.nh.nlmsg_type = RTM_GETTFILTER; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = ifindex; + req.tc.tcm_handle = handle; + req.tc.tcm_info = TC_H_MAKE(priority << 16, htons(protocol)); + + ret = tc_get_tcm_parent(attach_point, &parent); + if (ret < 0) + return libbpf_err(ret); + req.tc.tcm_parent = parent; + + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return libbpf_err(ret); + + info.opts = opts; + + ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); + if (ret < 0) + return libbpf_err(ret); + if (!info.processed) + return libbpf_err(-ENOENT); + return ret; +} diff --git a/src/nlattr.c b/src/nlattr.c new file mode 100644 index 0000000..3900d05 --- /dev/null +++ b/src/nlattr.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * NETLINK Netlink attributes + * + * Copyright (c) 2003-2013 Thomas Graf <tgraf@suug.ch> + */ + +#include <errno.h> +#include <string.h> +#include <stdio.h> +#include <linux/rtnetlink.h> +#include "nlattr.h" +#include "libbpf_internal.h" + +static uint16_t nla_attr_minlen[LIBBPF_NLA_TYPE_MAX+1] = { + [LIBBPF_NLA_U8] = sizeof(uint8_t), + [LIBBPF_NLA_U16] = sizeof(uint16_t), + [LIBBPF_NLA_U32] = sizeof(uint32_t), + [LIBBPF_NLA_U64] = sizeof(uint64_t), + [LIBBPF_NLA_STRING] = 1, + [LIBBPF_NLA_FLAG] = 0, +}; + +static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) +{ + int totlen = NLA_ALIGN(nla->nla_len); + + *remaining -= totlen; + return (struct nlattr *)((void *)nla + totlen); +} + +static int nla_ok(const struct nlattr *nla, int remaining) +{ + return remaining >= (int)sizeof(*nla) && + nla->nla_len >= sizeof(*nla) && + nla->nla_len <= remaining; +} + +static int nla_type(const struct nlattr *nla) +{ + return nla->nla_type & NLA_TYPE_MASK; +} + +static int validate_nla(struct nlattr *nla, int maxtype, + struct libbpf_nla_policy *policy) +{ + struct libbpf_nla_policy *pt; + unsigned int minlen = 0; + int type = nla_type(nla); + + if (type < 0 || type > maxtype) + return 0; + + pt = &policy[type]; + + if (pt->type > LIBBPF_NLA_TYPE_MAX) + return 0; + + if (pt->minlen) + minlen = pt->minlen; + else if (pt->type != LIBBPF_NLA_UNSPEC) + minlen = nla_attr_minlen[pt->type]; + + if (libbpf_nla_len(nla) < minlen) + return -1; + + if (pt->maxlen && libbpf_nla_len(nla) > pt->maxlen) + return -1; + + if (pt->type == LIBBPF_NLA_STRING) { + char *data = libbpf_nla_data(nla); + + if (data[libbpf_nla_len(nla) - 1] != '\0') + return -1; + } + + return 0; +} + +static inline int nlmsg_len(const struct nlmsghdr *nlh) +{ + return nlh->nlmsg_len - NLMSG_HDRLEN; +} + +/** + * Create attribute index based on a stream of attributes. + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg head Head of attribute stream. + * @arg len Length of attribute stream. + * @arg policy Attribute validation policy. + * + * Iterates over the stream of attributes and stores a pointer to each + * attribute in the index array using the attribute type as index to + * the array. Attribute with a type greater than the maximum type + * specified will be silently ignored in order to maintain backwards + * compatibility. If \a policy is not NULL, the attribute will be + * validated using the specified policy. + * + * @see nla_validate + * @return 0 on success or a negative error code. + */ +int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, + int len, struct libbpf_nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + + libbpf_nla_for_each_attr(nla, head, len, rem) { + int type = nla_type(nla); + + if (type > maxtype) + continue; + + if (policy) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + if (tb[type]) + pr_warn("Attribute of type %#x found multiple times in message, " + "previous attribute is being ignored.\n", type); + + tb[type] = nla; + } + + err = 0; +errout: + return err; +} + +/** + * Create attribute index based on nested attribute + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg nla Nested Attribute. + * @arg policy Attribute validation policy. + * + * Feeds the stream of attributes nested into the specified attribute + * to libbpf_nla_parse(). + * + * @see libbpf_nla_parse + * @return 0 on success or a negative error code. + */ +int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype, + struct nlattr *nla, + struct libbpf_nla_policy *policy) +{ + return libbpf_nla_parse(tb, maxtype, libbpf_nla_data(nla), + libbpf_nla_len(nla), policy); +} + +/* dump netlink extended ack error message */ +int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh) +{ + struct libbpf_nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = { + [NLMSGERR_ATTR_MSG] = { .type = LIBBPF_NLA_STRING }, + [NLMSGERR_ATTR_OFFS] = { .type = LIBBPF_NLA_U32 }, + }; + struct nlattr *tb[NLMSGERR_ATTR_MAX + 1], *attr; + struct nlmsgerr *err; + char *errmsg = NULL; + int hlen, alen; + + /* no TLVs, nothing to do here */ + if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS)) + return 0; + + err = (struct nlmsgerr *)NLMSG_DATA(nlh); + hlen = sizeof(*err); + + /* if NLM_F_CAPPED is set then the inner err msg was capped */ + if (!(nlh->nlmsg_flags & NLM_F_CAPPED)) + hlen += nlmsg_len(&err->msg); + + attr = (struct nlattr *) ((void *) err + hlen); + alen = nlh->nlmsg_len - hlen; + + if (libbpf_nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, + extack_policy) != 0) { + pr_warn("Failed to parse extended error attributes\n"); + return 0; + } + + if (tb[NLMSGERR_ATTR_MSG]) + errmsg = (char *) libbpf_nla_data(tb[NLMSGERR_ATTR_MSG]); + + pr_warn("Kernel error message: %s\n", errmsg); + + return 0; +} diff --git a/src/nlattr.h b/src/nlattr.h new file mode 100644 index 0000000..4d15ae2 --- /dev/null +++ b/src/nlattr.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * NETLINK Netlink attributes + * + * Copyright (c) 2003-2013 Thomas Graf <tgraf@suug.ch> + */ + +#ifndef __LIBBPF_NLATTR_H +#define __LIBBPF_NLATTR_H + +#include <stdint.h> +#include <string.h> +#include <errno.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> + +/* avoid multiple definition of netlink features */ +#define __LINUX_NETLINK_H + +/** + * Standard attribute types to specify validation policy + */ +enum { + LIBBPF_NLA_UNSPEC, /**< Unspecified type, binary data chunk */ + LIBBPF_NLA_U8, /**< 8 bit integer */ + LIBBPF_NLA_U16, /**< 16 bit integer */ + LIBBPF_NLA_U32, /**< 32 bit integer */ + LIBBPF_NLA_U64, /**< 64 bit integer */ + LIBBPF_NLA_STRING, /**< NUL terminated character string */ + LIBBPF_NLA_FLAG, /**< Flag */ + LIBBPF_NLA_MSECS, /**< Micro seconds (64bit) */ + LIBBPF_NLA_NESTED, /**< Nested attributes */ + __LIBBPF_NLA_TYPE_MAX, +}; + +#define LIBBPF_NLA_TYPE_MAX (__LIBBPF_NLA_TYPE_MAX - 1) + +/** + * @ingroup attr + * Attribute validation policy. + * + * See section @core_doc{core_attr_parse,Attribute Parsing} for more details. + */ +struct libbpf_nla_policy { + /** Type of attribute or LIBBPF_NLA_UNSPEC */ + uint16_t type; + + /** Minimal length of payload required */ + uint16_t minlen; + + /** Maximal length of payload allowed */ + uint16_t maxlen; +}; + +struct libbpf_nla_req { + struct nlmsghdr nh; + union { + struct ifinfomsg ifinfo; + struct tcmsg tc; + }; + char buf[128]; +}; + +/** + * @ingroup attr + * Iterate over a stream of attributes + * @arg pos loop counter, set to current attribute + * @arg head head of attribute stream + * @arg len length of attribute stream + * @arg rem initialized to len, holds bytes currently remaining in stream + */ +#define libbpf_nla_for_each_attr(pos, head, len, rem) \ + for (pos = head, rem = len; \ + nla_ok(pos, rem); \ + pos = nla_next(pos, &(rem))) + +/** + * libbpf_nla_data - head of payload + * @nla: netlink attribute + */ +static inline void *libbpf_nla_data(const struct nlattr *nla) +{ + return (void *)nla + NLA_HDRLEN; +} + +static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla) +{ + return *(uint8_t *)libbpf_nla_data(nla); +} + +static inline uint32_t libbpf_nla_getattr_u32(const struct nlattr *nla) +{ + return *(uint32_t *)libbpf_nla_data(nla); +} + +static inline const char *libbpf_nla_getattr_str(const struct nlattr *nla) +{ + return (const char *)libbpf_nla_data(nla); +} + +/** + * libbpf_nla_len - length of payload + * @nla: netlink attribute + */ +static inline int libbpf_nla_len(const struct nlattr *nla) +{ + return nla->nla_len - NLA_HDRLEN; +} + +int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, + int len, struct libbpf_nla_policy *policy); +int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype, + struct nlattr *nla, + struct libbpf_nla_policy *policy); + +int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh); + +static inline struct nlattr *nla_data(struct nlattr *nla) +{ + return (struct nlattr *)((void *)nla + NLA_HDRLEN); +} + +static inline struct nlattr *req_tail(struct libbpf_nla_req *req) +{ + return (struct nlattr *)((void *)req + NLMSG_ALIGN(req->nh.nlmsg_len)); +} + +static inline int nlattr_add(struct libbpf_nla_req *req, int type, + const void *data, int len) +{ + struct nlattr *nla; + + if (NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(NLA_HDRLEN + len) > sizeof(*req)) + return -EMSGSIZE; + if (!!data != !!len) + return -EINVAL; + + nla = req_tail(req); + nla->nla_type = type; + nla->nla_len = NLA_HDRLEN + len; + if (data) + memcpy(nla_data(nla), data, len); + req->nh.nlmsg_len = NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(nla->nla_len); + return 0; +} + +static inline struct nlattr *nlattr_begin_nested(struct libbpf_nla_req *req, int type) +{ + struct nlattr *tail; + + tail = req_tail(req); + if (nlattr_add(req, type | NLA_F_NESTED, NULL, 0)) + return NULL; + return tail; +} + +static inline void nlattr_end_nested(struct libbpf_nla_req *req, + struct nlattr *tail) +{ + tail->nla_len = (void *)req_tail(req) - (void *)tail; +} + +#endif /* __LIBBPF_NLATTR_H */ diff --git a/src/relo_core.c b/src/relo_core.c new file mode 100644 index 0000000..c4b0e81 --- /dev/null +++ b/src/relo_core.c @@ -0,0 +1,1690 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2019 Facebook */ + +#ifdef __KERNEL__ +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/string.h> +#include <linux/bpf_verifier.h> +#include "relo_core.h" + +static const char *btf_kind_str(const struct btf_type *t) +{ + return btf_type_str(t); +} + +static bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +static const struct btf_type * +skip_mods_and_typedefs(const struct btf *btf, u32 id, u32 *res_id) +{ + return btf_type_skip_modifiers(btf, id, res_id); +} + +static const char *btf__name_by_offset(const struct btf *btf, u32 offset) +{ + return btf_name_by_offset(btf, offset); +} + +static s64 btf__resolve_size(const struct btf *btf, u32 type_id) +{ + const struct btf_type *t; + int size; + + t = btf_type_by_id(btf, type_id); + t = btf_resolve_size(btf, t, &size); + if (IS_ERR(t)) + return PTR_ERR(t); + return size; +} + +enum libbpf_print_level { + LIBBPF_WARN, + LIBBPF_INFO, + LIBBPF_DEBUG, +}; + +#undef pr_warn +#undef pr_info +#undef pr_debug +#define pr_warn(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_info(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_debug(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define libbpf_print(level, fmt, ...) bpf_log((void *)prog_name, fmt, ##__VA_ARGS__) +#else +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <ctype.h> +#include <linux/err.h> + +#include "libbpf.h" +#include "bpf.h" +#include "btf.h" +#include "str_error.h" +#include "libbpf_internal.h" +#endif + +static bool is_flex_arr(const struct btf *btf, + const struct bpf_core_accessor *acc, + const struct btf_array *arr) +{ + const struct btf_type *t; + + /* not a flexible array, if not inside a struct or has non-zero size */ + if (!acc->name || arr->nelems > 0) + return false; + + /* has to be the last member of enclosing struct */ + t = btf_type_by_id(btf, acc->type_id); + return acc->idx == btf_vlen(t) - 1; +} + +static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: return "byte_off"; + case BPF_CORE_FIELD_BYTE_SIZE: return "byte_sz"; + case BPF_CORE_FIELD_EXISTS: return "field_exists"; + case BPF_CORE_FIELD_SIGNED: return "signed"; + case BPF_CORE_FIELD_LSHIFT_U64: return "lshift_u64"; + case BPF_CORE_FIELD_RSHIFT_U64: return "rshift_u64"; + case BPF_CORE_TYPE_ID_LOCAL: return "local_type_id"; + case BPF_CORE_TYPE_ID_TARGET: return "target_type_id"; + case BPF_CORE_TYPE_EXISTS: return "type_exists"; + case BPF_CORE_TYPE_MATCHES: return "type_matches"; + case BPF_CORE_TYPE_SIZE: return "type_size"; + case BPF_CORE_ENUMVAL_EXISTS: return "enumval_exists"; + case BPF_CORE_ENUMVAL_VALUE: return "enumval_value"; + default: return "unknown"; + } +} + +static bool core_relo_is_field_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: + case BPF_CORE_FIELD_BYTE_SIZE: + case BPF_CORE_FIELD_EXISTS: + case BPF_CORE_FIELD_SIGNED: + case BPF_CORE_FIELD_LSHIFT_U64: + case BPF_CORE_FIELD_RSHIFT_U64: + return true; + default: + return false; + } +} + +static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_TYPE_ID_LOCAL: + case BPF_CORE_TYPE_ID_TARGET: + case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: + case BPF_CORE_TYPE_SIZE: + return true; + default: + return false; + } +} + +static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_ENUMVAL_EXISTS: + case BPF_CORE_ENUMVAL_VALUE: + return true; + default: + return false; + } +} + +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_ENUM64: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + skip_mods_and_typedefs(local_btf, local_p->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + skip_mods_and_typedefs(local_btf, local_type->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_type), local_id, targ_id); + return 0; + } +} + +/* + * Turn bpf_core_relo into a low- and high-level spec representation, + * validating correctness along the way, as well as calculating resulting + * field bit offset, specified by accessor string. Low-level spec captures + * every single level of nestedness, including traversing anonymous + * struct/union members. High-level one only captures semantically meaningful + * "turning points": named fields and array indicies. + * E.g., for this case: + * + * struct sample { + * int __unimportant; + * struct { + * int __1; + * int __2; + * int a[7]; + * }; + * }; + * + * struct sample *s = ...; + * + * int x = &s->a[3]; // access string = '0:1:2:3' + * + * Low-level spec has 1:1 mapping with each element of access string (it's + * just a parsed access string representation): [0, 1, 2, 3]. + * + * High-level spec will capture only 3 points: + * - initial zero-index access by pointer (&s->... is the same as &s[0]...); + * - field 'a' access (corresponds to '2' in low-level spec); + * - array element #3 access (corresponds to '3' in low-level spec). + * + * Type-based relocations (TYPE_EXISTS/TYPE_MATCHES/TYPE_SIZE, + * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their + * spec and raw_spec are kept empty. + * + * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access + * string to specify enumerator's value index that need to be relocated. + */ +int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, + const struct bpf_core_relo *relo, + struct bpf_core_spec *spec) +{ + int access_idx, parsed_len, i; + struct bpf_core_accessor *acc; + const struct btf_type *t; + const char *name, *spec_str; + __u32 id, name_off; + __s64 sz; + + spec_str = btf__name_by_offset(btf, relo->access_str_off); + if (str_is_empty(spec_str) || *spec_str == ':') + return -EINVAL; + + memset(spec, 0, sizeof(*spec)); + spec->btf = btf; + spec->root_type_id = relo->type_id; + spec->relo_kind = relo->kind; + + /* type-based relocations don't have a field access string */ + if (core_relo_is_type_based(relo->kind)) { + if (strcmp(spec_str, "0")) + return -EINVAL; + return 0; + } + + /* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */ + while (*spec_str) { + if (*spec_str == ':') + ++spec_str; + if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1) + return -EINVAL; + if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + spec_str += parsed_len; + spec->raw_spec[spec->raw_len++] = access_idx; + } + + if (spec->raw_len == 0) + return -EINVAL; + + t = skip_mods_and_typedefs(btf, relo->type_id, &id); + if (!t) + return -EINVAL; + + access_idx = spec->raw_spec[0]; + acc = &spec->spec[0]; + acc->type_id = id; + acc->idx = access_idx; + spec->len++; + + if (core_relo_is_enumval_based(relo->kind)) { + if (!btf_is_any_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) + return -EINVAL; + + /* record enumerator name in a first accessor */ + name_off = btf_is_enum(t) ? btf_enum(t)[access_idx].name_off + : btf_enum64(t)[access_idx].name_off; + acc->name = btf__name_by_offset(btf, name_off); + return 0; + } + + if (!core_relo_is_field_based(relo->kind)) + return -EINVAL; + + sz = btf__resolve_size(btf, id); + if (sz < 0) + return sz; + spec->bit_offset = access_idx * sz * 8; + + for (i = 1; i < spec->raw_len; i++) { + t = skip_mods_and_typedefs(btf, id, &id); + if (!t) + return -EINVAL; + + access_idx = spec->raw_spec[i]; + acc = &spec->spec[spec->len]; + + if (btf_is_composite(t)) { + const struct btf_member *m; + __u32 bit_offset; + + if (access_idx >= btf_vlen(t)) + return -EINVAL; + + bit_offset = btf_member_bit_offset(t, access_idx); + spec->bit_offset += bit_offset; + + m = btf_members(t) + access_idx; + if (m->name_off) { + name = btf__name_by_offset(btf, m->name_off); + if (str_is_empty(name)) + return -EINVAL; + + acc->type_id = id; + acc->idx = access_idx; + acc->name = name; + spec->len++; + } + + id = m->type; + } else if (btf_is_array(t)) { + const struct btf_array *a = btf_array(t); + bool flex; + + t = skip_mods_and_typedefs(btf, a->type, &id); + if (!t) + return -EINVAL; + + flex = is_flex_arr(btf, acc - 1, a); + if (!flex && access_idx >= a->nelems) + return -EINVAL; + + spec->spec[spec->len].type_id = id; + spec->spec[spec->len].idx = access_idx; + spec->len++; + + sz = btf__resolve_size(btf, id); + if (sz < 0) + return sz; + spec->bit_offset += access_idx * sz * 8; + } else { + pr_warn("prog '%s': relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", + prog_name, relo->type_id, spec_str, i, id, btf_kind_str(t)); + return -EINVAL; + } + } + + return 0; +} + +/* Check two types for compatibility for the purpose of field access + * relocation. const/volatile/restrict and typedefs are skipped to ensure we + * are relocating semantically compatible entities: + * - any two STRUCTs/UNIONs are compatible and can be mixed; + * - any two FWDs are compatible, if their names match (modulo flavor suffix); + * - any two PTRs are always compatible; + * - for ENUMs, names should be the same (ignoring flavor suffix) or at + * least one of enums should be anonymous; + * - for ENUMs, check sizes, names are ignored; + * - for INT, size and signedness are ignored; + * - any two FLOATs are always compatible; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - everything else shouldn't be ever a target of relocation. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ +static int bpf_core_fields_are_compat(const struct btf *local_btf, + __u32 local_id, + const struct btf *targ_btf, + __u32 targ_id) +{ + const struct btf_type *local_type, *targ_type; + +recur: + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (btf_is_composite(local_type) && btf_is_composite(targ_type)) + return 1; + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_PTR: + case BTF_KIND_FLOAT: + return 1; + case BTF_KIND_FWD: + case BTF_KIND_ENUM64: + case BTF_KIND_ENUM: { + const char *local_name, *targ_name; + size_t local_len, targ_len; + + local_name = btf__name_by_offset(local_btf, + local_type->name_off); + targ_name = btf__name_by_offset(targ_btf, targ_type->name_off); + local_len = bpf_core_essential_name_len(local_name); + targ_len = bpf_core_essential_name_len(targ_name); + /* one of them is anonymous or both w/ same flavor-less names */ + return local_len == 0 || targ_len == 0 || + (local_len == targ_len && + strncmp(local_name, targ_name, local_len) == 0); + } + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && + btf_int_offset(targ_type) == 0; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + default: + return 0; + } +} + +/* + * Given single high-level named field accessor in local type, find + * corresponding high-level accessor for a target type. Along the way, + * maintain low-level spec for target as well. Also keep updating target + * bit offset. + * + * Searching is performed through recursive exhaustive enumeration of all + * fields of a struct/union. If there are any anonymous (embedded) + * structs/unions, they are recursively searched as well. If field with + * desired name is found, check compatibility between local and target types, + * before returning result. + * + * 1 is returned, if field is found. + * 0 is returned if no compatible field is found. + * <0 is returned on error. + */ +static int bpf_core_match_member(const struct btf *local_btf, + const struct bpf_core_accessor *local_acc, + const struct btf *targ_btf, + __u32 targ_id, + struct bpf_core_spec *spec, + __u32 *next_targ_id) +{ + const struct btf_type *local_type, *targ_type; + const struct btf_member *local_member, *m; + const char *local_name, *targ_name; + __u32 local_id; + int i, n, found; + + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!targ_type) + return -EINVAL; + if (!btf_is_composite(targ_type)) + return 0; + + local_id = local_acc->type_id; + local_type = btf_type_by_id(local_btf, local_id); + local_member = btf_members(local_type) + local_acc->idx; + local_name = btf__name_by_offset(local_btf, local_member->name_off); + + n = btf_vlen(targ_type); + m = btf_members(targ_type); + for (i = 0; i < n; i++, m++) { + __u32 bit_offset; + + bit_offset = btf_member_bit_offset(targ_type, i); + + /* too deep struct/union/array nesting */ + if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + + /* speculate this member will be the good one */ + spec->bit_offset += bit_offset; + spec->raw_spec[spec->raw_len++] = i; + + targ_name = btf__name_by_offset(targ_btf, m->name_off); + if (str_is_empty(targ_name)) { + /* embedded struct/union, we need to go deeper */ + found = bpf_core_match_member(local_btf, local_acc, + targ_btf, m->type, + spec, next_targ_id); + if (found) /* either found or error */ + return found; + } else if (strcmp(local_name, targ_name) == 0) { + /* matching named field */ + struct bpf_core_accessor *targ_acc; + + targ_acc = &spec->spec[spec->len++]; + targ_acc->type_id = targ_id; + targ_acc->idx = i; + targ_acc->name = targ_name; + + *next_targ_id = m->type; + found = bpf_core_fields_are_compat(local_btf, + local_member->type, + targ_btf, m->type); + if (!found) + spec->len--; /* pop accessor */ + return found; + } + /* member turned out not to be what we looked for */ + spec->bit_offset -= bit_offset; + spec->raw_len--; + } + + return 0; +} + +/* + * Try to match local spec to a target type and, if successful, produce full + * target spec (high-level, low-level + bit offset). + */ +static int bpf_core_spec_match(struct bpf_core_spec *local_spec, + const struct btf *targ_btf, __u32 targ_id, + struct bpf_core_spec *targ_spec) +{ + const struct btf_type *targ_type; + const struct bpf_core_accessor *local_acc; + struct bpf_core_accessor *targ_acc; + int i, sz, matched; + __u32 name_off; + + memset(targ_spec, 0, sizeof(*targ_spec)); + targ_spec->btf = targ_btf; + targ_spec->root_type_id = targ_id; + targ_spec->relo_kind = local_spec->relo_kind; + + if (core_relo_is_type_based(local_spec->relo_kind)) { + if (local_spec->relo_kind == BPF_CORE_TYPE_MATCHES) + return bpf_core_types_match(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + else + return bpf_core_types_are_compat(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + } + + local_acc = &local_spec->spec[0]; + targ_acc = &targ_spec->spec[0]; + + if (core_relo_is_enumval_based(local_spec->relo_kind)) { + size_t local_essent_len, targ_essent_len; + const char *targ_name; + + /* has to resolve to an enum */ + targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id); + if (!btf_is_any_enum(targ_type)) + return 0; + + local_essent_len = bpf_core_essential_name_len(local_acc->name); + + for (i = 0; i < btf_vlen(targ_type); i++) { + if (btf_is_enum(targ_type)) + name_off = btf_enum(targ_type)[i].name_off; + else + name_off = btf_enum64(targ_type)[i].name_off; + + targ_name = btf__name_by_offset(targ_spec->btf, name_off); + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != local_essent_len) + continue; + if (strncmp(local_acc->name, targ_name, local_essent_len) == 0) { + targ_acc->type_id = targ_id; + targ_acc->idx = i; + targ_acc->name = targ_name; + targ_spec->len++; + targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; + targ_spec->raw_len++; + return 1; + } + } + return 0; + } + + if (!core_relo_is_field_based(local_spec->relo_kind)) + return -EINVAL; + + for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) { + targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, + &targ_id); + if (!targ_type) + return -EINVAL; + + if (local_acc->name) { + matched = bpf_core_match_member(local_spec->btf, + local_acc, + targ_btf, targ_id, + targ_spec, &targ_id); + if (matched <= 0) + return matched; + } else { + /* for i=0, targ_id is already treated as array element + * type (because it's the original struct), for others + * we should find array element type first + */ + if (i > 0) { + const struct btf_array *a; + bool flex; + + if (!btf_is_array(targ_type)) + return 0; + + a = btf_array(targ_type); + flex = is_flex_arr(targ_btf, targ_acc - 1, a); + if (!flex && local_acc->idx >= a->nelems) + return 0; + if (!skip_mods_and_typedefs(targ_btf, a->type, + &targ_id)) + return -EINVAL; + } + + /* too deep struct/union/array nesting */ + if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + + targ_acc->type_id = targ_id; + targ_acc->idx = local_acc->idx; + targ_acc->name = NULL; + targ_spec->len++; + targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; + targ_spec->raw_len++; + + sz = btf__resolve_size(targ_btf, targ_id); + if (sz < 0) + return sz; + targ_spec->bit_offset += local_acc->idx * sz * 8; + } + } + + return 1; +} + +static int bpf_core_calc_field_relo(const char *prog_name, + const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u64 *val, __u32 *field_sz, __u32 *type_id, + bool *validate) +{ + const struct bpf_core_accessor *acc; + const struct btf_type *t; + __u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id; + const struct btf_member *m; + const struct btf_type *mt; + bool bitfield; + __s64 sz; + + *field_sz = 0; + + if (relo->kind == BPF_CORE_FIELD_EXISTS) { + *val = spec ? 1 : 0; + return 0; + } + + if (!spec) + return -EUCLEAN; /* request instruction poisoning */ + + acc = &spec->spec[spec->len - 1]; + t = btf_type_by_id(spec->btf, acc->type_id); + + /* a[n] accessor needs special handling */ + if (!acc->name) { + if (relo->kind == BPF_CORE_FIELD_BYTE_OFFSET) { + *val = spec->bit_offset / 8; + /* remember field size for load/store mem size */ + sz = btf__resolve_size(spec->btf, acc->type_id); + if (sz < 0) + return -EINVAL; + *field_sz = sz; + *type_id = acc->type_id; + } else if (relo->kind == BPF_CORE_FIELD_BYTE_SIZE) { + sz = btf__resolve_size(spec->btf, acc->type_id); + if (sz < 0) + return -EINVAL; + *val = sz; + } else { + pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n", + prog_name, relo->kind, relo->insn_off / 8); + return -EINVAL; + } + if (validate) + *validate = true; + return 0; + } + + m = btf_members(t) + acc->idx; + mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id); + bit_off = spec->bit_offset; + bit_sz = btf_member_bitfield_size(t, acc->idx); + + bitfield = bit_sz > 0; + if (bitfield) { + byte_sz = mt->size; + byte_off = bit_off / 8 / byte_sz * byte_sz; + /* figure out smallest int size necessary for bitfield load */ + while (bit_off + bit_sz - byte_off * 8 > byte_sz * 8) { + if (byte_sz >= 8) { + /* bitfield can't be read with 64-bit read */ + pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n", + prog_name, relo->kind, relo->insn_off / 8); + return -E2BIG; + } + byte_sz *= 2; + byte_off = bit_off / 8 / byte_sz * byte_sz; + } + } else { + sz = btf__resolve_size(spec->btf, field_type_id); + if (sz < 0) + return -EINVAL; + byte_sz = sz; + byte_off = spec->bit_offset / 8; + bit_sz = byte_sz * 8; + } + + /* for bitfields, all the relocatable aspects are ambiguous and we + * might disagree with compiler, so turn off validation of expected + * value, except for signedness + */ + if (validate) + *validate = !bitfield; + + switch (relo->kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: + *val = byte_off; + if (!bitfield) { + *field_sz = byte_sz; + *type_id = field_type_id; + } + break; + case BPF_CORE_FIELD_BYTE_SIZE: + *val = byte_sz; + break; + case BPF_CORE_FIELD_SIGNED: + *val = (btf_is_any_enum(mt) && BTF_INFO_KFLAG(mt->info)) || + (btf_int_encoding(mt) & BTF_INT_SIGNED); + if (validate) + *validate = true; /* signedness is never ambiguous */ + break; + case BPF_CORE_FIELD_LSHIFT_U64: +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + *val = 64 - (bit_off + bit_sz - byte_off * 8); +#else + *val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8); +#endif + break; + case BPF_CORE_FIELD_RSHIFT_U64: + *val = 64 - bit_sz; + if (validate) + *validate = true; /* right shift is never ambiguous */ + break; + case BPF_CORE_FIELD_EXISTS: + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u64 *val, bool *validate) +{ + __s64 sz; + + /* by default, always check expected value in bpf_insn */ + if (validate) + *validate = true; + + /* type-based relos return zero when target type is not found */ + if (!spec) { + *val = 0; + return 0; + } + + switch (relo->kind) { + case BPF_CORE_TYPE_ID_TARGET: + *val = spec->root_type_id; + /* type ID, embedded in bpf_insn, might change during linking, + * so enforcing it is pointless + */ + if (validate) + *validate = false; + break; + case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: + *val = 1; + break; + case BPF_CORE_TYPE_SIZE: + sz = btf__resolve_size(spec->btf, spec->root_type_id); + if (sz < 0) + return -EINVAL; + *val = sz; + break; + case BPF_CORE_TYPE_ID_LOCAL: + /* BPF_CORE_TYPE_ID_LOCAL is handled specially and shouldn't get here */ + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u64 *val) +{ + const struct btf_type *t; + + switch (relo->kind) { + case BPF_CORE_ENUMVAL_EXISTS: + *val = spec ? 1 : 0; + break; + case BPF_CORE_ENUMVAL_VALUE: + if (!spec) + return -EUCLEAN; /* request instruction poisoning */ + t = btf_type_by_id(spec->btf, spec->spec[0].type_id); + if (btf_is_enum(t)) + *val = btf_enum(t)[spec->spec[0].idx].val; + else + *val = btf_enum64_value(btf_enum64(t) + spec->spec[0].idx); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +/* Calculate original and target relocation values, given local and target + * specs and relocation kind. These values are calculated for each candidate. + * If there are multiple candidates, resulting values should all be consistent + * with each other. Otherwise, libbpf will refuse to proceed due to ambiguity. + * If instruction has to be poisoned, *poison will be set to true. + */ +static int bpf_core_calc_relo(const char *prog_name, + const struct bpf_core_relo *relo, + int relo_idx, + const struct bpf_core_spec *local_spec, + const struct bpf_core_spec *targ_spec, + struct bpf_core_relo_res *res) +{ + int err = -EOPNOTSUPP; + + res->orig_val = 0; + res->new_val = 0; + res->poison = false; + res->validate = true; + res->fail_memsz_adjust = false; + res->orig_sz = res->new_sz = 0; + res->orig_type_id = res->new_type_id = 0; + + if (core_relo_is_field_based(relo->kind)) { + err = bpf_core_calc_field_relo(prog_name, relo, local_spec, + &res->orig_val, &res->orig_sz, + &res->orig_type_id, &res->validate); + err = err ?: bpf_core_calc_field_relo(prog_name, relo, targ_spec, + &res->new_val, &res->new_sz, + &res->new_type_id, NULL); + if (err) + goto done; + /* Validate if it's safe to adjust load/store memory size. + * Adjustments are performed only if original and new memory + * sizes differ. + */ + res->fail_memsz_adjust = false; + if (res->orig_sz != res->new_sz) { + const struct btf_type *orig_t, *new_t; + + orig_t = btf_type_by_id(local_spec->btf, res->orig_type_id); + new_t = btf_type_by_id(targ_spec->btf, res->new_type_id); + + /* There are two use cases in which it's safe to + * adjust load/store's mem size: + * - reading a 32-bit kernel pointer, while on BPF + * size pointers are always 64-bit; in this case + * it's safe to "downsize" instruction size due to + * pointer being treated as unsigned integer with + * zero-extended upper 32-bits; + * - reading unsigned integers, again due to + * zero-extension is preserving the value correctly. + * + * In all other cases it's incorrect to attempt to + * load/store field because read value will be + * incorrect, so we poison relocated instruction. + */ + if (btf_is_ptr(orig_t) && btf_is_ptr(new_t)) + goto done; + if (btf_is_int(orig_t) && btf_is_int(new_t) && + btf_int_encoding(orig_t) != BTF_INT_SIGNED && + btf_int_encoding(new_t) != BTF_INT_SIGNED) + goto done; + + /* mark as invalid mem size adjustment, but this will + * only be checked for LDX/STX/ST insns + */ + res->fail_memsz_adjust = true; + } + } else if (core_relo_is_type_based(relo->kind)) { + err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val, &res->validate); + err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val, NULL); + } else if (core_relo_is_enumval_based(relo->kind)) { + err = bpf_core_calc_enumval_relo(relo, local_spec, &res->orig_val); + err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val); + } + +done: + if (err == -EUCLEAN) { + /* EUCLEAN is used to signal instruction poisoning request */ + res->poison = true; + err = 0; + } else if (err == -EOPNOTSUPP) { + /* EOPNOTSUPP means unknown/unsupported relocation */ + pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n", + prog_name, relo_idx, core_relo_kind_str(relo->kind), + relo->kind, relo->insn_off / 8); + } + + return err; +} + +/* + * Turn instruction for which CO_RE relocation failed into invalid one with + * distinct signature. + */ +static void bpf_core_poison_insn(const char *prog_name, int relo_idx, + int insn_idx, struct bpf_insn *insn) +{ + pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n", + prog_name, relo_idx, insn_idx); + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with the following message: + * invalid func unknown#195896080 + */ + insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ +} + +static int insn_bpf_size_to_bytes(struct bpf_insn *insn) +{ + switch (BPF_SIZE(insn->code)) { + case BPF_DW: return 8; + case BPF_W: return 4; + case BPF_H: return 2; + case BPF_B: return 1; + default: return -1; + } +} + +static int insn_bytes_to_bpf_size(__u32 sz) +{ + switch (sz) { + case 8: return BPF_DW; + case 4: return BPF_W; + case 2: return BPF_H; + case 1: return BPF_B; + default: return -1; + } +} + +/* + * Patch relocatable BPF instruction. + * + * Patched value is determined by relocation kind and target specification. + * For existence relocations target spec will be NULL if field/type is not found. + * Expected insn->imm value is determined using relocation kind and local + * spec, and is checked before patching instruction. If actual insn->imm value + * is wrong, bail out with error. + * + * Currently supported classes of BPF instruction are: + * 1. rX = <imm> (assignment with immediate operand); + * 2. rX += <imm> (arithmetic operations with immediate operand); + * 3. rX = <imm64> (load with 64-bit immediate value); + * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64}; + * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64}; + * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}. + */ +int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, const struct bpf_core_relo *relo, + int relo_idx, const struct bpf_core_relo_res *res) +{ + __u64 orig_val, new_val; + __u8 class; + + class = BPF_CLASS(insn->code); + + if (res->poison) { +poison: + /* poison second part of ldimm64 to avoid confusing error from + * verifier about "unknown opcode 00" + */ + if (is_ldimm64_insn(insn)) + bpf_core_poison_insn(prog_name, relo_idx, insn_idx + 1, insn + 1); + bpf_core_poison_insn(prog_name, relo_idx, insn_idx, insn); + return 0; + } + + orig_val = res->orig_val; + new_val = res->new_val; + + switch (class) { + case BPF_ALU: + case BPF_ALU64: + if (BPF_SRC(insn->code) != BPF_K) + return -EINVAL; + if (res->validate && insn->imm != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %llu -> %llu\n", + prog_name, relo_idx, + insn_idx, insn->imm, (unsigned long long)orig_val, + (unsigned long long)new_val); + return -EINVAL; + } + orig_val = insn->imm; + insn->imm = new_val; + pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %llu -> %llu\n", + prog_name, relo_idx, insn_idx, + (unsigned long long)orig_val, (unsigned long long)new_val); + break; + case BPF_LDX: + case BPF_ST: + case BPF_STX: + if (res->validate && insn->off != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %llu -> %llu\n", + prog_name, relo_idx, insn_idx, insn->off, (unsigned long long)orig_val, + (unsigned long long)new_val); + return -EINVAL; + } + if (new_val > SHRT_MAX) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)new_val); + return -ERANGE; + } + if (res->fail_memsz_adjust) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. " + "Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n", + prog_name, relo_idx, insn_idx); + goto poison; + } + + orig_val = insn->off; + insn->off = new_val; + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %llu -> %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)orig_val, + (unsigned long long)new_val); + + if (res->new_sz != res->orig_sz) { + int insn_bytes_sz, insn_bpf_sz; + + insn_bytes_sz = insn_bpf_size_to_bytes(insn); + if (insn_bytes_sz != res->orig_sz) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n", + prog_name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz); + return -EINVAL; + } + + insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz); + if (insn_bpf_sz < 0) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n", + prog_name, relo_idx, insn_idx, res->new_sz); + return -EINVAL; + } + + insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code); + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n", + prog_name, relo_idx, insn_idx, res->orig_sz, res->new_sz); + } + break; + case BPF_LD: { + __u64 imm; + + if (!is_ldimm64_insn(insn) || + insn[0].src_reg != 0 || insn[0].off != 0 || + insn[1].code != 0 || insn[1].dst_reg != 0 || + insn[1].src_reg != 0 || insn[1].off != 0) { + pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n", + prog_name, relo_idx, insn_idx); + return -EINVAL; + } + + imm = (__u32)insn[0].imm | ((__u64)insn[1].imm << 32); + if (res->validate && imm != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %llu -> %llu\n", + prog_name, relo_idx, + insn_idx, (unsigned long long)imm, + (unsigned long long)orig_val, (unsigned long long)new_val); + return -EINVAL; + } + + insn[0].imm = new_val; + insn[1].imm = new_val >> 32; + pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %llu\n", + prog_name, relo_idx, insn_idx, + (unsigned long long)imm, (unsigned long long)new_val); + break; + } + default: + pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n", + prog_name, relo_idx, insn_idx, insn->code, + insn->src_reg, insn->dst_reg, insn->off, insn->imm); + return -EINVAL; + } + + return 0; +} + +/* Output spec definition in the format: + * [<type-id>] (<type-name>) + <raw-spec> => <offset>@<spec>, + * where <spec> is a C-syntax view of recorded field access, e.g.: x.a[3].b + */ +int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *spec) +{ + const struct btf_type *t; + const char *s; + __u32 type_id; + int i, len = 0; + +#define append_buf(fmt, args...) \ + ({ \ + int r; \ + r = snprintf(buf, buf_sz, fmt, ##args); \ + len += r; \ + if (r >= buf_sz) \ + r = buf_sz; \ + buf += r; \ + buf_sz -= r; \ + }) + + type_id = spec->root_type_id; + t = btf_type_by_id(spec->btf, type_id); + s = btf__name_by_offset(spec->btf, t->name_off); + + append_buf("<%s> [%u] %s %s", + core_relo_kind_str(spec->relo_kind), + type_id, btf_kind_str(t), str_is_empty(s) ? "<anon>" : s); + + if (core_relo_is_type_based(spec->relo_kind)) + return len; + + if (core_relo_is_enumval_based(spec->relo_kind)) { + t = skip_mods_and_typedefs(spec->btf, type_id, NULL); + if (btf_is_enum(t)) { + const struct btf_enum *e; + const char *fmt_str; + + e = btf_enum(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %d" : "::%s = %u"; + append_buf(fmt_str, s, e->val); + } else { + const struct btf_enum64 *e; + const char *fmt_str; + + e = btf_enum64(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %lld" : "::%s = %llu"; + append_buf(fmt_str, s, (unsigned long long)btf_enum64_value(e)); + } + return len; + } + + if (core_relo_is_field_based(spec->relo_kind)) { + for (i = 0; i < spec->len; i++) { + if (spec->spec[i].name) + append_buf(".%s", spec->spec[i].name); + else if (i > 0 || spec->spec[i].idx > 0) + append_buf("[%u]", spec->spec[i].idx); + } + + append_buf(" ("); + for (i = 0; i < spec->raw_len; i++) + append_buf("%s%d", i == 0 ? "" : ":", spec->raw_spec[i]); + + if (spec->bit_offset % 8) + append_buf(" @ offset %u.%u)", spec->bit_offset / 8, spec->bit_offset % 8); + else + append_buf(" @ offset %u)", spec->bit_offset / 8); + return len; + } + + return len; +#undef append_buf +} + +/* + * Calculate CO-RE relocation target result. + * + * The outline and important points of the algorithm: + * 1. For given local type, find corresponding candidate target types. + * Candidate type is a type with the same "essential" name, ignoring + * everything after last triple underscore (___). E.g., `sample`, + * `sample___flavor_one`, `sample___flavor_another_one`, are all candidates + * for each other. Names with triple underscore are referred to as + * "flavors" and are useful, among other things, to allow to + * specify/support incompatible variations of the same kernel struct, which + * might differ between different kernel versions and/or build + * configurations. + * + * N.B. Struct "flavors" could be generated by bpftool's BTF-to-C + * converter, when deduplicated BTF of a kernel still contains more than + * one different types with the same name. In that case, ___2, ___3, etc + * are appended starting from second name conflict. But start flavors are + * also useful to be defined "locally", in BPF program, to extract same + * data from incompatible changes between different kernel + * versions/configurations. For instance, to handle field renames between + * kernel versions, one can use two flavors of the struct name with the + * same common name and use conditional relocations to extract that field, + * depending on target kernel version. + * 2. For each candidate type, try to match local specification to this + * candidate target type. Matching involves finding corresponding + * high-level spec accessors, meaning that all named fields should match, + * as well as all array accesses should be within the actual bounds. Also, + * types should be compatible (see bpf_core_fields_are_compat for details). + * 3. It is supported and expected that there might be multiple flavors + * matching the spec. As long as all the specs resolve to the same set of + * offsets across all candidates, there is no error. If there is any + * ambiguity, CO-RE relocation will fail. This is necessary to accommodate + * imperfection of BTF deduplication, which can cause slight duplication of + * the same BTF type, if some directly or indirectly referenced (by + * pointer) type gets resolved to different actual types in different + * object files. If such a situation occurs, deduplicated BTF will end up + * with two (or more) structurally identical types, which differ only in + * types they refer to through pointer. This should be OK in most cases and + * is not an error. + * 4. Candidate types search is performed by linearly scanning through all + * types in target BTF. It is anticipated that this is overall more + * efficient memory-wise and not significantly worse (if not better) + * CPU-wise compared to prebuilding a map from all local type names to + * a list of candidate type names. It's also sped up by caching resolved + * list of matching candidates per each local "root" type ID, that has at + * least one bpf_core_relo associated with it. This list is shared + * between multiple relocations for the same type ID and is updated as some + * of the candidates are pruned due to structural incompatibility. + */ +int bpf_core_calc_relo_insn(const char *prog_name, + const struct bpf_core_relo *relo, + int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch, + struct bpf_core_relo_res *targ_res) +{ + struct bpf_core_spec *local_spec = &specs_scratch[0]; + struct bpf_core_spec *cand_spec = &specs_scratch[1]; + struct bpf_core_spec *targ_spec = &specs_scratch[2]; + struct bpf_core_relo_res cand_res; + const struct btf_type *local_type; + const char *local_name; + __u32 local_id; + char spec_buf[256]; + int i, j, err; + + local_id = relo->type_id; + local_type = btf_type_by_id(local_btf, local_id); + local_name = btf__name_by_offset(local_btf, local_type->name_off); + if (!local_name) + return -EINVAL; + + err = bpf_core_parse_spec(prog_name, local_btf, relo, local_spec); + if (err) { + const char *spec_str; + + spec_str = btf__name_by_offset(local_btf, relo->access_str_off); + pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", + prog_name, relo_idx, local_id, btf_kind_str(local_type), + str_is_empty(local_name) ? "<anon>" : local_name, + spec_str ?: "<?>", err); + return -EINVAL; + } + + bpf_core_format_spec(spec_buf, sizeof(spec_buf), local_spec); + pr_debug("prog '%s': relo #%d: %s\n", prog_name, relo_idx, spec_buf); + + /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ + if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) { + /* bpf_insn's imm value could get out of sync during linking */ + memset(targ_res, 0, sizeof(*targ_res)); + targ_res->validate = false; + targ_res->poison = false; + targ_res->orig_val = local_spec->root_type_id; + targ_res->new_val = local_spec->root_type_id; + return 0; + } + + /* libbpf doesn't support candidate search for anonymous types */ + if (str_is_empty(local_name)) { + pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n", + prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); + return -EOPNOTSUPP; + } + + for (i = 0, j = 0; i < cands->len; i++) { + err = bpf_core_spec_match(local_spec, cands->cands[i].btf, + cands->cands[i].id, cand_spec); + if (err < 0) { + bpf_core_format_spec(spec_buf, sizeof(spec_buf), cand_spec); + pr_warn("prog '%s': relo #%d: error matching candidate #%d %s: %d\n ", + prog_name, relo_idx, i, spec_buf, err); + return err; + } + + bpf_core_format_spec(spec_buf, sizeof(spec_buf), cand_spec); + pr_debug("prog '%s': relo #%d: %s candidate #%d %s\n", prog_name, + relo_idx, err == 0 ? "non-matching" : "matching", i, spec_buf); + + if (err == 0) + continue; + + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, cand_spec, &cand_res); + if (err) + return err; + + if (j == 0) { + *targ_res = cand_res; + *targ_spec = *cand_spec; + } else if (cand_spec->bit_offset != targ_spec->bit_offset) { + /* if there are many field relo candidates, they + * should all resolve to the same bit offset + */ + pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", + prog_name, relo_idx, cand_spec->bit_offset, + targ_spec->bit_offset); + return -EINVAL; + } else if (cand_res.poison != targ_res->poison || + cand_res.new_val != targ_res->new_val) { + /* all candidates should result in the same relocation + * decision and value, otherwise it's dangerous to + * proceed due to ambiguity + */ + pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %llu != %s %llu\n", + prog_name, relo_idx, + cand_res.poison ? "failure" : "success", + (unsigned long long)cand_res.new_val, + targ_res->poison ? "failure" : "success", + (unsigned long long)targ_res->new_val); + return -EINVAL; + } + + cands->cands[j++] = cands->cands[i]; + } + + /* + * For BPF_CORE_FIELD_EXISTS relo or when used BPF program has field + * existence checks or kernel version/config checks, it's expected + * that we might not find any candidates. In this case, if field + * wasn't found in any candidate, the list of candidates shouldn't + * change at all, we'll just handle relocating appropriately, + * depending on relo's kind. + */ + if (j > 0) + cands->len = j; + + /* + * If no candidates were found, it might be both a programmer error, + * as well as expected case, depending whether instruction w/ + * relocation is guarded in some way that makes it unreachable (dead + * code) if relocation can't be resolved. This is handled in + * bpf_core_patch_insn() uniformly by replacing that instruction with + * BPF helper call insn (using invalid helper ID). If that instruction + * is indeed unreachable, then it will be ignored and eliminated by + * verifier. If it was an error, then verifier will complain and point + * to a specific instruction number in its log. + */ + if (j == 0) { + pr_debug("prog '%s': relo #%d: no matching targets found\n", + prog_name, relo_idx); + + /* calculate single target relo result explicitly */ + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, NULL, targ_res); + if (err) + return err; + } + + return 0; +} + +static bool bpf_core_names_match(const struct btf *local_btf, size_t local_name_off, + const struct btf *targ_btf, size_t targ_name_off) +{ + const char *local_n, *targ_n; + size_t local_len, targ_len; + + local_n = btf__name_by_offset(local_btf, local_name_off); + targ_n = btf__name_by_offset(targ_btf, targ_name_off); + + if (str_is_empty(targ_n)) + return str_is_empty(local_n); + + targ_len = bpf_core_essential_name_len(targ_n); + local_len = bpf_core_essential_name_len(local_n); + + return targ_len == local_len && strncmp(local_n, targ_n, local_len) == 0; +} + +static int bpf_core_enums_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t) +{ + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j; + + if (local_t->size != targ_t->size) + return 0; + + if (local_vlen > targ_vlen) + return 0; + + /* iterate over the local enum's variants and make sure each has + * a symbolic name correspondent in the target + */ + for (i = 0; i < local_vlen; i++) { + bool matched = false; + __u32 local_n_off, targ_n_off; + + local_n_off = btf_is_enum(local_t) ? btf_enum(local_t)[i].name_off : + btf_enum64(local_t)[i].name_off; + + for (j = 0; j < targ_vlen; j++) { + targ_n_off = btf_is_enum(targ_t) ? btf_enum(targ_t)[j].name_off : + btf_enum64(targ_t)[j].name_off; + + if (bpf_core_names_match(local_btf, local_n_off, targ_btf, targ_n_off)) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +static int bpf_core_composites_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t, + bool behind_ptr, int level) +{ + const struct btf_member *local_m = btf_members(local_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j, err; + + if (local_vlen > targ_vlen) + return 0; + + /* check that all local members have a match in the target */ + for (i = 0; i < local_vlen; i++, local_m++) { + const struct btf_member *targ_m = btf_members(targ_t); + bool matched = false; + + for (j = 0; j < targ_vlen; j++, targ_m++) { + if (!bpf_core_names_match(local_btf, local_m->name_off, + targ_btf, targ_m->name_off)) + continue; + + err = __bpf_core_types_match(local_btf, local_m->type, targ_btf, + targ_m->type, behind_ptr, level - 1); + if (err < 0) + return err; + if (err > 0) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +/* Check that two types "match". This function assumes that root types were + * already checked for name match. + * + * The matching relation is defined as follows: + * - modifiers and typedefs are stripped (and, hence, effectively ignored) + * - generally speaking types need to be of same kind (struct vs. struct, union + * vs. union, etc.) + * - exceptions are struct/union behind a pointer which could also match a + * forward declaration of a struct or union, respectively, and enum vs. + * enum64 (see below) + * Then, depending on type: + * - integers: + * - match if size and signedness match + * - arrays & pointers: + * - target types are recursively matched + * - structs & unions: + * - local members need to exist in target with the same name + * - for each member we recursively check match unless it is already behind a + * pointer, in which case we only check matching names and compatible kind + * - enums: + * - local variants have to have a match in target by symbolic name (but not + * numeric value) + * - size has to match (but enum may match enum64 and vice versa) + * - function pointers: + * - number and position of arguments in local type has to match target + * - for each argument and the return value we recursively check match + */ +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level) +{ + const struct btf_type *local_t, *targ_t; + int depth = 32; /* max recursion depth */ + __u16 local_k, targ_k; + + if (level <= 0) + return -EINVAL; + + local_t = btf_type_by_id(local_btf, local_id); + targ_t = btf_type_by_id(targ_btf, targ_id); + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_t = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_t = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_t || !targ_t) + return -EINVAL; + + /* While the name check happens after typedefs are skipped, root-level + * typedefs would still be name-matched as that's the contract with + * callers. + */ + if (!bpf_core_names_match(local_btf, local_t->name_off, targ_btf, targ_t->name_off)) + return 0; + + local_k = btf_kind(local_t); + targ_k = btf_kind(targ_t); + + switch (local_k) { + case BTF_KIND_UNKN: + return local_k == targ_k; + case BTF_KIND_FWD: { + bool local_f = BTF_INFO_KFLAG(local_t->info); + + if (behind_ptr) { + if (local_k == targ_k) + return local_f == BTF_INFO_KFLAG(targ_t->info); + + /* for forward declarations kflag dictates whether the + * target is a struct (0) or union (1) + */ + return (targ_k == BTF_KIND_STRUCT && !local_f) || + (targ_k == BTF_KIND_UNION && local_f); + } else { + if (local_k != targ_k) + return 0; + + /* match if the forward declaration is for the same kind */ + return local_f == BTF_INFO_KFLAG(targ_t->info); + } + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (!btf_is_any_enum(targ_t)) + return 0; + + return bpf_core_enums_match(local_btf, local_t, targ_btf, targ_t); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + if (behind_ptr) { + bool targ_f = BTF_INFO_KFLAG(targ_t->info); + + if (local_k == targ_k) + return 1; + + if (targ_k != BTF_KIND_FWD) + return 0; + + return (local_k == BTF_KIND_UNION) == targ_f; + } else { + if (local_k != targ_k) + return 0; + + return bpf_core_composites_match(local_btf, local_t, targ_btf, targ_t, + behind_ptr, level); + } + case BTF_KIND_INT: { + __u8 local_sgn; + __u8 targ_sgn; + + if (local_k != targ_k) + return 0; + + local_sgn = btf_int_encoding(local_t) & BTF_INT_SIGNED; + targ_sgn = btf_int_encoding(targ_t) & BTF_INT_SIGNED; + + return local_t->size == targ_t->size && local_sgn == targ_sgn; + } + case BTF_KIND_PTR: + if (local_k != targ_k) + return 0; + + behind_ptr = true; + + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + case BTF_KIND_ARRAY: { + const struct btf_array *local_array = btf_array(local_t); + const struct btf_array *targ_array = btf_array(targ_t); + + if (local_k != targ_k) + return 0; + + if (local_array->nelems != targ_array->nelems) + return 0; + + local_id = local_array->type; + targ_id = targ_array->type; + goto recur; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_t); + struct btf_param *targ_p = btf_params(targ_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, err; + + if (local_k != targ_k) + return 0; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + err = __bpf_core_types_match(local_btf, local_p->type, targ_btf, + targ_p->type, behind_ptr, level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_t), local_id, targ_id); + return 0; + } +} diff --git a/src/relo_core.h b/src/relo_core.h new file mode 100644 index 0000000..1c0566d --- /dev/null +++ b/src/relo_core.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2019 Facebook */ + +#ifndef __RELO_CORE_H +#define __RELO_CORE_H + +#include <linux/bpf.h> + +struct bpf_core_cand { + const struct btf *btf; + __u32 id; +}; + +/* dynamically sized list of type IDs and its associated struct btf */ +struct bpf_core_cand_list { + struct bpf_core_cand *cands; + int len; +}; + +#define BPF_CORE_SPEC_MAX_LEN 64 + +/* represents BPF CO-RE field or array element accessor */ +struct bpf_core_accessor { + __u32 type_id; /* struct/union type or array element type */ + __u32 idx; /* field index or array index */ + const char *name; /* field name or NULL for array accessor */ +}; + +struct bpf_core_spec { + const struct btf *btf; + /* high-level spec: named fields and array indices only */ + struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN]; + /* original unresolved (no skip_mods_or_typedefs) root type ID */ + __u32 root_type_id; + /* CO-RE relocation kind */ + enum bpf_core_relo_kind relo_kind; + /* high-level spec length */ + int len; + /* raw, low-level spec: 1-to-1 with accessor spec string */ + int raw_spec[BPF_CORE_SPEC_MAX_LEN]; + /* raw spec length */ + int raw_len; + /* field bit offset represented by spec */ + __u32 bit_offset; +}; + +struct bpf_core_relo_res { + /* expected value in the instruction, unless validate == false */ + __u64 orig_val; + /* new value that needs to be patched up to */ + __u64 new_val; + /* relocation unsuccessful, poison instruction, but don't fail load */ + bool poison; + /* some relocations can't be validated against orig_val */ + bool validate; + /* for field byte offset relocations or the forms: + * *(T *)(rX + <off>) = rY + * rX = *(T *)(rY + <off>), + * we remember original and resolved field size to adjust direct + * memory loads of pointers and integers; this is necessary for 32-bit + * host kernel architectures, but also allows to automatically + * relocate fields that were resized from, e.g., u32 to u64, etc. + */ + bool fail_memsz_adjust; + __u32 orig_sz; + __u32 orig_type_id; + __u32 new_sz; + __u32 new_type_id; +}; + +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level); +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id); +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level); +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id); + +size_t bpf_core_essential_name_len(const char *name); + +int bpf_core_calc_relo_insn(const char *prog_name, + const struct bpf_core_relo *relo, int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch, + struct bpf_core_relo_res *targ_res); + +int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, const struct bpf_core_relo *relo, + int relo_idx, const struct bpf_core_relo_res *res); + +int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, + const struct bpf_core_relo *relo, + struct bpf_core_spec *spec); + +int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *spec); + +#endif diff --git a/src/ringbuf.c b/src/ringbuf.c new file mode 100644 index 0000000..47855af --- /dev/null +++ b/src/ringbuf.c @@ -0,0 +1,587 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Ring buffer operations. + * + * Copyright (C) 2020 Facebook, Inc. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <linux/err.h> +#include <linux/bpf.h> +#include <asm/barrier.h> +#include <sys/mman.h> +#include <sys/epoll.h> +#include <time.h> + +#include "libbpf.h" +#include "libbpf_internal.h" +#include "bpf.h" + +struct ring { + ring_buffer_sample_fn sample_cb; + void *ctx; + void *data; + unsigned long *consumer_pos; + unsigned long *producer_pos; + unsigned long mask; + int map_fd; +}; + +struct ring_buffer { + struct epoll_event *events; + struct ring *rings; + size_t page_size; + int epoll_fd; + int ring_cnt; +}; + +struct user_ring_buffer { + struct epoll_event event; + unsigned long *consumer_pos; + unsigned long *producer_pos; + void *data; + unsigned long mask; + size_t page_size; + int map_fd; + int epoll_fd; +}; + +/* 8-byte ring buffer header structure */ +struct ringbuf_hdr { + __u32 len; + __u32 pad; +}; + +static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) +{ + if (r->consumer_pos) { + munmap(r->consumer_pos, rb->page_size); + r->consumer_pos = NULL; + } + if (r->producer_pos) { + munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1)); + r->producer_pos = NULL; + } +} + +/* Add extra RINGBUF maps to this ring buffer manager */ +int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + struct epoll_event *e; + struct ring *r; + __u64 mmap_sz; + void *tmp; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("ringbuf: failed to get map info for fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + + if (info.type != BPF_MAP_TYPE_RINGBUF) { + pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n", + map_fd); + return libbpf_err(-EINVAL); + } + + tmp = libbpf_reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings)); + if (!tmp) + return libbpf_err(-ENOMEM); + rb->rings = tmp; + + tmp = libbpf_reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events)); + if (!tmp) + return libbpf_err(-ENOMEM); + rb->events = tmp; + + r = &rb->rings[rb->ring_cnt]; + memset(r, 0, sizeof(*r)); + + r->map_fd = map_fd; + r->sample_cb = sample_cb; + r->ctx = ctx; + r->mask = info.max_entries - 1; + + /* Map writable consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + r->consumer_pos = tmp; + + /* Map read-only producer page and data pages. We map twice as big + * data size to allow simple reading of samples that wrap around the + * end of a ring buffer. See kernel implementation for details. + */ + mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; + if (mmap_sz != (__u64)(size_t)mmap_sz) { + pr_warn("ringbuf: ring buffer size (%u) is too big\n", info.max_entries); + return libbpf_err(-E2BIG); + } + tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ, MAP_SHARED, map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + r->producer_pos = tmp; + r->data = tmp + rb->page_size; + + e = &rb->events[rb->ring_cnt]; + memset(e, 0, sizeof(*e)); + + e->events = EPOLLIN; + e->data.fd = rb->ring_cnt; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + + rb->ring_cnt++; + return 0; +} + +void ring_buffer__free(struct ring_buffer *rb) +{ + int i; + + if (!rb) + return; + + for (i = 0; i < rb->ring_cnt; ++i) + ringbuf_unmap_ring(rb, &rb->rings[i]); + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb->events); + free(rb->rings); + free(rb); +} + +struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts) +{ + struct ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = ring_buffer__add(rb, map_fd, sample_cb, ctx); + if (err) + goto err_out; + + return rb; + +err_out: + ring_buffer__free(rb); + return errno = -err, NULL; +} + +static inline int roundup_len(__u32 len) +{ + /* clear out top 2 bits (discard and busy, if set) */ + len <<= 2; + len >>= 2; + /* add length prefix */ + len += BPF_RINGBUF_HDR_SZ; + /* round up to 8 byte alignment */ + return (len + 7) / 8 * 8; +} + +static int64_t ringbuf_process_ring(struct ring *r) +{ + int *len_ptr, len, err; + /* 64-bit to avoid overflow in case of extreme application behavior */ + int64_t cnt = 0; + unsigned long cons_pos, prod_pos; + bool got_new_data; + void *sample; + + cons_pos = smp_load_acquire(r->consumer_pos); + do { + got_new_data = false; + prod_pos = smp_load_acquire(r->producer_pos); + while (cons_pos < prod_pos) { + len_ptr = r->data + (cons_pos & r->mask); + len = smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & BPF_RINGBUF_BUSY_BIT) + goto done; + + got_new_data = true; + cons_pos += roundup_len(len); + + if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) { + sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ; + err = r->sample_cb(r->ctx, sample, len); + if (err < 0) { + /* update consumer pos and bail out */ + smp_store_release(r->consumer_pos, + cons_pos); + return err; + } + cnt++; + } + + smp_store_release(r->consumer_pos, cons_pos); + } + } while (got_new_data); +done: + return cnt; +} + +/* Consume available ring buffer(s) data without event polling. + * Returns number of records consumed across all registered ring buffers (or + * INT_MAX, whichever is less), or negative number if any of the callbacks + * return error. + */ +int ring_buffer__consume(struct ring_buffer *rb) +{ + int64_t err, res = 0; + int i; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = &rb->rings[i]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return libbpf_err(err); + res += err; + } + if (res > INT_MAX) + return INT_MAX; + return res; +} + +/* Poll for available data and consume records, if any are available. + * Returns number of records consumed (or INT_MAX, whichever is less), or + * negative number, if any of the registered callbacks returned error. + */ +int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) +{ + int i, cnt; + int64_t err, res = 0; + + cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms); + if (cnt < 0) + return libbpf_err(-errno); + + for (i = 0; i < cnt; i++) { + __u32 ring_id = rb->events[i].data.fd; + struct ring *ring = &rb->rings[ring_id]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return libbpf_err(err); + res += err; + } + if (res > INT_MAX) + return INT_MAX; + return res; +} + +/* Get an fd that can be used to sleep until data is available in the ring(s) */ +int ring_buffer__epoll_fd(const struct ring_buffer *rb) +{ + return rb->epoll_fd; +} + +static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) +{ + if (rb->consumer_pos) { + munmap(rb->consumer_pos, rb->page_size); + rb->consumer_pos = NULL; + } + if (rb->producer_pos) { + munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1)); + rb->producer_pos = NULL; + } +} + +void user_ring_buffer__free(struct user_ring_buffer *rb) +{ + if (!rb) + return; + + user_ringbuf_unmap_ring(rb); + + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb); +} + +static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + __u64 mmap_sz; + void *tmp; + struct epoll_event *rb_epoll; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err); + return err; + } + + if (info.type != BPF_MAP_TYPE_USER_RINGBUF) { + pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n", map_fd); + return -EINVAL; + } + + rb->map_fd = map_fd; + rb->mask = info.max_entries - 1; + + /* Map read-only consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return err; + } + rb->consumer_pos = tmp; + + /* Map read-write the producer page and data pages. We map the data + * region as twice the total size of the ring buffer to allow the + * simple reading and writing of samples that wrap around the end of + * the buffer. See the kernel implementation for details. + */ + mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; + if (mmap_sz != (__u64)(size_t)mmap_sz) { + pr_warn("user ringbuf: ring buf size (%u) is too big\n", info.max_entries); + return -E2BIG; + } + tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ | PROT_WRITE, MAP_SHARED, + map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return err; + } + + rb->producer_pos = tmp; + rb->data = tmp + rb->page_size; + + rb_epoll = &rb->event; + rb_epoll->events = EPOLLOUT; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) { + err = -errno; + pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err); + return err; + } + + return 0; +} + +struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts) +{ + struct user_ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, user_ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("user ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = user_ringbuf_map(rb, map_fd); + if (err) + goto err_out; + + return rb; + +err_out: + user_ring_buffer__free(rb); + return errno = -err, NULL; +} + +static void user_ringbuf_commit(struct user_ring_buffer *rb, void *sample, bool discard) +{ + __u32 new_len; + struct ringbuf_hdr *hdr; + uintptr_t hdr_offset; + + hdr_offset = rb->mask + 1 + (sample - rb->data) - BPF_RINGBUF_HDR_SZ; + hdr = rb->data + (hdr_offset & rb->mask); + + new_len = hdr->len & ~BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + __atomic_exchange_n(&hdr->len, new_len, __ATOMIC_ACQ_REL); +} + +void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, true); +} + +void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, false); +} + +void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size) +{ + __u32 avail_size, total_size, max_size; + /* 64-bit to avoid overflow in case of extreme application behavior */ + __u64 cons_pos, prod_pos; + struct ringbuf_hdr *hdr; + + /* The top two bits are used as special flags */ + if (size & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT)) + return errno = E2BIG, NULL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + cons_pos = smp_load_acquire(rb->consumer_pos); + /* Synchronizes with smp_store_release() in user_ringbuf_commit() */ + prod_pos = smp_load_acquire(rb->producer_pos); + + max_size = rb->mask + 1; + avail_size = max_size - (prod_pos - cons_pos); + /* Round up total size to a multiple of 8. */ + total_size = (size + BPF_RINGBUF_HDR_SZ + 7) / 8 * 8; + + if (total_size > max_size) + return errno = E2BIG, NULL; + + if (avail_size < total_size) + return errno = ENOSPC, NULL; + + hdr = rb->data + (prod_pos & rb->mask); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pad = 0; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + smp_store_release(rb->producer_pos, prod_pos + total_size); + + return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask); +} + +static __u64 ns_elapsed_timespec(const struct timespec *start, const struct timespec *end) +{ + __u64 start_ns, end_ns, ns_per_s = 1000000000; + + start_ns = (__u64)start->tv_sec * ns_per_s + start->tv_nsec; + end_ns = (__u64)end->tv_sec * ns_per_s + end->tv_nsec; + + return end_ns - start_ns; +} + +void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms) +{ + void *sample; + int err, ms_remaining = timeout_ms; + struct timespec start; + + if (timeout_ms < 0 && timeout_ms != -1) + return errno = EINVAL, NULL; + + if (timeout_ms != -1) { + err = clock_gettime(CLOCK_MONOTONIC, &start); + if (err) + return NULL; + } + + do { + int cnt, ms_elapsed; + struct timespec curr; + __u64 ns_per_ms = 1000000; + + sample = user_ring_buffer__reserve(rb, size); + if (sample) + return sample; + else if (errno != ENOSPC) + return NULL; + + /* The kernel guarantees at least one event notification + * delivery whenever at least one sample is drained from the + * ring buffer in an invocation to bpf_ringbuf_drain(). Other + * additional events may be delivered at any time, but only one + * event is guaranteed per bpf_ringbuf_drain() invocation, + * provided that a sample is drained, and the BPF program did + * not pass BPF_RB_NO_WAKEUP to bpf_ringbuf_drain(). If + * BPF_RB_FORCE_WAKEUP is passed to bpf_ringbuf_drain(), a + * wakeup event will be delivered even if no samples are + * drained. + */ + cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, ms_remaining); + if (cnt < 0) + return NULL; + + if (timeout_ms == -1) + continue; + + err = clock_gettime(CLOCK_MONOTONIC, &curr); + if (err) + return NULL; + + ms_elapsed = ns_elapsed_timespec(&start, &curr) / ns_per_ms; + ms_remaining = timeout_ms - ms_elapsed; + } while (ms_remaining > 0); + + /* Try one more time to reserve a sample after the specified timeout has elapsed. */ + return user_ring_buffer__reserve(rb, size); +} diff --git a/src/skel_internal.h b/src/skel_internal.h new file mode 100644 index 0000000..1e82ab0 --- /dev/null +++ b/src/skel_internal.h @@ -0,0 +1,374 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021 Facebook */ +#ifndef __SKEL_INTERNAL_H +#define __SKEL_INTERNAL_H + +#ifdef __KERNEL__ +#include <linux/fdtable.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#else +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/mman.h> +#include <stdlib.h> +#include "bpf.h" +#endif + +#ifndef __NR_bpf +# if defined(__mips__) && defined(_ABIO32) +# define __NR_bpf 4355 +# elif defined(__mips__) && defined(_ABIN32) +# define __NR_bpf 6319 +# elif defined(__mips__) && defined(_ABI64) +# define __NR_bpf 5315 +# endif +#endif + +/* This file is a base header for auto-generated *.lskel.h files. + * Its contents will change and may become part of auto-generation in the future. + * + * The layout of bpf_[map|prog]_desc and bpf_loader_ctx is feature dependent + * and will change from one version of libbpf to another and features + * requested during loader program generation. + */ +struct bpf_map_desc { + /* output of the loader prog */ + int map_fd; + /* input for the loader prog */ + __u32 max_entries; + __aligned_u64 initial_value; +}; +struct bpf_prog_desc { + int prog_fd; +}; + +enum { + BPF_SKEL_KERNEL = (1ULL << 0), +}; + +struct bpf_loader_ctx { + __u32 sz; + __u32 flags; + __u32 log_level; + __u32 log_size; + __u64 log_buf; +}; + +struct bpf_load_and_run_opts { + struct bpf_loader_ctx *ctx; + const void *data; + const void *insns; + __u32 data_sz; + __u32 insns_sz; + const char *errstr; +}; + +long kern_sys_bpf(__u32 cmd, void *attr, __u32 attr_size); + +static inline int skel_sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ +#ifdef __KERNEL__ + return kern_sys_bpf(cmd, attr, size); +#else + return syscall(__NR_bpf, cmd, attr, size); +#endif +} + +#ifdef __KERNEL__ +static inline int close(int fd) +{ + return close_fd(fd); +} + +static inline void *skel_alloc(size_t size) +{ + struct bpf_loader_ctx *ctx = kzalloc(size, GFP_KERNEL); + + if (!ctx) + return NULL; + ctx->flags |= BPF_SKEL_KERNEL; + return ctx; +} + +static inline void skel_free(const void *p) +{ + kfree(p); +} + +/* skel->bss/rodata maps are populated the following way: + * + * For kernel use: + * skel_prep_map_data() allocates kernel memory that kernel module can directly access. + * Generated lskel stores the pointer in skel->rodata and in skel->maps.rodata.initial_value. + * The loader program will perform probe_read_kernel() from maps.rodata.initial_value. + * skel_finalize_map_data() sets skel->rodata to point to actual value in a bpf map and + * does maps.rodata.initial_value = ~0ULL to signal skel_free_map_data() that kvfree + * is not nessary. + * + * For user space: + * skel_prep_map_data() mmaps anon memory into skel->rodata that can be accessed directly. + * Generated lskel stores the pointer in skel->rodata and in skel->maps.rodata.initial_value. + * The loader program will perform copy_from_user() from maps.rodata.initial_value. + * skel_finalize_map_data() remaps bpf array map value from the kernel memory into + * skel->rodata address. + * + * The "bpftool gen skeleton -L" command generates lskel.h that is suitable for + * both kernel and user space. The generated loader program does + * either bpf_probe_read_kernel() or bpf_copy_from_user() from initial_value + * depending on bpf_loader_ctx->flags. + */ +static inline void skel_free_map_data(void *p, __u64 addr, size_t sz) +{ + if (addr != ~0ULL) + kvfree(p); + /* When addr == ~0ULL the 'p' points to + * ((struct bpf_array *)map)->value. See skel_finalize_map_data. + */ +} + +static inline void *skel_prep_map_data(const void *val, size_t mmap_sz, size_t val_sz) +{ + void *addr; + + addr = kvmalloc(val_sz, GFP_KERNEL); + if (!addr) + return NULL; + memcpy(addr, val, val_sz); + return addr; +} + +static inline void *skel_finalize_map_data(__u64 *init_val, size_t mmap_sz, int flags, int fd) +{ + struct bpf_map *map; + void *addr = NULL; + + kvfree((void *) (long) *init_val); + *init_val = ~0ULL; + + /* At this point bpf_load_and_run() finished without error and + * 'fd' is a valid bpf map FD. All sanity checks below should succeed. + */ + map = bpf_map_get(fd); + if (IS_ERR(map)) + return NULL; + if (map->map_type != BPF_MAP_TYPE_ARRAY) + goto out; + addr = ((struct bpf_array *)map)->value; + /* the addr stays valid, since FD is not closed */ +out: + bpf_map_put(map); + return addr; +} + +#else + +static inline void *skel_alloc(size_t size) +{ + return calloc(1, size); +} + +static inline void skel_free(void *p) +{ + free(p); +} + +static inline void skel_free_map_data(void *p, __u64 addr, size_t sz) +{ + munmap(p, sz); +} + +static inline void *skel_prep_map_data(const void *val, size_t mmap_sz, size_t val_sz) +{ + void *addr; + + addr = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (addr == (void *) -1) + return NULL; + memcpy(addr, val, val_sz); + return addr; +} + +static inline void *skel_finalize_map_data(__u64 *init_val, size_t mmap_sz, int flags, int fd) +{ + void *addr; + + addr = mmap((void *) (long) *init_val, mmap_sz, flags, MAP_SHARED | MAP_FIXED, fd, 0); + if (addr == (void *) -1) + return NULL; + return addr; +} +#endif + +static inline int skel_closenz(int fd) +{ + if (fd > 0) + return close(fd); + return -EINVAL; +} + +#ifndef offsetofend +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) +#endif + +static inline int skel_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + + attr.map_type = map_type; + strncpy(attr.map_name, map_name, sizeof(attr.map_name)); + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + + return skel_sys_bpf(BPF_MAP_CREATE, &attr, attr_sz); +} + +static inline int skel_map_update_elem(int fd, const void *key, + const void *value, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = (long) key; + attr.value = (long) value; + attr.flags = flags; + + return skel_sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); +} + +static inline int skel_map_delete_elem(int fd, const void *key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = (long)key; + + return skel_sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); +} + +static inline int skel_map_get_fd_by_id(__u32 id) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_id = id; + + return skel_sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); +} + +static inline int skel_raw_tracepoint_open(const char *name, int prog_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint.prog_fd); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.raw_tracepoint.name = (long) name; + attr.raw_tracepoint.prog_fd = prog_fd; + + return skel_sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); +} + +static inline int skel_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_create.iter_info_len); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.link_create.prog_fd = prog_fd; + attr.link_create.target_fd = target_fd; + attr.link_create.attach_type = attach_type; + + return skel_sys_bpf(BPF_LINK_CREATE, &attr, attr_sz); +} + +#ifdef __KERNEL__ +#define set_err +#else +#define set_err err = -errno +#endif + +static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) +{ + const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array); + const size_t test_run_attr_sz = offsetofend(union bpf_attr, test); + int map_fd = -1, prog_fd = -1, key = 0, err; + union bpf_attr attr; + + err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1); + if (map_fd < 0) { + opts->errstr = "failed to create loader map"; + set_err; + goto out; + } + + err = skel_map_update_elem(map_fd, &key, opts->data, 0); + if (err < 0) { + opts->errstr = "failed to update loader map"; + set_err; + goto out; + } + + memset(&attr, 0, prog_load_attr_sz); + attr.prog_type = BPF_PROG_TYPE_SYSCALL; + attr.insns = (long) opts->insns; + attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn); + attr.license = (long) "Dual BSD/GPL"; + memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); + attr.fd_array = (long) &map_fd; + attr.log_level = opts->ctx->log_level; + attr.log_size = opts->ctx->log_size; + attr.log_buf = opts->ctx->log_buf; + attr.prog_flags = BPF_F_SLEEPABLE; + err = prog_fd = skel_sys_bpf(BPF_PROG_LOAD, &attr, prog_load_attr_sz); + if (prog_fd < 0) { + opts->errstr = "failed to load loader prog"; + set_err; + goto out; + } + + memset(&attr, 0, test_run_attr_sz); + attr.test.prog_fd = prog_fd; + attr.test.ctx_in = (long) opts->ctx; + attr.test.ctx_size_in = opts->ctx->sz; + err = skel_sys_bpf(BPF_PROG_RUN, &attr, test_run_attr_sz); + if (err < 0 || (int)attr.test.retval < 0) { + opts->errstr = "failed to execute loader prog"; + if (err < 0) { + set_err; + } else { + err = (int)attr.test.retval; +#ifndef __KERNEL__ + errno = -err; +#endif + } + goto out; + } + err = 0; +out: + if (map_fd >= 0) + close(map_fd); + if (prog_fd >= 0) + close(prog_fd); + return err; +} + +#endif diff --git a/src/str_error.c b/src/str_error.c new file mode 100644 index 0000000..146da01 --- /dev/null +++ b/src/str_error.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#undef _GNU_SOURCE +#include <string.h> +#include <stdio.h> +#include "str_error.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* + * Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl + * libc, while checking strerror_r() return to avoid having to check this in + * all places calling it. + */ +char *libbpf_strerror_r(int err, char *dst, int len) +{ + int ret = strerror_r(err < 0 ? -err : err, dst, len); + if (ret) + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + return dst; +} diff --git a/src/str_error.h b/src/str_error.h new file mode 100644 index 0000000..a139334 --- /dev/null +++ b/src/str_error.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __LIBBPF_STR_ERROR_H +#define __LIBBPF_STR_ERROR_H + +char *libbpf_strerror_r(int err, char *dst, int len); +#endif /* __LIBBPF_STR_ERROR_H */ diff --git a/src/strset.c b/src/strset.c new file mode 100644 index 0000000..2464bcb --- /dev/null +++ b/src/strset.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <linux/err.h> +#include "hashmap.h" +#include "libbpf_internal.h" +#include "strset.h" + +struct strset { + void *strs_data; + size_t strs_data_len; + size_t strs_data_cap; + size_t strs_data_max_len; + + /* lookup index for each unique string in strings set */ + struct hashmap *strs_hash; +}; + +static size_t strset_hash_fn(long key, void *ctx) +{ + const struct strset *s = ctx; + const char *str = s->strs_data + key; + + return str_hash(str); +} + +static bool strset_equal_fn(long key1, long key2, void *ctx) +{ + const struct strset *s = ctx; + const char *str1 = s->strs_data + key1; + const char *str2 = s->strs_data + key2; + + return strcmp(str1, str2) == 0; +} + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz) +{ + struct strset *set = calloc(1, sizeof(*set)); + struct hashmap *hash; + int err = -ENOMEM; + + if (!set) + return ERR_PTR(-ENOMEM); + + hash = hashmap__new(strset_hash_fn, strset_equal_fn, set); + if (IS_ERR(hash)) + goto err_out; + + set->strs_data_max_len = max_data_sz; + set->strs_hash = hash; + + if (init_data) { + long off; + + set->strs_data = malloc(init_data_sz); + if (!set->strs_data) + goto err_out; + + memcpy(set->strs_data, init_data, init_data_sz); + set->strs_data_len = init_data_sz; + set->strs_data_cap = init_data_sz; + + for (off = 0; off < set->strs_data_len; off += strlen(set->strs_data + off) + 1) { + /* hashmap__add() returns EEXIST if string with the same + * content already is in the hash map + */ + err = hashmap__add(hash, off, off); + if (err == -EEXIST) + continue; /* duplicate */ + if (err) + goto err_out; + } + } + + return set; +err_out: + strset__free(set); + return ERR_PTR(err); +} + +void strset__free(struct strset *set) +{ + if (IS_ERR_OR_NULL(set)) + return; + + hashmap__free(set->strs_hash); + free(set->strs_data); + free(set); +} + +size_t strset__data_size(const struct strset *set) +{ + return set->strs_data_len; +} + +const char *strset__data(const struct strset *set) +{ + return set->strs_data; +} + +static void *strset_add_str_mem(struct strset *set, size_t add_sz) +{ + return libbpf_add_mem(&set->strs_data, &set->strs_data_cap, 1, + set->strs_data_len, set->strs_data_max_len, add_sz); +} + +/* Find string offset that corresponds to a given string *s*. + * Returns: + * - >0 offset into string data, if string is found; + * - -ENOENT, if string is not in the string data; + * - <0, on any other error. + */ +int strset__find_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + + /* see strset__add_str() for why we do this */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + if (hashmap__find(set->strs_hash, new_off, &old_off)) + return old_off; + + return -ENOENT; +} + +/* Add a string s to the string data. If the string already exists, return its + * offset within string data. + * Returns: + * - > 0 offset into string data, on success; + * - < 0, on error. + */ +int strset__add_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + int err; + + /* Hashmap keys are always offsets within set->strs_data, so to even + * look up some string from the "outside", we need to first append it + * at the end, so that it can be addressed with an offset. Luckily, + * until set->strs_data_len is incremented, that string is just a piece + * of garbage for the rest of the code, so no harm, no foul. On the + * other hand, if the string is unique, it's already appended and + * ready to be used, only a simple set->strs_data_len increment away. + */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + /* Now attempt to add the string, but only if the string with the same + * contents doesn't exist already (HASHMAP_ADD strategy). If such + * string exists, we'll get its offset in old_off (that's old_key). + */ + err = hashmap__insert(set->strs_hash, new_off, new_off, + HASHMAP_ADD, &old_off, NULL); + if (err == -EEXIST) + return old_off; /* duplicated string, return existing offset */ + if (err) + return err; + + set->strs_data_len += len; /* new unique string, adjust data length */ + return new_off; +} diff --git a/src/strset.h b/src/strset.h new file mode 100644 index 0000000..b6ddf77 --- /dev/null +++ b/src/strset.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* Copyright (c) 2021 Facebook */ +#ifndef __LIBBPF_STRSET_H +#define __LIBBPF_STRSET_H + +#include <stdbool.h> +#include <stddef.h> + +struct strset; + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz); +void strset__free(struct strset *set); + +const char *strset__data(const struct strset *set); +size_t strset__data_size(const struct strset *set); + +int strset__find_str(struct strset *set, const char *s); +int strset__add_str(struct strset *set, const char *s); + +#endif /* __LIBBPF_STRSET_H */ diff --git a/src/usdt.bpf.h b/src/usdt.bpf.h new file mode 100644 index 0000000..fdfd235 --- /dev/null +++ b/src/usdt.bpf.h @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef __USDT_BPF_H__ +#define __USDT_BPF_H__ + +#include <linux/errno.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* Below types and maps are internal implementation details of libbpf's USDT + * support and are subjects to change. Also, bpf_usdt_xxx() API helpers should + * be considered an unstable API as well and might be adjusted based on user + * feedback from using libbpf's USDT support in production. + */ + +/* User can override BPF_USDT_MAX_SPEC_CNT to change default size of internal + * map that keeps track of USDT argument specifications. This might be + * necessary if there are a lot of USDT attachments. + */ +#ifndef BPF_USDT_MAX_SPEC_CNT +#define BPF_USDT_MAX_SPEC_CNT 256 +#endif +/* User can override BPF_USDT_MAX_IP_CNT to change default size of internal + * map that keeps track of IP (memory address) mapping to USDT argument + * specification. + * Note, if kernel supports BPF cookies, this map is not used and could be + * resized all the way to 1 to save a bit of memory. + */ +#ifndef BPF_USDT_MAX_IP_CNT +#define BPF_USDT_MAX_IP_CNT (4 * BPF_USDT_MAX_SPEC_CNT) +#endif + +enum __bpf_usdt_arg_type { + BPF_USDT_ARG_CONST, + BPF_USDT_ARG_REG, + BPF_USDT_ARG_REG_DEREF, +}; + +struct __bpf_usdt_arg_spec { + /* u64 scalar interpreted depending on arg_type, see below */ + __u64 val_off; + /* arg location case, see bpf_udst_arg() for details */ + enum __bpf_usdt_arg_type arg_type; + /* offset of referenced register within struct pt_regs */ + short reg_off; + /* whether arg should be interpreted as signed value */ + bool arg_signed; + /* number of bits that need to be cleared and, optionally, + * sign-extended to cast arguments that are 1, 2, or 4 bytes + * long into final 8-byte u64/s64 value returned to user + */ + char arg_bitshift; +}; + +/* should match USDT_MAX_ARG_CNT in usdt.c exactly */ +#define BPF_USDT_MAX_ARG_CNT 12 +struct __bpf_usdt_spec { + struct __bpf_usdt_arg_spec args[BPF_USDT_MAX_ARG_CNT]; + __u64 usdt_cookie; + short arg_cnt; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, BPF_USDT_MAX_SPEC_CNT); + __type(key, int); + __type(value, struct __bpf_usdt_spec); +} __bpf_usdt_specs SEC(".maps") __weak; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, BPF_USDT_MAX_IP_CNT); + __type(key, long); + __type(value, __u32); +} __bpf_usdt_ip_to_spec_id SEC(".maps") __weak; + +extern const _Bool LINUX_HAS_BPF_COOKIE __kconfig; + +static __always_inline +int __bpf_usdt_spec_id(struct pt_regs *ctx) +{ + if (!LINUX_HAS_BPF_COOKIE) { + long ip = PT_REGS_IP(ctx); + int *spec_id_ptr; + + spec_id_ptr = bpf_map_lookup_elem(&__bpf_usdt_ip_to_spec_id, &ip); + return spec_id_ptr ? *spec_id_ptr : -ESRCH; + } + + return bpf_get_attach_cookie(ctx); +} + +/* Return number of USDT arguments defined for currently traced USDT. */ +__weak __hidden +int bpf_usdt_arg_cnt(struct pt_regs *ctx) +{ + struct __bpf_usdt_spec *spec; + int spec_id; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return -ESRCH; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return -ESRCH; + + return spec->arg_cnt; +} + +/* Fetch USDT argument #*arg_num* (zero-indexed) and put its value into *res. + * Returns 0 on success; negative error, otherwise. + * On error *res is guaranteed to be set to zero. + */ +__weak __hidden +int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) +{ + struct __bpf_usdt_spec *spec; + struct __bpf_usdt_arg_spec *arg_spec; + unsigned long val; + int err, spec_id; + + *res = 0; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return -ESRCH; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return -ESRCH; + + if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) + return -ENOENT; + + arg_spec = &spec->args[arg_num]; + switch (arg_spec->arg_type) { + case BPF_USDT_ARG_CONST: + /* Arg is just a constant ("-4@$-9" in USDT arg spec). + * value is recorded in arg_spec->val_off directly. + */ + val = arg_spec->val_off; + break; + case BPF_USDT_ARG_REG: + /* Arg is in a register (e.g, "8@%rax" in USDT arg spec), + * so we read the contents of that register directly from + * struct pt_regs. To keep things simple user-space parts + * record offsetof(struct pt_regs, <regname>) in arg_spec->reg_off. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + break; + case BPF_USDT_ARG_REG_DEREF: + /* Arg is in memory addressed by register, plus some offset + * (e.g., "-4@-1204(%rbp)" in USDT arg spec). Register is + * identified like with BPF_USDT_ARG_REG case, and the offset + * is in arg_spec->val_off. We first fetch register contents + * from pt_regs, then do another user-space probe read to + * fetch argument value itself. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec->val_off); + if (err) + return err; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + val >>= arg_spec->arg_bitshift; +#endif + break; + default: + return -EINVAL; + } + + /* cast arg from 1, 2, or 4 bytes to final 8 byte size clearing + * necessary upper arg_bitshift bits, with sign extension if argument + * is signed + */ + val <<= arg_spec->arg_bitshift; + if (arg_spec->arg_signed) + val = ((long)val) >> arg_spec->arg_bitshift; + else + val = val >> arg_spec->arg_bitshift; + *res = val; + return 0; +} + +/* Retrieve user-specified cookie value provided during attach as + * bpf_usdt_opts.usdt_cookie. This serves the same purpose as BPF cookie + * returned by bpf_get_attach_cookie(). Libbpf's support for USDT is itself + * utilizing BPF cookies internally, so user can't use BPF cookie directly + * for USDT programs and has to use bpf_usdt_cookie() API instead. + */ +__weak __hidden +long bpf_usdt_cookie(struct pt_regs *ctx) +{ + struct __bpf_usdt_spec *spec; + int spec_id; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return 0; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return 0; + + return spec->usdt_cookie; +} + +/* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */ +#define ___bpf_usdt_args0() ctx +#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; }) +#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; }) +#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; }) +#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; }) +#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; }) +#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; }) +#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; }) +#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; }) +#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; }) +#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; }) +#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; }) +#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; }) +#define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args) + +/* + * BPF_USDT serves the same purpose for USDT handlers as BPF_PROG for + * tp_btf/fentry/fexit BPF programs and BPF_KPROBE for kprobes. + * Original struct pt_regs * context is preserved as 'ctx' argument. + */ +#define BPF_USDT(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_usdt_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#endif /* __USDT_BPF_H__ */ diff --git a/src/usdt.c b/src/usdt.c new file mode 100644 index 0000000..75b411f --- /dev/null +++ b/src/usdt.c @@ -0,0 +1,1516 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <libelf.h> +#include <gelf.h> +#include <unistd.h> +#include <linux/ptrace.h> +#include <linux/kernel.h> + +/* s8 will be marked as poison while it's a reg of riscv */ +#if defined(__riscv) +#define rv_s8 s8 +#endif + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_common.h" +#include "libbpf_internal.h" +#include "hashmap.h" + +/* libbpf's USDT support consists of BPF-side state/code and user-space + * state/code working together in concert. BPF-side parts are defined in + * usdt.bpf.h header library. User-space state is encapsulated by struct + * usdt_manager and all the supporting code centered around usdt_manager. + * + * usdt.bpf.h defines two BPF maps that usdt_manager expects: USDT spec map + * and IP-to-spec-ID map, which is auxiliary map necessary for kernels that + * don't support BPF cookie (see below). These two maps are implicitly + * embedded into user's end BPF object file when user's code included + * usdt.bpf.h. This means that libbpf doesn't do anything special to create + * these USDT support maps. They are created by normal libbpf logic of + * instantiating BPF maps when opening and loading BPF object. + * + * As such, libbpf is basically unaware of the need to do anything + * USDT-related until the very first call to bpf_program__attach_usdt(), which + * can be called by user explicitly or happen automatically during skeleton + * attach (or, equivalently, through generic bpf_program__attach() call). At + * this point, libbpf will instantiate and initialize struct usdt_manager and + * store it in bpf_object. USDT manager is per-BPF object construct, as each + * independent BPF object might or might not have USDT programs, and thus all + * the expected USDT-related state. There is no coordination between two + * bpf_object in parts of USDT attachment, they are oblivious of each other's + * existence and libbpf is just oblivious, dealing with bpf_object-specific + * USDT state. + * + * Quick crash course on USDTs. + * + * From user-space application's point of view, USDT is essentially just + * a slightly special function call that normally has zero overhead, unless it + * is being traced by some external entity (e.g, BPF-based tool). Here's how + * a typical application can trigger USDT probe: + * + * #include <sys/sdt.h> // provided by systemtap-sdt-devel package + * // folly also provide similar functionality in folly/tracing/StaticTracepoint.h + * + * STAP_PROBE3(my_usdt_provider, my_usdt_probe_name, 123, x, &y); + * + * USDT is identified by it's <provider-name>:<probe-name> pair of names. Each + * individual USDT has a fixed number of arguments (3 in the above example) + * and specifies values of each argument as if it was a function call. + * + * USDT call is actually not a function call, but is instead replaced by + * a single NOP instruction (thus zero overhead, effectively). But in addition + * to that, those USDT macros generate special SHT_NOTE ELF records in + * .note.stapsdt ELF section. Here's an example USDT definition as emitted by + * `readelf -n <binary>`: + * + * stapsdt 0x00000089 NT_STAPSDT (SystemTap probe descriptors) + * Provider: test + * Name: usdt12 + * Location: 0x0000000000549df3, Base: 0x00000000008effa4, Semaphore: 0x0000000000a4606e + * Arguments: -4@-1204(%rbp) -4@%edi -8@-1216(%rbp) -8@%r8 -4@$5 -8@%r9 8@%rdx 8@%r10 -4@$-9 -2@%cx -2@%ax -1@%sil + * + * In this case we have USDT test:usdt12 with 12 arguments. + * + * Location and base are offsets used to calculate absolute IP address of that + * NOP instruction that kernel can replace with an interrupt instruction to + * trigger instrumentation code (BPF program for all that we care about). + * + * Semaphore above is and optional feature. It records an address of a 2-byte + * refcount variable (normally in '.probes' ELF section) used for signaling if + * there is anything that is attached to USDT. This is useful for user + * applications if, for example, they need to prepare some arguments that are + * passed only to USDTs and preparation is expensive. By checking if USDT is + * "activated", an application can avoid paying those costs unnecessarily. + * Recent enough kernel has built-in support for automatically managing this + * refcount, which libbpf expects and relies on. If USDT is defined without + * associated semaphore, this value will be zero. See selftests for semaphore + * examples. + * + * Arguments is the most interesting part. This USDT specification string is + * providing information about all the USDT arguments and their locations. The + * part before @ sign defined byte size of the argument (1, 2, 4, or 8) and + * whether the argument is signed or unsigned (negative size means signed). + * The part after @ sign is assembly-like definition of argument location + * (see [0] for more details). Technically, assembler can provide some pretty + * advanced definitions, but libbpf is currently supporting three most common + * cases: + * 1) immediate constant, see 5th and 9th args above (-4@$5 and -4@-9); + * 2) register value, e.g., 8@%rdx, which means "unsigned 8-byte integer + * whose value is in register %rdx"; + * 3) memory dereference addressed by register, e.g., -4@-1204(%rbp), which + * specifies signed 32-bit integer stored at offset -1204 bytes from + * memory address stored in %rbp. + * + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * + * During attachment, libbpf parses all the relevant USDT specifications and + * prepares `struct usdt_spec` (USDT spec), which is then provided to BPF-side + * code through spec map. This allows BPF applications to quickly fetch the + * actual value at runtime using a simple BPF-side code. + * + * With basics out of the way, let's go over less immediately obvious aspects + * of supporting USDTs. + * + * First, there is no special USDT BPF program type. It is actually just + * a uprobe BPF program (which for kernel, at least currently, is just a kprobe + * program, so BPF_PROG_TYPE_KPROBE program type). With the only difference + * that uprobe is usually attached at the function entry, while USDT will + * normally will be somewhere inside the function. But it should always be + * pointing to NOP instruction, which makes such uprobes the fastest uprobe + * kind. + * + * Second, it's important to realize that such STAP_PROBEn(provider, name, ...) + * macro invocations can end up being inlined many-many times, depending on + * specifics of each individual user application. So single conceptual USDT + * (identified by provider:name pair of identifiers) is, generally speaking, + * multiple uprobe locations (USDT call sites) in different places in user + * application. Further, again due to inlining, each USDT call site might end + * up having the same argument #N be located in a different place. In one call + * site it could be a constant, in another will end up in a register, and in + * yet another could be some other register or even somewhere on the stack. + * + * As such, "attaching to USDT" means (in general case) attaching the same + * uprobe BPF program to multiple target locations in user application, each + * potentially having a completely different USDT spec associated with it. + * To wire all this up together libbpf allocates a unique integer spec ID for + * each unique USDT spec. Spec IDs are allocated as sequential small integers + * so that they can be used as keys in array BPF map (for performance reasons). + * Spec ID allocation and accounting is big part of what usdt_manager is + * about. This state has to be maintained per-BPF object and coordinate + * between different USDT attachments within the same BPF object. + * + * Spec ID is the key in spec BPF map, value is the actual USDT spec layed out + * as struct usdt_spec. Each invocation of BPF program at runtime needs to + * know its associated spec ID. It gets it either through BPF cookie, which + * libbpf sets to spec ID during attach time, or, if kernel is too old to + * support BPF cookie, through IP-to-spec-ID map that libbpf maintains in such + * case. The latter means that some modes of operation can't be supported + * without BPF cookie. Such mode is attaching to shared library "generically", + * without specifying target process. In such case, it's impossible to + * calculate absolute IP addresses for IP-to-spec-ID map, and thus such mode + * is not supported without BPF cookie support. + * + * Note that libbpf is using BPF cookie functionality for its own internal + * needs, so user itself can't rely on BPF cookie feature. To that end, libbpf + * provides conceptually equivalent USDT cookie support. It's still u64 + * user-provided value that can be associated with USDT attachment. Note that + * this will be the same value for all USDT call sites within the same single + * *logical* USDT attachment. This makes sense because to user attaching to + * USDT is a single BPF program triggered for singular USDT probe. The fact + * that this is done at multiple actual locations is a mostly hidden + * implementation details. This USDT cookie value can be fetched with + * bpf_usdt_cookie(ctx) API provided by usdt.bpf.h + * + * Lastly, while single USDT can have tons of USDT call sites, it doesn't + * necessarily have that many different USDT specs. It very well might be + * that 1000 USDT call sites only need 5 different USDT specs, because all the + * arguments are typically contained in a small set of registers or stack + * locations. As such, it's wasteful to allocate as many USDT spec IDs as + * there are USDT call sites. So libbpf tries to be frugal and performs + * on-the-fly deduplication during a single USDT attachment to only allocate + * the minimal required amount of unique USDT specs (and thus spec IDs). This + * is trivially achieved by using USDT spec string (Arguments string from USDT + * note) as a lookup key in a hashmap. USDT spec string uniquely defines + * everything about how to fetch USDT arguments, so two USDT call sites + * sharing USDT spec string can safely share the same USDT spec and spec ID. + * Note, this spec string deduplication is happening only during the same USDT + * attachment, so each USDT spec shares the same USDT cookie value. This is + * not generally true for other USDT attachments within the same BPF object, + * as even if USDT spec string is the same, USDT cookie value can be + * different. It was deemed excessive to try to deduplicate across independent + * USDT attachments by taking into account USDT spec string *and* USDT cookie + * value, which would complicated spec ID accounting significantly for little + * gain. + */ + +#define USDT_BASE_SEC ".stapsdt.base" +#define USDT_SEMA_SEC ".probes" +#define USDT_NOTE_SEC ".note.stapsdt" +#define USDT_NOTE_TYPE 3 +#define USDT_NOTE_NAME "stapsdt" + +/* should match exactly enum __bpf_usdt_arg_type from usdt.bpf.h */ +enum usdt_arg_type { + USDT_ARG_CONST, + USDT_ARG_REG, + USDT_ARG_REG_DEREF, +}; + +/* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */ +struct usdt_arg_spec { + __u64 val_off; + enum usdt_arg_type arg_type; + short reg_off; + bool arg_signed; + char arg_bitshift; +}; + +/* should match BPF_USDT_MAX_ARG_CNT in usdt.bpf.h */ +#define USDT_MAX_ARG_CNT 12 + +/* should match struct __bpf_usdt_spec from usdt.bpf.h */ +struct usdt_spec { + struct usdt_arg_spec args[USDT_MAX_ARG_CNT]; + __u64 usdt_cookie; + short arg_cnt; +}; + +struct usdt_note { + const char *provider; + const char *name; + /* USDT args specification string, e.g.: + * "-4@%esi -4@-24(%rbp) -4@%ecx 2@%ax 8@%rdx" + */ + const char *args; + long loc_addr; + long base_addr; + long sema_addr; +}; + +struct usdt_target { + long abs_ip; + long rel_ip; + long sema_off; + struct usdt_spec spec; + const char *spec_str; +}; + +struct usdt_manager { + struct bpf_map *specs_map; + struct bpf_map *ip_to_spec_id_map; + + int *free_spec_ids; + size_t free_spec_cnt; + size_t next_free_spec_id; + + bool has_bpf_cookie; + bool has_sema_refcnt; +}; + +struct usdt_manager *usdt_manager_new(struct bpf_object *obj) +{ + static const char *ref_ctr_sysfs_path = "/sys/bus/event_source/devices/uprobe/format/ref_ctr_offset"; + struct usdt_manager *man; + struct bpf_map *specs_map, *ip_to_spec_id_map; + + specs_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_specs"); + ip_to_spec_id_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_ip_to_spec_id"); + if (!specs_map || !ip_to_spec_id_map) { + pr_warn("usdt: failed to find USDT support BPF maps, did you forget to include bpf/usdt.bpf.h?\n"); + return ERR_PTR(-ESRCH); + } + + man = calloc(1, sizeof(*man)); + if (!man) + return ERR_PTR(-ENOMEM); + + man->specs_map = specs_map; + man->ip_to_spec_id_map = ip_to_spec_id_map; + + /* Detect if BPF cookie is supported for kprobes. + * We don't need IP-to-ID mapping if we can use BPF cookies. + * Added in: 7adfc6c9b315 ("bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value") + */ + man->has_bpf_cookie = kernel_supports(obj, FEAT_BPF_COOKIE); + + /* Detect kernel support for automatic refcounting of USDT semaphore. + * If this is not supported, USDTs with semaphores will not be supported. + * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") + */ + man->has_sema_refcnt = faccessat(AT_FDCWD, ref_ctr_sysfs_path, F_OK, AT_EACCESS) == 0; + + return man; +} + +void usdt_manager_free(struct usdt_manager *man) +{ + if (IS_ERR_OR_NULL(man)) + return; + + free(man->free_spec_ids); + free(man); +} + +static int sanity_check_usdt_elf(Elf *elf, const char *path) +{ + GElf_Ehdr ehdr; + int endianness; + + if (elf_kind(elf) != ELF_K_ELF) { + pr_warn("usdt: unrecognized ELF kind %d for '%s'\n", elf_kind(elf), path); + return -EBADF; + } + + switch (gelf_getclass(elf)) { + case ELFCLASS64: + if (sizeof(void *) != 8) { + pr_warn("usdt: attaching to 64-bit ELF binary '%s' is not supported\n", path); + return -EBADF; + } + break; + case ELFCLASS32: + if (sizeof(void *) != 4) { + pr_warn("usdt: attaching to 32-bit ELF binary '%s' is not supported\n", path); + return -EBADF; + } + break; + default: + pr_warn("usdt: unsupported ELF class for '%s'\n", path); + return -EBADF; + } + + if (!gelf_getehdr(elf, &ehdr)) + return -EINVAL; + + if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) { + pr_warn("usdt: unsupported type of ELF binary '%s' (%d), only ET_EXEC and ET_DYN are supported\n", + path, ehdr.e_type); + return -EBADF; + } + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + endianness = ELFDATA2LSB; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + endianness = ELFDATA2MSB; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + if (endianness != ehdr.e_ident[EI_DATA]) { + pr_warn("usdt: ELF endianness mismatch for '%s'\n", path); + return -EBADF; + } + + return 0; +} + +static int find_elf_sec_by_name(Elf *elf, const char *sec_name, GElf_Shdr *shdr, Elf_Scn **scn) +{ + Elf_Scn *sec = NULL; + size_t shstrndx; + + if (elf_getshdrstrndx(elf, &shstrndx)) + return -EINVAL; + + /* check if ELF is corrupted and avoid calling elf_strptr if yes */ + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) + return -EINVAL; + + while ((sec = elf_nextscn(elf, sec)) != NULL) { + char *name; + + if (!gelf_getshdr(sec, shdr)) + return -EINVAL; + + name = elf_strptr(elf, shstrndx, shdr->sh_name); + if (name && strcmp(sec_name, name) == 0) { + *scn = sec; + return 0; + } + } + + return -ENOENT; +} + +struct elf_seg { + long start; + long end; + long offset; + bool is_exec; +}; + +static int cmp_elf_segs(const void *_a, const void *_b) +{ + const struct elf_seg *a = _a; + const struct elf_seg *b = _b; + + return a->start < b->start ? -1 : 1; +} + +static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, size_t *seg_cnt) +{ + GElf_Phdr phdr; + size_t n; + int i, err; + struct elf_seg *seg; + void *tmp; + + *seg_cnt = 0; + + if (elf_getphdrnum(elf, &n)) { + err = -errno; + return err; + } + + for (i = 0; i < n; i++) { + if (!gelf_getphdr(elf, i, &phdr)) { + err = -errno; + return err; + } + + pr_debug("usdt: discovered PHDR #%d in '%s': vaddr 0x%lx memsz 0x%lx offset 0x%lx type 0x%lx flags 0x%lx\n", + i, path, (long)phdr.p_vaddr, (long)phdr.p_memsz, (long)phdr.p_offset, + (long)phdr.p_type, (long)phdr.p_flags); + if (phdr.p_type != PT_LOAD) + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) + return -ENOMEM; + + *segs = tmp; + seg = *segs + *seg_cnt; + (*seg_cnt)++; + + seg->start = phdr.p_vaddr; + seg->end = phdr.p_vaddr + phdr.p_memsz; + seg->offset = phdr.p_offset; + seg->is_exec = phdr.p_flags & PF_X; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find PT_LOAD program headers in '%s'\n", path); + return -ESRCH; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + return 0; +} + +static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) +{ + char path[PATH_MAX], line[PATH_MAX], mode[16]; + size_t seg_start, seg_end, seg_off; + struct elf_seg *seg; + int tmp_pid, i, err; + FILE *f; + + *seg_cnt = 0; + + /* Handle containerized binaries only accessible from + * /proc/<pid>/root/<path>. They will be reported as just /<path> in + * /proc/<pid>/maps. + */ + if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid) + goto proceed; + + if (!realpath(lib_path, path)) { + pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n", + lib_path, -errno); + libbpf_strlcpy(path, lib_path, sizeof(path)); + } + +proceed: + sprintf(line, "/proc/%d/maps", pid); + f = fopen(line, "r"); + if (!f) { + err = -errno; + pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n", + line, lib_path, err); + return err; + } + + /* We need to handle lines with no path at the end: + * + * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613 /usr/lib64/libc-2.17.so + * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0 + * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598 /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so + */ + while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", + &seg_start, &seg_end, mode, &seg_off, line) == 5) { + void *tmp; + + /* to handle no path case (see above) we need to capture line + * without skipping any whitespaces. So we need to strip + * leading whitespaces manually here + */ + i = 0; + while (isblank(line[i])) + i++; + if (strcmp(line + i, path) != 0) + continue; + + pr_debug("usdt: discovered segment for lib '%s': addrs %zx-%zx mode %s offset %zx\n", + path, seg_start, seg_end, mode, seg_off); + + /* ignore non-executable sections for shared libs */ + if (mode[2] != 'x') + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + + *segs = tmp; + seg = *segs + *seg_cnt; + *seg_cnt += 1; + + seg->start = seg_start; + seg->end = seg_end; + seg->offset = seg_off; + seg->is_exec = true; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find '%s' (resolved to '%s') within PID %d memory mappings\n", + lib_path, path, pid); + err = -ESRCH; + goto err_out; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + err = 0; +err_out: + fclose(f); + return err; +} + +static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long virtaddr) +{ + struct elf_seg *seg; + int i; + + /* for ELF binaries (both executables and shared libraries), we are + * given virtual address (absolute for executables, relative for + * libraries) which should match address range of [seg_start, seg_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->start <= virtaddr && virtaddr < seg->end) + return seg; + } + return NULL; +} + +static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long offset) +{ + struct elf_seg *seg; + int i; + + /* for VMA segments from /proc/<pid>/maps file, provided "address" is + * actually a file offset, so should be fall within logical + * offset-based range of [offset_start, offset_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->offset <= offset && offset < seg->offset + (seg->end - seg->start)) + return seg; + } + return NULL; +} + +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, + struct usdt_note *usdt_note); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie); + +static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid, + const char *usdt_provider, const char *usdt_name, __u64 usdt_cookie, + struct usdt_target **out_targets, size_t *out_target_cnt) +{ + size_t off, name_off, desc_off, seg_cnt = 0, vma_seg_cnt = 0, target_cnt = 0; + struct elf_seg *segs = NULL, *vma_segs = NULL; + struct usdt_target *targets = NULL, *target; + long base_addr = 0; + Elf_Scn *notes_scn, *base_scn; + GElf_Shdr base_shdr, notes_shdr; + GElf_Ehdr ehdr; + GElf_Nhdr nhdr; + Elf_Data *data; + int err; + + *out_targets = NULL; + *out_target_cnt = 0; + + err = find_elf_sec_by_name(elf, USDT_NOTE_SEC, ¬es_shdr, ¬es_scn); + if (err) { + pr_warn("usdt: no USDT notes section (%s) found in '%s'\n", USDT_NOTE_SEC, path); + return err; + } + + if (notes_shdr.sh_type != SHT_NOTE || !gelf_getehdr(elf, &ehdr)) { + pr_warn("usdt: invalid USDT notes section (%s) in '%s'\n", USDT_NOTE_SEC, path); + return -EINVAL; + } + + err = parse_elf_segs(elf, path, &segs, &seg_cnt); + if (err) { + pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err); + goto err_out; + } + + /* .stapsdt.base ELF section is optional, but is used for prelink + * offset compensation (see a big comment further below) + */ + if (find_elf_sec_by_name(elf, USDT_BASE_SEC, &base_shdr, &base_scn) == 0) + base_addr = base_shdr.sh_addr; + + data = elf_getdata(notes_scn, 0); + off = 0; + while ((off = gelf_getnote(data, off, &nhdr, &name_off, &desc_off)) > 0) { + long usdt_abs_ip, usdt_rel_ip, usdt_sema_off = 0; + struct usdt_note note; + struct elf_seg *seg = NULL; + void *tmp; + + err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, ¬e); + if (err) + goto err_out; + + if (strcmp(note.provider, usdt_provider) != 0 || strcmp(note.name, usdt_name) != 0) + continue; + + /* We need to compensate "prelink effect". See [0] for details, + * relevant parts quoted here: + * + * Each SDT probe also expands into a non-allocated ELF note. You can + * find this by looking at SHT_NOTE sections and decoding the format; + * see below for details. Because the note is non-allocated, it means + * there is no runtime cost, and also preserved in both stripped files + * and .debug files. + * + * However, this means that prelink won't adjust the note's contents + * for address offsets. Instead, this is done via the .stapsdt.base + * section. This is a special section that is added to the text. We + * will only ever have one of these sections in a final link and it + * will only ever be one byte long. Nothing about this section itself + * matters, we just use it as a marker to detect prelink address + * adjustments. + * + * Each probe note records the link-time address of the .stapsdt.base + * section alongside the probe PC address. The decoder compares the + * base address stored in the note with the .stapsdt.base section's + * sh_addr. Initially these are the same, but the section header will + * be adjusted by prelink. So the decoder applies the difference to + * the probe PC address to get the correct prelinked PC address; the + * same adjustment is applied to the semaphore address, if any. + * + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + */ + usdt_abs_ip = note.loc_addr; + if (base_addr) + usdt_abs_ip += base_addr - note.base_addr; + + /* When attaching uprobes (which is what USDTs basically are) + * kernel expects file offset to be specified, not a relative + * virtual address, so we need to translate virtual address to + * file offset, for both ET_EXEC and ET_DYN binaries. + */ + seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_abs_ip); + goto err_out; + } + if (!seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + usdt_abs_ip); + goto err_out; + } + /* translate from virtual address to file offset */ + usdt_rel_ip = usdt_abs_ip - seg->start + seg->offset; + + if (ehdr.e_type == ET_DYN && !man->has_bpf_cookie) { + /* If we don't have BPF cookie support but need to + * attach to a shared library, we'll need to know and + * record absolute addresses of attach points due to + * the need to lookup USDT spec by absolute IP of + * triggered uprobe. Doing this resolution is only + * possible when we have a specific PID of the process + * that's using specified shared library. BPF cookie + * removes the absolute address limitation as we don't + * need to do this lookup (we just use BPF cookie as + * an index of USDT spec), so for newer kernels with + * BPF cookie support libbpf supports USDT attachment + * to shared libraries with no PID filter. + */ + if (pid < 0) { + pr_warn("usdt: attaching to shared libraries without specific PID is not supported on current kernel\n"); + err = -ENOTSUP; + goto err_out; + } + + /* vma_segs are lazily initialized only if necessary */ + if (vma_seg_cnt == 0) { + err = parse_vma_segs(pid, path, &vma_segs, &vma_seg_cnt); + if (err) { + pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n", + pid, path, err); + goto err_out; + } + } + + seg = find_vma_seg(vma_segs, vma_seg_cnt, usdt_rel_ip); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_rel_ip); + goto err_out; + } + + usdt_abs_ip = seg->start - seg->offset + usdt_rel_ip; + } + + pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", path, + note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args, + seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0); + + /* Adjust semaphore address to be a file offset */ + if (note.sema_addr) { + if (!man->has_sema_refcnt) { + pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n", + usdt_provider, usdt_name, path); + err = -ENOTSUP; + goto err_out; + } + + seg = find_elf_seg(segs, seg_cnt, note.sema_addr); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n", + usdt_provider, usdt_name, path, note.sema_addr); + goto err_out; + } + if (seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx] for semaphore of '%s:%s' at 0x%lx is executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + note.sema_addr); + goto err_out; + } + + usdt_sema_off = note.sema_addr - seg->start + seg->offset; + + pr_debug("usdt: sema for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", + path, note.sema_addr, note.base_addr, usdt_sema_off, + seg->start, seg->end, seg->offset); + } + + /* Record adjusted addresses and offsets and parse USDT spec */ + tmp = libbpf_reallocarray(targets, target_cnt + 1, sizeof(*targets)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + targets = tmp; + + target = &targets[target_cnt]; + memset(target, 0, sizeof(*target)); + + target->abs_ip = usdt_abs_ip; + target->rel_ip = usdt_rel_ip; + target->sema_off = usdt_sema_off; + + /* notes.args references strings from Elf itself, so they can + * be referenced safely until elf_end() call + */ + target->spec_str = note.args; + + err = parse_usdt_spec(&target->spec, ¬e, usdt_cookie); + if (err) + goto err_out; + + target_cnt++; + } + + *out_targets = targets; + *out_target_cnt = target_cnt; + err = target_cnt; + +err_out: + free(segs); + free(vma_segs); + if (err < 0) + free(targets); + return err; +} + +struct bpf_link_usdt { + struct bpf_link link; + + struct usdt_manager *usdt_man; + + size_t spec_cnt; + int *spec_ids; + + size_t uprobe_cnt; + struct { + long abs_ip; + struct bpf_link *link; + } *uprobes; +}; + +static int bpf_link_usdt_detach(struct bpf_link *link) +{ + struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + struct usdt_manager *man = usdt_link->usdt_man; + int i; + + for (i = 0; i < usdt_link->uprobe_cnt; i++) { + /* detach underlying uprobe link */ + bpf_link__destroy(usdt_link->uprobes[i].link); + /* there is no need to update specs map because it will be + * unconditionally overwritten on subsequent USDT attaches, + * but if BPF cookies are not used we need to remove entry + * from ip_to_spec_id map, otherwise we'll run into false + * conflicting IP errors + */ + if (!man->has_bpf_cookie) { + /* not much we can do about errors here */ + (void)bpf_map_delete_elem(bpf_map__fd(man->ip_to_spec_id_map), + &usdt_link->uprobes[i].abs_ip); + } + } + + /* try to return the list of previously used spec IDs to usdt_manager + * for future reuse for subsequent USDT attaches + */ + if (!man->free_spec_ids) { + /* if there were no free spec IDs yet, just transfer our IDs */ + man->free_spec_ids = usdt_link->spec_ids; + man->free_spec_cnt = usdt_link->spec_cnt; + usdt_link->spec_ids = NULL; + } else { + /* otherwise concat IDs */ + size_t new_cnt = man->free_spec_cnt + usdt_link->spec_cnt; + int *new_free_ids; + + new_free_ids = libbpf_reallocarray(man->free_spec_ids, new_cnt, + sizeof(*new_free_ids)); + /* If we couldn't resize free_spec_ids, we'll just leak + * a bunch of free IDs; this is very unlikely to happen and if + * system is so exhausted on memory, it's the least of user's + * concerns, probably. + * So just do our best here to return those IDs to usdt_manager. + */ + if (new_free_ids) { + memcpy(new_free_ids + man->free_spec_cnt, usdt_link->spec_ids, + usdt_link->spec_cnt * sizeof(*usdt_link->spec_ids)); + man->free_spec_ids = new_free_ids; + man->free_spec_cnt = new_cnt; + } + } + + return 0; +} + +static void bpf_link_usdt_dealloc(struct bpf_link *link) +{ + struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + + free(usdt_link->spec_ids); + free(usdt_link->uprobes); + free(usdt_link); +} + +static size_t specs_hash_fn(long key, void *ctx) +{ + return str_hash((char *)key); +} + +static bool specs_equal_fn(long key1, long key2, void *ctx) +{ + return strcmp((char *)key1, (char *)key2) == 0; +} + +static int allocate_spec_id(struct usdt_manager *man, struct hashmap *specs_hash, + struct bpf_link_usdt *link, struct usdt_target *target, + int *spec_id, bool *is_new) +{ + long tmp; + void *new_ids; + int err; + + /* check if we already allocated spec ID for this spec string */ + if (hashmap__find(specs_hash, target->spec_str, &tmp)) { + *spec_id = tmp; + *is_new = false; + return 0; + } + + /* otherwise it's a new ID that needs to be set up in specs map and + * returned back to usdt_manager when USDT link is detached + */ + new_ids = libbpf_reallocarray(link->spec_ids, link->spec_cnt + 1, sizeof(*link->spec_ids)); + if (!new_ids) + return -ENOMEM; + link->spec_ids = new_ids; + + /* get next free spec ID, giving preference to free list, if not empty */ + if (man->free_spec_cnt) { + *spec_id = man->free_spec_ids[man->free_spec_cnt - 1]; + + /* cache spec ID for current spec string for future lookups */ + err = hashmap__add(specs_hash, target->spec_str, *spec_id); + if (err) + return err; + + man->free_spec_cnt--; + } else { + /* don't allocate spec ID bigger than what fits in specs map */ + if (man->next_free_spec_id >= bpf_map__max_entries(man->specs_map)) + return -E2BIG; + + *spec_id = man->next_free_spec_id; + + /* cache spec ID for current spec string for future lookups */ + err = hashmap__add(specs_hash, target->spec_str, *spec_id); + if (err) + return err; + + man->next_free_spec_id++; + } + + /* remember new spec ID in the link for later return back to free list on detach */ + link->spec_ids[link->spec_cnt] = *spec_id; + link->spec_cnt++; + *is_new = true; + return 0; +} + +struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct bpf_program *prog, + pid_t pid, const char *path, + const char *usdt_provider, const char *usdt_name, + __u64 usdt_cookie) +{ + int i, fd, err, spec_map_fd, ip_map_fd; + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct hashmap *specs_hash = NULL; + struct bpf_link_usdt *link = NULL; + struct usdt_target *targets = NULL; + size_t target_cnt; + Elf *elf; + + spec_map_fd = bpf_map__fd(man->specs_map); + ip_map_fd = bpf_map__fd(man->ip_to_spec_id_map); + + /* TODO: perform path resolution similar to uprobe's */ + fd = open(path, O_RDONLY); + if (fd < 0) { + err = -errno; + pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err); + return libbpf_err_ptr(err); + } + + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + err = -EBADF; + pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1)); + goto err_out; + } + + err = sanity_check_usdt_elf(elf, path); + if (err) + goto err_out; + + /* normalize PID filter */ + if (pid < 0) + pid = -1; + else if (pid == 0) + pid = getpid(); + + /* discover USDT in given binary, optionally limiting + * activations to a given PID, if pid > 0 + */ + err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name, + usdt_cookie, &targets, &target_cnt); + if (err <= 0) { + err = (err == 0) ? -ENOENT : err; + goto err_out; + } + + specs_hash = hashmap__new(specs_hash_fn, specs_equal_fn, NULL); + if (IS_ERR(specs_hash)) { + err = PTR_ERR(specs_hash); + goto err_out; + } + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto err_out; + } + + link->usdt_man = man; + link->link.detach = &bpf_link_usdt_detach; + link->link.dealloc = &bpf_link_usdt_dealloc; + + link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); + if (!link->uprobes) { + err = -ENOMEM; + goto err_out; + } + + for (i = 0; i < target_cnt; i++) { + struct usdt_target *target = &targets[i]; + struct bpf_link *uprobe_link; + bool is_new; + int spec_id; + + /* Spec ID can be either reused or newly allocated. If it is + * newly allocated, we'll need to fill out spec map, otherwise + * entire spec should be valid and can be just used by a new + * uprobe. We reuse spec when USDT arg spec is identical. We + * also never share specs between two different USDT + * attachments ("links"), so all the reused specs already + * share USDT cookie value implicitly. + */ + err = allocate_spec_id(man, specs_hash, link, target, &spec_id, &is_new); + if (err) + goto err_out; + + if (is_new && bpf_map_update_elem(spec_map_fd, &spec_id, &target->spec, BPF_ANY)) { + err = -errno; + pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %d\n", + spec_id, usdt_provider, usdt_name, path, err); + goto err_out; + } + if (!man->has_bpf_cookie && + bpf_map_update_elem(ip_map_fd, &target->abs_ip, &spec_id, BPF_NOEXIST)) { + err = -errno; + if (err == -EEXIST) { + pr_warn("usdt: IP collision detected for spec #%d for '%s:%s' in '%s'\n", + spec_id, usdt_provider, usdt_name, path); + } else { + pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %d\n", + target->abs_ip, spec_id, usdt_provider, usdt_name, + path, err); + } + goto err_out; + } + + opts.ref_ctr_offset = target->sema_off; + opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0; + uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, + target->rel_ip, &opts); + err = libbpf_get_error(uprobe_link); + if (err) { + pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", + i, usdt_provider, usdt_name, path, err); + goto err_out; + } + + link->uprobes[i].link = uprobe_link; + link->uprobes[i].abs_ip = target->abs_ip; + link->uprobe_cnt++; + } + + free(targets); + hashmap__free(specs_hash); + elf_end(elf); + close(fd); + + return &link->link; + +err_out: + if (link) + bpf_link__destroy(&link->link); + free(targets); + hashmap__free(specs_hash); + if (elf) + elf_end(elf); + close(fd); + return libbpf_err_ptr(err); +} + +/* Parse out USDT ELF note from '.note.stapsdt' section. + * Logic inspired by perf's code. + */ +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, + struct usdt_note *note) +{ + const char *provider, *name, *args; + long addrs[3]; + size_t len; + + /* sanity check USDT note name and type first */ + if (strncmp(data + name_off, USDT_NOTE_NAME, nhdr->n_namesz) != 0) + return -EINVAL; + if (nhdr->n_type != USDT_NOTE_TYPE) + return -EINVAL; + + /* sanity check USDT note contents ("description" in ELF terminology) */ + len = nhdr->n_descsz; + data = data + desc_off; + + /* +3 is the very minimum required to store three empty strings */ + if (len < sizeof(addrs) + 3) + return -EINVAL; + + /* get location, base, and semaphore addrs */ + memcpy(&addrs, data, sizeof(addrs)); + + /* parse string fields: provider, name, args */ + provider = data + sizeof(addrs); + + name = (const char *)memchr(provider, '\0', data + len - provider); + if (!name) /* non-zero-terminated provider */ + return -EINVAL; + name++; + if (name >= data + len || *name == '\0') /* missing or empty name */ + return -EINVAL; + + args = memchr(name, '\0', data + len - name); + if (!args) /* non-zero-terminated name */ + return -EINVAL; + ++args; + if (args >= data + len) /* missing arguments spec */ + return -EINVAL; + + note->provider = provider; + note->name = name; + if (*args == '\0' || *args == ':') + note->args = ""; + else + note->args = args; + note->loc_addr = addrs[0]; + note->base_addr = addrs[1]; + note->sema_addr = addrs[2]; + + return 0; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie) +{ + const char *s; + int len; + + spec->usdt_cookie = usdt_cookie; + spec->arg_cnt = 0; + + s = note->args; + while (s[0]) { + if (spec->arg_cnt >= USDT_MAX_ARG_CNT) { + pr_warn("usdt: too many USDT arguments (> %d) for '%s:%s' with args spec '%s'\n", + USDT_MAX_ARG_CNT, note->provider, note->name, note->args); + return -E2BIG; + } + + len = parse_usdt_arg(s, spec->arg_cnt, &spec->args[spec->arg_cnt]); + if (len < 0) + return len; + + s += len; + spec->arg_cnt++; + } + + return 0; +} + +/* Architecture-specific logic for parsing USDT argument location specs */ + +#if defined(__x86_64__) || defined(__i386__) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *names[4]; + size_t pt_regs_off; + } reg_map[] = { +#ifdef __x86_64__ +#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg64) +#else +#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg32) +#endif + { {"rip", "eip", "", ""}, reg_off(rip, eip) }, + { {"rax", "eax", "ax", "al"}, reg_off(rax, eax) }, + { {"rbx", "ebx", "bx", "bl"}, reg_off(rbx, ebx) }, + { {"rcx", "ecx", "cx", "cl"}, reg_off(rcx, ecx) }, + { {"rdx", "edx", "dx", "dl"}, reg_off(rdx, edx) }, + { {"rsi", "esi", "si", "sil"}, reg_off(rsi, esi) }, + { {"rdi", "edi", "di", "dil"}, reg_off(rdi, edi) }, + { {"rbp", "ebp", "bp", "bpl"}, reg_off(rbp, ebp) }, + { {"rsp", "esp", "sp", "spl"}, reg_off(rsp, esp) }, +#undef reg_off +#ifdef __x86_64__ + { {"r8", "r8d", "r8w", "r8b"}, offsetof(struct pt_regs, r8) }, + { {"r9", "r9d", "r9w", "r9b"}, offsetof(struct pt_regs, r9) }, + { {"r10", "r10d", "r10w", "r10b"}, offsetof(struct pt_regs, r10) }, + { {"r11", "r11d", "r11w", "r11b"}, offsetof(struct pt_regs, r11) }, + { {"r12", "r12d", "r12w", "r12b"}, offsetof(struct pt_regs, r12) }, + { {"r13", "r13d", "r13w", "r13b"}, offsetof(struct pt_regs, r13) }, + { {"r14", "r14d", "r14w", "r14b"}, offsetof(struct pt_regs, r14) }, + { {"r15", "r15d", "r15w", "r15b"}, offsetof(struct pt_regs, r15) }, +#endif + }; + int i, j; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + for (j = 0; j < ARRAY_SIZE(reg_map[i].names); j++) { + if (strcmp(reg_name, reg_map[i].names[j]) == 0) + return reg_map[i].pt_regs_off; + } + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +{ + char reg_name[16]; + int arg_sz, len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", &arg_sz, &off, reg_name, &len) == 3) { + /* Memory dereference case, e.g., -4@-20(%rbp) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ ( %%%15[^)] ) %n", &arg_sz, reg_name, &len) == 2) { + /* Memory dereference case without offset, e.g., 8@(%rsp) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %%%15s %n", &arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -4@%eax */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ $%ld %n", &arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@$71 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + arg_num, arg_str, arg_sz); + return -EINVAL; + } + + return len; +} + +#elif defined(__s390x__) + +/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */ + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +{ + unsigned int reg; + int arg_sz, len; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", &arg_sz, &off, ®, &len) == 3) { + /* Memory dereference case, e.g., -2@-28(%r15) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + if (reg > 15) { + pr_warn("usdt: unrecognized register '%%r%u'\n", reg); + return -EINVAL; + } + arg->reg_off = offsetof(user_pt_regs, gprs[reg]); + } else if (sscanf(arg_str, " %d @ %%r%u %n", &arg_sz, ®, &len) == 2) { + /* Register read case, e.g., -8@%r0 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + if (reg > 15) { + pr_warn("usdt: unrecognized register '%%r%u'\n", reg); + return -EINVAL; + } + arg->reg_off = offsetof(user_pt_regs, gprs[reg]); + } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@71 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + arg_num, arg_str, arg_sz); + return -EINVAL; + } + + return len; +} + +#elif defined(__aarch64__) + +static int calc_pt_regs_off(const char *reg_name) +{ + int reg_num; + + if (sscanf(reg_name, "x%d", ®_num) == 1) { + if (reg_num >= 0 && reg_num < 31) + return offsetof(struct user_pt_regs, regs[reg_num]); + } else if (strcmp(reg_name, "sp") == 0) { + return offsetof(struct user_pt_regs, sp); + } + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +{ + char reg_name[16]; + int arg_sz, len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ \[ %15[a-z0-9], %ld ] %n", &arg_sz, reg_name, &off, &len) == 3) { + /* Memory dereference case, e.g., -4@[sp, 96] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", &arg_sz, reg_name, &len) == 2) { + /* Memory dereference case, e.g., -4@[sp] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", &arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@x4 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + arg_num, arg_str, arg_sz); + return -EINVAL; + } + + return len; +} + +#elif defined(__riscv) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *name; + size_t pt_regs_off; + } reg_map[] = { + { "ra", offsetof(struct user_regs_struct, ra) }, + { "sp", offsetof(struct user_regs_struct, sp) }, + { "gp", offsetof(struct user_regs_struct, gp) }, + { "tp", offsetof(struct user_regs_struct, tp) }, + { "a0", offsetof(struct user_regs_struct, a0) }, + { "a1", offsetof(struct user_regs_struct, a1) }, + { "a2", offsetof(struct user_regs_struct, a2) }, + { "a3", offsetof(struct user_regs_struct, a3) }, + { "a4", offsetof(struct user_regs_struct, a4) }, + { "a5", offsetof(struct user_regs_struct, a5) }, + { "a6", offsetof(struct user_regs_struct, a6) }, + { "a7", offsetof(struct user_regs_struct, a7) }, + { "s0", offsetof(struct user_regs_struct, s0) }, + { "s1", offsetof(struct user_regs_struct, s1) }, + { "s2", offsetof(struct user_regs_struct, s2) }, + { "s3", offsetof(struct user_regs_struct, s3) }, + { "s4", offsetof(struct user_regs_struct, s4) }, + { "s5", offsetof(struct user_regs_struct, s5) }, + { "s6", offsetof(struct user_regs_struct, s6) }, + { "s7", offsetof(struct user_regs_struct, s7) }, + { "s8", offsetof(struct user_regs_struct, rv_s8) }, + { "s9", offsetof(struct user_regs_struct, s9) }, + { "s10", offsetof(struct user_regs_struct, s10) }, + { "s11", offsetof(struct user_regs_struct, s11) }, + { "t0", offsetof(struct user_regs_struct, t0) }, + { "t1", offsetof(struct user_regs_struct, t1) }, + { "t2", offsetof(struct user_regs_struct, t2) }, + { "t3", offsetof(struct user_regs_struct, t3) }, + { "t4", offsetof(struct user_regs_struct, t4) }, + { "t5", offsetof(struct user_regs_struct, t5) }, + { "t6", offsetof(struct user_regs_struct, t6) }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + if (strcmp(reg_name, reg_map[i].name) == 0) + return reg_map[i].pt_regs_off; + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +{ + char reg_name[16]; + int arg_sz, len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %15[a-z0-9] ) %n", &arg_sz, &off, reg_name, &len) == 3) { + /* Memory dereference case, e.g., -8@-88(s0) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", &arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@a1 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + arg_num, arg_str, arg_sz); + return -EINVAL; + } + + return len; +} + +#else + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +{ + pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n"); + return -ENOTSUP; +} + +#endif |