diff options
Diffstat (limited to 'lib/libxdp')
29 files changed, 8227 insertions, 0 deletions
diff --git a/lib/libxdp/.gitignore b/lib/libxdp/.gitignore new file mode 100644 index 0000000..c5a9951 --- /dev/null +++ b/lib/libxdp/.gitignore @@ -0,0 +1,5 @@ +*.so.* +*.a +*.pc +sharedobjs/ +staticobjs/ diff --git a/lib/libxdp/Makefile b/lib/libxdp/Makefile new file mode 100644 index 0000000..431932a --- /dev/null +++ b/lib/libxdp/Makefile @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) + +LIB_DIR = .. + +include libxdp.mk +include $(LIB_DIR)/defines.mk + +OBJDIR ?= . +SHARED_OBJDIR := $(OBJDIR)/sharedobjs +STATIC_OBJDIR := $(OBJDIR)/staticobjs +OBJS := libxdp.o xsk.o +XDP_OBJS := xdp-dispatcher.o xsk_def_xdp_prog.o xsk_def_xdp_prog_5.3.o +EMBEDDED_XDP_OBJS := $(addsuffix .embed.o,$(basename $(XDP_OBJS))) +SHARED_OBJS := $(addprefix $(SHARED_OBJDIR)/,$(OBJS)) +STATIC_OBJS := $(addprefix $(STATIC_OBJDIR)/,$(OBJS)) $(EMBEDDED_XDP_OBJS) +STATIC_LIBS := $(OBJDIR)/libxdp.a +MAN_PAGE := libxdp.3 +MAN_OBJ := ${MAN_PAGE:.3=.man} +MAN_FILES := $(MAN_PAGE) +TEST_DIR := tests + +SHARED_CFLAGS += -fPIC -DSHARED +LIB_HEADERS := $(wildcard $(HEADER_DIR)/xdp/*.h) +BPF_HEADERS := $(wildcard $(HEADER_DIR)/bpf/*.h) $(wildcard $(HEADER_DIR)/xdp/*.h) +EXTRA_LIB_DEPS := $(OBJECT_LIBBPF) $(LIBMK) $(LIB_OBJS) $(LIB_HEADERS) compat.h libxdp_internal.h xsk_def_xdp_prog.h bpf_instr.h +PC_FILE := $(OBJDIR)/libxdp.pc +TEMPLATED_SOURCES := xdp-dispatcher.c + +CFLAGS += -I$(HEADER_DIR) +BPF_CFLAGS += -I$(HEADER_DIR) + + +ifndef BUILD_STATIC_ONLY +SHARED_LIBS := $(OBJDIR)/libxdp.so \ + $(OBJDIR)/libxdp.so.$(LIBXDP_MAJOR_VERSION) \ + $(OBJDIR)/libxdp.so.$(LIBXDP_VERSION) +VERSION_SCRIPT := libxdp.map +CHECK_RULES := check_abi +endif + +all: $(STATIC_LIBS) $(SHARED_LIBS) $(XDP_OBJS) $(PC_FILE) check man + +clean: + $(Q)rm -f $(STATIC_LIBS) $(STATIC_OBJS) $(SHARED_LIBS) $(SHARED_OBJS) $(XDP_OBJS) $(PC_FILE) $(MAN_OBJ) $(TEMPLATED_SOURCES) + $(Q)for d in $(SHARED_OBJDIR) $(STATIC_OBJDIR); do \ + [ -d "$$d" ] && rmdir "$$d"; done || true + $(Q)$(MAKE) -C $(TEST_DIR) clean + +install: all + $(Q)install -d -m 0755 $(DESTDIR)$(HDRDIR) + $(Q)install -d -m 0755 $(DESTDIR)$(LIBDIR) + $(Q)install -d -m 0755 $(DESTDIR)$(LIBDIR)/pkgconfig + $(Q)install -d -m 0755 $(DESTDIR)$(BPF_OBJECT_DIR) + $(Q)install -m 0644 $(LIB_HEADERS) $(DESTDIR)$(HDRDIR)/ + $(Q)install -m 0644 $(PC_FILE) $(DESTDIR)$(LIBDIR)/pkgconfig/ + $(Q)cp -fpR $(SHARED_LIBS) $(STATIC_LIBS) $(DESTDIR)$(LIBDIR) + $(Q)install -m 0755 $(XDP_OBJS) $(DESTDIR)$(BPF_OBJECT_DIR) + $(if $(MAN_FILES),$(Q)install -m 0755 -d $(DESTDIR)$(MANDIR)/man3) + $(if $(MAN_FILES),$(Q)install -m 0644 $(MAN_FILES) $(DESTDIR)$(MANDIR)/man3) + + +$(OBJDIR)/libxdp.a: $(STATIC_OBJS) + $(QUIET_LINK)$(AR) rcs $@ $^ + +$(OBJDIR)/libxdp.so: $(OBJDIR)/libxdp.so.$(LIBXDP_MAJOR_VERSION) + $(Q)ln -sf $(^F) $@ + +$(OBJDIR)/libxdp.so.$(LIBXDP_MAJOR_VERSION): $(OBJDIR)/libxdp.so.$(LIBXDP_VERSION) + $(Q)ln -sf $(^F) $@ + +$(OBJDIR)/libxdp.so.$(LIBXDP_VERSION): $(SHARED_OBJS) + $(QUIET_LINK)$(CC) -shared -Wl,-soname,libxdp.so.$(LIBXDP_MAJOR_VERSION) \ + -Wl,--version-script=$(VERSION_SCRIPT) \ + $^ $(LDFLAGS) $(LDLIBS) -o $@ + +$(OBJDIR)/libxdp.pc: + $(Q)sed -e "s|@PREFIX@|$(PREFIX)|" \ + -e "s|@LIBDIR@|$(LIBDIR)|" \ + -e "s|@VERSION@|$(TOOLS_VERSION)|" \ + < libxdp.pc.template > $@ + +$(STATIC_OBJDIR): + $(Q)mkdir -p $(STATIC_OBJDIR) + +$(SHARED_OBJDIR): + $(Q)mkdir -p $(SHARED_OBJDIR) + +$(STATIC_OBJDIR)/%.o: %.c $(EXTRA_LIB_DEPS) | $(STATIC_OBJDIR) + $(QUIET_CC)$(CC) $(CFLAGS) $(CPPFLAGS) -D LIBXDP_STATIC=1 -Wall -I../../headers -c $< -o $@ + +$(SHARED_OBJDIR)/%.o: %.c $(EXTRA_LIB_DEPS) | $(SHARED_OBJDIR) + $(QUIET_CC)$(CC) $(CFLAGS) $(CPPFLAGS) $(SHARED_CFLAGS) -Wall -I../../headers -c $< -o $@ + +XDP_IN_SHARED := $(SHARED_OBJDIR)/libxdp.o $(SHARED_OBJDIR)/xsk.o + +GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(XDP_IN_SHARED) | \ + cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \ + sed 's/\[.*\]//' | \ + awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}' | \ + sort -u | wc -l) +VERSIONED_SYM_COUNT = $(shell readelf --dyn-syms --wide $(OBJDIR)/libxdp.so | \ + grep -Eo '[^ ]+@LIBXDP_' | cut -d@ -f1 | sort -u | wc -l) + +check: $(CHECK_RULES) + +check_abi: $(OBJDIR)/libxdp.so + @if [ "$(GLOBAL_SYM_COUNT)" != "$(VERSIONED_SYM_COUNT)" ]; then \ + echo "Warning: Num of global symbols in $(XDP_IN_SHARED)" \ + "($(GLOBAL_SYM_COUNT)) does NOT match with num of" \ + "versioned symbols in $^ ($(VERSIONED_SYM_COUNT))." \ + "Please make sure all symbols are" \ + "versioned in $(VERSION_SCRIPT)." >&2; \ + readelf -s --wide $(XDP_IN_SHARED) | \ + cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \ + sed 's/\[.*\]//' | \ + awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}'| \ + sort -u > $(OUTPUT)libxdp_global_syms.tmp; \ + readelf --dyn-syms --wide $(OUTPUT)libxdp.so | \ + grep -Eo '[^ ]+@LIBXDP_' | cut -d@ -f1 | \ + sort -u > $(OUTPUT)libxdp_versioned_syms.tmp; \ + diff -u $(OUTPUT)libxdp_global_syms.tmp \ + $(OUTPUT)libxdp_versioned_syms.tmp; \ + rm $(OUTPUT)libxdp_global_syms.tmp \ + $(OUTPUT)libxdp_versioned_syms.tmp; \ + exit 1; \ + fi + + +$(TEMPLATED_SOURCES): %.c: %.c.in Makefile + $(QUIET_M4)$(M4) $(DEFINES) $< > $@ || ( ret=$$?; rm -f $@; exit $$ret ) + +$(EMBEDDED_XDP_OBJS): %.embed.o: %.o + $(QUIET_GEN)$(LD) -r -b binary -o $@ -z noexecstack --format=binary $< + $(Q)$(OBJCOPY) --rename-section .data=.rodata,alloc,load,readonly,data,contents $@ + +$(XDP_OBJS): %.o: %.c $(BPF_HEADERS) $(LIBMK) + $(QUIET_CLANG)$(CLANG) -S \ + -target $(BPF_TARGET) \ + -D __BPF_TRACING__ \ + $(BPF_CFLAGS) \ + -Wall \ + -Wno-unused-value \ + -Wno-pointer-sign \ + -Wno-compare-distinct-pointer-types \ + -Werror \ + -O2 -emit-llvm -c -g -o ${@:.o=.ll} $< + $(QUIET_LLC)$(LLC) -march=$(BPF_TARGET) -filetype=obj -o $@ ${@:.o=.ll} + +.PHONY: man +ifeq ($(EMACS),) +man: ; +else +man: $(MAN_PAGE) +$(MAN_OBJ): README.org $(LIBMK) + $(Q)$(EMACS) -Q --batch --find-file $< --eval "(progn (require 'ox-man)(org-man-export-to-man))" + $(Q)touch -r $< $@ + +$(MAN_PAGE): $(MAN_OBJ) $(LIBMK) + $(QUIET_GEN)MODDATE=$$(git log -1 --pretty="format:%cI" README.org 2>/dev/null); \ + [ "$$?" -eq "0" ] && DATE=$$(date '+%B %_d, %Y' -d "$$MODDATE") || DATE=$$(date '+%B %_d, %Y'); \ + sed -e "1 s/DATE/$$DATE/" -e "1 s/VERSION/v$(TOOLS_VERSION)/" -e '1,5 s/^.SH "\([^"]\+\) - \([^"]\+\)"/.SH "NAME"\n\1 \\- \2\n.SH "SYNOPSIS"/' $< > $@ + +endif + +.PHONY: test +test: all + $(Q)$(MAKE) -C $(TEST_DIR) run diff --git a/lib/libxdp/README.org b/lib/libxdp/README.org new file mode 100644 index 0000000..9ca7f2e --- /dev/null +++ b/lib/libxdp/README.org @@ -0,0 +1,437 @@ +#+EXPORT_FILE_NAME: libxdp +#+TITLE: libxdp +#+OPTIONS: ^:nil +#+MAN_CLASS_OPTIONS: :section-id "3\" \"DATE\" \"VERSION\" \"libxdp - library for loading XDP programs" +# This file serves both as a README on github, and as the source for the man +# page; the latter through the org-mode man page export support. +# . +# To export the man page, simply use the org-mode exporter; (require 'ox-man) if +# it's not available. There's also a Makefile rule to export it. + +* libxdp - library for attaching XDP programs and using AF_XDP sockets + +This directory contains the files for the =libxdp= library for +attaching XDP programs to network interfaces and using AF_XDP +sockets. The library is fairly lightweight and relies on =libbpf= to +do the heavy lifting for processing eBPF object files etc. + +=Libxdp= provides two primary features on top of =libbpf=. The first is +the ability to load multiple XDP programs in sequence on a single +network device (which is not natively supported by the kernel). This +support relies on the =freplace= functionality in the kernel, which +makes it possible to attach an eBPF program as a replacement for a +global function in another (already loaded) eBPF program. The second +main feature is helper functions for configuring AF_XDP sockets as +well as reading and writing packets from these sockets. + +Some of the functionality provided by libxdp depends on particular kernel +features; see the "Kernel feature compatibility" section below for details. + +** Using libxdp from an application + +Basic usage of libxdp from an application is quite straight forward. The +following example loads, then unloads, an XDP program from the 'lo' interface: + +#+begin_src C +#define IFINDEX 1 + +struct xdp_program *prog; +int err; + +prog = xdp_program__open_file("my-program.o", "section_name", NULL); +err = xdp_program__attach(prog, IFINDEX, XDP_MODE_NATIVE, 0); + +if (!err) + xdp_program__detach(prog, IFINDEX, XDP_MODE_NATIVE, 0); + +xdp_program__close(prog); +#+end_src + +The =xdp_program= structure is an opaque structure that represents a single XDP +program. =libxdp= contains functions to create such a struct either from a BPF +object file on disk, from a =libbpf= BPF object, or from an identifier of a +program that is already loaded into the kernel: + +#+begin_src C +struct xdp_program *xdp_program__from_bpf_obj(struct bpf_object *obj, + const char *section_name); +struct xdp_program *xdp_program__find_file(const char *filename, + const char *section_name, + struct bpf_object_open_opts *opts); +struct xdp_program *xdp_program__open_file(const char *filename, + const char *section_name, + struct bpf_object_open_opts *opts); +struct xdp_program *xdp_program__from_fd(int fd); +struct xdp_program *xdp_program__from_id(__u32 prog_id); +struct xdp_program *xdp_program__from_pin(const char *pin_path); +#+end_src + +The functions that open a BPF object or file need the function name of the XDP +program as well as the file name or object, since an ELF file can contain +multiple XDP programs. The =xdp_program__find_file()= function takes a filename +without a path, and will look for the object in =LIBXDP_OBJECT_PATH= which +defaults to =/usr/lib/bpf= (or =/usr/lib64/bpf= on systems using a split library +path). This is convenient for applications shipping pre-compiled eBPF object +files. + +The =xdp_program__attach()= function will attach the program to an interface, +building a dispatcher program to execute it. Multiple programs can be attached +at once with =xdp_program__attach_multi()=; they will be sorted in order of +their run priority, and execution from one program to the next will proceed +based on the chain call actions defined for each program (see the *Program +metadata* section below). Because the loading process involves modifying the +attach type of the program, the attach functions only work with =struct +xdp_program= objects that have not yet been loaded into the kernel. + +When using the attach functions to attach to an interface that already has an +XDP program loaded, libxdp will attempt to add the program to the list of loaded +programs. However, this may fail, either due to missing kernel support, or +because the already-attached program was not loaded using a dispatcher +compatible with libxdp. If the kernel support for incremental attach (merged in +kernel 5.10) is missing, the only way to actually run multiple programs on a +single interface is to attach them all at the same time with +=xdp_program__attach_multi()=. If the existing program is not an XDP dispatcher, +that program will have to be detached from the interface before libxdp can +attach a new one. This can be done by calling =xdp_program__detach()= with a +reference to the loaded program; but note that this will of course break any +application relying on that other XDP program to be present. + +* Program metadata + +To support multiple XDP programs on the same interface, libxdp uses two pieces +of metadata for each XDP program: Run priority and chain call actions. + +*** Run priority +This is the priority of the program and is a simple integer used +to sort programs when loading multiple programs onto the same interface. +Programs that wish to run early (such as a packet filter) should set low values +for this, while programs that want to run later (such as a packet forwarder or +counter) should set higher values. Note that later programs are only run if the +previous programs end with a return code that is part of its chain call actions +(see below). If not specified, the default priority value is 50. + +*** Chain call actions +These are the program return codes that the program indicate for packets that +should continue processing. If the program returns one of these actions, later +programs in the call chain will be run, whereas if it returns any other action, +processing will be interrupted, and the XDP dispatcher will return the verdict +immediately. If not set, this defaults to just XDP_PASS, which is likely the +value most programs should use. + +*** Specifying metadata +The metadata outlined above is specified as BTF information embedded in the ELF +file containing the XDP program. The =xdp_helpers.h= file shipped with libxdp +contains helper macros to include this information, which can be used as +follows: + +#+begin_src C +#include <bpf/bpf_helpers.h> +#include <xdp/xdp_helpers.h> + +struct { + __uint(priority, 10); + __uint(XDP_PASS, 1); + __uint(XDP_DROP, 1); +} XDP_RUN_CONFIG(my_xdp_func); +#+end_src + +This example specifies that the XDP program in =my_xdp_func= should have +priority 10 and that its chain call actions are =XDP_PASS= and =XDP_DROP=. +In a source file with multiple XDP programs in the same file, a definition like +the above can be included for each program (main XDP function). Any program that +does not specify any config information will use the default values outlined +above. + +*** Inspecting and modifying metadata + +=libxdp= exposes the following functions that an application can use to inspect +and modify the metadata on an XDP program. Modification is only possible before +a program is attached on an interface. These functions won't modify the BTF +information itself, but the new values will be stored as part of the program +attachment. + +#+begin_src C +unsigned int xdp_program__run_prio(const struct xdp_program *xdp_prog); +int xdp_program__set_run_prio(struct xdp_program *xdp_prog, + unsigned int run_prio); +bool xdp_program__chain_call_enabled(const struct xdp_program *xdp_prog, + enum xdp_action action); +int xdp_program__set_chain_call_enabled(struct xdp_program *prog, + unsigned int action, + bool enabled); +int xdp_program__print_chain_call_actions(const struct xdp_program *prog, + char *buf, + size_t buf_len); +#+end_src + +* The dispatcher program +To support multiple non-offloaded programs on the same network interface, +=libxdp= uses a *dispatcher program* which is a small wrapper program that will +call each component program in turn, expect the return code, and then chain call +to the next program based on the chain call actions of the previous program (see +the *Program metadata* section above). + +While applications using =libxdp= do not need to know the details of the +dispatcher program to just load an XDP program unto an interface, =libxdp= does +expose the dispatcher and its attached component programs, which can be used to +list the programs currently attached to an interface. + +The structure used for this is =struct xdp_multiprog=, which can only be +constructed from the programs loaded on an interface based on ifindex. The API +for getting a multiprog reference and iterating through the attached programs +looks like this: + +#+begin_src C +struct xdp_multiprog *xdp_multiprog__get_from_ifindex(int ifindex); +struct xdp_program *xdp_multiprog__next_prog(const struct xdp_program *prog, + const struct xdp_multiprog *mp); +void xdp_multiprog__close(struct xdp_multiprog *mp); +int xdp_multiprog__detach(struct xdp_multiprog *mp, int ifindex); +enum xdp_attach_mode xdp_multiprog__attach_mode(const struct xdp_multiprog *mp); +struct xdp_program *xdp_multiprog__main_prog(const struct xdp_multiprog *mp); +struct xdp_program *xdp_multiprog__hw_prog(const struct xdp_multiprog *mp); +bool xdp_multiprog__is_legacy(const struct xdp_multiprog *mp); +#+end_src + +If a non-offloaded program is attached to the interface which =libxdp= doesn't +recognise as a dispatcher program, an =xdp_multiprog= structure will still be +returned, and =xdp_multiprog__is_legacy()= will return true for that program +(note that this also holds true if only an offloaded program is loaded). A +reference to that (regular) XDP program can be obtained by +=xdp_multiprog__main_prog()=. If the program attached to the interface *is* a +dispatcher program, =xdp_multiprog__main_prog()= will return a reference to the +dispatcher program itself, which is mainly useful for obtaining other data about +that program (such as the program ID). A reference to an offloaded program can +be acquired using =xdp_multiprog_hw_prog()=. Function +=xdp_multiprog__attach_mode()= returns the attach mode of the non-offloaded +program, whether an offloaded program is attached should be checked through +=xdp_multiprog_hw_prog()=. + +** Pinning in bpffs +The kernel will automatically detach component programs from the dispatcher once +the last reference to them disappears. To prevent this from happening, =libxdp= +will pin the component program references in =bpffs= before attaching the +dispatcher to the network interface. The pathnames generated for pinning is as +follows: + +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID - dispatcher program for IFINDEX with BPF program ID DID +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-prog - component program 0, program reference +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-link - component program 0, bpf_link reference +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-prog - component program 1, program reference +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-link - component program 1, bpf_link reference +- etc, up to ten component programs + +If set, the =LIBXDP_BPFFS= environment variable will override the location of +=bpffs=, but the =xdp= subdirectory is always used. If no =bpffs= is mounted, +libxdp will consult the environment variable =LIBXDP_BPFFS_AUTOMOUNT=. If this +is set to =1=, libxdp will attempt to automount a bpffs. If not, libxdp will +fall back to loading a single program without a dispatcher, as if the kernel did +not support the features needed for multiprog attachment. + +* Using AF_XDP sockets + +Libxdp implements helper functions for configuring AF_XDP sockets as +well as reading and writing packets from these sockets. AF_XDP sockets +can be used to redirect packets to user-space at high rates from an +XDP program. Note that this functionality used to reside in libbpf, +but has now been moved over to libxdp as it is a better fit for this +library. As of the 1.0 release of libbpf, the AF_XDP socket support +will be removed and all future development will be performed +in libxdp instead. + +For an overview of AF_XDP sockets, please refer to this Linux Plumbers +paper +(http://vger.kernel.org/lpc_net2018_talks/lpc18_pres_af_xdp_perf-v3.pdf) +and the documentation in the Linux kernel +(Documentation/networking/af_xdp.rst or +https://www.kernel.org/doc/html/latest/networking/af_xdp.html). + +For an example on how to use the interface, take a look at the AF_XDP-example +and AF_XDP-forwarding programs in the bpf-examples repository: +https://github.com/xdp-project/bpf-examples. + +** Control path + +Libxdp provides helper functions for creating and destroying umems and +sockets as shown below. The first thing that a user generally wants to +do is to create a umem area. This is the area that will contain all +packets received and the ones that are going to be sent. After that, +AF_XDP sockets can be created tied to this umem. These can either be +sockets that have exclusive ownership of that umem through +xsk_socket__create() or shared with other sockets using +xsk_socket__create_shared. There is one option called +XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD that can be set in the +libxdp_flags field (also called libbpf_flags for compatibility +reasons). This will make libxdp not load any XDP program or set and +BPF maps which is a must if users want to add their own XDP program. + +#+begin_src C +int xsk_umem__create(struct xsk_umem **umem, + void *umem_area, __u64 size, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_umem_config *config); +int xsk_socket__create(struct xsk_socket **xsk, + const char *ifname, __u32 queue_id, + struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + const struct xsk_socket_config *config); +int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, + const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_socket_config *config); +int xsk_umem__delete(struct xsk_umem *umem); +void xsk_socket__delete(struct xsk_socket *xsk); +#+end_src + +There are also two helper function to get the file descriptor of a +umem or a socket. These are needed when using standard Linux syscalls +such as poll(), recvmsg(), sendto(), etc. + +#+begin_src C +int xsk_umem__fd(const struct xsk_umem *umem); +int xsk_socket__fd(const struct xsk_socket *xsk); +#+end_src + +The control path also provides two APIs for setting up AF_XDP sockets when the +process that is going to use the AF_XDP socket is non-privileged. These two +functions perform the operations that require privileges and can be executed +from some form of control process that has the necessary privileges. The +xsk_socket__create executed on the non-privileged process will then skip these +two steps. For an example on how to use these, please take a look at the +AF_XDP-example program in the bpf-examples repository: +https://github.com/xdp-project/bpf-examples/tree/master/AF_XDP-example. + +#+begin_src C +int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); +int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); +#+end_src + +** Data path + +For performance reasons, all the data path functions are static inline +functions found in the xsk.h header file so they can be optimized into +the target application binary for best possible performance. There are +four FIFO rings of two main types: producer rings (fill and Tx) and +consumer rings (Rx and completion). The producer rings use +xsk_ring_prod functions and consumer rings use xsk_ring_cons +functions. For producer rings, you start with =reserving= one or more +slots in a producer ring and then when they have been filled out, you +=submit= them so that the kernel will act on them. For a consumer +ring, you =peek= if there are any new packets in the ring and if so +you can read them from the ring. Once you are done reading them, you +=release= them back to the kernel so it can use them for new +packets. There is also a =cancel= operation for consumer rings if the +application does not want to consume all packets received with the +peek operation. + +#+begin_src C +__u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx); +void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb); +__u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx); +void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb); +void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb); +#+end_src + +The functions below are used for reading and writing the descriptors +of the rings. xsk_ring_prod__fill_addr() and xsk_ring_prod__tx_desc() +*writes* entries in the fill and Tx rings respectively, while +xsk_ring_cons__comp_addr and xsk_ring_cons__rx_desc *reads* entries from +the completion and Rx rings respectively. The =idx= is the parameter +returned in the xsk_ring_prod__reserve or xsk_ring_cons__peek +calls. To advance to the next entry, simply do =idx++=. + +#+begin_src C +__u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, __u32 idx); +struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, __u32 idx); +const __u64 *xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx); +const struct xdp_desc *xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx); +#+end_src + +The xsk_umem functions are used to get a pointer to the packet data +itself, always located inside the umem. In the default aligned mode, +you can get the addr variable straight from the Rx descriptor. But in +unaligned mode, you need to use the three last function below as the +offset used is carried in the upper 16 bits of the addr. Therefore, +you cannot use the addr straight from the descriptor in the unaligned +case. + +#+begin_src C +void *xsk_umem__get_data(void *umem_area, __u64 addr); +__u64 xsk_umem__extract_addr(__u64 addr); +__u64 xsk_umem__extract_offset(__u64 addr); +__u64 xsk_umem__add_offset_to_addr(__u64 addr); +#+end_src + +There is one more function in the data path and that checks if the +need_wakeup flag is set. Use of this flag is highly encouraged and +should be enabled by setting =XDP_USE_NEED_WAKEUP= bit in the +=xdp_bind_flags= field that is provided to the +xsk_socket_create_[shared]() calls. If this function returns true, +then you need to call =recvmsg()=, =sendto()=, or =poll()= depending on the +situation. =recvmsg()= if you are *receiving*, or =sendto()= if you are +*sending*. =poll()= can be used for both cases and provide the ability to +sleep too, as with any other socket. But note that poll is a slower +operation than the other two. + +#+begin_src C +int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r); +#+end_src + +For an example on how to use all these APIs, take a look at the AF_XDP-example +and AF_XDP-forwarding programs in the bpf-examples repository: +https://github.com/xdp-project/bpf-examples. + +* Kernel and BPF program feature compatibility + +The features exposed by libxdp relies on certain kernel versions and BPF +features to work. To get the full benefit of all features, libxdp needs to be +used with kernel 5.10 or newer, unless the commits mentioned below have been +backported. However, libxdp will probe the kernel and transparently fall back to +legacy loading procedures, so it is possible to use the library with older +versions, although some features will be unavailable, as detailed below. + +The ability to attach multiple BPF programs to a single interface relies on the +kernel "BPF program extension" feature which was introduced by commit +be8704ff07d2 ("bpf: Introduce dynamic program extensions") in the upstream +kernel and first appeared in kernel release 5.6. To *incrementally* attach +multiple programs, a further refinement added by commit 4a1e7c0c63e0 ("bpf: +Support attaching freplace programs to multiple attach points") is needed; this +first appeared in the upstream kernel version 5.10. The functionality relies on +the "BPF trampolines" feature which is unfortunately only available on the +x86_64 architecture. In other words, kernels before 5.6 can only attach a single +XDP program to each interface, kernels 5.6+ can attach multiple programs if they +are all attached at the same time, and kernels 5.10 have full support for XDP +multiprog on x86_64. On other architectures, only a single program can be +attached to each interface. + +To load AF_XDP programs, kernel support for AF_XDP sockets needs to be included +and enabled in the kernel build. In addition, when using AF_XDP sockets, an XDP +program is also loaded on the interface. The XDP program used for this by libxdp +requires the ability to do map lookups into XSK maps, which was introduced with +commit fada7fdc83c0 ("bpf: Allow bpf_map_lookup_elem() on an xskmap") in kernel +5.3. This means that the minimum required kernel version for using AF_XDP is +kernel 5.3; however, for the AF_XDP XDP program to co-exist with other programs, +the same constraints for multiprog applies as outlined above. + +Note that some Linux distributions backport features to earlier kernel versions, +especially in enterprise kernels; for instance, Red Hat Enterprise Linux kernels +include everything needed for libxdp to function since RHEL 8.5. + +Finally, XDP programs loaded using the multiprog facility must include type +information (using the BPF Type Format, BTF). To get this, compile the programs +with a recent version of Clang/LLVM (version 10+), and enable debug information +when compiling (using the =-g= option). + +* BUGS +Please report any bugs on Github: https://github.com/xdp-project/xdp-tools/issues + +* AUTHORS +libxdp and this man page were written by Toke +Høiland-Jørgensen. AF_XDP support and documentation was contributed by +Magnus Karlsson. diff --git a/lib/libxdp/bpf_instr.h b/lib/libxdp/bpf_instr.h new file mode 100644 index 0000000..ff1a396 --- /dev/null +++ b/lib/libxdp/bpf_instr.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __BPF_INSTR_H +#define __BPF_INSTR_H + +#include <linux/bpf.h> + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_EMIT_CALL(FUNC) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((FUNC) - BPF_FUNC_unspec) }) + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_MOV32_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_LD_IMM64_RAW_FULL(DST, SRC, OFF1, OFF2, IMM1, IMM2) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF1, \ + .imm = IMM1 }), \ + ((struct bpf_insn) { \ + .code = 0, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = OFF2, \ + .imm = IMM2 }) + +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_FD, 0, 0, \ + MAP_FD, 0) + +#define BPF_LD_MAP_VALUE(DST, MAP_FD, VALUE_OFF) \ + BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_VALUE, 0, 0, \ + MAP_FD, VALUE_OFF) + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#endif diff --git a/lib/libxdp/compat.h b/lib/libxdp/compat.h new file mode 100644 index 0000000..6e9bc34 --- /dev/null +++ b/lib/libxdp/compat.h @@ -0,0 +1,13 @@ +#ifndef __COMPAT_H +#define __COMPAT_H + +#ifndef HAVE_SECURE_GETENV +#include <stdlib.h> +// Source: https://www.openwall.com/lists/musl/2019/05/28/3 +static inline char *secure_getenv(const char *name) +{ + return libc.secure ? NULL : getenv(name); +} +#endif + +#endif diff --git a/lib/libxdp/libxdp.3 b/lib/libxdp/libxdp.3 new file mode 100644 index 0000000..800d021 --- /dev/null +++ b/lib/libxdp/libxdp.3 @@ -0,0 +1,503 @@ +.TH "libxdp" "3" "November 17, 2022" "v1.3.1" "libxdp - library for loading XDP programs" + +.SH "NAME" +libxdp \- library for attaching XDP programs and using AF_XDP sockets +.SH "SYNOPSIS" +.PP +This directory contains the files for the \fIlibxdp\fP library for +attaching XDP programs to network interfaces and using AF_XDP +sockets. The library is fairly lightweight and relies on \fIlibbpf\fP to +do the heavy lifting for processing eBPF object files etc. + +.PP +\fILibxdp\fP provides two primary features on top of \fIlibbpf\fP. The first is +the ability to load multiple XDP programs in sequence on a single +network device (which is not natively supported by the kernel). This +support relies on the \fIfreplace\fP functionality in the kernel, which +makes it possible to attach an eBPF program as a replacement for a +global function in another (already loaded) eBPF program. The second +main feature is helper functions for configuring AF_XDP sockets as +well as reading and writing packets from these sockets. + +.PP +Some of the functionality provided by libxdp depends on particular kernel +features; see the "Kernel feature compatibility" section below for details. + +.SS "Using libxdp from an application" +.PP +Basic usage of libxdp from an application is quite straight forward. The +following example loads, then unloads, an XDP program from the 'lo' interface: + +.RS +.nf +\fC#define IFINDEX 1 + +struct xdp_program *prog; +int err; + +prog = xdp_program__open_file("my-program.o", "section_name", NULL); +err = xdp_program__attach(prog, IFINDEX, XDP_MODE_NATIVE, 0); + +if (!err) + xdp_program__detach(prog, IFINDEX, XDP_MODE_NATIVE, 0); + +xdp_program__close(prog); +\fP +.fi +.RE + +.PP +The \fIxdp_program\fP structure is an opaque structure that represents a single XDP +program. \fIlibxdp\fP contains functions to create such a struct either from a BPF +object file on disk, from a \fIlibbpf\fP BPF object, or from an identifier of a +program that is already loaded into the kernel: + +.RS +.nf +\fCstruct xdp_program *xdp_program__from_bpf_obj(struct bpf_object *obj, + const char *section_name); +struct xdp_program *xdp_program__find_file(const char *filename, + const char *section_name, + struct bpf_object_open_opts *opts); +struct xdp_program *xdp_program__open_file(const char *filename, + const char *section_name, + struct bpf_object_open_opts *opts); +struct xdp_program *xdp_program__from_fd(int fd); +struct xdp_program *xdp_program__from_id(__u32 prog_id); +struct xdp_program *xdp_program__from_pin(const char *pin_path); +\fP +.fi +.RE + +.PP +The functions that open a BPF object or file need the function name of the XDP +program as well as the file name or object, since an ELF file can contain +multiple XDP programs. The \fIxdp_program__find_file()\fP function takes a filename +without a path, and will look for the object in \fILIBXDP_OBJECT_PATH\fP which +defaults to \fI/usr/lib/bpf\fP (or \fI/usr/lib64/bpf\fP on systems using a split library +path). This is convenient for applications shipping pre-compiled eBPF object +files. + +.PP +The \fIxdp_program__attach()\fP function will attach the program to an interface, +building a dispatcher program to execute it. Multiple programs can be attached +at once with \fIxdp_program__attach_multi()\fP; they will be sorted in order of +their run priority, and execution from one program to the next will proceed +based on the chain call actions defined for each program (see the \fBProgram +metadata\fP section below). Because the loading process involves modifying the +attach type of the program, the attach functions only work with \fIstruct +xdp_program\fP objects that have not yet been loaded into the kernel. + +.PP +When using the attach functions to attach to an interface that already has an +XDP program loaded, libxdp will attempt to add the program to the list of loaded +programs. However, this may fail, either due to missing kernel support, or +because the already-attached program was not loaded using a dispatcher +compatible with libxdp. If the kernel support for incremental attach (merged in +kernel 5.10) is missing, the only way to actually run multiple programs on a +single interface is to attach them all at the same time with +\fIxdp_program__attach_multi()\fP. If the existing program is not an XDP dispatcher, +that program will have to be detached from the interface before libxdp can +attach a new one. This can be done by calling \fIxdp_program__detach()\fP with a +reference to the loaded program; but note that this will of course break any +application relying on that other XDP program to be present. + +.SH "Program metadata" +.PP +To support multiple XDP programs on the same interface, libxdp uses two pieces +of metadata for each XDP program: Run priority and chain call actions. + +.SS "Run priority" +.PP +This is the priority of the program and is a simple integer used +to sort programs when loading multiple programs onto the same interface. +Programs that wish to run early (such as a packet filter) should set low values +for this, while programs that want to run later (such as a packet forwarder or +counter) should set higher values. Note that later programs are only run if the +previous programs end with a return code that is part of its chain call actions +(see below). If not specified, the default priority value is 50. + +.SS "Chain call actions" +.PP +These are the program return codes that the program indicate for packets that +should continue processing. If the program returns one of these actions, later +programs in the call chain will be run, whereas if it returns any other action, +processing will be interrupted, and the XDP dispatcher will return the verdict +immediately. If not set, this defaults to just XDP_PASS, which is likely the +value most programs should use. + +.SS "Specifying metadata" +.PP +The metadata outlined above is specified as BTF information embedded in the ELF +file containing the XDP program. The \fIxdp_helpers.h\fP file shipped with libxdp +contains helper macros to include this information, which can be used as +follows: + +.RS +.nf +\fC#include <bpf/bpf_helpers.h> +#include <xdp/xdp_helpers.h> + +struct { + __uint(priority, 10); + __uint(XDP_PASS, 1); + __uint(XDP_DROP, 1); +} XDP_RUN_CONFIG(my_xdp_func); +\fP +.fi +.RE + +.PP +This example specifies that the XDP program in \fImy_xdp_func\fP should have +priority 10 and that its chain call actions are \fIXDP_PASS\fP and \fIXDP_DROP\fP. +In a source file with multiple XDP programs in the same file, a definition like +the above can be included for each program (main XDP function). Any program that +does not specify any config information will use the default values outlined +above. + +.SS "Inspecting and modifying metadata" +.PP +\fIlibxdp\fP exposes the following functions that an application can use to inspect +and modify the metadata on an XDP program. Modification is only possible before +a program is attached on an interface. These functions won't modify the BTF +information itself, but the new values will be stored as part of the program +attachment. + +.RS +.nf +\fCunsigned int xdp_program__run_prio(const struct xdp_program *xdp_prog); +int xdp_program__set_run_prio(struct xdp_program *xdp_prog, + unsigned int run_prio); +bool xdp_program__chain_call_enabled(const struct xdp_program *xdp_prog, + enum xdp_action action); +int xdp_program__set_chain_call_enabled(struct xdp_program *prog, + unsigned int action, + bool enabled); +int xdp_program__print_chain_call_actions(const struct xdp_program *prog, + char *buf, + size_t buf_len); +\fP +.fi +.RE + +.SH "The dispatcher program" +.PP +To support multiple non-offloaded programs on the same network interface, +\fIlibxdp\fP uses a \fBdispatcher program\fP which is a small wrapper program that will +call each component program in turn, expect the return code, and then chain call +to the next program based on the chain call actions of the previous program (see +the \fBProgram metadata\fP section above). + +.PP +While applications using \fIlibxdp\fP do not need to know the details of the +dispatcher program to just load an XDP program unto an interface, \fIlibxdp\fP does +expose the dispatcher and its attached component programs, which can be used to +list the programs currently attached to an interface. + +.PP +The structure used for this is \fIstruct xdp_multiprog\fP, which can only be +constructed from the programs loaded on an interface based on ifindex. The API +for getting a multiprog reference and iterating through the attached programs +looks like this: + +.RS +.nf +\fCstruct xdp_multiprog *xdp_multiprog__get_from_ifindex(int ifindex); +struct xdp_program *xdp_multiprog__next_prog(const struct xdp_program *prog, + const struct xdp_multiprog *mp); +void xdp_multiprog__close(struct xdp_multiprog *mp); +int xdp_multiprog__detach(struct xdp_multiprog *mp, int ifindex); +enum xdp_attach_mode xdp_multiprog__attach_mode(const struct xdp_multiprog *mp); +struct xdp_program *xdp_multiprog__main_prog(const struct xdp_multiprog *mp); +struct xdp_program *xdp_multiprog__hw_prog(const struct xdp_multiprog *mp); +bool xdp_multiprog__is_legacy(const struct xdp_multiprog *mp); +\fP +.fi +.RE + +.PP +If a non-offloaded program is attached to the interface which \fIlibxdp\fP doesn't +recognise as a dispatcher program, an \fIxdp_multiprog\fP structure will still be +returned, and \fIxdp_multiprog__is_legacy()\fP will return true for that program +(note that this also holds true if only an offloaded program is loaded). A +reference to that (regular) XDP program can be obtained by +\fIxdp_multiprog__main_prog()\fP. If the program attached to the interface \fBis\fP a +dispatcher program, \fIxdp_multiprog__main_prog()\fP will return a reference to the +dispatcher program itself, which is mainly useful for obtaining other data about +that program (such as the program ID). A reference to an offloaded program can +be acquired using \fIxdp_multiprog_hw_prog()\fP. Function +\fIxdp_multiprog__attach_mode()\fP returns the attach mode of the non-offloaded +program, whether an offloaded program is attached should be checked through +\fIxdp_multiprog_hw_prog()\fP. + +.SS "Pinning in bpffs" +.PP +The kernel will automatically detach component programs from the dispatcher once +the last reference to them disappears. To prevent this from happening, \fIlibxdp\fP +will pin the component program references in \fIbpffs\fP before attaching the +dispatcher to the network interface. The pathnames generated for pinning is as +follows: + +.IP \(em 4 +/sys/fs/bpf/xdp/dispatch-IFINDEX-DID - dispatcher program for IFINDEX with BPF program ID DID +.IP \(em 4 +/sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-prog - component program 0, program reference +.IP \(em 4 +/sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-link - component program 0, bpf_link reference +.IP \(em 4 +/sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-prog - component program 1, program reference +.IP \(em 4 +/sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-link - component program 1, bpf_link reference +.IP \(em 4 +etc, up to ten component programs + +.PP +If set, the \fILIBXDP_BPFFS\fP environment variable will override the location of +\fIbpffs\fP, but the \fIxdp\fP subdirectory is always used. If no \fIbpffs\fP is mounted, +libxdp will consult the environment variable \fILIBXDP_BPFFS_AUTOMOUNT\fP. If this +is set to \fI1\fP, libxdp will attempt to automount a bpffs. If not, libxdp will +fall back to loading a single program without a dispatcher, as if the kernel did +not support the features needed for multiprog attachment. + +.SH "Using AF_XDP sockets" +.PP +Libxdp implements helper functions for configuring AF_XDP sockets as +well as reading and writing packets from these sockets. AF_XDP sockets +can be used to redirect packets to user-space at high rates from an +XDP program. Note that this functionality used to reside in libbpf, +but has now been moved over to libxdp as it is a better fit for this +library. As of the 1.0 release of libbpf, the AF_XDP socket support +will be removed and all future development will be performed +in libxdp instead. + +.PP +For an overview of AF_XDP sockets, please refer to this Linux Plumbers +paper +(\fIhttp://vger.kernel.org/lpc_net2018_talks/lpc18_pres_af_xdp_perf-v3.pdf\fP) +and the documentation in the Linux kernel +(Documentation/networking/af_xdp.rst or +\fIhttps://www.kernel.org/doc/html/latest/networking/af_xdp.html\fP). + +.PP +For an example on how to use the interface, take a look at the AF_XDP-example +and AF_XDP-forwarding programs in the bpf-examples repository: +\fIhttps://github.com/xdp-project/bpf-examples\fP. + +.SS "Control path" +.PP +Libxdp provides helper functions for creating and destroying umems and +sockets as shown below. The first thing that a user generally wants to +do is to create a umem area. This is the area that will contain all +packets received and the ones that are going to be sent. After that, +AF_XDP sockets can be created tied to this umem. These can either be +sockets that have exclusive ownership of that umem through +xsk_socket__create() or shared with other sockets using +xsk_socket__create_shared. There is one option called +XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD that can be set in the +libxdp_flags field (also called libbpf_flags for compatibility +reasons). This will make libxdp not load any XDP program or set and +BPF maps which is a must if users want to add their own XDP program. + +.RS +.nf +\fCint xsk_umem__create(struct xsk_umem **umem, + void *umem_area, __u64 size, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_umem_config *config); +int xsk_socket__create(struct xsk_socket **xsk, + const char *ifname, __u32 queue_id, + struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + const struct xsk_socket_config *config); +int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, + const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_socket_config *config); +int xsk_umem__delete(struct xsk_umem *umem); +void xsk_socket__delete(struct xsk_socket *xsk); +\fP +.fi +.RE + +.PP +There are also two helper function to get the file descriptor of a +umem or a socket. These are needed when using standard Linux syscalls +such as poll(), recvmsg(), sendto(), etc. + +.RS +.nf +\fCint xsk_umem__fd(const struct xsk_umem *umem); +int xsk_socket__fd(const struct xsk_socket *xsk); +\fP +.fi +.RE + +.PP +The control path also provides two APIs for setting up AF_XDP sockets when the +process that is going to use the AF_XDP socket is non-privileged. These two +functions perform the operations that require privileges and can be executed +from some form of control process that has the necessary privileges. The +xsk_socket__create executed on the non-privileged process will then skip these +two steps. For an example on how to use these, please take a look at the +AF_XDP-example program in the bpf-examples repository: +\fIhttps://github.com/xdp-project/bpf-examples/tree/master/AF_XDP-example\fP. + +.RS +.nf +\fCint xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); +int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); +\fP +.fi +.RE + +.SS "Data path" +.PP +For performance reasons, all the data path functions are static inline +functions found in the xsk.h header file so they can be optimized into +the target application binary for best possible performance. There are +four FIFO rings of two main types: producer rings (fill and Tx) and +consumer rings (Rx and completion). The producer rings use +xsk_ring_prod functions and consumer rings use xsk_ring_cons +functions. For producer rings, you start with \fIreserving\fP one or more +slots in a producer ring and then when they have been filled out, you +\fIsubmit\fP them so that the kernel will act on them. For a consumer +ring, you \fIpeek\fP if there are any new packets in the ring and if so +you can read them from the ring. Once you are done reading them, you +\fIrelease\fP them back to the kernel so it can use them for new +packets. There is also a \fIcancel\fP operation for consumer rings if the +application does not want to consume all packets received with the +peek operation. + +.RS +.nf +\fC__u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx); +void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb); +__u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx); +void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb); +void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb); +\fP +.fi +.RE + +.PP +The functions below are used for reading and writing the descriptors +of the rings. xsk_ring_prod__fill_addr() and xsk_ring_prod__tx_desc() +\fBwrites\fP entries in the fill and Tx rings respectively, while +xsk_ring_cons__comp_addr and xsk_ring_cons__rx_desc \fBreads\fP entries from +the completion and Rx rings respectively. The \fIidx\fP is the parameter +returned in the xsk_ring_prod__reserve or xsk_ring_cons__peek +calls. To advance to the next entry, simply do \fIidx++\fP. + +.RS +.nf +\fC__u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, __u32 idx); +struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, __u32 idx); +const __u64 *xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx); +const struct xdp_desc *xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx); +\fP +.fi +.RE + +.PP +The xsk_umem functions are used to get a pointer to the packet data +itself, always located inside the umem. In the default aligned mode, +you can get the addr variable straight from the Rx descriptor. But in +unaligned mode, you need to use the three last function below as the +offset used is carried in the upper 16 bits of the addr. Therefore, +you cannot use the addr straight from the descriptor in the unaligned +case. + +.RS +.nf +\fCvoid *xsk_umem__get_data(void *umem_area, __u64 addr); +__u64 xsk_umem__extract_addr(__u64 addr); +__u64 xsk_umem__extract_offset(__u64 addr); +__u64 xsk_umem__add_offset_to_addr(__u64 addr); +\fP +.fi +.RE + +.PP +There is one more function in the data path and that checks if the +need_wakeup flag is set. Use of this flag is highly encouraged and +should be enabled by setting \fIXDP_USE_NEED_WAKEUP\fP bit in the +\fIxdp_bind_flags\fP field that is provided to the +xsk_socket_create_[shared]() calls. If this function returns true, +then you need to call \fIrecvmsg()\fP, \fIsendto()\fP, or \fIpoll()\fP depending on the +situation. \fIrecvmsg()\fP if you are \fBreceiving\fP, or \fIsendto()\fP if you are +\fBsending\fP. \fIpoll()\fP can be used for both cases and provide the ability to +sleep too, as with any other socket. But note that poll is a slower +operation than the other two. + +.RS +.nf +\fCint xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r); +\fP +.fi +.RE + +.PP +For an example on how to use all these APIs, take a look at the AF_XDP-example +and AF_XDP-forwarding programs in the bpf-examples repository: +\fIhttps://github.com/xdp-project/bpf-examples\fP. + +.SH "Kernel and BPF program feature compatibility" +.PP +The features exposed by libxdp relies on certain kernel versions and BPF +features to work. To get the full benefit of all features, libxdp needs to be +used with kernel 5.10 or newer, unless the commits mentioned below have been +backported. However, libxdp will probe the kernel and transparently fall back to +legacy loading procedures, so it is possible to use the library with older +versions, although some features will be unavailable, as detailed below. + +.PP +The ability to attach multiple BPF programs to a single interface relies on the +kernel "BPF program extension" feature which was introduced by commit +be8704ff07d2 ("bpf: Introduce dynamic program extensions") in the upstream +kernel and first appeared in kernel release 5.6. To \fBincrementally\fP attach +multiple programs, a further refinement added by commit 4a1e7c0c63e0 ("bpf: +Support attaching freplace programs to multiple attach points") is needed; this +first appeared in the upstream kernel version 5.10. The functionality relies on +the "BPF trampolines" feature which is unfortunately only available on the +x86_64 architecture. In other words, kernels before 5.6 can only attach a single +XDP program to each interface, kernels 5.6+ can attach multiple programs if they +are all attached at the same time, and kernels 5.10 have full support for XDP +multiprog on x86_64. On other architectures, only a single program can be +attached to each interface. + +.PP +To load AF_XDP programs, kernel support for AF_XDP sockets needs to be included +and enabled in the kernel build. In addition, when using AF_XDP sockets, an XDP +program is also loaded on the interface. The XDP program used for this by libxdp +requires the ability to do map lookups into XSK maps, which was introduced with +commit fada7fdc83c0 ("bpf: Allow bpf_map_lookup_elem() on an xskmap") in kernel +5.3. This means that the minimum required kernel version for using AF_XDP is +kernel 5.3; however, for the AF_XDP XDP program to co-exist with other programs, +the same constraints for multiprog applies as outlined above. + +.PP +Note that some Linux distributions backport features to earlier kernel versions, +especially in enterprise kernels; for instance, Red Hat Enterprise Linux kernels +include everything needed for libxdp to function since RHEL 8.5. + +.PP +Finally, XDP programs loaded using the multiprog facility must include type +information (using the BPF Type Format, BTF). To get this, compile the programs +with a recent version of Clang/LLVM (version 10+), and enable debug information +when compiling (using the \fI\-g\fP option). + +.SH "BUGS" +.PP +Please report any bugs on Github: \fIhttps://github.com/xdp-project/xdp-tools/issues\fP + +.SH "AUTHORS" +.PP +libxdp and this man page were written by Toke +Høiland-Jørgensen. AF_XDP support and documentation was contributed by +Magnus Karlsson. diff --git a/lib/libxdp/libxdp.c b/lib/libxdp/libxdp.c new file mode 100644 index 0000000..9689457 --- /dev/null +++ b/lib/libxdp/libxdp.c @@ -0,0 +1,3408 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * XDP management utility functions + * + * Copyright (C) 2020 Toke Høiland-Jørgensen <toke@redhat.com> + */ + +#include <linux/bpf.h> +#define _GNU_SOURCE + +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/types.h> +#include <sys/mount.h> +#include <fcntl.h> +#include <inttypes.h> +#include <dirent.h> + +#include <linux/err.h> /* ERR_PTR */ +#include <linux/if_link.h> +#include <linux/magic.h> + +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include <bpf/btf.h> +#include <xdp/libxdp.h> +#include <xdp/prog_dispatcher.h> + +#include "compat.h" +#include "libxdp_internal.h" + +#define XDP_RUN_CONFIG_SEC ".xdp_run_config" +#define XDP_SKIP_ENVVAR "LIBXDP_SKIP_DISPATCHER" + +/* When cloning BPF fds, we want to make sure they don't end up as any of the + * standard stdin, stderr, stdout descriptors: fd 0 can confuse the kernel, and + * there are orchestration systems that will force-close the others if they + * don't point to the "right" things. So just to be safe, use 3 as the minimum + * fd number. + */ +#define MIN_FD 3 + +/* Max number of times we retry attachment */ +#define MAX_RETRY 10 + +#define IFINDEX_LO 1 + +static const char *dispatcher_feature_err = + "This means that the kernel does not support the features needed\n" + "by the multiprog dispatcher, either because it is too old entirely,\n" + "or because it is not yet supported on the current architecture.\n"; + +struct xdp_program { + /* one of prog or prog_fd should be set */ + struct bpf_program *bpf_prog; + struct bpf_object *bpf_obj; + struct btf *btf; + enum bpf_prog_type prog_type; + int prog_fd; + int link_fd; + char *prog_name; + char *attach_name; + __u8 prog_tag[BPF_TAG_SIZE]; + __u32 prog_id; + __u64 load_time; + bool from_external_obj; + bool is_frags; + unsigned int run_prio; + unsigned int chain_call_actions; /* bitmap */ + + /* for building list of attached programs to multiprog */ + struct xdp_program *next; +}; + +struct xdp_multiprog { + struct xdp_dispatcher_config config; + struct xdp_program *main_prog; /* dispatcher or legacy prog pointer */ + struct xdp_program *first_prog; /* uses xdp_program->next to build a list */ + struct xdp_program *hw_prog; + __u32 version; + size_t num_links; + bool is_loaded; + bool is_legacy; + bool kernel_frags_support; + bool checked_compat; + enum xdp_attach_mode attach_mode; + int ifindex; +}; + +#define XDP_DISPATCHER_VERSION_V1 1 +struct xdp_dispatcher_config_v1 { + __u8 num_progs_enabled; /* Number of active program slots */ + __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; + __u32 run_prios[MAX_DISPATCHER_ACTIONS]; +}; + +static const char *xdp_action_names[] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", + [XDP_REDIRECT] = "XDP_REDIRECT", +}; + +static struct xdp_program *xdp_program__create_from_obj(struct bpf_object *obj, + const char *section_name, + const char *prog_name, + bool external); + +#ifdef LIBXDP_STATIC +struct xdp_embedded_obj { + const char *filename; + const void *data_start; + const void *data_end; +}; + +extern const char _binary_xdp_dispatcher_o_start; +extern const char _binary_xdp_dispatcher_o_end; +extern const char _binary_xsk_def_xdp_prog_o_start; +extern const char _binary_xsk_def_xdp_prog_o_end; +extern const char _binary_xsk_def_xdp_prog_5_3_o_start; +extern const char _binary_xsk_def_xdp_prog_5_3_o_end; + +static struct xdp_embedded_obj embedded_objs[] = { + {"xdp-dispatcher.o", &_binary_xdp_dispatcher_o_start, &_binary_xdp_dispatcher_o_end}, + {"xsk_def_xdp_prog.o", &_binary_xsk_def_xdp_prog_o_start, &_binary_xsk_def_xdp_prog_o_end}, + {"xsk_def_xdp_prog_5.3.o", &_binary_xsk_def_xdp_prog_5_3_o_start, &_binary_xsk_def_xdp_prog_5_3_o_end}, + {}, +}; +static struct xdp_program *xdp_program__find_embedded(const char *filename, + const char *section_name, + const char *prog_name, + struct bpf_object_open_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, default_opts, + .object_name = filename, + ); + struct xdp_embedded_obj *eobj; + struct bpf_object *obj; + size_t size; + int err; + + for (eobj = &embedded_objs[0]; eobj->filename; eobj++) { + if (strcmp(filename, eobj->filename)) + continue; + + size = eobj->data_end - eobj->data_start; + + /* set the object name to the same as if we opened the file from + * the filesystem + */ + if (!opts) + opts = &default_opts; + else if (!opts->object_name) + opts->object_name = filename; + + pr_debug("Loading XDP program '%s' from embedded object file\n", filename); + + obj = bpf_object__open_mem(eobj->data_start, size, opts); + err = libbpf_get_error(obj); + if (err) + return ERR_PTR(err); + return xdp_program__create_from_obj(obj, section_name, prog_name, false); + } + + return NULL; +} +#else +static inline struct xdp_program *xdp_program__find_embedded(__unused const char *filename, + __unused const char *section_name, + __unused const char *prog_name, + __unused struct bpf_object_open_opts *opts) +{ + return NULL; +} +#endif + +static int __base_pr(enum libxdp_print_level level, const char *format, + va_list args) +{ + if (level == LIBXDP_DEBUG) + return 0; + + return vfprintf(stderr, format, args); +} + +static libxdp_print_fn_t __libxdp_pr = __base_pr; + +libxdp_print_fn_t libxdp_set_print(libxdp_print_fn_t fn) +{ + libxdp_print_fn_t old_print_fn = __libxdp_pr; + + __libxdp_pr = fn; + return old_print_fn; +} + +__printf(2, 3) void libxdp_print(enum libxdp_print_level level, const char *format, ...) +{ + va_list args; + + if (!__libxdp_pr) + return; + + va_start(args, format); + __libxdp_pr(level, format, args); + va_end(args); +} + +static enum { + COMPAT_UNKNOWN, + COMPAT_SUPPORTED, + COMPAT_UNSUPPORTED +} kernel_compat = COMPAT_UNKNOWN; + +static int xdp_multiprog__attach(struct xdp_multiprog *old_mp, + struct xdp_multiprog *mp, + enum xdp_attach_mode mode); +static struct xdp_multiprog *xdp_multiprog__generate(struct xdp_program **progs, + size_t num_progs, + int ifindex, + struct xdp_multiprog *old_mp, + bool remove_progs); +static int xdp_multiprog__pin(struct xdp_multiprog *mp); +static int xdp_multiprog__unpin(struct xdp_multiprog *mp); + + +/* On NULL, libxdp always sets errno to 0 for old APIs, so that their + * compatibility is maintained wrt old libxdp_get_error that called the older + * version of libbpf_get_error which did PTR_ERR_OR_ZERO, but newer versions + * unconditionally return -errno on seeing NULL, as the libbpf practice changed + * to returning NULL or errors. + * + * The new APIs (like xdp_program__create) which indicate error using NULL set + * their errno when returning NULL. + */ +long libxdp_get_error(const void *ptr) +{ + if (!IS_ERR_OR_NULL(ptr)) + return 0; + + if (IS_ERR(ptr)) + errno = -PTR_ERR(ptr); + return -errno; +} + +int libxdp_strerror(int err, char *buf, size_t size) +{ + return libxdp_err(libbpf_strerror(err, buf, size)); +} + +static char *libxdp_strerror_r(int err, char *dst, size_t size) +{ + int ret = libxdp_strerror(err, dst, size); + if (ret) + snprintf(dst, size, "ERROR: strerror_r(%d)=%d", err, ret); + return dst; +} + +#ifndef HAVE_LIBBPF_BTF__LOAD_FROM_KERNEL_BY_ID +static struct btf *btf__load_from_kernel_by_id(__u32 id) +{ + struct btf *btf; + int err; + + err = btf__get_from_id(id, &btf); + if (err) + return NULL; + return btf; +} +#endif + +#ifndef HAVE_LIBBPF_BTF__TYPE_CNT +static __u32 btf__type_cnt(const struct btf *btf) +{ + /* old function didn't include 'void' type in count */ + return btf__get_nr_types(btf) + 1; +} +#endif + +#ifndef HAVE_LIBBPF_BPF_OBJECT__NEXT_MAP +static struct bpf_map *bpf_object__next_map(const struct bpf_object *obj, + const struct bpf_map *map) +{ + return bpf_map__next(map, obj); +} +#endif + +#ifndef HAVE_LIBBPF_BPF_OBJECT__NEXT_PROGRAM +static struct bpf_program *bpf_object__next_program(const struct bpf_object *obj, + struct bpf_program *prog) +{ + return bpf_program__next(prog, obj); +} +#endif + +#ifndef HAVE_LIBBPF_BPF_PROGRAM__INSN_CNT +#define BPF_INSN_SZ (sizeof(struct bpf_insn)) +static size_t bpf_program__insn_cnt(const struct bpf_program *prog) +{ + size_t sz; + + sz = bpf_program__size(prog); + return sz / BPF_INSN_SZ; +} +#endif + +/* This function has been deprecated in libbpf, but we expose an API that uses + * section names, so we reimplement it to keep compatibility + */ +static struct bpf_program * +bpf_program_by_section_name(const struct bpf_object *obj, + const char *section_name) +{ + struct bpf_program *pos; + const char *sname; + + bpf_object__for_each_program(pos, obj) { + sname = bpf_program__section_name(pos); + if (sname && !strcmp(sname, section_name)) + return pos; + } + return NULL; +} + +static bool bpf_is_valid_mntpt(const char *mnt) +{ + struct statfs st_fs; + + if (statfs(mnt, &st_fs) < 0) + return false; + if ((unsigned long)st_fs.f_type != BPF_FS_MAGIC) + return false; + + return true; +} + +static int bpf_mnt_fs(const char *target) +{ + bool bind_done = false; + int err; + +retry: + err = mount("", target, "none", MS_PRIVATE | MS_REC, NULL); + if (err) { + if (errno != EINVAL || bind_done) { + err = -errno; + pr_warn("mount --make-private %s failed: %s\n", + target, strerror(-err)); + return err; + } + + err = mount(target, target, "none", MS_BIND, NULL); + if (err) { + err = -errno; + pr_warn("mount --bind %s %s failed: %s\n", + target, target, strerror(-err)); + return err; + } + + bind_done = true; + goto retry; + } + + err = mount("bpf", target, "bpf", 0, "mode=0700"); + if (err) { + err = -errno; + pr_warn("mount -t bpf bpf %s failed: %s\n", + target, strerror(-err)); + return err; + } + + return 0; +} + +static const char *bpf_find_mntpt_single(char *mnt, int len, const char *mntpt, bool mount) +{ + int err; + + if (!bpf_is_valid_mntpt(mntpt)) { + if (!mount) + return NULL; + + pr_debug("No bpffs found at %s, mounting a new one\n", + mntpt); + + err = bpf_mnt_fs(mntpt); + if (err) + return NULL; + } + + strncpy(mnt, mntpt, len - 1); + mnt[len - 1] = '\0'; + return mnt; +} + +static const char *find_bpffs() +{ + static bool bpf_mnt_cached = false; + static char bpf_wrk_dir[PATH_MAX]; + static const char *mnt = NULL; + char *envdir, *envval; + bool mount = false; + + if (bpf_mnt_cached) + return mnt; + + envdir = secure_getenv(XDP_BPFFS_ENVVAR); + envval = secure_getenv(XDP_BPFFS_MOUNT_ENVVAR); + if (envval && envval[0] == '1' && envval[1] == '\0') + mount = true; + + mnt = bpf_find_mntpt_single(bpf_wrk_dir, + sizeof(bpf_wrk_dir), + envdir ?: BPF_DIR_MNT, + mount); + if (!mnt) + pr_warn("No bpffs found at %s\n", envdir ?: BPF_DIR_MNT); + else + bpf_mnt_cached = 1; + + return mnt; +} + +static int mk_state_subdir(char *dir, size_t dir_sz, const char *parent) +{ + int err; + + err = try_snprintf(dir, dir_sz, "%s/xdp", parent); + if (err) + return err; + + err = mkdir(dir, S_IRWXU); + if (err && errno != EEXIST) + return -errno; + + return 0; +} + +static const char *get_bpffs_dir(void) +{ + static char bpffs_dir[PATH_MAX]; + static const char *dir = NULL; + const char *parent; + int err; + + if (dir) + return dir; + + parent = find_bpffs(); + if (!parent) { + err = -ENOENT; + goto err; + } + + err = mk_state_subdir(bpffs_dir, sizeof(bpffs_dir), parent); + if (err) + goto err; + + dir = bpffs_dir; + return dir; +err: + return ERR_PTR(err); +} + +static const char *get_lock_dir(void) +{ + static const char *dir = NULL; + static char rundir[PATH_MAX]; + int err; + + if (dir) + return dir; + + dir = get_bpffs_dir(); + if (!IS_ERR(dir)) + return dir; + + err = mk_state_subdir(rundir, sizeof(rundir), RUNDIR); + if (err) + return ERR_PTR(err); + + dir = rundir; + return dir; +} + +int xdp_lock_acquire(void) +{ + int lock_fd, err; + const char *dir; + + dir = get_lock_dir(); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + lock_fd = open(dir, O_DIRECTORY); + if (lock_fd < 0) { + err = -errno; + pr_warn("Couldn't open lock directory at %s: %s\n", + dir, strerror(-err)); + return err; + } + + err = flock(lock_fd, LOCK_EX); + if (err) { + err = -errno; + pr_warn("Couldn't flock fd %d: %s\n", lock_fd, strerror(-err)); + close(lock_fd); + return err; + } + + pr_debug("Acquired lock from %s with fd %d\n", dir, lock_fd); + return lock_fd; +} + +int xdp_lock_release(int lock_fd) +{ + int err; + + err = flock(lock_fd, LOCK_UN); + if (err) { + err = -errno; + pr_warn("Couldn't unlock fd %d: %s\n", lock_fd, strerror(-err)); + } else { + pr_debug("Released lock fd %d\n", lock_fd); + } + close(lock_fd); + return err; +} + +static int do_xdp_attach(int ifindex, int prog_fd, int old_fd, __u32 xdp_flags) +{ +#ifdef HAVE_LIBBPF_BPF_XDP_ATTACH + LIBBPF_OPTS(bpf_xdp_attach_opts, opts, + .old_prog_fd = old_fd); + return bpf_xdp_attach(ifindex, prog_fd, xdp_flags, &opts); +#else + DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, .old_fd = old_fd); + return bpf_set_link_xdp_fd_opts(ifindex, prog_fd, xdp_flags, old_fd ? &opts : NULL); +#endif +} + +int xdp_attach_fd(int prog_fd, int old_fd, int ifindex, + enum xdp_attach_mode mode) +{ + int err = 0, xdp_flags = 0; + + pr_debug("Replacing XDP fd %d with %d on ifindex %d\n", + old_fd, prog_fd, ifindex); + + if (old_fd == -1) { + xdp_flags |= XDP_FLAGS_UPDATE_IF_NOEXIST; + old_fd = 0; + } + + switch (mode) { + case XDP_MODE_SKB: + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case XDP_MODE_NATIVE: + xdp_flags |= XDP_FLAGS_DRV_MODE; + break; + case XDP_MODE_HW: + xdp_flags |= XDP_FLAGS_HW_MODE; + break; + case XDP_MODE_UNSPEC: + break; + } +again: + err = do_xdp_attach(ifindex, prog_fd, old_fd, xdp_flags); + if (err < 0) { + if (err == -EINVAL && old_fd) { + pr_debug("Got 'invalid argument', trying again without old_fd\n"); + old_fd = 0; + goto again; + } + pr_info("Error attaching XDP program to ifindex %d: %s\n", + ifindex, strerror(-err)); + + if (err == -EEXIST && old_fd) + /* We raced with another attach/detach, have to retry */ + return -EAGAIN; + + switch (-err) { + case EBUSY: + case EEXIST: + pr_info("XDP already loaded on device\n"); + break; + case EOPNOTSUPP: + pr_info("XDP mode not supported; try using SKB mode\n"); + break; + default: + break; + } + } + return err; +} + +const struct btf *xdp_program__btf(struct xdp_program *xdp_prog) +{ + if (!xdp_prog) + return libxdp_err_ptr(0, true); + + return xdp_prog->btf; +} + +enum xdp_attach_mode +xdp_program__is_attached(const struct xdp_program *xdp_prog, int ifindex) +{ + struct xdp_program *prog = NULL; + struct xdp_multiprog *mp; + enum xdp_attach_mode ret = XDP_MODE_UNSPEC; + + if (!xdp_prog || !xdp_prog->prog_id) + return ret; + + mp = xdp_multiprog__get_from_ifindex(ifindex); + if (IS_ERR_OR_NULL(mp)) + return ret; + + prog = xdp_multiprog__hw_prog(mp); + if (xdp_program__id(prog) == xdp_program__id(xdp_prog)) { + ret = XDP_MODE_HW; + goto out; + } + + if (xdp_multiprog__is_legacy(mp)) { + prog = xdp_multiprog__main_prog(mp); + if (xdp_program__id(prog) == xdp_program__id(xdp_prog)) + ret = xdp_multiprog__attach_mode(mp); + goto out; + } + + while ((prog = xdp_multiprog__next_prog(prog, mp))) { + if (xdp_program__id(prog) == xdp_program__id(xdp_prog)) { + ret = xdp_multiprog__attach_mode(mp); + break; + } + } + +out: + xdp_multiprog__close(mp); + return ret; +} + +int xdp_program__set_chain_call_enabled(struct xdp_program *prog, + unsigned int action, bool enabled) +{ + if (IS_ERR_OR_NULL(prog) || prog->prog_fd >= 0 || action >= XDP_DISPATCHER_RETVAL) + return libxdp_err(-EINVAL); + + if (enabled) + prog->chain_call_actions |= (1U << action); + else + prog->chain_call_actions &= ~(1U << action); + + return 0; +} + +bool xdp_program__chain_call_enabled(const struct xdp_program *prog, + enum xdp_action action) +{ + if (IS_ERR_OR_NULL(prog) || action >= XDP_DISPATCHER_RETVAL) + return false; + + return !!(prog->chain_call_actions & (1U << action)); +} + +unsigned int xdp_program__run_prio(const struct xdp_program *prog) +{ + if (IS_ERR_OR_NULL(prog)) + return XDP_DEFAULT_RUN_PRIO; + + return prog->run_prio; +} + +int xdp_program__set_run_prio(struct xdp_program *prog, unsigned int run_prio) +{ + if (IS_ERR_OR_NULL(prog) || prog->prog_fd >= 0) + return libxdp_err(-EINVAL); + + prog->run_prio = run_prio; + return 0; +} + +bool xdp_program__xdp_frags_support(const struct xdp_program *prog) +{ + if (IS_ERR_OR_NULL(prog)) + return false; + + /* Until we load the program we just check the bpf_program__flags() to + * ensure any changes made to those are honoured on the libxdp side. For + * loaded programs we keep our own state variable which is populated + * either by copying over the program flags in xdp_program__load(), or + * by loading the state from the dispatcher state variables if + * instantiating the object from the kernel. + */ + if (!prog->bpf_prog || prog->prog_fd >= 0) + return prog->is_frags; + + return !!(bpf_program__flags(prog->bpf_prog) & BPF_F_XDP_HAS_FRAGS); +} + +int xdp_program__set_xdp_frags_support(struct xdp_program *prog, bool frags) +{ + __u32 prog_flags; + int ret; + + if (IS_ERR_OR_NULL(prog) || !prog->bpf_prog || prog->prog_fd >= 0) + return libxdp_err(-EINVAL); + + prog_flags = bpf_program__flags(prog->bpf_prog); + + if (frags) + prog_flags |= BPF_F_XDP_HAS_FRAGS; + else + prog_flags &= ~BPF_F_XDP_HAS_FRAGS; + + ret = bpf_program__set_flags(prog->bpf_prog, prog_flags); + if (!ret) + prog->is_frags = frags; + + return ret; +} + +const char *xdp_program__name(const struct xdp_program *prog) +{ + if (IS_ERR_OR_NULL(prog)) + return libxdp_err_ptr(0, true); + + return prog->prog_name; +} + +struct bpf_object *xdp_program__bpf_obj(struct xdp_program *prog) +{ + if (IS_ERR_OR_NULL(prog)) + return libxdp_err_ptr(0, true); + + return prog->bpf_obj; +} + +const unsigned char *xdp_program__tag(const struct xdp_program *prog) +{ + if (IS_ERR_OR_NULL(prog)) + return libxdp_err_ptr(0, true); + + return prog->prog_tag; +} + +uint32_t xdp_program__id(const struct xdp_program *prog) +{ + if (IS_ERR_OR_NULL(prog)) + return 0; + + return prog->prog_id; +} + +int xdp_program__fd(const struct xdp_program *prog) +{ + if (IS_ERR_OR_NULL(prog)) + return errno = ENOENT, -1; + + return prog->prog_fd; +} + +int xdp_program__print_chain_call_actions(const struct xdp_program *prog, + char *buf, size_t buf_len) +{ + bool first = true; + char *pos = buf; + int i, len = 0; + + if (IS_ERR_OR_NULL(prog) || !buf || !buf_len) + return libxdp_err(-EINVAL); + + for (i = 0; i <= XDP_REDIRECT; i++) { + if (xdp_program__chain_call_enabled(prog, i)) { + if (!first) { + if (!buf_len) + goto err_len; + *pos++ = ','; + buf_len--; + } else { + first = false; + } + len = snprintf(pos, buf_len, "%s", xdp_action_names[i]); + if (len < 0 || (size_t)len >= buf_len) + goto err_len; + pos += len; + buf_len -= len; + } + } + return 0; +err_len: + *pos = '\0'; + return libxdp_err(-ENOSPC); +} + +static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, + __u32 id, __u32 *res_id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + + if (res_id) + *res_id = id; + + while (btf_is_mod(t) || btf_is_typedef(t)) { + if (res_id) + *res_id = t->type; + t = btf__type_by_id(btf, t->type); + } + + return t; +} + +static bool get_field_int(const struct btf *btf, + const char *t_name, + const struct btf_type *t, + __u32 *res) +{ + const struct btf_array *arr_info; + const struct btf_type *arr_t; + + if (!btf_is_ptr(t)) { + pr_warn("attr '%s': expected PTR, got %u.\n", + t_name, btf_kind(t)); + return false; + } + + arr_t = btf__type_by_id(btf, t->type); + if (!arr_t) { + pr_warn("attr '%s': type [%u] not found.\n", + t_name, t->type); + return false; + } + if (!btf_is_array(arr_t)) { + pr_warn("attr '%s': expected ARRAY, got %u.\n", + t_name, btf_kind(arr_t)); + return false; + } + arr_info = btf_array(arr_t); + *res = arr_info->nelems; + return true; +} + +static bool get_xdp_action(const char *act_name, unsigned int *act) +{ + const char **name = xdp_action_names; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(xdp_action_names); i++, name++) { + if (!strcmp(act_name, *name)) { + *act = i; + return true; + } + } + return false; +} + +/* + * Find BTF func definition for func_name, which may be a truncated prefix of + * the real function name. + * Return NULL on no, or ambiguous, match. + */ +static const struct btf_type *btf_get_function(const struct btf *btf, + const char *func_name) +{ + const struct btf_type *t, *match; + size_t len, matches = 0; + const char *name; + int nr_types, i; + + if (!btf) { + pr_debug("No BTF found for program\n"); + return NULL; + } + + len = strlen(func_name); + + nr_types = btf__type_cnt(btf); + for (i = 1; i < nr_types; i++) { + t = btf__type_by_id(btf, i); + if (!btf_is_func(t)) + continue; + + name = btf__name_by_offset(btf, t->name_off); + if (!strncmp(name, func_name, len)) { + pr_debug("Found func %s matching %s\n", + name, func_name); + + if (strlen(name) == len) + return t; /* exact match */ + + /* prefix, may not be unique */ + matches++; + match = t; + } + } + + if (matches == 1) /* unique match */ + return match; + + pr_debug("Function '%s' not found or ambiguous (%zu matches).\n", + func_name, matches); + return NULL; +} + +static const struct btf_type *btf_get_datasec(const struct btf *btf, + const char *sec_name) +{ + const struct btf_type *t; + int nr_types, i; + const char *name; + + if (!btf) { + pr_debug("No BTF found for program\n"); + return NULL; + } + + nr_types = btf__type_cnt(btf); + for (i = 1; i < nr_types; i++) { + t = btf__type_by_id(btf, i); + if (!btf_is_datasec(t)) + continue; + name = btf__name_by_offset(btf, t->name_off); + if (strcmp(name, sec_name) == 0) + return t; + } + + pr_debug("DATASEC '%s' not found.\n", sec_name); + return NULL; +} + +static const struct btf_type *btf_get_section_var(const struct btf *btf, + const struct btf_type *sec, + const char *var_name, + __u16 kind) +{ + const struct btf_var_secinfo *vi; + const struct btf_var *var_extra; + const struct btf_type *var, *def; + const char *name; + int vlen, i; + + vlen = btf_vlen(sec); + vi = btf_var_secinfos(sec); + for (i = 0; i < vlen; i++, vi++) { + var = btf__type_by_id(btf, vi->type); + var_extra = btf_var(var); + name = btf__name_by_offset(btf, var->name_off); + + if (strcmp(name, var_name)) + continue; + + if (!btf_is_var(var)) { + pr_warn("struct '%s': unexpected var kind %u.\n", + name, btf_kind(var)); + return ERR_PTR(-EINVAL); + } + if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED && + var_extra->linkage != BTF_VAR_STATIC) { + pr_warn("struct '%s': unsupported var linkage %u.\n", + name, var_extra->linkage); + return ERR_PTR(-EOPNOTSUPP); + } + + def = skip_mods_and_typedefs(btf, var->type, NULL); + if (btf_kind(def) != kind) { + pr_warn("var '%s': unexpected def kind %u.\n", + name, btf_kind(def)); + return ERR_PTR(-EINVAL); + } + return def; + } + return ERR_PTR(-ENOENT); +} + +/** + * This function parses the run config information attached to an XDP program. + * + * This information is specified using BTF, in a format similar to how + * BTF-defined maps are done. The definition looks like this: + * + * struct { + * __uint(priority, 10); + * __uint(XDP_PASS, 1); + * } XDP_RUN_CONFIG(FUNCNAME); + * + * The priority is simply an integer that will be used to sort programs as they + * are attached on the interface (see cmp_xdp_programs() for full sort order). + * In addition to the priority, the run config can define an integer value for + * each XDP action. A non-zero value means that execution will continue to the + * next loaded program if the current program returns that action. I.e., in the + * above example, any return value other than XDP_PASS will cause the dispatcher + * to exit with that return code, whereas XDP_PASS means execution will + * continue. + * + * Since this information becomes part of the object file BTF info, it will + * survive loading into the kernel, and so it can be retrieved for + * already-loaded programs as well. + */ +static int xdp_program__parse_btf(struct xdp_program *xdp_prog, + const struct btf *btf) +{ + const struct btf_type *def, *sec; + const struct btf_member *m; + char struct_name[100]; + int err, i, mlen; + + if (!btf) + btf = xdp_program__btf(xdp_prog); + + /* If the program name is the maximum allowed object name in the kernel, + * it may have been truncated, in which case we try to expand it by + * looking for a match in the BTF data. + */ + if (strlen(xdp_prog->prog_name) >= BPF_OBJ_NAME_LEN - 1) { + const struct btf_type *func; + char *name; + + func = btf_get_function(btf, xdp_prog->prog_name); + if (func) { + name = strdup(btf__name_by_offset(btf, func->name_off)); + if (!name) + return -ENOMEM; + free(xdp_prog->prog_name); + xdp_prog->prog_name = name; + } + } + + err = try_snprintf(struct_name, sizeof(struct_name), "_%s", + xdp_program__name(xdp_prog)); + if (err) + return err; + + sec = btf_get_datasec(btf, XDP_RUN_CONFIG_SEC); + if (!sec) + return -ENOENT; + + def = btf_get_section_var(btf, sec, struct_name, BTF_KIND_STRUCT); + if (IS_ERR(def)) { + pr_debug("Couldn't find run order struct %s\n", struct_name); + return PTR_ERR(def); + } + + mlen = btf_vlen(def); + m = btf_members(def); + for (i = 0; i < mlen; i++, m++) { + const char *mname = btf__name_by_offset(btf, m->name_off); + const struct btf_type *m_t; + unsigned int val, act; + + if (!mname) { + pr_warn("struct '%s': invalid field #%d.\n", struct_name, i); + return -EINVAL; + } + m_t = skip_mods_and_typedefs(btf, m->type, NULL); + + if (!strcmp(mname, "priority")) { + if (!get_field_int(btf, mname, m_t, &xdp_prog->run_prio)) + return -EINVAL; + continue; + } else if (get_xdp_action(mname, &act)) { + if (!get_field_int(btf, mname, m_t, &val)) + return -EINVAL; + xdp_program__set_chain_call_enabled(xdp_prog, act, val); + } else { + pr_warn("Invalid mname: %s\n", mname); + return -ENOTSUP; + } + } + return 0; +} + +static struct xdp_program *xdp_program__new(void) +{ + struct xdp_program *xdp_prog; + + xdp_prog = malloc(sizeof(*xdp_prog)); + if (!xdp_prog) + return ERR_PTR(-ENOMEM); + + memset(xdp_prog, 0, sizeof(*xdp_prog)); + + xdp_prog->prog_fd = -1; + xdp_prog->link_fd = -1; + xdp_prog->run_prio = XDP_DEFAULT_RUN_PRIO; + xdp_prog->chain_call_actions = XDP_DEFAULT_CHAIN_CALL_ACTIONS; + + return xdp_prog; +} + +void xdp_program__close(struct xdp_program *xdp_prog) +{ + if (!xdp_prog) + return; + + if (xdp_prog->link_fd >= 0) + close(xdp_prog->link_fd); + if (xdp_prog->prog_fd >= 0) + close(xdp_prog->prog_fd); + + free(xdp_prog->prog_name); + free(xdp_prog->attach_name); + + if (!xdp_prog->from_external_obj) { + if (xdp_prog->bpf_obj) + bpf_object__close(xdp_prog->bpf_obj); + else if (xdp_prog->btf) + btf__free(xdp_prog->btf); + } + + free(xdp_prog); +} + +static struct xdp_program *xdp_program__create_from_obj(struct bpf_object *obj, + const char *section_name, + const char *prog_name, + bool external) +{ + struct xdp_program *xdp_prog; + struct bpf_program *bpf_prog; + int err; + + if (!obj || (section_name && prog_name)) + return ERR_PTR(-EINVAL); + + if (section_name) + bpf_prog = bpf_program_by_section_name(obj, section_name); + else if (prog_name) + bpf_prog = bpf_object__find_program_by_name(obj, prog_name); + else + bpf_prog = bpf_object__next_program(obj, NULL); + + if (!bpf_prog) { + pr_warn("Couldn't find xdp program in bpf object%s%s\n", + section_name ? " section " : "", section_name ?: ""); + return ERR_PTR(-ENOENT); + } + + xdp_prog = xdp_program__new(); + if (IS_ERR(xdp_prog)) + return xdp_prog; + + xdp_prog->prog_name = strdup(bpf_program__name(bpf_prog)); + if (!xdp_prog->prog_name) { + err = -ENOMEM; + goto err; + } + + err = xdp_program__parse_btf(xdp_prog, bpf_object__btf(obj)); + if (err && err != -ENOENT) + goto err; + + xdp_prog->bpf_prog = bpf_prog; + xdp_prog->bpf_obj = obj; + xdp_prog->btf = bpf_object__btf(obj); + xdp_prog->from_external_obj = external; + + return xdp_prog; +err: + xdp_program__close(xdp_prog); + return ERR_PTR(err); +} + +struct xdp_program *xdp_program__from_bpf_obj(struct bpf_object *obj, + const char *section_name) +{ + struct xdp_program *prog; + + prog = xdp_program__create_from_obj(obj, section_name, NULL, true); + /* xdp_program__create_from_obj does not return NULL */ + if (!IS_ERR(prog)) + return prog; + return libxdp_err_ptr(PTR_ERR(prog), false); +} + +static struct bpf_object *open_bpf_obj(const char *filename, + struct bpf_object_open_opts *opts) +{ + struct bpf_object *obj; + int err; + + obj = bpf_object__open_file(filename, opts); + err = libbpf_get_error(obj); + if (err) { + if (err == -ENOENT) + pr_debug( + "Couldn't load the eBPF program (libbpf said 'no such file').\n" + "Maybe the program was compiled with a too old " + "version of LLVM (need v9.0+)?\n"); + return ERR_PTR(err); + } + + return obj; +} + +static struct xdp_program *__xdp_program__open_file(const char *filename, + const char *section_name, + const char *prog_name, + struct bpf_object_open_opts *opts) +{ + struct xdp_program *xdp_prog; + struct bpf_object *obj; + int err; + + if (!filename) + return ERR_PTR(-EINVAL); + + obj = open_bpf_obj(filename, opts); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + return ERR_PTR(err); + } + + xdp_prog = xdp_program__create_from_obj(obj, section_name, prog_name, false); + if (IS_ERR(xdp_prog)) + bpf_object__close(obj); + + return xdp_prog; +} + +struct xdp_program *xdp_program__open_file(const char *filename, + const char *section_name, + struct bpf_object_open_opts *opts) +{ + struct xdp_program *prog; + + prog = __xdp_program__open_file(filename, section_name, NULL, opts); + /* __xdp_program__open_file does not return NULL */ + if (!IS_ERR(prog)) + return prog; + return libxdp_err_ptr(PTR_ERR(prog), false); +} + +static bool try_bpf_file(char *buf, size_t buf_size, char *path, + const char *progname) +{ + struct stat sb = {}; + + if (try_snprintf(buf, buf_size, "%s/%s", path, progname)) + return false; + + pr_debug("Looking for '%s'\n", buf); + if (stat(buf, &sb)) + return false; + + return true; +} + +static int find_bpf_file(char *buf, size_t buf_size, const char *progname) +{ + static char *bpf_obj_paths[] = { +#ifdef DEBUG + ".", +#endif + BPF_OBJECT_PATH, + NULL + }; + char *path, **p; + + path = secure_getenv(XDP_OBJECT_ENVVAR); + if (path && try_bpf_file(buf, buf_size, path, progname)) { + return 0; + } else if (!path) { + for (p = bpf_obj_paths; *p; p++) + if (try_bpf_file(buf, buf_size, *p, progname)) + return 0; + } + + pr_warn("Couldn't find a BPF file with name %s\n", progname); + return -ENOENT; +} + +static struct xdp_program *__xdp_program__find_file(const char *filename, + const char *section_name, + const char *prog_name, + struct bpf_object_open_opts *opts) +{ + struct xdp_program *prog; + char buf[PATH_MAX]; + int err; + + prog = xdp_program__find_embedded(filename, section_name, prog_name, opts); + if (prog) + return prog; + + err = find_bpf_file(buf, sizeof(buf), filename); + if (err) + return ERR_PTR(err); + + pr_debug("Loading XDP program from '%s' section '%s'\n", buf, + section_name ?: (prog_name ?: "(unknown)")); + return __xdp_program__open_file(buf, section_name, prog_name, opts); +} + +struct xdp_program *xdp_program__find_file(const char *filename, + const char *section_name, + struct bpf_object_open_opts *opts) +{ + struct xdp_program *prog; + + prog = __xdp_program__find_file(filename, section_name, NULL, opts); + /* __xdp_program__find_file does not return NULL */ + if (!IS_ERR(prog)) + return prog; + return libxdp_err_ptr(PTR_ERR(prog), false); +} + +static int xdp_program__fill_from_fd(struct xdp_program *xdp_prog, int fd) +{ + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + struct btf *btf = NULL; + int err = 0, prog_fd; + + if (!xdp_prog) + return -EINVAL; + + /* Duplicate the descriptor, as we take ownership of the fd below */ + prog_fd = fcntl(fd, F_DUPFD_CLOEXEC, MIN_FD); + if (prog_fd < 0) { + err = -errno; + pr_debug("Error on fcntl: %s", strerror(-err)); + return err; + } + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("couldn't get program info: %s", strerror(-err)); + goto err; + } + + if (!xdp_prog->prog_name) { + xdp_prog->prog_name = strdup(info.name); + if (!xdp_prog->prog_name) { + err = -ENOMEM; + pr_warn("failed to strdup program title"); + goto err; + } + } + + if (info.btf_id && !xdp_prog->btf) { + btf = btf__load_from_kernel_by_id(info.btf_id); + if (!btf) { + pr_warn("Couldn't get BTF for ID %ul\n", info.btf_id); + goto err; + } + xdp_prog->btf = btf; + } + + pr_debug("Duplicated fd %d to %d for prog %s\n", fd, prog_fd, xdp_prog->prog_name); + memcpy(xdp_prog->prog_tag, info.tag, BPF_TAG_SIZE); + xdp_prog->load_time = info.load_time; + xdp_prog->prog_fd = prog_fd; + xdp_prog->prog_id = info.id; + xdp_prog->prog_type = info.type; + + return 0; +err: + close(prog_fd); + btf__free(btf); + return err; +} + +struct xdp_program *xdp_program__from_fd(int fd) +{ + struct xdp_program *xdp_prog = NULL; + int err; + + xdp_prog = xdp_program__new(); + if (IS_ERR(xdp_prog)) + return libxdp_err_ptr(PTR_ERR(xdp_prog), false); + + err = xdp_program__fill_from_fd(xdp_prog, fd); + if (err) + goto err; + + err = xdp_program__parse_btf(xdp_prog, NULL); + if (err && err != -ENOENT) + goto err; + + return xdp_prog; +err: + xdp_program__close(xdp_prog); + return libxdp_err_ptr(err, false); +} + +struct xdp_program *xdp_program__from_id(__u32 id) +{ + struct xdp_program *prog; + int fd, err; + + fd = bpf_prog_get_fd_by_id(id); + if (fd < 0) { + err = -errno; + pr_warn("couldn't get program fd: %s", strerror(-err)); + return libxdp_err_ptr(err, false); + } + + prog = xdp_program__from_fd(fd); + if (IS_ERR(prog)) { + err = errno; + close(fd); + errno = err; + } + return prog; +} + +struct xdp_program *xdp_program__from_pin(const char *pin_path) +{ + struct xdp_program *prog; + int fd, err; + + fd = bpf_obj_get(pin_path); + if (fd < 0) { + err = -errno; + pr_warn("couldn't get program fd from %s: %s", + pin_path, strerror(-err)); + return libxdp_err_ptr(err, false); + } + + prog = xdp_program__from_fd(fd); + if (IS_ERR(prog)) { + err = errno; + close(fd); + errno = err; + } + return prog; +} + +struct xdp_program *xdp_program__create(struct xdp_program_opts *opts) +{ + const char *pin_path, *prog_name, *find_filename, *open_filename; + struct bpf_object_open_opts *obj_opts; + struct xdp_program *prog; + struct bpf_object *obj; + __u32 id; + int fd; + + if (!opts || !OPTS_VALID(opts, xdp_program_opts)) + goto err; + + obj = OPTS_GET(opts, obj, NULL); + obj_opts = OPTS_GET(opts, opts, NULL); + prog_name = OPTS_GET(opts, prog_name, NULL); + find_filename = OPTS_GET(opts, find_filename, NULL); + open_filename = OPTS_GET(opts, open_filename, NULL); + pin_path = OPTS_GET(opts, pin_path, NULL); + id = OPTS_GET(opts, id, 0); + fd = OPTS_GET(opts, fd, 0); + + if (obj) { /* prog_name is optional */ + if (obj_opts || find_filename || open_filename || pin_path || id || fd) + goto err; + prog = xdp_program__create_from_obj(obj, NULL, prog_name, true); + } else if (find_filename) { /* prog_name, obj_opts is optional */ + if (obj || open_filename || pin_path || id || fd) + goto err; + prog = __xdp_program__find_file(find_filename, NULL, prog_name, obj_opts); + } else if (open_filename) { /* prog_name, obj_opts is optional */ + if (obj || find_filename || pin_path || id || fd) + goto err; + prog = __xdp_program__open_file(open_filename, NULL, prog_name, obj_opts); + } else if (pin_path) { + if (obj || obj_opts || prog_name || find_filename || open_filename || id || fd) + goto err; + prog = xdp_program__from_pin(pin_path); + } else if (id) { + if (obj || obj_opts || prog_name || find_filename || open_filename || pin_path || fd) + goto err; + prog = xdp_program__from_id(id); + } else if (fd) { + if (obj || obj_opts || prog_name || find_filename || open_filename || pin_path || id) + goto err; + prog = xdp_program__from_fd(fd); + } else { + goto err; + } + if (IS_ERR(prog)) + return libxdp_err_ptr(PTR_ERR(prog), true); + return prog; +err: + return libxdp_err_ptr(-EINVAL, true); +} + +static int cmp_xdp_programs(const void *_a, const void *_b) +{ + const struct xdp_program *a = *(struct xdp_program * const *)_a; + const struct xdp_program *b = *(struct xdp_program * const *)_b; + int cmp; + + if (a->run_prio != b->run_prio) + return a->run_prio < b->run_prio ? -1 : 1; + + cmp = strcmp(a->prog_name, b->prog_name); + if (cmp) + return cmp; + + /* Hopefully the two checks above will resolve most comparisons; in + * cases where they don't, hopefully the checks below will keep the + * order stable. + */ + + /* loaded before non-loaded */ + if (a->prog_fd >= 0 && b->prog_fd < 0) + return -1; + else if (a->prog_fd < 0 && b->prog_fd >= 0) + return 1; + + /* two unloaded programs - compare by size */ + if (a->bpf_prog && b->bpf_prog) { + size_t size_a, size_b; + + size_a = bpf_program__insn_cnt(a->bpf_prog); + size_b = bpf_program__insn_cnt(b->bpf_prog); + if (size_a != size_b) + return size_a < size_b ? -1 : 1; + } + + cmp = memcmp(a->prog_tag, b->prog_tag, BPF_TAG_SIZE); + if (cmp) + return cmp; + + /* at this point we are really grasping for straws */ + if (a->load_time != b->load_time) + return a->load_time < b->load_time ? -1 : 1; + + return 0; +} + +int xdp_program__pin(struct xdp_program *prog, const char *pin_path) +{ + if (IS_ERR_OR_NULL(prog) || prog->prog_fd < 0) + return libxdp_err(-EINVAL); + + return libxdp_err(bpf_program__pin(prog->bpf_prog, pin_path)); +} + +static int xdp_program__load(struct xdp_program *prog) +{ + bool is_loaded, autoload; + int err; + + if (IS_ERR_OR_NULL(prog)) + return -EINVAL; + + if (prog->prog_fd >= 0) + return -EEXIST; + + if (!prog->bpf_obj || !prog->bpf_prog) + return -EINVAL; + + /* bpf_program__set_autoload fails if the object is loaded, use this to + * detect if it is (since libbpf doesn't expose an API to discover + * this). This is necessary because of objects containing multiple + * programs: if a user creates xdp_program references to programs in + * such an object before loading it, they will get out of sync. + */ + autoload = bpf_program__autoload(prog->bpf_prog); + is_loaded = !!bpf_program__set_autoload(prog->bpf_prog, autoload); + if (is_loaded) { + pr_debug("XDP program %s is already loaded with fd %d\n", + xdp_program__name(prog), bpf_program__fd(prog->bpf_prog)); + + prog->is_frags = !!(bpf_program__flags(prog->bpf_prog) & BPF_F_XDP_HAS_FRAGS); + } else { + /* We got an explicit load request, make sure we actually load */ + if (!autoload) + bpf_program__set_autoload(prog->bpf_prog, true); + + /* Make sure we sync is_frags to internal state variable (in case it was + * changed on bpf_prog since creation), and unset flag if we're loading + * an EXT program (the dispatcher will have the flag set instead in this + * case) + */ + prog->is_frags = xdp_program__xdp_frags_support(prog); + + if (bpf_program__type(prog->bpf_prog) == BPF_PROG_TYPE_EXT) + bpf_program__set_flags(prog->bpf_prog, + bpf_program__flags(prog->bpf_prog) & ~BPF_F_XDP_HAS_FRAGS); + + err = bpf_object__load(prog->bpf_obj); + if (err) + return err; + + pr_debug("Loaded XDP program %s, got fd %d\n", + xdp_program__name(prog), bpf_program__fd(prog->bpf_prog)); + } + + /* xdp_program__fill_from_fd() clones the fd and takes ownership of the clone */ + return xdp_program__fill_from_fd(prog, bpf_program__fd(prog->bpf_prog)); +} + +struct xdp_program *xdp_program__clone(struct xdp_program *prog, unsigned int flags) +{ + if (IS_ERR_OR_NULL(prog) || flags || (prog->prog_fd < 0 && !prog->bpf_obj)) + return libxdp_err_ptr(-EINVAL, false); + + if (prog->prog_fd >= 0) + /* Clone a loaded program struct by creating a new object from the + program fd; xdp_program__fill_from_fd() already duplicates the fd + before filling in the object, so this creates a completely + independent xdp_program object. + */ + return xdp_program__from_fd(prog->prog_fd); + + return xdp_program__create_from_obj(prog->bpf_obj, NULL, + prog->prog_name, true); +} + + +static int xdp_program__attach_single(struct xdp_program *prog, int ifindex, + enum xdp_attach_mode mode) +{ + int err; + + if (prog->prog_fd < 0) { + bpf_program__set_type(prog->bpf_prog, BPF_PROG_TYPE_XDP); + err = xdp_program__load(prog); + if (err) + return err; + } + + if (prog->prog_fd < 0) + return -EINVAL; + + return xdp_attach_fd(xdp_program__fd(prog), -1, ifindex, mode); +} + + +static int xdp_multiprog__main_fd(struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp)) + return -EINVAL; + + if (!mp->main_prog) + return -ENOENT; + + return mp->main_prog->prog_fd; +} + +static __u32 xdp_multiprog__main_id(struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp) || !mp->main_prog) + return 0; + + return mp->main_prog->prog_id; +} + +static int xdp_multiprog__hw_fd(struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp)) + return -EINVAL; + + if (!mp->hw_prog) + return -ENOENT; + + return mp->hw_prog->prog_fd; +} + +static __u32 xdp_multiprog__hw_id(struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp) || !mp->hw_prog) + return 0; + + return mp->hw_prog->prog_id; +} + +static int xdp_program__attach_hw(struct xdp_program *prog, int ifindex) +{ + struct bpf_map *map; + + bpf_program__set_ifindex(prog->bpf_prog, ifindex); + bpf_object__for_each_map (map, prog->bpf_obj) { + bpf_map__set_ifindex(map, ifindex); + } + + return xdp_program__attach_single(prog, ifindex, XDP_MODE_HW); +} + +static int xdp_multiprog__detach_hw(struct xdp_multiprog *old_mp) +{ + int err = 0, hw_fd = -1, ifindex = -1; + + if (!old_mp) + return -EINVAL; + + ifindex = old_mp->ifindex; + + hw_fd = xdp_multiprog__hw_fd(old_mp); + if (hw_fd < 0) + return -EINVAL; + + err = xdp_attach_fd(-1, hw_fd, ifindex, XDP_MODE_HW); + if (err < 0) + return err; + + pr_debug("Detached hw program on ifindex '%d'\n", ifindex); + + return 0; +} + +int xdp_program__attach_multi(struct xdp_program **progs, size_t num_progs, + int ifindex, enum xdp_attach_mode mode, + unsigned int flags) +{ + struct xdp_multiprog *old_mp = NULL, *mp; + int err = 0, retry_counter = 0; + + if (!progs || !num_progs || flags) + return libxdp_err(-EINVAL); + +retry: + old_mp = xdp_multiprog__get_from_ifindex(ifindex); + if (IS_ERR_OR_NULL(old_mp)) + old_mp = NULL; + + if (mode == XDP_MODE_HW) { + bool old_hw_prog = xdp_multiprog__hw_prog(old_mp) != NULL; + + xdp_multiprog__close(old_mp); + + if (old_hw_prog) { + pr_warn("XDP program already loaded in HW mode on ifindex %d; " + "replacing HW mode programs not supported\n", ifindex); + return libxdp_err(-EEXIST); + } + + if (num_progs > 1) + return libxdp_err(-EINVAL); + + return libxdp_err(xdp_program__attach_hw(progs[0], ifindex)); + } + + if (num_progs == 1) { + char *envval; + + envval = secure_getenv(XDP_SKIP_ENVVAR); + if (envval && envval[0] == '1' && envval[1] == '\0') { + pr_debug("Skipping dispatcher due to environment setting\n"); + return libxdp_err(xdp_program__attach_single(progs[0], ifindex, mode)); + } + } + + mp = xdp_multiprog__generate(progs, num_progs, ifindex, old_mp, false); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + mp = NULL; + if (err == -EOPNOTSUPP) { + if (num_progs == 1) { + pr_info("Falling back to loading single prog " + "without dispatcher\n"); + return libxdp_err(xdp_program__attach_single(progs[0], ifindex, mode)); + } else { + pr_warn("Can't fall back to legacy load with %zu " + "programs\n%s\n", num_progs, dispatcher_feature_err); + } + } + goto out; + } + + err = xdp_multiprog__pin(mp); + if (err) { + pr_warn("Failed to pin program: %s\n", strerror(-err)); + goto out_close; + } + + err = xdp_multiprog__attach(old_mp, mp, mode); + if (err) { + pr_debug("Failed to attach dispatcher on ifindex %d: %s\n", + ifindex, strerror(-err)); + xdp_multiprog__unpin(mp); + + if (err == -EAGAIN) { + if (++retry_counter > MAX_RETRY) { + pr_warn("Retried more than %d times, giving up\n", + retry_counter); + err = -EBUSY; + goto out_close; + } + + pr_debug("Existing dispatcher replaced while building replacement, retrying.\n"); + xdp_multiprog__close(old_mp); + xdp_multiprog__close(mp); + usleep(1 << retry_counter); /* exponential backoff */ + goto retry; + } + goto out_close; + } + + if (old_mp) { + err = xdp_multiprog__unpin(old_mp); + if (err) { + pr_warn("Failed to unpin old dispatcher: %s\n", + strerror(-err)); + err = 0; + } + } + +out_close: + xdp_multiprog__close(mp); +out: + if (old_mp) + xdp_multiprog__close(old_mp); + return libxdp_err(err); +} + +int xdp_program__attach(struct xdp_program *prog, int ifindex, + enum xdp_attach_mode mode, + unsigned int flags) +{ + if (IS_ERR_OR_NULL(prog) || IS_ERR(prog)) + return libxdp_err(-EINVAL); + + return libxdp_err(xdp_program__attach_multi(&prog, 1, ifindex, mode, flags)); +} + +int xdp_program__detach_multi(struct xdp_program **progs, size_t num_progs, + int ifindex, enum xdp_attach_mode mode, + unsigned int flags) +{ + struct xdp_multiprog *new_mp, *mp; + int err = 0, retry_counter = 0; + size_t i; + + if (flags || !num_progs || !progs) + return libxdp_err(-EINVAL); + + retry: + new_mp = NULL; + mp = xdp_multiprog__get_from_ifindex(ifindex); + if (IS_ERR_OR_NULL(mp)) { + pr_warn("No XDP dispatcher found on ifindex %d\n", ifindex); + return libxdp_err(-ENOENT); + } + + if (mode == XDP_MODE_HW || xdp_multiprog__is_legacy(mp)) { + __u32 id = (mode == XDP_MODE_HW) ? + xdp_multiprog__hw_id(mp) : + xdp_multiprog__main_id(mp); + + if (num_progs > 1) { + pr_warn("Can only detach one program in legacy or HW mode\n"); + err = -EINVAL; + goto out; + } + + if (!xdp_program__id(progs[0])) { + pr_warn("Program 0 not loaded\n"); + err = -EINVAL; + goto out; + } + + if (id != xdp_program__id(progs[0])) { + pr_warn("Asked to unload prog %u but %u is loaded\n", + xdp_program__id(progs[0]), id); + err = -ENOENT; + goto out; + } + } + + if (mode == XDP_MODE_HW) { + err = xdp_multiprog__detach_hw(mp); + goto out; + } + + if (mode != XDP_MODE_UNSPEC && mp->attach_mode != mode) { + pr_warn("XDP dispatcher attached in mode %d, requested %d\n", + mp->attach_mode, mode); + err = -ENOENT; + goto out; + } + + if (xdp_multiprog__is_legacy(mp)) { + err = xdp_multiprog__attach(mp, NULL, mode); + goto out; + } + + /* fist pass - check progs and count number still loaded */ + for (i = 0; i < num_progs; i++) { + struct xdp_program *p = NULL; + bool found = false; + + if (!progs[i]->prog_id) { + pr_warn("Program %zu not loaded\n", i); + err = -EINVAL; + goto out; + } + + while ((p = xdp_multiprog__next_prog(p, mp))) { + if (progs[i]->prog_id == p->prog_id) + found = true; + } + + if (!found) { + pr_warn("Couldn't find program with id %d on ifindex %d\n", + progs[i]->prog_id, ifindex); + err = -ENOENT; + goto out; + } + } + + if (num_progs == mp->num_links) { + err = xdp_multiprog__attach(mp, NULL, mp->attach_mode); + if (err) + goto out; + + err = xdp_multiprog__unpin(mp); + if (err) + goto out; + } else { + new_mp = xdp_multiprog__generate(progs, num_progs, ifindex, mp, true); + if (IS_ERR(new_mp)) { + err = PTR_ERR(new_mp); + if (err == -EOPNOTSUPP) { + pr_warn("Asked to detach %zu progs, but %zu loaded on ifindex %d, " + "and partial detach is not supported by the kernel.\n", + num_progs, mp->num_links, ifindex); + } + goto out; + } + err = xdp_multiprog__pin(new_mp); + if (err) { + pr_warn("Failed to pin program: %s\n", strerror(-err)); + goto out; + } + + err = xdp_multiprog__attach(mp, new_mp, mode); + if (err) { + pr_debug("Failed to attach dispatcher on ifindex %d: %s\n", + ifindex, strerror(-err)); + xdp_multiprog__unpin(new_mp); + goto out; + } + + err = xdp_multiprog__unpin(mp); + if (err) { + pr_warn("Failed to unpin old dispatcher: %s\n", + strerror(-err)); + err = 0; + } + } + +out: + xdp_multiprog__close(mp); + xdp_multiprog__close(new_mp); + if (err == -EAGAIN) { + if (++retry_counter > MAX_RETRY) { + pr_warn("Retried more than %d times, giving up\n", + retry_counter); + return libxdp_err(-EBUSY); + } + + pr_debug("Existing dispatcher replaced while building replacement, retrying.\n"); + usleep(1 << retry_counter); /* exponential backoff */ + goto retry; + } + return libxdp_err(err); +} + +int xdp_program__detach(struct xdp_program *prog, int ifindex, + enum xdp_attach_mode mode, + unsigned int flags) +{ + if (IS_ERR_OR_NULL(prog) || IS_ERR(prog)) + return -EINVAL; + + return libxdp_err(xdp_program__detach_multi(&prog, 1, ifindex, mode, flags)); +} + +int xdp_program__test_run(struct xdp_program *prog, struct bpf_test_run_opts *opts, unsigned int flags) +{ + struct xdp_multiprog *mp = NULL; + int err, prog_fd; + + if (IS_ERR_OR_NULL(prog) || flags) + return libxdp_err(-EINVAL); + + if (prog->prog_fd < 0) { + err = xdp_program__load(prog); + if (err) + return libxdp_err(err); + } + + if (prog->prog_type == BPF_PROG_TYPE_EXT) { + mp = xdp_multiprog__generate(&prog, 1, 0, NULL, false); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + if (err == -EOPNOTSUPP) + pr_warn("Program was already attached to a dispatcher, " + "and kernel doesn't support multiple attachments\n"); + return libxdp_err(err); + } + + prog_fd = xdp_multiprog__main_fd(mp); + } else if (prog->prog_type != BPF_PROG_TYPE_XDP) { + pr_warn("Can't test_run non-XDP programs\n"); + return libxdp_err(-ENOEXEC); + } else { + prog_fd = prog->prog_fd; + } + + err = bpf_prog_test_run_opts(prog_fd, opts); + if (err) + err = -errno; + + if (mp) + xdp_multiprog__close(mp); + + return libxdp_err(err); +} + +static int xdp_multiprog__check_kernel_frags_support(struct xdp_multiprog *mp) +{ + struct xdp_program *test_prog; + int err; + + pr_debug("Checking for kernel frags support\n"); + test_prog = __xdp_program__find_file("xdp-dispatcher.o", NULL, "xdp_pass", NULL); + if (IS_ERR(test_prog)) { + err = PTR_ERR(test_prog); + pr_warn("Couldn't open BPF file xdp-dispatcher.o\n"); + return err; + } + + bpf_program__set_flags(test_prog->bpf_prog, BPF_F_XDP_HAS_FRAGS); + err = xdp_program__load(test_prog); + if (!err) { + pr_debug("Kernel supports XDP programs with frags\n"); + mp->kernel_frags_support = true; + } else { + pr_debug("Kernel DOES NOT support XDP programs with frags\n"); + } + xdp_program__close(test_prog); + + return 0; +} + +void xdp_multiprog__close(struct xdp_multiprog *mp) +{ + struct xdp_program *p, *next = NULL; + + if (IS_ERR_OR_NULL(mp)) + return; + + xdp_program__close(mp->main_prog); + for (p = mp->first_prog; p; p = next) { + next = p->next; + xdp_program__close(p); + } + xdp_program__close(mp->hw_prog); + + free(mp); +} + +static struct xdp_multiprog *xdp_multiprog__new(int ifindex) +{ + struct xdp_multiprog *mp; + + mp = malloc(sizeof *mp); + if (!mp) + return ERR_PTR(-ENOMEM); + memset(mp, 0, sizeof(*mp)); + mp->ifindex = ifindex; + mp->version = XDP_DISPATCHER_VERSION; + + return mp; +} + +static int xdp_multiprog__load(struct xdp_multiprog *mp) +{ + char buf[100]; + int err = 0; + + if (IS_ERR_OR_NULL(mp) || !mp->main_prog || mp->is_loaded || xdp_multiprog__is_legacy(mp)) + return -EINVAL; + + pr_debug("Loading multiprog dispatcher for %d programs %s frags support\n", + mp->config.num_progs_enabled, + mp->config.is_xdp_frags ? "with" : "without"); + + if (mp->config.is_xdp_frags) + xdp_program__set_xdp_frags_support(mp->main_prog, true); + + err = xdp_program__load(mp->main_prog); + if (err) { + pr_info("Failed to load dispatcher: %s\n", + libxdp_strerror_r(err, buf, sizeof(buf))); + err = -EOPNOTSUPP; + goto out; + } + mp->is_loaded = true; +out: + return err; +} + +int check_xdp_prog_version(const struct btf *btf, const char *name, __u32 *version) +{ + const struct btf_type *sec, *def; + + sec = btf_get_datasec(btf, XDP_METADATA_SECTION); + if (!sec) + return libxdp_err(-ENOENT); + + def = btf_get_section_var(btf, sec, name, BTF_KIND_PTR); + if (IS_ERR(def)) + return libxdp_err(PTR_ERR(def)); + + if (!get_field_int(btf, name, def, version)) + return libxdp_err(-ENOENT); + + return 0; +} + +static int check_dispatcher_version(struct xdp_multiprog *mp, + const char *prog_name, const struct btf *btf, + __u32 nr_maps, __u32 map_id) +{ + __u32 version = 0, map_key = 0, info_len = sizeof(struct bpf_map_info); + const char *name = "dispatcher_version"; + struct bpf_map_info map_info = {}; + int err, map_fd, i; + __u8 *buf = NULL; + + if (prog_name && strcmp(prog_name, "xdp_dispatcher")) { + pr_debug("XDP program with name '%s' is not a dispatcher\n", prog_name); + return -ENOENT; + } + + if (nr_maps != 1) { + pr_warn("Expected a single map for dispatcher, found %u\n", nr_maps); + return -ENOENT; + } + + map_fd = bpf_map_get_fd_by_id(map_id); + if (map_fd < 0) { + err = -errno; + pr_warn("Could not get config map fd for id %u: %s\n", map_id, strerror(-err)); + return err; + } + + err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len); + if (err) { + err = -errno; + pr_warn("Couldn't get map info: %s\n", strerror(-err)); + goto out; + } + + if (map_info.key_size != sizeof(map_key) || + map_info.value_size < 2 || + map_info.max_entries != 1 || + !(map_info.map_flags & BPF_F_RDONLY_PROG)) { + pr_warn("Map flags or key/value size mismatch\n"); + err = -EINVAL; + goto out; + } + + buf = malloc(map_info.value_size); + if (!buf) { + err = -ENOMEM; + goto out; + } + + err = bpf_map_lookup_elem(map_fd, &map_key, buf); + if (err) { + err = -errno; + pr_warn("Could not lookup map value: %s\n", strerror(-err)); + goto out; + } + + if (buf[0] == XDP_DISPATCHER_MAGIC) { + version = buf[1]; + } else { + err = check_xdp_prog_version(btf, name, &version); + if (err) + goto out; + } + + switch (version) { + case XDP_DISPATCHER_VERSION_V1: + { + struct xdp_dispatcher_config_v1 *config = (void *)buf; + + for (i = 0; i < MAX_DISPATCHER_ACTIONS; i++) { + mp->config.chain_call_actions[i] = config->chain_call_actions[i]; + mp->config.run_prios[i] = config->run_prios[i]; + } + mp->config.num_progs_enabled = config->num_progs_enabled; + break; + } + case XDP_DISPATCHER_VERSION: + if (map_info.value_size != sizeof(mp->config)) { + pr_warn("Dispatcher version matches, but map size %u != expected %zu\n", + map_info.value_size, sizeof(mp->config)); + err = -EINVAL; + goto out; + } + memcpy(&mp->config, buf, sizeof(mp->config)); + break; + + default: + pr_warn("XDP dispatcher version %u higher than supported %u\n", + version, XDP_DISPATCHER_VERSION); + err = -EOPNOTSUPP; + goto out; + } + pr_debug("Verified XDP dispatcher version %d <= %d\n", + version, XDP_DISPATCHER_VERSION); + + mp->version = version; + +out: + close(map_fd); + free(buf); + return err; +} + +static int xdp_multiprog__link_pinned_progs(struct xdp_multiprog *mp) +{ + char buf[PATH_MAX], pin_path[PATH_MAX]; + struct xdp_program *prog, *p = NULL; + const char *bpffs_dir; + int err, lock_fd, i; + struct stat sb = {}; + + if (IS_ERR_OR_NULL(mp) || mp->first_prog) + return -EINVAL; + + bpffs_dir = get_bpffs_dir(); + if (IS_ERR(bpffs_dir)) + return PTR_ERR(bpffs_dir); + + err = try_snprintf(pin_path, sizeof(pin_path), "%s/dispatch-%d-%d", + bpffs_dir, mp->ifindex, mp->main_prog->prog_id); + if (err) + return err; + + lock_fd = xdp_lock_acquire(); + if (lock_fd < 0) + return lock_fd; + + pr_debug("Reading multiprog component programs from pinned directory\n"); + err = stat(pin_path, &sb); + if (err) { + err = -errno; + pr_debug("Couldn't stat pin_path '%s': %s\n", + pin_path, strerror(-err)); + goto out; + } + + for (i = 0; i < mp->config.num_progs_enabled; i++) { + + err = try_snprintf(buf, sizeof(buf), "%s/prog%d-prog", + pin_path, i); + if (err) + goto err; + + prog = xdp_program__from_pin(buf); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto err; + } + err = try_snprintf(buf, sizeof(buf), "prog%d", i); + if (err) + goto err; + prog->attach_name = strdup(buf); + if (!prog->attach_name) { + err = -ENOMEM; + goto err; + } + + prog->chain_call_actions = (mp->config.chain_call_actions[i] & + ~(1U << XDP_DISPATCHER_RETVAL)); + prog->run_prio = mp->config.run_prios[i]; + prog->is_frags = !!(mp->config.program_flags[i] & BPF_F_XDP_HAS_FRAGS); + + if (!p) { + mp->first_prog = prog; + p = mp->first_prog; + } else { + p->next = prog; + p = prog; + } + mp->num_links++; + } + +out: + xdp_lock_release(lock_fd); + return err; +err: + prog = mp->first_prog; + while (prog) { + p = prog->next; + xdp_program__close(prog); + prog = p; + } + mp->first_prog = NULL; + goto out; +} + +static int xdp_multiprog__fill_from_fd(struct xdp_multiprog *mp, + int prog_fd, int hw_fd) +{ + struct bpf_prog_info info = {}; + __u32 info_len, map_id = 0; + struct xdp_program *prog; + struct btf *btf = NULL; + int err = 0; + + if (IS_ERR_OR_NULL(mp)) + return -EINVAL; + + if (prog_fd > 0) { + info.nr_map_ids = 1; + info.map_ids = (uintptr_t)&map_id; + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err) { + pr_warn("couldn't get program info for fd: %d", prog_fd); + return -EINVAL; + } + + if (!info.btf_id) { + pr_debug("No BTF for prog ID %u\n", info.id); + mp->is_legacy = true; + goto legacy; + } + + btf = btf__load_from_kernel_by_id(info.btf_id); + if (!btf) { + pr_warn("Couldn't get BTF for ID %ul\n", info.btf_id); + goto out; + } + + err = check_dispatcher_version(mp, info.name, btf, + info.nr_map_ids, map_id); + if (err) { + if (err != -ENOENT) { + pr_warn("Dispatcher version check failed for ID %d\n", + info.id); + goto out; + } else { + /* no dispatcher, mark as legacy prog */ + mp->is_legacy = true; + err = 0; + goto legacy; + } + } + +legacy: + prog = xdp_program__from_fd(prog_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out; + } + + mp->main_prog = prog; + + if (!xdp_multiprog__is_legacy(mp)) { + err = xdp_multiprog__link_pinned_progs(mp); + if (err) { + pr_warn("Unable to read pinned progs: %s\n", strerror(-err)); + mp->is_legacy = true; + err = 0; + } + } + + pr_debug("Found %s with id %d and %zu component progs\n", + xdp_multiprog__is_legacy(mp) ? "legacy program" : "multiprog", + mp->main_prog->prog_id, mp->num_links); + } + + if (hw_fd > 0) { + prog = xdp_program__from_fd(hw_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out; + } + + if (mp->first_prog == NULL) + mp->is_legacy = true; + + mp->hw_prog = prog; + + pr_debug("Found hw program with id %d\n", mp->hw_prog->prog_id); + } + + mp->is_loaded = true; + +out: + btf__free(btf); + return err; +} + +static struct xdp_multiprog *xdp_multiprog__from_fd(int fd, int hw_fd, + int ifindex) +{ + struct xdp_multiprog *mp = NULL; + int err; + + mp = xdp_multiprog__new(ifindex); + if (IS_ERR(mp)) + return mp; + + err = xdp_multiprog__fill_from_fd(mp, fd, hw_fd); + if (err) + goto err; + + return mp; +err: + xdp_multiprog__close(mp); + return ERR_PTR(err); +} + + +static struct xdp_multiprog *xdp_multiprog__from_id(__u32 id, __u32 hw_id, + int ifindex) +{ + struct xdp_multiprog *mp; + int hw_fd = 0; + int fd = 0; + int err; + + if (id) { + fd = bpf_prog_get_fd_by_id(id); + if (fd < 0) { + err = -errno; + pr_warn("couldn't get program fd: %s", strerror(-err)); + goto err; + } + } + + if (hw_id) { + hw_fd = bpf_prog_get_fd_by_id(hw_id); + if (hw_fd < 0) { + err = -errno; + pr_warn("couldn't get program fd: %s", strerror(-err)); + goto err; + } + } + + mp = xdp_multiprog__from_fd(fd, hw_fd, ifindex); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + goto err; + } + return mp; +err: + if (fd > 0) + close(fd); + if (hw_fd > 0) + close(hw_fd); + return ERR_PTR(err); +} + +static int xdp_get_ifindex_prog_id(int ifindex, __u32 *prog_id, + __u32 *hw_prog_id, enum xdp_attach_mode *mode) +{ + __u32 _prog_id, _drv_prog_id, _hw_prog_id, _skb_prog_id; + enum xdp_attach_mode _mode; + __u8 _attach_mode; + + if (!hw_prog_id) + hw_prog_id = &_prog_id; + if (!mode) + mode = &_mode; + int err; +#ifdef HAVE_LIBBPF_BPF_XDP_ATTACH + LIBBPF_OPTS(bpf_xdp_query_opts, opts); + err = bpf_xdp_query(ifindex, 0, &opts); + if (err) + return err; + + _drv_prog_id = opts.drv_prog_id; + _skb_prog_id = opts.skb_prog_id; + _hw_prog_id = opts.hw_prog_id; + _attach_mode = opts.attach_mode; +#else + struct xdp_link_info xinfo = {}; + err = bpf_get_link_xdp_info(ifindex, &xinfo, sizeof(xinfo), 0); + if (err) + return err; + + _drv_prog_id = xinfo.drv_prog_id; + _skb_prog_id = xinfo.skb_prog_id; + _hw_prog_id = xinfo.hw_prog_id; + _attach_mode = xinfo.attach_mode; +#endif + switch (_attach_mode) { + case XDP_ATTACHED_SKB: + *prog_id = _skb_prog_id; + *mode = XDP_MODE_SKB; + break; + case XDP_ATTACHED_DRV: + *prog_id = _drv_prog_id; + *mode = XDP_MODE_NATIVE; + break; + case XDP_ATTACHED_MULTI: + if (_drv_prog_id) { + *prog_id = _drv_prog_id; + *mode = XDP_MODE_NATIVE; + } else if (_skb_prog_id) { + *prog_id = _skb_prog_id; + *mode = XDP_MODE_SKB; + } + *hw_prog_id = _hw_prog_id; + break; + case XDP_ATTACHED_HW: + *hw_prog_id = _hw_prog_id; + *mode = XDP_MODE_UNSPEC; + break; + case XDP_ATTACHED_NONE: + default: + *mode = XDP_MODE_UNSPEC; + break; + } + return 0; +} + +struct xdp_multiprog *xdp_multiprog__get_from_ifindex(int ifindex) +{ + enum xdp_attach_mode mode = XDP_MODE_UNSPEC; + int err, retry_counter = 0; + struct xdp_multiprog *mp; + __u32 hw_prog_id = 0; + __u32 prog_id = 0; + +retry: + err = xdp_get_ifindex_prog_id(ifindex, &prog_id, &hw_prog_id, &mode); + if (err) + return libxdp_err_ptr(err, false); + + if (!prog_id && !hw_prog_id) + return libxdp_err_ptr(-ENOENT, false); + + mp = xdp_multiprog__from_id(prog_id, hw_prog_id, ifindex); + if (!IS_ERR_OR_NULL(mp)) + mp->attach_mode = mode; + else if (IS_ERR(mp)) { + err = PTR_ERR(mp); + if (err == -ENOENT) { + if (++retry_counter > MAX_RETRY) { + pr_warn("Retried more than %d times, giving up\n", + retry_counter); + err = -EBUSY; + } else { + pr_debug("Dispatcher disappeared before we could load it, retrying.\n"); + usleep(1 << retry_counter); /* exponential backoff */ + goto retry; + } + } + + mp = libxdp_err_ptr(err, false); + } else + mp = libxdp_err_ptr(0, true); + return mp; +} + +int libxdp_check_kern_compat(void) +{ + struct xdp_program *tgt_prog = NULL, *test_prog = NULL; + const char *bpffs_dir; + char buf[PATH_MAX]; + int lock_fd; + int err = 0; + + bpffs_dir = get_bpffs_dir(); + if (IS_ERR(bpffs_dir)) { + err = PTR_ERR(bpffs_dir); + pr_warn("Can't use dispatcher without a working bpffs\n"); + return -EOPNOTSUPP; + } + + if (kernel_compat > COMPAT_UNKNOWN) + goto skip; + + pr_debug("Checking dispatcher compatibility\n"); + + tgt_prog = __xdp_program__find_file("xdp-dispatcher.o", NULL, "xdp_pass", NULL); + if (IS_ERR(tgt_prog)) { + err = PTR_ERR(tgt_prog); + pr_warn("Couldn't open BPF file xdp-dispatcher.o\n"); + return err; + } + + test_prog = __xdp_program__find_file("xdp-dispatcher.o", NULL, "xdp_pass", NULL); + if (IS_ERR(test_prog)) { + err = PTR_ERR(test_prog); + pr_warn("Couldn't open BPF file xdp-dispatcher.o\n"); + return err; + } + + err = xdp_program__load(tgt_prog); + if (err) { + pr_debug("Couldn't load XDP program: %s\n", strerror(-err)); + goto out; + } + + err = bpf_program__set_attach_target(test_prog->bpf_prog, + tgt_prog->prog_fd, + "xdp_pass"); + if (err) { + pr_debug("Failed to set attach target: %s\n", strerror(-err)); + goto out; + } + + bpf_program__set_type(test_prog->bpf_prog, BPF_PROG_TYPE_EXT); + bpf_program__set_expected_attach_type(test_prog->bpf_prog, 0); + err = xdp_program__load(test_prog); + if (err) { + char buf[100] = {}; + libxdp_strerror(err, buf, sizeof(buf)); + pr_debug("Failed to load program %s: %s\n", + xdp_program__name(test_prog), buf); + goto out; + } + + test_prog->link_fd = bpf_raw_tracepoint_open(NULL, test_prog->prog_fd); + if (test_prog->link_fd < 0) { + err = -errno; + pr_debug("Failed to attach test program to dispatcher: %s\n", + strerror(-err)); + goto out; + } + + err = try_snprintf(buf, sizeof(buf), "%s/prog-test-link-%i-%i", + bpffs_dir, IFINDEX_LO, test_prog->prog_id); + if (err) + goto out; + + lock_fd = xdp_lock_acquire(); + if (lock_fd < 0) { + err = lock_fd; + goto out; + } + + err = bpf_obj_pin(test_prog->link_fd, buf); + if (err) { + err = -errno; + pr_warn("Couldn't pin link FD at %s: %s\n", buf, strerror(-err)); + goto out_locked; + } + err = unlink(buf); + if (err) { + err = -errno; + pr_warn("Couldn't unlink file %s: %s\n", buf, strerror(-err)); + goto out_locked; + } + + kernel_compat = COMPAT_SUPPORTED; +out_locked: + xdp_lock_release(lock_fd); +out: + xdp_program__close(test_prog); + xdp_program__close(tgt_prog); + if (err) { + pr_info("Compatibility check for dispatcher program failed: %s\n", + strerror(-err)); + kernel_compat = COMPAT_UNSUPPORTED; + } +skip: + return kernel_compat == COMPAT_SUPPORTED ? 0 : -EOPNOTSUPP; +} + +static int find_prog_btf_id(const char *name, __u32 attach_prog_fd) +{ + struct bpf_prog_info info = {}; + __u32 info_size = sizeof(info); + int err = -EINVAL; + struct btf *btf; + + err = bpf_obj_get_info_by_fd(attach_prog_fd, &info, &info_size); + if (err) { + err = -errno; + pr_warn("failed get_prog_info for FD %d\n", attach_prog_fd); + return err; + } + if (!info.btf_id) { + pr_warn("The target program doesn't have BTF\n"); + return -EINVAL; + } + btf = btf__load_from_kernel_by_id(info.btf_id); + if (!btf) { + pr_warn("Failed to get BTF of the program\n"); + return -EINVAL; + } + err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); + btf__free(btf); + if (err <= 0) + pr_warn("%s is not found in prog's BTF\n", name); + + return err; +} + +static int xdp_multiprog__link_prog(struct xdp_multiprog *mp, + struct xdp_program *prog) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + struct xdp_program *new_prog, *p; + bool was_loaded = false; + char buf[PATH_MAX]; + int err, lfd = -1; + char *attach_func; + __s32 btf_id; + + if (IS_ERR_OR_NULL(mp) || IS_ERR_OR_NULL(prog) || !mp->is_loaded || + mp->num_links >= mp->config.num_progs_enabled) + return -EINVAL; + + err = libxdp_check_kern_compat(); + if (err) + return err; + + if (!prog->btf) { + pr_warn("Program %s has no BTF information, so we can't load it as multiprog\n", + xdp_program__name(prog)); + return -EOPNOTSUPP; + } + + pr_debug("Linking prog %s as multiprog entry %zu\n", + xdp_program__name(prog), mp->num_links); + + err = try_snprintf(buf, sizeof(buf), "prog%zu", mp->num_links); + if (err) + goto err; + + + if (mp->config.num_progs_enabled == 1) + attach_func = "xdp_dispatcher"; + else + attach_func = buf; + + btf_id = find_prog_btf_id(attach_func, mp->main_prog->prog_fd); + if (btf_id <= 0) { + err = btf_id; + pr_debug("Couldn't find BTF ID for %s: %d\n", attach_func, err); + goto err; + } + + if (prog->prog_fd < 0) { + err = bpf_program__set_attach_target(prog->bpf_prog, + mp->main_prog->prog_fd, + attach_func); + if (err) { + pr_debug("Failed to set attach target: %s\n", strerror(-err)); + goto err; + } + + bpf_program__set_type(prog->bpf_prog, BPF_PROG_TYPE_EXT); + bpf_program__set_expected_attach_type(prog->bpf_prog, 0); + err = xdp_program__load(prog); + if (err) { + if (err == -E2BIG) { + pr_debug("Got 'argument list too long' error while " + "loading component program.\n"); + err = -EOPNOTSUPP; + } else { + char buf[100] = {}; + libxdp_strerror(err, buf, sizeof(buf)); + pr_debug("Failed to load program %s: %s\n", + xdp_program__name(prog), buf); + } + goto err; + } + + was_loaded = true; + } + + /* clone the xdp_program ref so we can keep it */ + new_prog = xdp_program__clone(prog, 0); + if (IS_ERR(new_prog)) { + err = PTR_ERR(new_prog); + pr_warn("Failed to clone xdp_program: %s\n", strerror(-err)); + goto err; + } + + opts.target_btf_id = btf_id; + + /* The attach will disappear once this fd is closed */ + lfd = bpf_link_create(new_prog->prog_fd, mp->main_prog->prog_fd, 0, &opts); + if (lfd < 0) { + err = -errno; + if (err == -EINVAL) { + if (!was_loaded) { + pr_debug("Kernel doesn't support re-attaching " + "freplace programs.\n"); + err = -EOPNOTSUPP; + } else { + pr_debug("Got EINVAL, retrying " + "raw_tracepoint_open() without target\n"); + /* we just loaded the program, so should be able + * to attach the old way */ + lfd = bpf_raw_tracepoint_open(NULL, new_prog->prog_fd); + if (lfd < 0) + err = -errno; + else + goto attach_ok; + } + } + if (err == -EPERM) { + pr_debug("Got 'permission denied' error while " + "attaching program to dispatcher.\n%s\n", + dispatcher_feature_err); + err = -EOPNOTSUPP; + } else { + pr_warn("Failed to attach program %s to dispatcher: %s\n", + xdp_program__name(new_prog), strerror(-err)); + } + goto err_free; + } + +attach_ok: + new_prog->attach_name = strdup(buf); + if (!new_prog->attach_name) { + err = -ENOMEM; + goto err_free; + } + + pr_debug( + "Attached prog '%s' with priority %d in dispatcher entry '%s' with fd %d\n", + xdp_program__name(new_prog), xdp_program__run_prio(new_prog), + new_prog->attach_name, lfd); + new_prog->link_fd = lfd; + + if (!mp->first_prog) { + mp->first_prog = new_prog; + } else { + p = mp->first_prog; + while (p->next) + p = p->next; + p->next = new_prog; + } + + mp->num_links++; + return 0; + +err_free: + if (lfd >= 0) + close(lfd); + xdp_program__close(new_prog); +err: + return err; +} + +/* + * xdp_multiprog__generate - generate a new multiprog dispatcher + * + * This generates a new multiprog dispatcher for the programs in progs. If + * old_mp is set, the progs will either be added to or removed from the existing + * set of programs in the dispatcher represented by old_mp, depending on the + * value of remove_progs. If old_mp is not set, a new dispatcher will be created + * just holding the programs in progs. In both cases, the full set of programs + * will be sorted according to their run order (see cmp_xdp_programs). + * + * When called with remove_progs set, the caller is responsible for checking + * that all the programs in progs are actually present in old_mp. + */ +static struct xdp_multiprog *xdp_multiprog__generate(struct xdp_program **progs, + size_t num_progs, + int ifindex, + struct xdp_multiprog *old_mp, + bool remove_progs) +{ + size_t num_new_progs = old_mp ? old_mp->num_links : 0; + struct xdp_program **new_progs = NULL; + struct xdp_program *dispatcher; + struct xdp_multiprog *mp; + struct bpf_map *map; + size_t i; + int err; + + if (!progs || !num_progs || (!old_mp && remove_progs)) + return ERR_PTR(-EINVAL); + + num_new_progs += remove_progs ? -num_progs : num_progs; + + if (num_new_progs > MAX_DISPATCHER_ACTIONS) + return ERR_PTR(-E2BIG); + + pr_debug("Generating multi-prog dispatcher for %zu programs\n", + num_new_progs); + + mp = xdp_multiprog__new(ifindex); + if (IS_ERR(mp)) + return mp; + + err = xdp_multiprog__check_kernel_frags_support(mp); + if (err) + goto err; + + if (old_mp) { + struct xdp_program *prog; + size_t j; + + if (xdp_multiprog__is_legacy(old_mp)) { + pr_warn("Existing program is not using a dispatcher, can't replace; unload first\n"); + err = -EBUSY; + goto err; + } + + if (old_mp->version < mp->version) { + pr_warn("Existing dispatcher version %u is older than our version %u. " + "Refusing transparent upgrade, unload first\n", + old_mp->version, mp->version); + err = -EBUSY; + goto err; + } + + new_progs = calloc(num_new_progs, sizeof(*new_progs)); + if (!new_progs) { + err = -ENOMEM; + goto err; + } + + for (i = 0, prog = old_mp->first_prog; prog; prog = prog->next) { + if (remove_progs) { + /* remove_new means new_progs is an array of + * programs we should remove from old_mp instead + * of adding them. + */ + bool found = false; + + for (j = 0; j < num_progs; j++) + if (progs[j]->prog_id == prog->prog_id) + found = true; + if (found) + continue; + + /* Sanity check: caller should ensure all + * programs to remove actually exist; check here + * anyway to ensure we don't overrun the array + * if this is not done correctly. + */ + if (i >= num_new_progs) { + pr_warn("Not all programs to remove were found\n"); + err = -EINVAL; + goto err; + } + } + new_progs[i++] = prog; + } + if (!remove_progs) + for (j = 0; i < num_new_progs; i++, j++) + new_progs[i] = progs[j]; + + } else { + new_progs = progs; + } + + if (num_new_progs > 1) + qsort(new_progs, num_new_progs, sizeof(*new_progs), cmp_xdp_programs); + + dispatcher = __xdp_program__find_file("xdp-dispatcher.o", + NULL, "xdp_dispatcher", NULL); + if (IS_ERR(dispatcher)) { + err = PTR_ERR(dispatcher); + pr_warn("Couldn't open BPF file 'xdp-dispatcher.o'\n"); + goto err; + } + + mp->main_prog = dispatcher; + + map = bpf_object__next_map(mp->main_prog->bpf_obj, NULL); + if (!map) { + pr_warn("Couldn't find rodata map in object file 'xdp-dispatcher.o'\n"); + err = -ENOENT; + goto err; + } + + mp->config.magic = XDP_DISPATCHER_MAGIC; + mp->config.dispatcher_version = mp->version; + mp->config.num_progs_enabled = num_new_progs; + mp->config.is_xdp_frags = mp->kernel_frags_support; + for (i = 0; i < num_new_progs; i++) { + mp->config.chain_call_actions[i] = + (new_progs[i]->chain_call_actions | + (1U << XDP_DISPATCHER_RETVAL)); + mp->config.run_prios[i] = new_progs[i]->run_prio; + + if (xdp_program__xdp_frags_support(new_progs[i])) + mp->config.program_flags[i] = BPF_F_XDP_HAS_FRAGS; + else + mp->config.is_xdp_frags = false; + } + + if (mp->kernel_frags_support) { + if (!mp->config.is_xdp_frags) + pr_debug("At least one attached program doesn't " + "support frags, disabling it for the " + "dispatcher\n"); + else + pr_debug("All attached programs support frags, " + "enabling it for the dispatcher\n"); + } + + err = bpf_map__set_initial_value(map, &mp->config, sizeof(mp->config)); + if (err) { + pr_warn("Failed to set rodata for object file 'xdp-dispatcher.o'\n"); + goto err; + } + + err = xdp_multiprog__load(mp); + if (err) + goto err; + + for (i = 0; i < num_new_progs; i++) { + err = xdp_multiprog__link_prog(mp, new_progs[i]); + if (err) + goto err; + } + + if (old_mp) + free(new_progs); + + return mp; + +err: + if (old_mp) + free(new_progs); + xdp_multiprog__close(mp); + return ERR_PTR(err); +} + +static int xdp_multiprog__pin(struct xdp_multiprog *mp) +{ + char pin_path[PATH_MAX], buf[PATH_MAX]; + struct xdp_program *prog; + const char *bpffs_dir; + int err = 0, lock_fd; + + if (IS_ERR_OR_NULL(mp) || xdp_multiprog__is_legacy(mp)) + return -EINVAL; + + bpffs_dir = get_bpffs_dir(); + if (IS_ERR(bpffs_dir)) + return PTR_ERR(bpffs_dir); + + err = try_snprintf(pin_path, sizeof(pin_path), "%s/dispatch-%d-%d", + bpffs_dir, mp->ifindex, mp->main_prog->prog_id); + if (err) + return err; + + lock_fd = xdp_lock_acquire(); + if (lock_fd < 0) + return lock_fd; + + pr_debug("Pinning multiprog fd %d beneath %s\n", + mp->main_prog->prog_fd, pin_path); + + err = mkdir(pin_path, S_IRWXU); + if (err && errno != EEXIST) { + err = -errno; + goto out; + } + + for (prog = mp->first_prog; prog; prog = prog->next) { + if (prog->link_fd < 0) { + err = -EINVAL; + pr_warn("Prog %s not linked\n", prog->prog_name); + goto err_unpin; + } + + err = try_snprintf(buf, sizeof(buf), "%s/%s-link", + pin_path, prog->attach_name); + if (err) + goto err_unpin; + + err = bpf_obj_pin(prog->link_fd, buf); + if (err) { + err = -errno; + pr_warn("Couldn't pin link FD at %s: %s\n", buf, strerror(-err)); + goto err_unpin; + } + pr_debug("Pinned link for prog %s at %s\n", prog->prog_name, buf); + + err = try_snprintf(buf, sizeof(buf), "%s/%s-prog", + pin_path, prog->attach_name); + if (err) + goto err_unpin; + + err = bpf_obj_pin(prog->prog_fd, buf); + if (err) { + err = -errno; + pr_warn("Couldn't pin prog FD at %s: %s\n", buf, strerror(-err)); + goto err_unpin; + } + + pr_debug("Pinned prog %s at %s\n", prog->prog_name, buf); + } +out: + xdp_lock_release(lock_fd); + return err; + +err_unpin: + for (prog = mp->first_prog; prog; prog = prog->next) { + if (!try_snprintf(buf, sizeof(buf), "%s/%s-link", + pin_path, prog->attach_name)) + unlink(buf); + if (!try_snprintf(buf, sizeof(buf), "%s/%s-prog", + pin_path, prog->attach_name)) + unlink(buf); + } + rmdir(pin_path); + goto out; +} + +static int xdp_multiprog__unpin(struct xdp_multiprog *mp) +{ + char pin_path[PATH_MAX], buf[PATH_MAX]; + struct xdp_program *prog; + const char *bpffs_dir; + int err = 0, lock_fd; + + if (IS_ERR_OR_NULL(mp) || xdp_multiprog__is_legacy(mp)) + return -EINVAL; + + bpffs_dir = get_bpffs_dir(); + if (IS_ERR(bpffs_dir)) + return PTR_ERR(bpffs_dir); + + err = try_snprintf(pin_path, sizeof(pin_path), "%s/dispatch-%d-%d", + bpffs_dir, mp->ifindex, mp->main_prog->prog_id); + if (err) + return err; + + lock_fd = xdp_lock_acquire(); + if (lock_fd < 0) + return lock_fd; + + pr_debug("Unpinning multiprog fd %d beneath %s\n", + mp->main_prog->prog_fd, pin_path); + + for (prog = mp->first_prog; prog; prog = prog->next) { + err = try_snprintf(buf, sizeof(buf), "%s/%s-link", + pin_path, prog->attach_name); + if (err) + goto out; + + err = unlink(buf); + if (err) { + err = -errno; + pr_warn("Couldn't unlink file %s: %s\n", + buf, strerror(-err)); + goto out; + } + pr_debug("Unpinned link for prog %s from %s\n", + prog->prog_name, buf); + + err = try_snprintf(buf, sizeof(buf), "%s/%s-prog", + pin_path, prog->attach_name); + if (err) + goto out; + + err = unlink(buf); + if (err) { + err = -errno; + pr_warn("Couldn't unlink file %s: %s\n", + buf, strerror(-err)); + goto out; + } + + pr_debug("Unpinned prog %s from %s\n", prog->prog_name, buf); + } + + err = rmdir(pin_path); + if (err) + err = -errno; + pr_debug("Removed pin directory %s\n", pin_path); +out: + xdp_lock_release(lock_fd); + return err; +} + +static int xdp_multiprog__attach(struct xdp_multiprog *old_mp, + struct xdp_multiprog *mp, + enum xdp_attach_mode mode) +{ + int err = 0, prog_fd = -1, old_fd = -1, ifindex = -1; + + if (IS_ERR_OR_NULL(mp) && !old_mp) + return -EINVAL; + + if (mode == XDP_MODE_HW) + return -EINVAL; + + if (mp) { + prog_fd = xdp_multiprog__main_fd(mp); + if (prog_fd < 0) + return -EINVAL; + ifindex = mp->ifindex; + } + + if (old_mp) { + old_fd = xdp_multiprog__main_fd(old_mp); + if (old_fd < 0) + return -EINVAL; + if (ifindex > -1 && ifindex != old_mp->ifindex) + return -EINVAL; + ifindex = old_mp->ifindex; + } + + + err = xdp_attach_fd(prog_fd, old_fd, ifindex, mode); + if (err < 0) + goto err; + + if (mp) + pr_debug("Loaded %zu programs on ifindex %d%s\n", + mp->num_links, ifindex, + mode == XDP_MODE_SKB ? " in skb mode" : ""); + else + pr_debug("Detached %s on ifindex %d%s\n", + xdp_multiprog__is_legacy(old_mp) ? "program" : "multiprog", + ifindex, + mode == XDP_MODE_SKB ? " in skb mode" : ""); + + return 0; +err: + return err; +} + +int xdp_multiprog__detach(struct xdp_multiprog *mp) +{ + int err = 0; + + if (IS_ERR_OR_NULL(mp) || !mp->is_loaded) + return libxdp_err(-EINVAL); + + if (mp->hw_prog) { + err = xdp_multiprog__detach_hw(mp); + if (err) + return libxdp_err(err); + } + + if (mp->main_prog) { + err = xdp_multiprog__attach(mp, NULL, mp->attach_mode); + if (err) + return libxdp_err(err); + + if (!xdp_multiprog__is_legacy(mp)) + err = xdp_multiprog__unpin(mp); + } + return libxdp_err(err); +} + +struct xdp_program *xdp_multiprog__next_prog(const struct xdp_program *prog, + const struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp) || xdp_multiprog__is_legacy(mp)) + return libxdp_err_ptr(0, true); + + if (prog) + return prog->next; + + return mp->first_prog; +} + +struct xdp_program *xdp_multiprog__hw_prog(const struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp)) + return libxdp_err_ptr(0, true); + + return mp->hw_prog; +} + +enum xdp_attach_mode xdp_multiprog__attach_mode(const struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp)) + return XDP_MODE_UNSPEC; + + return mp->attach_mode; +} + +struct xdp_program *xdp_multiprog__main_prog(const struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp)) + return libxdp_err_ptr(0, true); + + return mp->main_prog; +} + +bool xdp_multiprog__is_legacy(const struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp)) + return false; + + return mp->is_legacy; +} + +int xdp_multiprog__program_count(const struct xdp_multiprog *mp) +{ + if (IS_ERR_OR_NULL(mp)) + return libxdp_err(-EINVAL); + + return mp->num_links; +} + +bool xdp_multiprog__xdp_frags_support(const struct xdp_multiprog *mp) +{ + return !xdp_multiprog__is_legacy(mp) && mp->config.is_xdp_frags; +} + +static int remove_pin_dir(const char *subdir) +{ + char prog_path[PATH_MAX], pin_path[PATH_MAX]; + int err; + DIR *d; + + const char *dir = get_bpffs_dir(); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + err = try_snprintf(pin_path, sizeof(pin_path), "%s/%s", dir, subdir); + if (err) + return err; + + d = opendir(pin_path); + if (!d) { + err = -errno; + pr_warn("Failed to open pin directory: %s\n", strerror(-err)); + return err; + } + + for (struct dirent *dent = readdir(d); dent; dent = readdir(d)) { + /* skip . and .. */ + if (dent->d_type == DT_DIR) + continue; + + err = try_snprintf(prog_path, sizeof(prog_path), "%s/%s", + pin_path, dent->d_name); + if (err) + goto err; + + err = unlink(prog_path); + if (err) { + err = -errno; + pr_warn("Couldn't unlink file %s/%s: %s\n", subdir, + dent->d_name, strerror(-err)); + goto err; + } + } + err = rmdir(pin_path); + if (err) { + err = -errno; + pr_warn("Failed to remove pin directory %s: %s\n", pin_path, + strerror(-err)); + } +err: + closedir(d); + return err; +} + +int libxdp_clean_references(int ifindex) +{ + int err = 0, lock_fd, path_ifindex; + __u32 dir_prog_id, prog_id = 0; + DIR *d; + + const char *dir = get_bpffs_dir(); + if (IS_ERR(dir)) + return libxdp_err(PTR_ERR(dir)); + + lock_fd = xdp_lock_acquire(); + if (lock_fd < 0) + return libxdp_err(lock_fd); + + d = opendir(dir); + if (!d) { + err = -errno; + pr_debug("Failed to open bpffs directory: %s\n", + strerror(-err)); + goto out; + } + + for (struct dirent *dent = readdir(d); dent; dent = readdir(d)) { + if (dent->d_type != DT_DIR) + continue; + + if (sscanf(dent->d_name, "dispatch-%d-%"PRIu32"", + &path_ifindex, &dir_prog_id) != 2) + continue; + + /* If ifindex is set, skip this dir if it doesn't match */ + if (ifindex && path_ifindex != ifindex) + continue; + + xdp_get_ifindex_prog_id(path_ifindex, &prog_id, NULL, NULL); + if (!prog_id || prog_id != dir_prog_id) { + pr_info("Prog id %"PRIu32" no longer attached on ifindex %d, removing pin directory %s\n", + dir_prog_id, path_ifindex, dent->d_name); + err = remove_pin_dir(dent->d_name); + if (err) + break; + } + } + closedir(d); +out: + xdp_lock_release(lock_fd); + return libxdp_err(err); +} diff --git a/lib/libxdp/libxdp.map b/lib/libxdp/libxdp.map new file mode 100644 index 0000000..9242794 --- /dev/null +++ b/lib/libxdp/libxdp.map @@ -0,0 +1,78 @@ +LIBXDP_1.0.0 { + global: + libxdp_get_error; + libxdp_set_print; + libxdp_strerror; + xdp_multiprog__attach_mode; + xdp_multiprog__close; + xdp_multiprog__detach; + xdp_multiprog__dispatcher; + xdp_multiprog__get_from_ifindex; + xdp_multiprog__is_legacy; + xdp_multiprog__next_prog; + xdp_multiprog__main_prog; + xdp_multiprog__hw_prog; + xdp_program__attach; + xdp_program__attach_multi; + xdp_program__bpf_obj; + xdp_program__btf; + xdp_program__chain_call_enabled; + xdp_program__close; + xdp_program__detach; + xdp_program__detach_multi; + xdp_program__find_file; + xdp_program__from_bpf_obj; + xdp_program__from_fd; + xdp_program__from_id; + xdp_program__from_pin; + xdp_program__fd; + xdp_program__id; + xdp_program__is_attached; + xdp_program__name; + xdp_program__open_file; + xdp_program__pin; + xdp_program__print_chain_call_actions; + xdp_program__run_prio; + xdp_program__set_chain_call_enabled; + xdp_program__set_run_prio; + xdp_program__tag; +}; + +LIBXDP_1.2.0 { + libxdp_clean_references; + xdp_multiprog__program_count; + xsk_setup_xdp_prog; + xsk_socket__create; + xsk_socket__create_shared; + xsk_socket__delete; + xsk_socket__fd; + xsk_socket__update_xskmap; + xsk_umem__create; + xsk_umem__delete; + xsk_umem__fd; + xsk_cons_nb_avail; + xsk_prod_nb_free; + xsk_ring_cons__cancel; + xsk_ring_cons__comp_addr; + xsk_ring_cons__peek; + xsk_ring_cons__release; + xsk_ring_cons__rx_desc; + xsk_ring_prod__fill_addr; + xsk_ring_prod__needs_wakeup; + xsk_ring_prod__reserve; + xsk_ring_prod__submit; + xsk_ring_prod__tx_desc; + xsk_umem__add_offset_to_addr; + xsk_umem__extract_addr; + xsk_umem__extract_offset; + xsk_umem__get_data; +} LIBXDP_1.0.0; + +LIBXDP_1.3.0 { + xdp_multiprog__xdp_frags_support; + xdp_program__clone; + xdp_program__create; + xdp_program__set_xdp_frags_support; + xdp_program__test_run; + xdp_program__xdp_frags_support; +} LIBXDP_1.2.0; diff --git a/lib/libxdp/libxdp.mk b/lib/libxdp/libxdp.mk new file mode 100644 index 0000000..18b60e5 --- /dev/null +++ b/lib/libxdp/libxdp.mk @@ -0,0 +1,3 @@ +LIBXDP_VERSION := $(shell sed -ne "/LIBXDP_[0-9\.]\+ {/ {s/LIBXDP_\([0-9\.]\+\) {/\1/;p}" $(LIB_DIR)/libxdp/libxdp.map | tail -n 1) +LIBXDP_MAJOR_VERSION := $(shell echo $(LIBXDP_VERSION) | sed 's/\..*//') + diff --git a/lib/libxdp/libxdp.pc.template b/lib/libxdp/libxdp.pc.template new file mode 100644 index 0000000..30b10d4 --- /dev/null +++ b/lib/libxdp/libxdp.pc.template @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +prefix=@PREFIX@ +libdir=@LIBDIR@ +includedir=${prefix}/include + +Name: libxdp +Description: XDP library +Version: @VERSION@ +Libs: -L${libdir} -lxdp +Requires.private: libbpf +Cflags: -I${includedir} diff --git a/lib/libxdp/libxdp_internal.h b/lib/libxdp/libxdp_internal.h new file mode 100644 index 0000000..605735c --- /dev/null +++ b/lib/libxdp/libxdp_internal.h @@ -0,0 +1,146 @@ +#ifndef __LIBXDP_LIBXDP_INTERNAL_H +#define __LIBXDP_LIBXDP_INTERNAL_H + +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <stdarg.h> +#include <stdio.h> +#include <linux/err.h> +#include <xdp/libxdp.h> + +#define LIBXDP_HIDE_SYMBOL __attribute__((visibility("hidden"))) +#define __unused __attribute__((unused)) + +#define __printf(a, b) __attribute__((format(printf, a, b))) + +static inline int try_snprintf(char *buf, size_t buf_len, const char *format, ...) +{ + va_list args; + int len; + + va_start(args, format); + len = vsnprintf(buf, buf_len, format, args); + va_end(args); + + if (len < 0) + return -EINVAL; + else if ((size_t)len >= buf_len) + return -ENAMETOOLONG; + + return 0; +} + +LIBXDP_HIDE_SYMBOL __printf(2, 3) void libxdp_print(enum libxdp_print_level level, + const char *format, ...); +#define __pr(level, fmt, ...) \ + do { \ + libxdp_print(level, "libxdp: " fmt, ##__VA_ARGS__); \ + } while (0) + +#define pr_warn(fmt, ...) __pr(LIBXDP_WARN, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) __pr(LIBXDP_INFO, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) __pr(LIBXDP_DEBUG, fmt, ##__VA_ARGS__) + +LIBXDP_HIDE_SYMBOL int check_xdp_prog_version(const struct btf *btf, const char *name, + __u32 *version); + +LIBXDP_HIDE_SYMBOL int libxdp_check_kern_compat(void); + +#define min(x, y) ((x) < (y) ? x : y) +#define max(x, y) ((x) > (y) ? x : y) + +#ifndef offsetof +#define offsetof(type, member) ((size_t) & ((type *)0)->member) +#endif + +#ifndef offsetofend +#define offsetofend(TYPE, FIELD) (offsetof(TYPE, FIELD) + sizeof(((TYPE *)0)->FIELD)) +#endif + +#ifndef container_of +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) +#endif + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +/* OPTS macros, from libbpf_internal.h */ + +static inline bool libxdp_is_mem_zeroed(const char *obj, + size_t off_start, size_t off_end) +{ + const char *p; + + for (p = obj + off_start; p < obj + off_end; p++) { + if (*p) + return false; + } + return true; +} + +static inline bool libxdp_validate_opts(const char *opts, + size_t opts_sz, size_t user_sz, + const char *type_name) +{ + if (user_sz < sizeof(size_t)) { + pr_warn("%s size (%zu) is too small\n", type_name, user_sz); + return false; + } + if (!libxdp_is_mem_zeroed(opts, opts_sz, user_sz)) { + pr_warn("%s has non-zero extra bytes\n", type_name); + return false; + } + return true; +} + +#define OPTS_VALID(opts, type) \ + (!(opts) || libxdp_validate_opts((const char *)opts, \ + offsetofend(struct type, \ + type##__last_field), \ + (opts)->sz, #type)) +#define OPTS_HAS(opts, field) \ + ((opts) && opts->sz >= offsetofend(typeof(*(opts)), field)) +#define OPTS_GET(opts, field, fallback_value) \ + (OPTS_HAS(opts, field) ? (opts)->field : fallback_value) +#define OPTS_SET(opts, field, value) \ + do { \ + if (OPTS_HAS(opts, field)) \ + (opts)->field = value; \ + } while (0) + +#define OPTS_ZEROED(opts, last_nonzero_field) \ + (!(opts) || libxdp_is_mem_zeroed((const void *)opts, \ + offsetofend(typeof(*(opts)), \ + last_nonzero_field), \ + (opts)->sz)) + +/* handle direct returned errors */ +static inline int libxdp_err(int ret) +{ + if (ret < 0) + errno = -ret; + return ret; +} + +/* handle error for pointer-returning APIs, err is assumed to be < 0 always */ +static inline void *libxdp_err_ptr(int err, bool ret_null) +{ + /* set errno on error, this doesn't break anything */ + errno = -err; + + if (ret_null) + return NULL; + /* legacy: encode err as ptr */ + return ERR_PTR(err); +} + +LIBXDP_HIDE_SYMBOL int xdp_lock_acquire(void); +LIBXDP_HIDE_SYMBOL int xdp_lock_release(int lock_fd); +LIBXDP_HIDE_SYMBOL int xdp_attach_fd(int prog_fd, int old_fd, int ifindex, + enum xdp_attach_mode mode); + +#endif /* __LIBXDP_LIBXDP_INTERNAL_H */ diff --git a/lib/libxdp/protocol.org b/lib/libxdp/protocol.org new file mode 100644 index 0000000..2adaf6a --- /dev/null +++ b/lib/libxdp/protocol.org @@ -0,0 +1,473 @@ +#+OPTIONS: ^:nil + +* Protocol for atomic loading of multi-prog dispatchers + +With the support for the =freplace= program type, it is possible to load +multiple XDP programs on a single interface by building a /dispatcher/ program +which will run on the interface, and which will call the component XDP programs +as functions using the =freplace= type. + +For this to work in an interoperable way, applications need to agree on how to +attach their XDP programs using this mechanism. This document outlines the +protocol implemented by =libxdp=, serving as both documentation and a blueprint +for anyone else who wants to implement the same protocol and interoperate. + +** Generating a dispatcher +The dispatcher is simply an XDP program that will call each of a number of stub +functions in turn, and depending on their return code either continue on to the +next function or return immediately. These stub functions are then replaced at +load time with the user XDP programs, using the =freplace= functionality. + +*** Dispatcher format +The dispatcher XDP program contains the main function containing the dispatcher +logic, 10 stub functions that can be replaced by component BPF programs, and a +configuration structure that is used by the dispatcher logic. + +In =libxdp=, this dispatcher is generated by [[https://github.com/xdp-project/xdp-tools/blob/master/lib/libxdp/xdp-dispatcher.c.in][an M4 macro file]] which expands to +the following: + +#+begin_src C +#define XDP_METADATA_SECTION "xdp_metadata" +#define XDP_DISPATCHER_VERSION 2 +#define XDP_DISPATCHER_MAGIC 236 +#define XDP_DISPATCHER_RETVAL 31 +#define MAX_DISPATCHER_ACTIONS 10 + +struct xdp_dispatcher_config { + __u8 magic; /* Set to XDP_DISPATCHER_MAGIC */ + __u8 dispatcher_version; /* Set to XDP_DISPATCHER_VERSION */ + __u8 num_progs_enabled; /* Number of active program slots */ + __u8 is_xdp_frags; /* Whether this dispatcher is loaded with XDP frags support */ + __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; + __u32 run_prios[MAX_DISPATCHER_ACTIONS]; + __u32 program_flags[MAX_DISPATCHER_ACTIONS]; +}; + +/* While 'const volatile' sounds a little like an oxymoron, there's reason + * behind the madness: + * + * - const places the data in rodata, where libbpf will mark it as read-only and + * frozen on program load, letting the kernel do dead code elimination based + * on the values. + * + * - volatile prevents the compiler from optimising away the checks based on the + * compile-time value of the variables, which is important since we will be + * changing the values before loading the program into the kernel. + */ +static volatile const struct xdp_dispatcher_config conf = {}; + +/* The volatile return value prevents the compiler from assuming it knows the + * return value and optimising based on that. + */ +__attribute__ ((noinline)) +int prog0(struct xdp_md *ctx) { + volatile int ret = XDP_DISPATCHER_RETVAL; + + if (!ctx) + return XDP_ABORTED; + return ret; +} +/* the above is repeated as prog1...prog9 */ + +SEC("xdp") +int xdp_dispatcher(struct xdp_md *ctx) +{ + __u8 num_progs_enabled = conf.num_progs_enabled; + int ret; + + if (num_progs_enabled < 1) + goto out; + ret = prog0(ctx); + if (!((1U << ret) & conf.chain_call_actions[0])) + return ret; + + /* the above is repeated for prog1...prog9 */ + +out: + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; +__uint(dispatcher_version, XDP_DISPATCHER_VERSION) SEC(XDP_METADATA_SECTION); +#+end_src + +The dispatcher program is pre-compiled and distributed with =libxdp=. Because +the configuration struct is marked as =const= in the source file, it will be put +into the =rodata=, which libbpf will turn into a read-only (frozen) map on load. +This allows the kernel verifier to perform dead code elimination based on the +values in the map. This is also the reason for the =num_progs_enabled= member of +the config struct: together with the checks in the main dispatcher function the +verifier will effectively remove all the stub function calls not being used, +without having to rely on dynamic compilation. + +When generating a dispatcher, this BPF object file is opened and the +configuration struct is populated before the object is loaded. As a forward +compatibility measure, =libxdp= will also check for the presence of the +=dispatcher_version= field in the =xdp_metadata= section (encoded like the +program metadata described in "Processing program metadata" below), and if it +doesn't match the expected version (currently version 2), will abort any action. + + +*** Populating the dispatcher configuration map +On loading, the dispatcher configuration map is populated as follows: + +- The =magic= field is set to the =XDP_DISPATCHER_MAGIC= value (236). This field + is here to make it possible to check if a program is a dispatcher without + looking at the program BTF in the future. + +- The =dispatcher_version= field is set to the current dispatcher version (2). + This is redundant with the BTF-encoded version in the metadata field, but must + be checked so that the BTF metadata version can be removed in the future. See + the section on old dispatcher versions below. + +- The =num_progs_enabled= member is simply set to the number of active programs + that will be attached to this dispatcher. + +- The =is_xdp_frags= variable is set to 1 if dispatcher is loaded with XDP frags + support (see section below), or 0 otherwise. + +The two other fields contain per-component program metadata, which is read from +the component programs as explained in the "Processing program metadata" section +below. + +- The =chain_call_actions= array is populated with a bitmap signifying which XDP + actions (return codes) of each component program should be interpreted as a + signal to continue execution of the next XDP program. For instance, a packet + filtering program might designate that an =XDP_PASS= action should make + execution continue, while other return codes should immediately end the call + chain and return. The special =XDP_DISPATCHER_RETVAL= (which is set to 31 + corresponding to the topmost bit in the bitmap) is always included in each + programs' =chain_call_actions=; this value is returned by the stub functions, + which ensures that should a component program become detached, processing + will always continue past the stub function. + +- The =run_prios= array contains the effective run priority of each component + program when it was installed. This is also read as program metadata, but + because it can be overridden at load time, the effective value is stored in + the configuration array so it can be carried forward when the dispatcher is + replaced. Component programs are expected to be sorted in order of their run + priority (as explained below in "Loading and attaching component programs"). + +- The =program_flags= is used to store the flags that an XDP program was loaded + with. This is populated with the value of the =BPF_F_XDP_HAS_FRAGS= flag if + the component program in this slot had that flag set (see the section on XDP + frags support below), and is 0 otherwise. + +**** Processing program metadata +As explained above, each component program must specify one or more chain call +actions and a run priority on attach. When loading a user program, =libxdp= will +attempt to read this metadata from the object file as explained in the +following; if no values are found in the object file, a default run priority of +50 will be applied, and =XDP_PASS= will be the only chain call action. + +The metadata is read from the object file by looking for BTF-encoded metadata in +the =.xdp_run_config= object section, encoded similar to the BTF-defined maps +used by libbpf (in the =.maps= section). Here, =libxdp= will look for a struct +definition with the XDP program function name prefixed by an underscore (e.g., +if the main XDP function is called =xdp_main=, libxdp will look for a struct +definition called =_xdp_main=). In this struct, a member =priority= encodes the +run priority, each XDP action can be set as a chain call action by setting a +struct member with the action name. + +The =xdp_helpers.h= header file included with XDP exposes helper macros that can +be used with the existing helpers in =bpf_helpers.h= (from libbpf), so a full +run configuration metadata section can be defined as follows: + +#+begin_src C +#include <bpf/bpf_helpers.h> +#include <xdp/xdp_helpers.h> + +struct { + __uint(priority, 10); + __uint(XDP_PASS, 1); + __uint(XDP_DROP, 1); +} XDP_RUN_CONFIG(my_xdp_func); +#+end_src + +This example sets priority 10 with chain call actions =XDP_PASS= and =XDP_DROP= +for the XDP program starting at =my_xdp_func()=. + +This turns into the following BTF information (as shown by =bpftool btf dump=): + +#+begin_src +[12] STRUCT '(anon)' size=24 vlen=3 + 'priority' type_id=13 bits_offset=0 + 'XDP_PASS' type_id=15 bits_offset=64 + 'XDP_DROP' type_id=15 bits_offset=128 +[13] PTR '(anon)' type_id=14 +[14] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=10 +[15] PTR '(anon)' type_id=16 +[16] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=1 +[17] VAR '_my_xdp_func' type_id=12, linkage=global-alloc +[18] DATASEC '.xdp_run_config' size=0 vlen=1 + type_id=17 offset=0 size=24 +#+end_src + +The parser will look for the =.xdp_run_config= DATASEC, then follow the types +recursively, extracting the field values from the =nr_elems= in the anonymous +arrays in type IDs 14 and 16. + +While =libxdp= will automatically load any metadata specified as above in the +program BTF, the application using =libxdp= can override these values at +runtime. These overridden values will be the ones used when determining program +order, and will be preserved in the dispatcher configuration map for subsequent +operations. + +*** Old versions of the XDP dispatcher +This document currently describes version 2 of the dispatcher and protocol. This +differs from version 1 in the following respects: + +- The dispatcher configuration map has gained the =magic= and + =dispatcher_version= fields for identifying the dispatcher and its version.. + +- The protocol now supports propagating the value of the =BPF_F_XDP_HAS_FRAGS= + field for supporting XDP frags programs for higher MTU. The dispatcher + configuration map has gained the =is_xdp_frags= and =program_flags= fields for + use with this feature. The protocol for propagating the frags field is + described below, and an implementation of this protocol that recognises + version 2 of the dispatcher MUST implement this protocol. + +Older versions of libxdp will check the dispatcher version field of any +dispatcher loaded in the kernel, and refuse to operate on a dispatcher with a +higher version than the library version implements. This means that if a newer +dispatcher is loaded, old versions of the library will be locked out of +modifying that dispatcher. This is by design: old library versions don't +recognise the semantics of new features added in subsequent versions, and so +would introduce bugs if it attempted to operate on newer versions. + +Newer versions of libxdp will, however, recognise older dispatcher versions. If +a newer version of libxdp loads a new program and finds an old dispatcher +version already loaded on an interface, it will display the programs attached to +it, but will refuse to replace it with a newer version so as not to lock out the +program that loaded the program(s) already attached. Manually unloading the +loaded programs will be required to load a new dispatcher version on the +interface. + +*** Loading and attaching component programs +When loading one or more XDP programs onto an interface (assuming no existing +program is found on the interface; for adding programs, see below), =libxdp= +first prepares a dispatcher program with the right number of slots, by +populating the configuration struct as described above. Then, this dispatcher +program is loaded into the kernel, with the =BPF_F_XDP_HAS_FRAGS= flag set if +all component programs have that flag set (see the section on supporting XDP +frags below). + +Having loaded the dispatcher program, =libxdp= then loads each of the component +programs. To do this, first the list of component programs is sorted by their +run priority, forming the final run sequence. Should several programs have the +same run priority, ties are broken in the following arbitrary, but +deterministic, order (see =cmp_xdp_programs()= [[https://github.com/xdp-project/xdp-tools/blob/master/lib/libxdp/libxdp.c][in libxdp.c]]): + +- By XDP function name (=bpf_program__name()= from libbpf) +- By sorting already-loaded programs before not-yet-loaded ones +- By unloaded programs by program size +- By loaded program bpf tag value (using =memcmp()=) +- By load time + +Before loading, each component program type is reset to =BPF_PROG_TYPE_EXT= with +an expected attach type of 0, and the =BPF_F_XDP_HAS_FRAGS= is unset (see the +section on supporting frags below). Then, the attachment target is set to the +dispatcher file descriptor and the BTF ID of the stub function to replace (i.e., +the first component program has =prog0()= as its target, and so on). Then the +program is loaded, at which point the kernel will verify the component program's +compatibility with the attach point. + +Having loaded the component program, it is attached to the dispatcher by way of +=bpf_link_create()=, specifying the same target file description and BTF ID used +when loading the program. This will return a link fd, which will be pinned to +prevent the attachment to unravel when the fd is closed (see "Locking and +pinning" below). + +*** Locking and pinning +To prevent the kernel from detaching any =freplace= program when its last file +description is closed, the programs must be pinned in =bpffs=. This is done in +the =xdp= subdirectory of =bpffs=, which by default means =/sys/fs/bpf/xdp=. If +the =LIBXDP_BPFFS= environment variable is set, this will override the location +of the top-level =bpffs=, and the =xdp= subdirectory will be created beneath +this path. + +The pathnames generated for pinning are the following: + +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID - dispatcher program for IFINDEX with BPF program ID DID +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-prog - component program 0, program reference +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-link - component program 0, bpf_link reference +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-prog - component program 1, program reference +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-link - component program 1, bpf_link reference +- etc, up to ten component programs + +This means that several pin operations have to be performed for each dispatcher +program. Semantically, these are all atomic, so to make sure every consumer of +the hierarchy of pinned files gets a consistent view, locking is needed. This is +implemented by opening the parent directory =/sys/fs/bpf/xdp= with the +=O_DIRECTORY= flag, and obtaining a lock on the resulting file descriptor using +=flock(lock_fd, LOCK_EX)=. + +When creating a new dispatcher program, it will first be fully populated, with +all component programs attached. Then, the programs will be linked in =bpffs= as +specified above, and once this succeeds, the program will be attached to the +interface. If attaching the program fails, the programs will be unpinned again, +and the error returned to the caller. This order ensures atomic attachment to +the interface, without any risk that component programs will be automatically +detached due to a badly timed application crash. + +When loading the initial dispatcher program, the =XDP_FLAGS_UPDATE_IF_NOEXIST= +flag is set to prevent accidentally overriding any concurrent modifications. If +this fails, the whole operation starts over, turning the load into a +modification as described below. + +*** Supporting XDP programs with frags support (BPF_F_XDP_HAS_FRAGS flag) +Linux kernel 5.18 added support for a new API that allows XDP programs to access +packet data that spans more than a single page, allowing XDP programs to be +loaded on interfaces with bigger MTUs. Such packets will not have all their +packet data accessible by the traditional "direct packet access"; instead, only +the first fragment will be available this way, and the rest of the packet data +has to be accessed via the new =bpf_xdp_load_bytes()= helper. + +Existing XDP programs are written with the assumption that they can see the +whole packet data using direct packet access, which means they can subtly +malfunction if some of the packet data is suddenly invisible (for instance, +counting packet lengths is no longer accurate). Whether a given XDP program +supports the frags API or not is a semantic issue, and it's not possible for the +kernel to auto-detect this. For this reason, programs have to opt in to XDP +frags support at load time, by setting the =BPF_F_XDP_HAS_FRAGS= flag as they +are loaded into the kernel. Programs that are not loaded with this flag will be +rejected from attaching to network devices that use packet fragment (i.e., those +with a large MTU). + +This has implications for the XDP dispatcher, as its purpose is for multiple +programs to be loaded at the same time. Since the =BPF_F_XDP_HAS_FRAGS= cannot +be set for individual component programs, it has to be set for the dispatcher as +a whole. However, as described above, programs can subtly malfunction if they +are exposed to packets with fragments without being ready to do so. This means +that it's only safe to set the =BPF_F_XDP_HAS_FRAGS= on the dispatcher itself if +*all* component programs have the flag set. + +To properly propagate the flags even when adding new programs to an existing +dispatcher, the dispatcher itself needs to keep track of which of its component +programs had the =BPF_F_XDP_HAS_FRAGS= flag set when they were added. The +dispatcher configuration map users the =program_flags= array for this: for each +component program, this field is set to the value of the =BPF_F_XDP_HAS_FRAGS= +flag if that component program has the flag set, and to 0 otherwise. An +additional field, =is_xdp_frags=, is set if the dispatcher itself is loaded with +the frags field set (which may not be the case if the kernel doesn't support the +flag). + +When generating a dispatcher for a set of programs, libxdp simply tracks if all +component programs support the =BPF_F_XDP_HAS_FRAGS=, and if they do, the +dispatcher is loaded with this flag set. If any program attached to the +dispatcher does not support the flag, the dispatcher is loaded without this flag +set (and the =is_xdp_frags= field in the dispatcher configuration is set +accordingly). If libxdp determines that the running kernel does not support the +=BPF_F_XDP_HAS_FRAGS=, the dispatcher is loaded without the flag regardless of +the value of the component programs. + +When adding a program to an existing dispatcher, this may result in a +"downgrade", i.e., loading a new dispatcher without the frags flag to replace an +existing dispatcher that does have the flag set. This will result in the +replacement dispatcher being rejected by the kernel at attach time, but only if +the interface being attached to actually requires the frags flag (i.e., if it +has a large MTU). If the attachment is rejected, the old dispatcher will stay in +place, leading to no loss of functionality. + +** Adding or removing programs from an existing dispatcher +The sections above explain how to generate a dispatcher and attach it to an +interface, assuming no existing program is attached. When one or more programs +is already attached, a couple of extra steps are required to ensure that the +switch is made atomically. + +Briefly, changing the programs attached to an interface entails the following +steps: + +- Reading the existing dispatcher program and obtaining references to the + component programs. + +- Generating a new dispatcher containing the new set of programs (adding or + removing the programs needed). + +- Atomically swapping out the XDP program attachment on the interface so the new + dispatcher takes over from the old one. + +- Unpinning and dismantling the old dispatcher. + +These operations are each described in turn in the following sections. + +*** Reading list of existing programs from the kernel +The first step is to obtain the ID of the currently loaded XDP program using +=bpf_get_link_xdp_info()=. A file descriptor to the dispatcher is obtained using +=bpf_prog_get_fd_by_id()=, and the BTF information attached to the program is +obtained from the kernel. This is checked for the presence of the dispatcher +version field (as explained above), and the operation is aborted if this is not +present, or doesn't match what the library expects. + +Having thus established that the program loaded on the interface is indeed a +compatible dispatcher, the map ID of the map containing the configuration struct +is obtained from the kernel, and the configuration data is loaded from the map +(after checking that the map value size matches the expected configuration +struct). + +Then, the file lock on the directory in =bpffs= is obtained as explained in +the "Locking and pinning" section above, and, while holding this lock, file +descriptors to each of the component programs and =bpf_link= objects are +obtained. The end result is a reference to the full dispatcher structure (and +its component programs), corresponding to that generated on load. When +populating the component program structure in memory, the chain call actions and +run priority from the dispatcher configuration map is used instead of parsing +the BTF metadata of each program: This ensures that any modified values +specified at load time will be retained in stead of being reverted to the +values compiled into the BTF metadata. Similarly, the =program_flags= array of +the in-kernel dispatcher is used to determine which of the existing component +programs support the =BPF_F_XDP_HAS_FRAGS= flag (see the section on frags +support above). + +*** Generating a new dispatcher +Having obtained a reference to the existing dispatcher, =libxdp= takes that and +the list of programs to add to or remove from the interface, and simply +generates a new dispatcher with the new set of programs. When adding programs, +the whole list of programs is sorted according to their run priorities (as +explained above), resulting in new programs being inserted in the right place in +the existing sequence according to their priority. + +Generating this secondary dispatcher relies on the support for multiple +attachments for =freplace= programs, which was added in kernel 5.10. This allows +the =bpf_link_create()= operation to specify an attachment target in the new +dispatcher. In other words, the component programs will briefly be attached to +both the old and new dispatcher, but only one of those will be attached to the +interface. + +After completion of the new dispatcher, its component programs are pinned in +=bpffs= as described above. + +*** Atomic replace and retry +At this point, =libxdp= has references to both the old dispatcher, already +attached to the interface, and the new one with the modified set of component +programs. The new dispatcher is then atomically swapped out with the old one, +using the =XDP_FLAGS_REPLACE= flag to the netlink operation (and the +accompanying =IFLA_XDP_EXPECTED_FD= attribute). + +Once the atomic replace operation succeeds, the old dispatcher is unpinned from +=bppfs= and the in-memory references to both the old and new dispatchers are +released (since the new dispatcher was already pinned, preventing it from being +detached from the interface). + +Should this atomic replace instead *fail* because the program attached to the +interface changed while the new dispatcher was being built, the whole operation +is simply started over from the beginning. That is, the new dispatcher is +unpinned from =bpffs=, and the in-memory references to both dispatchers are +released (but no unpinning of the old dispatcher is performed!). Then, the +program ID attached to the interface is again read from the kernel, and the +operation proceeds from "Reading list of existing programs from the kernel". + + +** Compatibility with older kernels +The full functionality described above can only be attained with kernels version +5.10 or newer, because this is the version that introduced support for +re-attaching an freplace program in a secondary attachment point. However, the +freplace functionality itself was introduced in kernel 5.7, so for kernel +versions 5.7 to 5.9, multiple programs can be attached as long as they are all +attached to the dispatcher immediately as they are loaded. This is achieved by +using =bpf_raw_tracepoint_open()= in place of =bpf_link_create()= when attaching +the component programs to the dispatcher. The =bpf_raw_tracepoint_open()= +function doesn't take an attach target as a parameter; instead, it simply +attached the freplace program to the target that was specified at load time +(which is why it only works when all component programs are loaded together with +the dispatcher). diff --git a/lib/libxdp/tests/.gitignore b/lib/libxdp/tests/.gitignore new file mode 100644 index 0000000..cc3a114 --- /dev/null +++ b/lib/libxdp/tests/.gitignore @@ -0,0 +1,4 @@ +test_xsk_refcnt +check_kern_compat +test_xdp_frags +test_dispatcher_versions diff --git a/lib/libxdp/tests/Makefile b/lib/libxdp/tests/Makefile new file mode 100644 index 0000000..3c22901 --- /dev/null +++ b/lib/libxdp/tests/Makefile @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) + +USER_TARGETS := test_xsk_refcnt check_kern_compat test_xdp_frags test_dispatcher_versions +BPF_TARGETS := xdp_dispatcher_v1 xdp_pass +USER_LIBS := -lpthread + +EXTRA_DEPS += xdp_dispatcher_v1.h +EXTRA_USER_DEPS += test_utils.h + +TEST_FILE := ./test-libxdp.sh +TEST_RUNNER := ./test_runner.sh + +USER_C := ${USER_TARGETS:=.c} +USER_OBJ := ${USER_C:.c=.o} +BPF_OBJS := $(BPF_TARGETS:=.o) + +LIB_DIR := ../.. +LDLIBS += $(USER_LIBS) + +include $(LIB_DIR)/defines.mk + +LDFLAGS+=-L$(LIBXDP_DIR) +ifeq ($(DYNAMIC_LIBXDP),1) + LDLIBS:=-lxdp $(LDLIBS) + OBJECT_LIBXDP:=$(LIBXDP_DIR)/libxdp.so.$(LIBXDP_VERSION) +else + LDLIBS:=-l:libxdp.a $(LDLIBS) + OBJECT_LIBXDP:=$(LIBXDP_DIR)/libxdp.a +endif + +# Detect submodule libbpf source file changes +ifeq ($(SYSTEM_LIBBPF),n) + LIBBPF_SOURCES := $(wildcard $(LIBBPF_DIR)/src/*.[ch]) +endif + +LIBXDP_SOURCES := $(wildcard $(LIBXDP_DIR)/*.[ch] $(LIBXDP_DIR)/*.in) + +CFLAGS += -I$(HEADER_DIR) + +BPF_HEADERS := $(wildcard $(HEADER_DIR)/bpf/*.h) $(wildcard $(HEADER_DIR)/xdp/*.h) + +all: $(USER_TARGETS) $(BPF_OBJS) + +.PHONY: clean +clean:: + $(Q)rm -f $(USER_TARGETS) $(USER_OBJ) + +$(OBJECT_LIBBPF): $(LIBBPF_SOURCES) + $(Q)$(MAKE) -C $(LIB_DIR) libbpf + +$(OBJECT_LIBXDP): $(LIBXDP_SOURCES) + $(Q)$(MAKE) -C $(LIBXDP_DIR) + +# Create expansions for dependencies +LIB_H := ${LIB_OBJS:.o=.h} + +# Detect if any of common obj changed and create dependency on .h-files +$(LIB_OBJS): %.o: %.c %.h $(LIB_H) + $(Q)$(MAKE) -C $(dir $@) $(notdir $@) + +ALL_EXEC_TARGETS=$(USER_TARGETS) +$(ALL_EXEC_TARGETS): %: %.c $(OBJECT_LIBBPF) $(OBJECT_LIBXDP) $(LIBMK) $(LIB_OBJS) $(EXTRA_DEPS) $(EXTRA_USER_DEPS) + $(QUIET_CC)$(CC) -Wall $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o $@ $(LIB_OBJS) \ + $< $(LDLIBS) + +$(BPF_OBJS): %.o: %.c $(BPF_HEADERS) $(LIBMK) $(EXTRA_DEPS) + $(QUIET_CLANG)$(CLANG) -S \ + -target $(BPF_TARGET) \ + -D __BPF_TRACING__ \ + $(BPF_CFLAGS) \ + -Wall \ + -Wno-unused-value \ + -Wno-pointer-sign \ + -Wno-compare-distinct-pointer-types \ + -Werror \ + -O2 -emit-llvm -c -g -o ${@:.o=.ll} $< + $(QUIET_LLC)$(LLC) -march=$(BPF_TARGET) -filetype=obj -o $@ ${@:.o=.ll} + +run: all + $(Q)env CC="$(CC)" CFLAGS="$(CFLAGS) $(LDFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDLIBS="$(LDLIBS)" V=$(V) $(TEST_RUNNER) $(TEST_FILE) $(RUN_TESTS) diff --git a/lib/libxdp/tests/check_kern_compat.c b/lib/libxdp/tests/check_kern_compat.c new file mode 100644 index 0000000..8fb8991 --- /dev/null +++ b/lib/libxdp/tests/check_kern_compat.c @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "test_utils.h" +#include "../libxdp_internal.h" + +int main(__unused int argc, __unused char** argv) +{ + silence_libbpf_logging(); + return libxdp_check_kern_compat(); +} diff --git a/lib/libxdp/tests/test-libxdp.sh b/lib/libxdp/tests/test-libxdp.sh new file mode 100644 index 0000000..90fc44c --- /dev/null +++ b/lib/libxdp/tests/test-libxdp.sh @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) + +ALL_TESTS="test_link_so test_link_a test_old_dispatcher test_xdp_frags test_xsk_prog_refcnt_bpffs test_xsk_prog_refcnt_legacy" + +TESTS_DIR=$(dirname "${BASH_SOURCE[0]}") + +test_link_so() +{ + TMPDIR=$(mktemp --tmpdir -d libxdp-test.XXXXXX) + cat >$TMPDIR/libxdptest.c <<EOF +#include <xdp/libxdp.h> +int main(int argc, char **argv) { + (void) argc; (void) argv; + (void) xdp_program__open_file("filename", "section_name", NULL); + return 0; +} +EOF + $CC -o $TMPDIR/libxdptest $TMPDIR/libxdptest.c $CFLAGS $CPPFLAGS -lxdp $LDLIBS 2>&1 + retval=$? + rm -rf "$TMPDIR" + return $retval +} + +test_link_a() +{ + TMPDIR=$(mktemp --tmpdir -d libxdp-test.XXXXXX) + cat >$TMPDIR/libxdptest.c <<EOF +#include <xdp/libxdp.h> +int main(int argc, char **argv) { + (void) argc; (void) argv; + (void) xdp_program__open_file("filename", "section_name", NULL); + return 0; +} +EOF + $CC -o $TMPDIR/libxdptest $TMPDIR/libxdptest.c $CFLAGS $CPPFLAGS -l:libxdp.a $LDLIBS 2>&1 + retval=$? + rm -rf "$TMPDIR" + return $retval +} + +test_refcnt_once() +{ + # We need multiple queues for this test + NUM_QUEUES_REQUIRED=3 + ip link add xsk_veth0 numrxqueues $NUM_QUEUES_REQUIRED type veth peer name xsk_veth1 + check_run $TESTS_DIR/test_xsk_refcnt xsk_veth0 2>&1 + ip link delete xsk_veth0 +} + +check_mount_bpffs() +{ + mount | grep -q /sys/fs/bpf || mount -t bpf bpf /sys/fs/bpf/ || echo "Unable to mount /sys/fs/bpf" + mount | grep -q /sys/fs/bpf +} + +check_unmount_bpffs() +{ + mount | grep -q /sys/fs/bpf && umount /sys/fs/bpf/ || echo "Unable to unmount /sys/fs/bpf" + ! mount | grep -q /sys/fs/bpf +} + +test_xsk_prog_refcnt_bpffs() +{ + check_mount_bpffs && test_refcnt_once "$@" +} + +test_xsk_prog_refcnt_legacy() +{ + check_unmount_bpffs && test_refcnt_once "$@" +} + +test_xdp_frags() +{ + skip_if_missing_libxdp_compat + + check_mount_bpffs || return 1 + ip link add xdp_veth_big0 mtu 5000 type veth peer name xdp_veth_big1 mtu 5000 + ip link add xdp_veth_small0 type veth peer name xdp_veth_small1 + check_run $TESTS_DIR/test_xdp_frags xdp_veth_big0 xdp_veth_small0 2>&1 + ip link delete xdp_veth_big0 + ip link delete xdp_veth_small0 +} + +test_old_dispatcher() +{ + skip_if_missing_libxdp_compat + + check_mount_bpffs || return 1 + ip link add xdp_veth0 type veth peer name xdp_veth1 + check_run $TESTS_DIR/test_dispatcher_versions xdp_veth0 + ip link delete xdp_veth0 +} + +cleanup_tests() +{ + ip link del dev xdp_veth_big0 >/dev/null 2>&1 + ip link del dev xdp_veth_small0 >/dev/null 2>&1 + ip link del dev xsk_veth0 >/dev/null 2>&1 +} diff --git a/lib/libxdp/tests/test_dispatcher_versions.c b/lib/libxdp/tests/test_dispatcher_versions.c new file mode 100644 index 0000000..14a8ba8 --- /dev/null +++ b/lib/libxdp/tests/test_dispatcher_versions.c @@ -0,0 +1,300 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE + +#include <errno.h> +#include <linux/err.h> +#include <net/if.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "test_utils.h" +#include "../libxdp_internal.h" +#include "xdp_dispatcher_v1.h" + +#include <xdp/libxdp.h> +#include <bpf/libbpf.h> +#include <bpf/btf.h> + +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +#define BPFFS_DIR "/sys/fs/bpf/xdp" + +#define PROG_RUN_PRIO 42 +#define PROG_CHAIN_CALL_ACTIONS (1 << XDP_DROP) + +int get_prog_id(int prog_fd) +{ + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + int err; + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &len); + if (err) + return -errno; + + return info.id; +} + +int load_dispatcher_v1(int ifindex) +{ + struct xdp_dispatcher_config_v1 dispatcher_config = {}; + struct bpf_object *obj_dispatcher, *obj_prog = NULL; + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + struct bpf_program *dispatcher_prog, *xdp_prog; + int ret, btf_id, lfd = -1, dispatcher_id; + char pin_path[PATH_MAX], buf[PATH_MAX]; + const char *attach_func = "prog0"; + struct bpf_map *map; + + if (!ifindex) + return -ENOENT; + + obj_dispatcher = bpf_object__open("xdp_dispatcher_v1.o"); + if (!obj_dispatcher) + return -errno; + + btf_id = btf__find_by_name_kind(bpf_object__btf(obj_dispatcher), + attach_func, BTF_KIND_FUNC); + if (btf_id <= 0) { + ret = -ENOENT; + goto out; + } + opts.target_btf_id = btf_id; + + map = bpf_object__next_map(obj_dispatcher, NULL); + if (!map) { + ret = -ENOENT; + goto out; + } + + dispatcher_prog = bpf_object__find_program_by_name(obj_dispatcher, + "xdp_dispatcher"); + if (!dispatcher_prog) { + ret = -errno; + goto out; + } + + dispatcher_config.num_progs_enabled = 1; + dispatcher_config.chain_call_actions[0] = PROG_CHAIN_CALL_ACTIONS; + dispatcher_config.run_prios[0] = PROG_RUN_PRIO; + + ret = bpf_map__set_initial_value(map, &dispatcher_config, + sizeof(dispatcher_config)); + if (ret) + goto out; + + + ret = bpf_object__load(obj_dispatcher); + if (ret) + goto out; + + dispatcher_id = get_prog_id(bpf_program__fd(dispatcher_prog)); + if (dispatcher_id < 0) { + ret = dispatcher_id; + goto out; + } + + obj_prog = bpf_object__open("xdp_pass.o"); + if (!obj_prog) { + ret = -errno; + goto out; + } + + xdp_prog = bpf_object__find_program_by_name(obj_prog, "xdp_pass"); + if (!xdp_prog) { + ret = -errno; + goto out; + } + + ret = bpf_program__set_attach_target(xdp_prog, + bpf_program__fd(dispatcher_prog), + attach_func); + if (ret) + goto out; + + bpf_program__set_type(xdp_prog, BPF_PROG_TYPE_EXT); + bpf_program__set_expected_attach_type(xdp_prog, 0); + + ret = bpf_object__load(obj_prog); + if (ret) + goto out; + + lfd = bpf_link_create(bpf_program__fd(xdp_prog), + bpf_program__fd(dispatcher_prog), 0, &opts); + if (lfd < 0) { + ret = -errno; + goto out; + } + + ret = try_snprintf(pin_path, sizeof(pin_path), "%s/dispatch-%d-%d", + BPFFS_DIR, ifindex, dispatcher_id); + if (ret) + goto out; + + ret = mkdir(BPFFS_DIR, S_IRWXU); + if (ret && errno != EEXIST) { + ret = -errno; + printf("mkdir err (%s): %s\n", BPFFS_DIR, strerror(-ret)); + goto out; + } + + ret = mkdir(pin_path, S_IRWXU); + if (ret) { + ret = -errno; + printf("mkdir err (%s): %s\n", pin_path, strerror(-ret)); + goto out; + } + + ret = try_snprintf(buf, sizeof(buf), "%s/prog0-link", pin_path); + if (ret) + goto err_unpin; + + ret = bpf_obj_pin(lfd, buf); + if (ret) + goto err_unpin; + + ret = try_snprintf(buf, sizeof(buf), "%s/prog0-prog", pin_path); + if (ret) + goto err_unpin; + + ret = bpf_obj_pin(bpf_program__fd(xdp_prog), buf); + if (ret) + goto err_unpin; + + ret = xdp_attach_fd(bpf_program__fd(dispatcher_prog), -1, ifindex, + XDP_MODE_NATIVE); + if (ret) + goto err_unpin; + +out: + if (lfd >= 0) + close(lfd); + bpf_object__close(obj_dispatcher); + bpf_object__close(obj_prog); + return ret; + +err_unpin: + if (!try_snprintf(buf, sizeof(buf), "%s/prog0-link", pin_path)) + unlink(buf); + if (!try_snprintf(buf, sizeof(buf), "%s/prog0-prog", pin_path)) + unlink(buf); + rmdir(pin_path); + goto out; +} + +int check_old_dispatcher(int ifindex) +{ + struct xdp_multiprog *mp = NULL; + struct xdp_program *xdp_prog; + char buf[100]; + int ret; + + ret = load_dispatcher_v1(ifindex); + if (ret) + goto out; + + mp = xdp_multiprog__get_from_ifindex(ifindex); + ret = libxdp_get_error(mp); + if (ret) + goto out; + + if (xdp_multiprog__is_legacy(mp)) { + printf("Got unexpected legacy multiprog\n"); + ret = -EINVAL; + goto out; + } + + if (xdp_multiprog__program_count(mp) != 1) { + printf("Expected 1 attached program, got %d\n", + xdp_multiprog__program_count(mp)); + ret = -EINVAL; + goto out; + } + + xdp_prog = xdp_multiprog__next_prog(NULL, mp); + if (!xdp_prog) { + ret = -errno; + goto out; + } + + if (strcmp(xdp_program__name(xdp_prog), "xdp_pass")) { + printf("Expected xdp_pass program, got %s\n", + xdp_program__name(xdp_prog)); + ret = -EINVAL; + goto out; + } + + if (xdp_program__run_prio(xdp_prog) != PROG_RUN_PRIO) { + printf("Expected run prio %d got %d\n", PROG_RUN_PRIO, + xdp_program__run_prio(xdp_prog)); + ret = -EINVAL; + goto out; + } + + ret = xdp_program__print_chain_call_actions(xdp_prog, buf, sizeof(buf)); + if (ret) + goto out; + + if (strcmp(buf, "XDP_DROP")) { + printf("Expected actions XDP_PASS, got %s\n", buf); + ret = -EINVAL; + goto out; + } + + xdp_prog = xdp_program__open_file("xdp_pass.o", "xdp", NULL); + ret = libxdp_get_error(xdp_prog); + if (ret) + goto out; + + ret = xdp_program__attach(xdp_prog, ifindex, XDP_MODE_NATIVE, 0); + xdp_program__close(xdp_prog); + if (!ret) { + printf("Shouldn't have been able to attach a new program to ifindex!\n"); + ret = -EINVAL; + goto out; + } + ret = 0; + +out: + if (mp) + xdp_multiprog__detach(mp); + xdp_multiprog__close(mp); + return ret; +} + +static void usage(char *progname) +{ + fprintf(stderr, "Usage: %s <ifname>\n", progname); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) +{ + int ifindex, ret; + char *envval; + + envval = secure_getenv("VERBOSE_TESTS"); + + silence_libbpf_logging(); + if (envval && envval[0] == '1') + verbose_libxdp_logging(); + else + silence_libxdp_logging(); + + if (argc != 2) + usage(argv[0]); + + ifindex = if_nametoindex(argv[1]); + + ret = check_old_dispatcher(ifindex); + + return ret; +} diff --git a/lib/libxdp/tests/test_runner.sh b/lib/libxdp/tests/test_runner.sh new file mode 100755 index 0000000..eb043a1 --- /dev/null +++ b/lib/libxdp/tests/test_runner.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Script to setup and manage tests for xdp-tools. +# Based on the test-env script from xdp-tutorial. +# +# Author: Toke Høiland-Jørgensen (toke@redhat.com) +# Date: 26 May 2020 +# Copyright (c) 2020 Red Hat + +set -o errexit +set -o nounset +umask 077 + +TEST_PROG_DIR="${TEST_PROG_DIR:-$(dirname "${BASH_SOURCE[0]}")}" +ALL_TESTS="" +VERBOSE_TESTS=${V:-0} + +export VERBOSE_TESTS + +# Odd return value for skipping, as only 0-255 is valid. +SKIPPED_TEST=249 + +skip_if_missing_libxdp_compat() +{ + if ! $TEST_PROG_DIR/check_kern_compat; then + exit "$SKIPPED_TEST" + fi +} + +is_func() +{ + type "$1" 2>/dev/null | grep -q 'is a function' +} + +check_run() +{ + local ret + + [ "$VERBOSE_TESTS" -eq "1" ] && echo "$@" + "$@" + ret=$? + if [ "$ret" -ne "0" ]; then + exit $ret + fi +} + +exec_test() +{ + local testn="$1" + local output + local ret + + printf " %-30s" "[$testn]" + if ! is_func "$testn"; then + echo "INVALID" + return 1 + fi + + output=$($testn 2>&1) + ret=$? + if [ "$ret" -eq "0" ]; then + echo "PASS" + elif [ "$ret" -eq "$SKIPPED_TEST" ]; then + echo "SKIPPED" + ret=0 + else + echo "FAIL" + fi + if [ "$ret" -ne "0" ] || [ "$VERBOSE_TESTS" -eq "1" ]; then + echo "$output" | sed 's/^/\t/' + fi + return $ret +} + +run_tests() +{ + local TESTS="$*" + local ret=0 + [ -z "$TESTS" ] && TESTS="$ALL_TESTS" + + echo " Running tests from $TEST_DEFINITIONS" + + for testn in $TESTS; do + exec_test $testn || ret=1 + if is_func cleanup_tests; then + cleanup_tests || true + fi + done + + return $ret +} + +usage() +{ + echo "Usage: $0 <test_definition_file> [test names]" >&2 + exit 1 +} + +if [ "$EUID" -ne "0" ]; then + if command -v sudo >/dev/null 2>&1; then + exec sudo env CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" LDLIBS="$LDLIBS" V=${VERBOSE_TESTS} "$0" "$@" + else + die "Tests must be run as root" + fi +else + if [ "${DID_UNSHARE:-0}" -ne "1" ]; then + echo "Executing tests in separate net- and mount namespaces" >&2 + exec env DID_UNSHARE=1 unshare -n -m "$0" "$@" + fi +fi + +TEST_DEFINITIONS="${1:-}" +[ -f "$TEST_DEFINITIONS" ] || usage +source "$TEST_DEFINITIONS" + +shift +run_tests "$@" diff --git a/lib/libxdp/tests/test_utils.h b/lib/libxdp/tests/test_utils.h new file mode 100644 index 0000000..1642c12 --- /dev/null +++ b/lib/libxdp/tests/test_utils.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __TEST_UTILS_H +#define __TEST_UTILS_H + +#include <bpf/libbpf.h> +#include <xdp/libxdp.h> + +#define __unused __attribute__((unused)) + +static int libbpf_silent_func(__unused enum libbpf_print_level level, + __unused const char *format, + __unused va_list args) +{ + return 0; +} + +static inline void silence_libbpf_logging(void) +{ + libbpf_set_print(libbpf_silent_func); +} + +static int libxdp_silent_func(__unused enum libxdp_print_level level, + __unused const char *format, + __unused va_list args) +{ + return 0; +} + +static int libxdp_verbose_func(__unused enum libxdp_print_level level, + __unused const char *format, + __unused va_list args) +{ + fprintf(stderr, " "); + vfprintf(stderr, format, args); + return 0; +} + +static inline void silence_libxdp_logging(void) +{ + libxdp_set_print(libxdp_silent_func); +} + +static inline void verbose_libxdp_logging(void) +{ + libxdp_set_print(libxdp_verbose_func); +} + +#endif diff --git a/lib/libxdp/tests/test_xdp_frags.c b/lib/libxdp/tests/test_xdp_frags.c new file mode 100644 index 0000000..d70e802 --- /dev/null +++ b/lib/libxdp/tests/test_xdp_frags.c @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE + +#include <errno.h> +#include <linux/err.h> +#include <net/if.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <sys/resource.h> +#include <unistd.h> + +#include "test_utils.h" + +#include <xdp/libxdp.h> +#include <bpf/libbpf.h> + +# define ARRAY_SIZE(_x) (sizeof(_x) / sizeof((_x)[0])) + +static bool kern_compat; + + +static struct xdp_program *load_prog(void) +{ + DECLARE_LIBXDP_OPTS(xdp_program_opts, opts, + .prog_name = "xdp_pass", + .find_filename = "xdp-dispatcher.o", + ); + return xdp_program__create(&opts); +} + +static int check_attached_progs(int ifindex, int count, bool frags) +{ + struct xdp_multiprog *mp; + int ret; + + /* If the kernel does not support frags, we always expect + * frags support to be disabled on a returned dispatcher + */ + if (!kern_compat) + frags = false; + + mp = xdp_multiprog__get_from_ifindex(ifindex); + ret = libxdp_get_error(mp); + if (ret) { + fprintf(stderr, "Couldn't get multiprog on ifindex %d: %s\n", + ifindex, strerror(-ret)); + return ret; + } + + ret = -EINVAL; + + if (xdp_multiprog__is_legacy(mp)) { + fprintf(stderr, "Found legacy prog on ifindex %d\n", ifindex); + goto out; + } + + if (xdp_multiprog__program_count(mp) != count) { + fprintf(stderr, "Expected %d programs loaded on ifindex %d, found %d\n", + count, ifindex, xdp_multiprog__program_count(mp)); + goto out; + } + + if (xdp_multiprog__xdp_frags_support(mp) != frags) { + fprintf(stderr, + "Multiprog on ifindex %d %s frags, expected %s\n", + ifindex, + xdp_multiprog__xdp_frags_support(mp) ? + "supports" : + "does not support", + frags ? "support" : "no support"); + goto out; + } + + ret = 0; + +out: + xdp_multiprog__close(mp); + return ret; +} + +static void print_test_result(const char *func, int ret) +{ + fflush(stderr); + fprintf(stderr, "%s:\t%s\n", func, ret ? "FAILED" : "PASSED"); + fflush(stdout); +} + +static int load_attach_prog(struct xdp_program **prog, int ifindex, bool frags) +{ + int ret; + + *prog = load_prog(); + if (!*prog) { + ret = -errno; + fprintf(stderr, "Couldn't load program: %s\n", strerror(-ret)); + return ret; + } + + ret = xdp_program__set_xdp_frags_support(*prog, frags); + if (ret) + return ret; + + return xdp_program__attach(*prog, ifindex, XDP_MODE_NATIVE, 0); +} + +static int _check_load(int ifindex, bool frags, bool should_succeed) +{ + struct xdp_program *prog = NULL; + bool attached; + int ret; + + ret = load_attach_prog(&prog, ifindex, frags); + attached = !ret; + + if (attached != should_succeed) { + ret = -EINVAL; + goto out; + } + + if (should_succeed) + ret = check_attached_progs(ifindex, 1, frags); + else + ret = 0; + +out: + if (attached) + xdp_program__detach(prog, ifindex, XDP_MODE_NATIVE, 0); + xdp_program__close(prog); + return ret; +} + +static int check_load_frags(int ifindex_bigmtu, int ifindex_smallmtu) +{ + int ret = _check_load(ifindex_smallmtu, true, true); + if (!ret && ifindex_bigmtu) + _check_load(ifindex_bigmtu, true, true); + print_test_result(__func__, ret); + return ret; +} + +static int check_load_nofrags_success(int ifindex) +{ + int ret = _check_load(ifindex, false, true); + print_test_result(__func__, ret); + return ret; +} + +static int check_load_nofrags_fail(int ifindex) +{ + int ret = _check_load(ifindex, false, false); + print_test_result(__func__, ret); + return ret; +} +static int check_load_frags_multi(int ifindex) +{ + struct xdp_program *prog1 = NULL, *prog2 = NULL; + int ret; + + ret = load_attach_prog(&prog1, ifindex, true); + if (ret) + goto out; + + ret = load_attach_prog(&prog2, ifindex, true); + if (ret) + goto out_prog1; + + ret = check_attached_progs(ifindex, 2, true); + + xdp_program__detach(prog2, ifindex, XDP_MODE_NATIVE, 0); +out_prog1: + xdp_program__detach(prog1, ifindex, XDP_MODE_NATIVE, 0); +out: + xdp_program__close(prog2); + xdp_program__close(prog1); + print_test_result(__func__, ret); + return ret; +} + +static int check_load_mix_small(int ifindex) +{ + struct xdp_program *prog1 = NULL, *prog2 = NULL; + int ret; + + ret = load_attach_prog(&prog1, ifindex, true); + if (ret) + goto out; + + /* First program attached, dispatcher supports frags */ + ret = check_attached_progs(ifindex, 1, true); + if (ret) + goto out; + + ret = load_attach_prog(&prog2, ifindex, false); + if (ret) + goto out_prog1; + + /* Mixed program attachment, dispatcher should not support frags */ + ret = check_attached_progs(ifindex, 2, false); + + ret = xdp_program__detach(prog2, ifindex, XDP_MODE_NATIVE, 0) || ret; + if (ret) + goto out_prog1; + + /* Second program removed, back to frags-only */ + ret = check_attached_progs(ifindex, 1, true) || ret; + +out_prog1: + xdp_program__detach(prog1, ifindex, XDP_MODE_NATIVE, 0); + +out: + xdp_program__close(prog2); + xdp_program__close(prog1); + print_test_result(__func__, ret); + return ret; +} + +static int check_load_mix_big(int ifindex) +{ + struct xdp_program *prog1 = NULL, *prog2 = NULL; + int ret; + + ret = load_attach_prog(&prog1, ifindex, true); + if (ret) + goto out; + + /* First program attached, dispatcher supports frags */ + ret = check_attached_progs(ifindex, 1, true); + if (ret) + goto out; + + /* Second non-frags program should fail on big-MTU device */ + ret = load_attach_prog(&prog2, ifindex, false); + if (!ret) { + xdp_program__detach(prog2, ifindex, XDP_MODE_NATIVE, 0); + ret = -EINVAL; + goto out_prog1; + } + + /* Still only a single program loaded, with frags support */ + ret = check_attached_progs(ifindex, 1, true); + +out_prog1: + xdp_program__detach(prog1, ifindex, XDP_MODE_NATIVE, 0); + +out: + xdp_program__close(prog2); + xdp_program__close(prog1); + print_test_result(__func__, ret); + return ret; +} + + +static bool check_frags_compat(void) +{ + struct xdp_program *test_prog; + struct bpf_program *prog; + struct bpf_object *obj; + bool ret = false; + int err; + + test_prog = load_prog(); + if (!test_prog) + return false; + + obj = xdp_program__bpf_obj(test_prog); + if (!obj) + goto out; + + prog = bpf_object__find_program_by_name(obj, "xdp_pass"); + if (!prog) + goto out; + + bpf_program__set_flags(prog, BPF_F_XDP_HAS_FRAGS); + err = bpf_object__load(obj); + if (!err) { + printf("Kernel supports XDP programs with frags\n"); + ret = true; + } else { + printf("Kernel DOES NOT support XDP programs with frags\n"); + } + fflush(stdout); + +out: + xdp_program__close(test_prog); + return ret; +} + +static void usage(char *progname) +{ + fprintf(stderr, "Usage: %s <ifname_bigmtu> <ifname_smallmtu>\n", progname); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int ifindex_bigmtu, ifindex_smallmtu, ret; + char *envval; + + envval = secure_getenv("VERBOSE_TESTS"); + + silence_libbpf_logging(); + if (envval && envval[0] == '1') + verbose_libxdp_logging(); + else + silence_libxdp_logging(); + + kern_compat = check_frags_compat(); + + if (argc != 3) + usage(argv[0]); + + ifindex_bigmtu = if_nametoindex(argv[1]); + ifindex_smallmtu = if_nametoindex(argv[2]); + if (!ifindex_bigmtu || !ifindex_smallmtu) { + fprintf(stderr, "Interface '%s' or '%s' not found.\n", argv[1], argv[2]); + usage(argv[0]); + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = check_load_frags(kern_compat ? ifindex_bigmtu : 0, ifindex_smallmtu); + ret = check_load_nofrags_success(ifindex_smallmtu) || ret; + if (kern_compat) { + ret = check_load_nofrags_fail(ifindex_bigmtu) || ret; + ret = check_load_frags_multi(ifindex_bigmtu) || ret; + ret = check_load_mix_big(ifindex_bigmtu) || ret; + } + ret = check_load_mix_small(ifindex_smallmtu) || ret; + + return ret; +} diff --git a/lib/libxdp/tests/test_xsk_refcnt.c b/lib/libxdp/tests/test_xsk_refcnt.c new file mode 100644 index 0000000..bdd22da --- /dev/null +++ b/lib/libxdp/tests/test_xsk_refcnt.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +#include <errno.h> +#include <linux/err.h> +#include <net/if.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <sys/resource.h> +#include <unistd.h> + +#include "test_utils.h" + +#include <xdp/libxdp.h> +#include <xdp/xsk.h> + +typedef __u64 u64; +typedef __u32 u32; +typedef __u16 u16; +typedef __u8 u8; + +#define MAX_EVENTS 10 +#define MAX_NUM_QUEUES 4 +#define TEST_NAME_LENGTH 128 + +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + void *buffer; +}; + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; +}; + +/* Event holds socket operations that are run concurrently + * and in theory can produce a race condition + */ +struct xsk_test_event { + u32 num_create; + u32 num_delete; + u32 create_qids[MAX_NUM_QUEUES]; /* QIDs for sockets being created in this event */ + u32 delete_qids[MAX_NUM_QUEUES]; /* QIDs for sockets being deleted in this event */ +}; + +struct xsk_test { + char name[TEST_NAME_LENGTH]; + u32 num_events; + struct xsk_test_event events[MAX_EVENTS]; +}; + +/* Tests that use less queues must come first, + * so we can run all possible tests on VMs with + * small number of CPUs + */ +static struct xsk_test all_tests[] = { + { "Single socket created and deleted", + .num_events = 2, + .events = {{ .num_create = 1, .create_qids = {0} }, + { .num_delete = 1, .delete_qids = {0} } + }}, + { "2 sockets, created and deleted sequentially", + .num_events = 4, + .events = {{ .num_create = 1, .create_qids = {0} }, + { .num_create = 1, .create_qids = {1} }, + { .num_delete = 1, .delete_qids = {0} }, + { .num_delete = 1, .delete_qids = {1} } + }}, + { "2 sockets, created sequentially and deleted asynchronously", + .num_events = 3, + .events = {{ .num_create = 1, .create_qids = {0} }, + { .num_create = 1, .create_qids = {1} }, + { .num_delete = 2, .delete_qids = {0, 1} } + }}, + { "2 sockets, asynchronously delete and create", + .num_events = 3, + .events = {{ .num_create = 1, .create_qids = {0} }, + { .num_create = 1, .create_qids = {1}, + .num_delete = 1, .delete_qids = {0} }, + { .num_delete = 1, .delete_qids = {1} } + }}, + { "3 sockets, created and deleted sequentially", + .num_events = 6, + .events = {{ .num_create = 1, .create_qids = {0} }, + { .num_create = 1, .create_qids = {1} }, + { .num_create = 1, .create_qids = {2} }, + { .num_delete = 1, .delete_qids = {1} }, + { .num_delete = 1, .delete_qids = {2} }, + { .num_delete = 1, .delete_qids = {0} } + }}, +}; + +# define ARRAY_SIZE(_x) (sizeof(_x) / sizeof((_x)[0])) + +static const char *opt_if; +static const u8 num_tests = ARRAY_SIZE(all_tests); + +static struct xsk_socket_info *xsks[MAX_NUM_QUEUES]; + +#define FRAME_SIZE 64 +#define NUM_FRAMES (XSK_RING_CONS__DEFAULT_NUM_DESCS * 2) + +static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) +{ + struct xsk_umem_info *umem; + int ret; + + umem = calloc(1, sizeof(*umem)); + if (!umem) + exit(EXIT_FAILURE); + + ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, + NULL); + if (ret) + exit(ret); + + umem->buffer = buffer; + return umem; +} + +static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, + unsigned int qid) +{ + struct xsk_socket_config cfg = {}; + struct xsk_socket_info *xsk; + struct xsk_ring_cons *rxr; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + exit(EXIT_FAILURE); + + xsk->umem = umem; + cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + + rxr = &xsk->rx; + xsk_socket__create(&xsk->xsk, opt_if, qid, umem->umem, + rxr, NULL, &cfg); + + return xsk; +} + +static void *create_socket(void *args) +{ + struct xsk_umem_info *umem; + u32 qid = *(u32 *)args; + void *buffs; + + if (posix_memalign(&buffs, + getpagesize(), /* PAGE_SIZE aligned */ + NUM_FRAMES * FRAME_SIZE)) { + fprintf(stderr, "ERROR: Can't allocate buffer memory \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + umem = xsk_configure_umem(buffs, NUM_FRAMES * FRAME_SIZE); + xsks[qid] = xsk_configure_socket(umem, qid); + + return NULL; +} + +static void *delete_socket(void *args) +{ + u32 qid = *(u32 *)args; + struct xsk_umem *umem; + void *buff; + + buff = xsks[qid]->umem->buffer; + umem = xsks[qid]->umem->umem; + xsk_socket__delete(xsks[qid]->xsk); + free(buff); + (void)xsk_umem__delete(umem); + + return NULL; +} + +static bool xsk_prog_attached(void) +{ + char xsk_prog_name[] = "xsk_def_prog"; + int ifindex = if_nametoindex(opt_if); + struct xdp_program *xsk_prog; + struct xdp_multiprog *mp; + bool answer = false; + + mp = xdp_multiprog__get_from_ifindex(ifindex); + if (IS_ERR_OR_NULL(mp)) + return false; + + xsk_prog = xdp_multiprog__is_legacy(mp) ? xdp_multiprog__main_prog(mp) : + xdp_multiprog__next_prog(NULL, mp); + + if (IS_ERR_OR_NULL(xsk_prog)) + goto free_mp; + + answer = !strncmp(xsk_prog_name, xdp_program__name(xsk_prog), + sizeof(xsk_prog_name)); +free_mp: + xdp_multiprog__close(mp); + return answer; +} + +static void update_reference_refcnt(struct xsk_test_event *event, int *refcnt) +{ + *refcnt += event->num_create; + *refcnt -= event->num_delete; +} + +static bool check_run_event(struct xsk_test_event *event, int *refcnt) +{ + pthread_t threads[MAX_NUM_QUEUES]; + bool prog_attached, prog_needed; + u8 thread_num = 0, i; + int ret; + + update_reference_refcnt(event, refcnt); + + for (i = 0; i < event->num_create; i++) { + ret = pthread_create(&threads[thread_num++], NULL, + &create_socket, &event->create_qids[i]); + if (ret) + exit(ret); + } + + for (i = 0; i < event->num_delete; i++) { + ret = pthread_create(&threads[thread_num++], NULL, + &delete_socket, &event->delete_qids[i]); + if (ret) + exit(ret); + } + + for (i = 0; i < thread_num; i++) + pthread_join(threads[i], NULL); + + prog_attached = xsk_prog_attached(); + prog_needed = *refcnt > 0; + + if (prog_needed != prog_attached) { + printf("Program is referenced by %d sockets, but is %s attached\n", + *refcnt, prog_attached ? "still" : "not"); + return false; + } + + return true; +} + +static bool check_run_test(struct xsk_test *test) +{ + bool test_ok = false; + int refcnt = 0; + u8 i = 0; + + for (i = 0; i < test->num_events; i++) { + if (!check_run_event(&test->events[i], &refcnt)) { + printf("Event %u failed\n", i); + goto print_result; + } + } + + /* Do not let tests interfere with each other */ + sleep(1); + + test_ok = true; + +print_result: + printf("%s: %s\n", test->name, test_ok ? "PASSED" : "FAILED"); + return test_ok; +} + +static int read_args(int argc, char **argv) +{ + if (argc != 2) + return -1; + + opt_if = argv[1]; + return 0; +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + u8 i = 0; + + if (read_args(argc, argv)) + return -1; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + silence_libbpf_logging(); + + for (i = 0; i < num_tests; i++) { + if (!check_run_test(&all_tests[i])) + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/lib/libxdp/tests/xdp_dispatcher_v1.c b/lib/libxdp/tests/xdp_dispatcher_v1.c new file mode 100644 index 0000000..00bb426 --- /dev/null +++ b/lib/libxdp/tests/xdp_dispatcher_v1.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#include "xdp_dispatcher_v1.h" + +#define XDP_METADATA_SECTION "xdp_metadata" +#define XDP_DISPATCHER_VERSION_V1 1 +#define XDP_DISPATCHER_RETVAL 31 + + +static volatile const struct xdp_dispatcher_config_v1 conf = {}; + +__attribute__ ((noinline)) +int prog0(struct xdp_md *ctx) { + volatile int ret = XDP_DISPATCHER_RETVAL; + + if (!ctx) + return XDP_ABORTED; + return ret; +} +__attribute__ ((noinline)) + +SEC("xdp") +int xdp_dispatcher(struct xdp_md *ctx) +{ + __u8 num_progs_enabled = conf.num_progs_enabled; + int ret; + + if (num_progs_enabled < 1) + goto out; + ret = prog0(ctx); + if (!((1U << ret) & conf.chain_call_actions[0])) + return ret; + +out: + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; +__uint(dispatcher_version, XDP_DISPATCHER_VERSION_V1) SEC(XDP_METADATA_SECTION); diff --git a/lib/libxdp/tests/xdp_dispatcher_v1.h b/lib/libxdp/tests/xdp_dispatcher_v1.h new file mode 100644 index 0000000..55dac37 --- /dev/null +++ b/lib/libxdp/tests/xdp_dispatcher_v1.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __XDP_DISPATCHER_V1_H +#define __XDP_DISPATCHER_V1_H + +#ifndef MAX_DISPATCHER_ACTIONS +#define MAX_DISPATCHER_ACTIONS 10 +#endif + +struct xdp_dispatcher_config_v1 { + __u8 num_progs_enabled; + __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; + __u32 run_prios[MAX_DISPATCHER_ACTIONS]; +}; + +#endif diff --git a/lib/libxdp/tests/xdp_pass.c b/lib/libxdp/tests/xdp_pass.c new file mode 100644 index 0000000..6b61a00 --- /dev/null +++ b/lib/libxdp/tests/xdp_pass.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("xdp") +int xdp_pass(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/lib/libxdp/xdp-dispatcher.c.in b/lib/libxdp/xdp-dispatcher.c.in new file mode 100644 index 0000000..6214d78 --- /dev/null +++ b/lib/libxdp/xdp-dispatcher.c.in @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +divert(-1) +#forloop definition taken from example in the M4 manual +define(`forloop', `pushdef(`$1', `$2')_forloop($@)popdef(`$1')') +define(`_forloop',`$4`'ifelse($1, decr(`$3'), `', `define(`$1', incr($1))$0($@)')') +define(`NUM_PROGS',ifdef(`MAX_DISPATCHER_ACTIONS', MAX_DISPATCHER_ACTIONS, `10')) +divert(0)dnl + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#include <xdp/prog_dispatcher.h> + +/* While 'const volatile' sounds a little like an oxymoron, there's reason + * behind the madness: + * + * - const places the data in rodata, where libbpf will mark it as read-only and + * frozen on program load, letting the kernel do dead code elimination based + * on the values. + * + * - volatile prevents the compiler from optimising away the checks based on the + * compile-time value of the variables, which is important since we will be + * changing the values before loading the program into the kernel. + */ +static volatile const struct xdp_dispatcher_config conf = {}; + +/* The volatile return value prevents the compiler from assuming it knows the + * return value and optimising based on that. + */ +forloop(`i', `0', NUM_PROGS, +`__attribute__ ((noinline)) +int format(`prog%d', i)(struct xdp_md *ctx) { + volatile int ret = XDP_DISPATCHER_RETVAL; + + if (!ctx) + return XDP_ABORTED; + return ret; +} +') + +__attribute__ ((noinline)) +int compat_test(struct xdp_md *ctx) { + volatile int ret = XDP_DISPATCHER_RETVAL; + + if (!ctx) + return XDP_ABORTED; + return ret; +} + + +SEC("xdp") +int xdp_dispatcher(struct xdp_md *ctx) +{ + __u8 num_progs_enabled = conf.num_progs_enabled; + int ret; +forloop(`i', `0', NUM_PROGS, +` + if (num_progs_enabled < incr(i)) + goto out; + ret = format(`prog%d', i)(ctx); + if (!((1U << ret) & conf.chain_call_actions[i])) + return ret; +') + /* keep a reference to the compat_test() function so we can use it + * as an freplace target in xdp_multiprog__check_compat() in libxdp + */ + if (num_progs_enabled < incr(NUM_PROGS)) + goto out; + ret = compat_test(ctx); +out: + return XDP_PASS; +} + +SEC("xdp") +int xdp_pass(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; +__uint(dispatcher_version, XDP_DISPATCHER_VERSION) SEC(XDP_METADATA_SECTION); diff --git a/lib/libxdp/xsk.c b/lib/libxdp/xsk.c new file mode 100644 index 0000000..c6c201b --- /dev/null +++ b/lib/libxdp/xsk.c @@ -0,0 +1,1299 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * AF_XDP user-space access library. + * + * Copyright(c) 2018 - 2021 Intel Corporation. + * + * Author(s): Magnus Karlsson <magnus.karlsson@intel.com> + */ + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <arpa/inet.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <dirent.h> +#include <linux/err.h> +#include <linux/ethtool.h> +#include <linux/filter.h> +#include <linux/if_ether.h> +#include <linux/if_link.h> +#include <linux/if_packet.h> +#include <linux/if_xdp.h> +#include <linux/list.h> +#include <linux/sockios.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <xdp/xsk.h> + +#include "libxdp_internal.h" +#include "xsk_def_xdp_prog.h" +#include "bpf_instr.h" + +#ifndef SOL_XDP + #define SOL_XDP 283 +#endif + +#ifndef AF_XDP + #define AF_XDP 44 +#endif + +#ifndef PF_XDP + #define PF_XDP AF_XDP +#endif + +#ifndef SO_NETNS_COOKIE + #define SO_NETNS_COOKIE 71 +#endif + +#define INIT_NS 1 + +struct xsk_umem { + struct xsk_ring_prod *fill_save; + struct xsk_ring_cons *comp_save; + char *umem_area; + struct xsk_umem_config config; + int fd; + int refcount; + struct list_head ctx_list; + bool rx_ring_setup_done; + bool tx_ring_setup_done; +}; + +struct xsk_ctx { + struct xsk_ring_prod *fill; + struct xsk_ring_cons *comp; + struct xsk_umem *umem; + __u32 queue_id; + int refcount; + int ifindex; + __u64 netns_cookie; + int xsks_map_fd; + struct list_head list; + struct xdp_program *xdp_prog; + int refcnt_map_fd; + char ifname[IFNAMSIZ]; +}; + +struct xsk_socket { + struct xsk_ring_cons *rx; + struct xsk_ring_prod *tx; + struct xsk_ctx *ctx; + struct xsk_socket_config config; + int fd; +}; + +struct xsk_nl_info { + int ifindex; + int fd; + bool xdp_prog_attached; +}; + +/* Up until and including Linux 5.3 */ +struct xdp_ring_offset_v1 { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +/* Up until and including Linux 5.3 */ +struct xdp_mmap_offsets_v1 { + struct xdp_ring_offset_v1 rx; + struct xdp_ring_offset_v1 tx; + struct xdp_ring_offset_v1 fr; + struct xdp_ring_offset_v1 cr; +}; + +/* Export all inline helpers as symbols for use by language bindings. */ +extern inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, + __u32 idx); +extern inline const __u64 * +xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx); +extern inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, + __u32 idx); +extern inline const struct xdp_desc * +xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx); +extern inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r); +extern inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb); +extern inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb); +extern inline __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, + __u32 *idx); +extern inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb); +extern inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, + __u32 *idx); +extern inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb); +extern inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb); +extern inline void *xsk_umem__get_data(void *umem_area, __u64 addr); +extern inline __u64 xsk_umem__extract_addr(__u64 addr); +extern inline __u64 xsk_umem__extract_offset(__u64 addr); +extern inline __u64 xsk_umem__add_offset_to_addr(__u64 addr); + +int xsk_umem__fd(const struct xsk_umem *umem) +{ + return umem ? umem->fd : -EINVAL; +} + +int xsk_socket__fd(const struct xsk_socket *xsk) +{ + return xsk ? xsk->fd : -EINVAL; +} + +static bool xsk_page_aligned(void *buffer) +{ + unsigned long addr = (unsigned long)buffer; + + return !(addr & (getpagesize() - 1)); +} + +static void xsk_set_umem_config(struct xsk_umem_config *cfg, + const struct xsk_umem_config *usr_cfg) +{ + if (!usr_cfg) { + cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; + cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + cfg->flags = XSK_UMEM__DEFAULT_FLAGS; + return; + } + + cfg->fill_size = usr_cfg->fill_size; + cfg->comp_size = usr_cfg->comp_size; + cfg->frame_size = usr_cfg->frame_size; + cfg->frame_headroom = usr_cfg->frame_headroom; + cfg->flags = usr_cfg->flags; +} + +static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, + const struct xsk_socket_config *usr_cfg) +{ + if (!usr_cfg) { + cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg->libbpf_flags = 0; + cfg->xdp_flags = 0; + cfg->bind_flags = 0; + return 0; + } + + if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD) + return -EINVAL; + + cfg->rx_size = usr_cfg->rx_size; + cfg->tx_size = usr_cfg->tx_size; + cfg->libbpf_flags = usr_cfg->libbpf_flags; + cfg->xdp_flags = usr_cfg->xdp_flags; + cfg->bind_flags = usr_cfg->bind_flags; + + return 0; +} + +static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off) +{ + struct xdp_mmap_offsets_v1 off_v1; + + /* getsockopt on a kernel <= 5.3 has no flags fields. + * Copy over the offsets to the correct places in the >=5.4 format + * and put the flags where they would have been on that kernel. + */ + memcpy(&off_v1, off, sizeof(off_v1)); + + off->rx.producer = off_v1.rx.producer; + off->rx.consumer = off_v1.rx.consumer; + off->rx.desc = off_v1.rx.desc; + off->rx.flags = off_v1.rx.consumer + sizeof(__u32); + + off->tx.producer = off_v1.tx.producer; + off->tx.consumer = off_v1.tx.consumer; + off->tx.desc = off_v1.tx.desc; + off->tx.flags = off_v1.tx.consumer + sizeof(__u32); + + off->fr.producer = off_v1.fr.producer; + off->fr.consumer = off_v1.fr.consumer; + off->fr.desc = off_v1.fr.desc; + off->fr.flags = off_v1.fr.consumer + sizeof(__u32); + + off->cr.producer = off_v1.cr.producer; + off->cr.consumer = off_v1.cr.consumer; + off->cr.desc = off_v1.cr.desc; + off->cr.flags = off_v1.cr.consumer + sizeof(__u32); +} + +static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off) +{ + socklen_t optlen; + int err; + + optlen = sizeof(*off); + err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen); + if (err) + return err; + + if (optlen == sizeof(*off)) + return 0; + + if (optlen == sizeof(struct xdp_mmap_offsets_v1)) { + xsk_mmap_offsets_v1(off); + return 0; + } + + return -EINVAL; +} + +static int xsk_create_umem_rings(struct xsk_umem *umem, int fd, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp) +{ + struct xdp_mmap_offsets off; + void *map; + int err; + + err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING, + &umem->config.fill_size, + sizeof(umem->config.fill_size)); + if (err) + return -errno; + + err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, + &umem->config.comp_size, + sizeof(umem->config.comp_size)); + if (err) + return -errno; + + err = xsk_get_mmap_offsets(fd, &off); + if (err) + return -errno; + + map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, + XDP_UMEM_PGOFF_FILL_RING); + if (map == MAP_FAILED) + return -errno; + + fill->mask = umem->config.fill_size - 1; + fill->size = umem->config.fill_size; + fill->producer = map + off.fr.producer; + fill->consumer = map + off.fr.consumer; + fill->flags = map + off.fr.flags; + fill->ring = map + off.fr.desc; + fill->cached_cons = umem->config.fill_size; + + map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, + XDP_UMEM_PGOFF_COMPLETION_RING); + if (map == MAP_FAILED) { + err = -errno; + goto out_mmap; + } + + comp->mask = umem->config.comp_size - 1; + comp->size = umem->config.comp_size; + comp->producer = map + off.cr.producer; + comp->consumer = map + off.cr.consumer; + comp->flags = map + off.cr.flags; + comp->ring = map + off.cr.desc; + + return 0; + +out_mmap: + munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64)); + return err; +} + +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, + __u64 size, struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_umem_config *usr_config) +{ + struct xdp_umem_reg mr; + struct xsk_umem *umem; + int err; + + if (!umem_area || !umem_ptr || !fill || !comp) + return -EFAULT; + if (!size && !xsk_page_aligned(umem_area)) + return -EINVAL; + + umem = calloc(1, sizeof(*umem)); + if (!umem) + return -ENOMEM; + + umem->fd = socket(AF_XDP, SOCK_RAW, 0); + if (umem->fd < 0) { + err = -errno; + goto out_umem_alloc; + } + + umem->umem_area = umem_area; + INIT_LIST_HEAD(&umem->ctx_list); + xsk_set_umem_config(&umem->config, usr_config); + + memset(&mr, 0, sizeof(mr)); + mr.addr = (uintptr_t)umem_area; + mr.len = size; + mr.chunk_size = umem->config.frame_size; + mr.headroom = umem->config.frame_headroom; + mr.flags = umem->config.flags; + + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); + if (err) { + err = -errno; + goto out_socket; + } + + err = xsk_create_umem_rings(umem, umem->fd, fill, comp); + if (err) + goto out_socket; + + umem->fill_save = fill; + umem->comp_save = comp; + *umem_ptr = umem; + return 0; + +out_socket: + close(umem->fd); +out_umem_alloc: + free(umem); + return err; +} + +static int xsk_init_xsk_struct(struct xsk_socket *xsk, int ifindex) +{ + char ifname[IFNAMSIZ]; + struct xsk_ctx *ctx; + char *interface; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + interface = if_indextoname(ifindex, &ifname[0]); + if (!interface) { + free(ctx); + return -errno; + } + + ctx->ifindex = ifindex; + memcpy(ctx->ifname, ifname, IFNAMSIZ -1); + ctx->ifname[IFNAMSIZ - 1] = 0; + + xsk->ctx = ctx; + + return 0; +} + +static enum xdp_attach_mode xsk_convert_xdp_flags(__u32 xdp_flags) +{ + if (xdp_flags & ~XDP_FLAGS_MASK) + pr_warn("XDP flag: 0x%x contains flags not supported by libxdp.\n", xdp_flags); + + if (xdp_flags & XDP_FLAGS_SKB_MODE) + return XDP_MODE_SKB; + if (xdp_flags & XDP_FLAGS_DRV_MODE) + return XDP_MODE_NATIVE; + if (xdp_flags & XDP_FLAGS_HW_MODE) + return XDP_MODE_HW; + + return XDP_MODE_NATIVE; +} + +#define MAX_DEV_QUEUE_PATH_LEN 64 + +static void xsk_get_queues_from_sysfs(const char* ifname, __u32 *rx, __u32 *tx) { + char buf[MAX_DEV_QUEUE_PATH_LEN]; + struct dirent *entry; + DIR *dir; + int err; + + *rx = *tx = 0; + + err = try_snprintf(buf, MAX_DEV_QUEUE_PATH_LEN, + "/sys/class/net/%s/queues/", ifname); + if (err) + return; + + dir = opendir(buf); + if(dir == NULL) + return; + + while((entry = readdir(dir))) { + if (0 == strncmp(entry->d_name, "rx", 2)) + ++*rx; + + if (0 == strncmp(entry->d_name, "tx", 2)) + ++*tx; + } + + closedir(dir); +} + +static int xsk_get_max_queues(char *ifname) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + struct ifreq ifr = {}; + int fd, err, ret; + + fd = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (fd < 0) + return -errno; + + ifr.ifr_data = (void *)&channels; + memcpy(ifr.ifr_name, ifname, IFNAMSIZ - 1); + ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + err = ioctl(fd, SIOCETHTOOL, &ifr); + if (err && errno != EOPNOTSUPP) { + ret = -errno; + goto out; + } + + if (err) { + /* If the device says it has no channels, + * try to get rx tx from sysfs, otherwise all traffic + * is sent to a single stream, so max queues = 1. + */ + __u32 rx, tx; + xsk_get_queues_from_sysfs(ifr.ifr_name, &rx, &tx); + ret = max(max(rx, tx), 1); + } else { + /* Take the max of rx, tx, combined. Drivers return + * the number of channels in different ways. + */ + ret = max(channels.max_rx, channels.max_tx); + ret = max(ret, (int)channels.max_combined); + } + +out: + close(fd); + return ret; +} + +static int xsk_size_map(struct xdp_program *xdp_prog, char *ifname) +{ + struct bpf_object *bpf_obj = xdp_program__bpf_obj(xdp_prog); + struct bpf_map *map; + int max_queues; + int err; + + max_queues = xsk_get_max_queues(ifname); + if (max_queues < 0) + return max_queues; + + map = bpf_object__find_map_by_name(bpf_obj, "xsks_map"); + if (!map) + return -ENOENT; + + err = bpf_map__set_max_entries(map, max_queues); + if (err) + return err; + + return 0; +} + +static void xsk_delete_map_entry(int xsks_map_fd, __u32 queue_id) +{ + bpf_map_delete_elem(xsks_map_fd, &queue_id); + close(xsks_map_fd); +} + +static int xsk_lookup_map_by_filter(int prog_fd, + bool (*map_info_filter)(struct bpf_map_info *map_info)) +{ + __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info); + __u32 map_len = sizeof(struct bpf_map_info); + struct bpf_prog_info prog_info = {}; + int fd, err, xsks_map_fd = -ENOENT; + struct bpf_map_info map_info; + + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_len); + if (err) + return err; + + num_maps = prog_info.nr_map_ids; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids)); + if (!map_ids) + return -ENOMEM; + + memset(&prog_info, 0, prog_len); + prog_info.nr_map_ids = num_maps; + prog_info.map_ids = (__u64)(unsigned long)map_ids; + + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_len); + if (err) { + free(map_ids); + return err; + } + + for (i = 0; i < prog_info.nr_map_ids; i++) { + fd = bpf_map_get_fd_by_id(map_ids[i]); + if (fd < 0) + continue; + + memset(&map_info, 0, map_len); + err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); + if (err) { + close(fd); + continue; + } + + if (map_info_filter(&map_info)) { + xsks_map_fd = fd; + break; + } + + close(fd); + } + + free(map_ids); + return xsks_map_fd; +} + +static bool xsk_map_is_socket_map(struct bpf_map_info *map_info) +{ + return !strncmp(map_info->name, "xsks_map", sizeof(map_info->name)) && + map_info->key_size == 4 && map_info->value_size == 4; +} + +static bool xsk_map_is_refcnt_map(struct bpf_map_info *map_info) +{ + /* In order to avoid confusing users with multiple identically named + * maps, libbpf names non-custom internal maps (.data, .bss, etc.) + * in an unexpected way, namely the first 8 characters of a bpf object + * name + a suffix signifying the internal map type, + * ex. "xdp_def_" + ".data". + */ + return !strncmp(map_info->name, "xsk_def_.data", + sizeof(map_info->name)) && + map_info->value_size >= sizeof(int); +} + +static int xsk_lookup_bpf_map(int prog_fd) +{ + return xsk_lookup_map_by_filter(prog_fd, &xsk_map_is_socket_map); +} + +static int xsk_lookup_refcnt_map(int prog_fd, const char *xdp_filename) +{ + int map_fd = xsk_lookup_map_by_filter(prog_fd, &xsk_map_is_refcnt_map); + + if (map_fd >= 0) + goto out; + + if (map_fd != -ENOENT) { + pr_debug("Error getting refcount map: %s\n", strerror(-map_fd)); + goto out; + } + + if (xdp_filename) + pr_warn("Refcount was not found in %s or kernel does not support required features, so automatic program removal on unload is disabled\n", + xdp_filename); + else + pr_warn("Another XSK socket was created by a version of libxdp that doesn't support program refcnt, so automatic program removal on unload is disabled.\n"); +out: + return map_fd; +} + +#ifdef HAVE_LIBBPF_BPF_MAP_CREATE +/* bpf_map_create() and the new bpf_prog_create() were added at the same time - + * however there's a naming conflict with another bpf_prog_load() function in + * older versions of libbpf; to avoid hitting that we create our own wrapper + * function for this one even with new libbpf versions. + */ +static int xsk_check_create_prog(struct bpf_insn *insns, size_t insns_cnt) +{ + return bpf_prog_load(BPF_PROG_TYPE_XDP, "testprog", + "GPL", insns, insns_cnt, NULL); +} +#else +static int bpf_map_create(enum bpf_map_type map_type, + __unused const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + __unused void *opts) +{ + struct bpf_create_map_attr map_attr; + + memset(&map_attr, 0, sizeof(map_attr)); + map_attr.map_type = map_type; + map_attr.key_size = key_size; + map_attr.value_size = value_size; + map_attr.max_entries = max_entries; + + return bpf_create_map_xattr(&map_attr); +} + +static int xsk_check_create_prog(struct bpf_insn *insns, size_t insns_cnt) +{ + struct bpf_load_program_attr prog_attr; + + memset(&prog_attr, 0, sizeof(prog_attr)); + prog_attr.prog_type = BPF_PROG_TYPE_XDP; + prog_attr.insns = insns; + prog_attr.insns_cnt = insns_cnt; + prog_attr.license = "GPL"; + + return bpf_load_program_xattr(&prog_attr, NULL, 0); +} +#endif + +static bool xsk_check_redirect_flags(void) +{ + char data_in = 0, data_out; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &data_in, + .data_out = &data_out, + .data_size_in = 1); + struct bpf_insn insns[] = { + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, XDP_PASS), + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + BPF_EXIT_INSN(), + }; + int prog_fd, map_fd, ret; + bool detected = false; + + map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xskmap", + sizeof(int), sizeof(int), 1, NULL); + if (map_fd < 0) + return detected; + + insns[0].imm = map_fd; + + prog_fd = xsk_check_create_prog(insns, ARRAY_SIZE(insns)); + if (prog_fd < 0) { + close(map_fd); + return detected; + } + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + if (!ret && opts.retval == XDP_PASS) + detected = true; + close(prog_fd); + close(map_fd); + return detected; +} + +static struct xdp_program *xsk_lookup_program(int ifindex) +{ + const char *version_name = "xsk_prog_version"; + const char *prog_name = "xsk_def_prog"; + struct xdp_multiprog *multi_prog; + struct xdp_program *prog = NULL; + __u32 version; + int err; + + multi_prog = xdp_multiprog__get_from_ifindex(ifindex); + if (IS_ERR(multi_prog)) + return NULL; + + if (xdp_multiprog__is_legacy(multi_prog)) { + prog = xdp_multiprog__main_prog(multi_prog); + prog = strcmp(xdp_program__name(prog), prog_name) ? NULL : prog; + goto check; + } + + while ((prog = xdp_multiprog__next_prog(prog, multi_prog))) + if (!strcmp(xdp_program__name(prog), prog_name)) + break; + +check: + if (!prog) + goto out; + + err = check_xdp_prog_version(xdp_program__btf(prog), version_name, &version); + if (err) { + prog = ERR_PTR(err); + goto out; + } + if (version > XSK_PROG_VERSION) { + pr_warn("XSK default program version %d higher than supported %d\n", version, + XSK_PROG_VERSION); + prog = ERR_PTR(-EOPNOTSUPP); + } + +out: + if (!IS_ERR_OR_NULL(prog)) + prog = xdp_program__clone(prog, 0); + + xdp_multiprog__close(multi_prog); + return prog; +} + +static int xsk_update_prog_refcnt(int refcnt_map_fd, int delta) +{ + struct bpf_map_info map_info = {}; + __u32 info_len = sizeof(map_info); + int *value_data = NULL; + int lock_fd, ret; + __u32 key = 0; + + ret = bpf_obj_get_info_by_fd(refcnt_map_fd, &map_info, &info_len); + if (ret) + return ret; + + value_data = calloc(1, map_info.value_size); + if (!value_data) + return -ENOMEM; + + lock_fd = xdp_lock_acquire(); + if (lock_fd < 0) { + ret = lock_fd; + goto out; + } + + /* Note, if other global variables are added before the refcnt, + * this changes map's value type, not number of elements, + * so additional offset must be applied to value_data, + * when reading refcount, but map key always stays zero + */ + ret = bpf_map_lookup_elem(refcnt_map_fd, &key, value_data); + if (ret) + goto unlock; + + /* If refcount is 0, program is awaiting detach and can't be used */ + if (*value_data) { + *value_data += delta; + ret = bpf_map_update_elem(refcnt_map_fd, &key, value_data, 0); + if (ret) + goto unlock; + } + + ret = *value_data; +unlock: + xdp_lock_release(lock_fd); +out: + free(value_data); + return ret; +} + +static int xsk_incr_prog_refcnt(int refcnt_map_fd) +{ + return xsk_update_prog_refcnt(refcnt_map_fd, 1); +} + +static int xsk_decr_prog_refcnt(int refcnt_map_fd) +{ + return xsk_update_prog_refcnt(refcnt_map_fd, -1); +} + +static int __xsk_setup_xdp_prog(struct xsk_socket *xsk, int *xsks_map_fd) +{ + const char *fallback_prog = "xsk_def_xdp_prog_5.3.o"; + const char *default_prog = "xsk_def_xdp_prog.o"; + struct xsk_ctx *ctx = xsk->ctx; + const char *file_name = NULL; + bool attached = false; + int err; + + ctx->xdp_prog = xsk_lookup_program(ctx->ifindex); + if (IS_ERR(ctx->xdp_prog)) + return PTR_ERR(ctx->xdp_prog); + + ctx->refcnt_map_fd = -ENOENT; + + if (ctx->xdp_prog) { + int refcnt; + + ctx->refcnt_map_fd = xsk_lookup_refcnt_map(xdp_program__fd(ctx->xdp_prog), NULL); + if (ctx->refcnt_map_fd == -ENOENT) + goto map_lookup; + + if (ctx->refcnt_map_fd < 0) { + err = ctx->refcnt_map_fd; + goto err_prog_load; + } + + refcnt = xsk_incr_prog_refcnt(ctx->refcnt_map_fd); + if (refcnt < 0) { + err = refcnt; + pr_debug("Error occurred when incrementing xsk XDP prog refcount: %s\n", + strerror(-err)); + goto err_prog_load; + } + + if (!refcnt) { + pr_warn("Current program is being detached, falling back on creating a new program\n"); + close(ctx->refcnt_map_fd); + ctx->refcnt_map_fd = -ENOENT; + xdp_program__close(ctx->xdp_prog); + ctx->xdp_prog = NULL; + } + } + + if (!ctx->xdp_prog) { + file_name = xsk_check_redirect_flags() ? default_prog : fallback_prog; + ctx->xdp_prog = xdp_program__find_file(file_name, NULL, NULL); + if (IS_ERR(ctx->xdp_prog)) + return PTR_ERR(ctx->xdp_prog); + + err = xsk_size_map(ctx->xdp_prog, ctx->ifname); + if (err) + goto err_prog_load; + + err = xdp_program__attach(ctx->xdp_prog, ctx->ifindex, + xsk_convert_xdp_flags(xsk->config.xdp_flags), 0); + if (err) + goto err_prog_load; + + attached = true; + } + + if (ctx->refcnt_map_fd < 0) { + ctx->refcnt_map_fd = xsk_lookup_refcnt_map(xdp_program__fd(ctx->xdp_prog), + file_name); + if (ctx->refcnt_map_fd < 0 && ctx->refcnt_map_fd != -ENOENT) { + err = ctx->refcnt_map_fd; + goto err_prog_load; + } + } +map_lookup: + ctx->xsks_map_fd = xsk_lookup_bpf_map(xdp_program__fd(ctx->xdp_prog)); + if (ctx->xsks_map_fd < 0) { + err = ctx->xsks_map_fd; + goto err_lookup; + } + + if (xsk->rx) { + err = bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, &xsk->fd, 0); + if (err) + goto err_lookup; + } + if (xsks_map_fd) + *xsks_map_fd = ctx->xsks_map_fd; + + return 0; + +err_lookup: + if (attached) + xdp_program__detach(ctx->xdp_prog, ctx->ifindex, + xsk_convert_xdp_flags(xsk->config.xdp_flags), 0); +err_prog_load: + if (ctx->refcnt_map_fd >= 0) + close(ctx->refcnt_map_fd); + ctx->refcnt_map_fd = -ENOENT; + xdp_program__close(ctx->xdp_prog); + ctx->xdp_prog = NULL; + return err; +} + +static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, __u64 netns_cookie, int ifindex, __u32 queue_id) +{ + struct xsk_ctx *ctx; + + if (list_empty(&umem->ctx_list)) + return NULL; + + list_for_each_entry(ctx, &umem->ctx_list, list) { + if (ctx->netns_cookie == netns_cookie && ctx->ifindex == ifindex && ctx->queue_id == queue_id) { + ctx->refcount++; + return ctx; + } + } + + return NULL; +} + +static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap) +{ + struct xsk_umem *umem = ctx->umem; + struct xdp_mmap_offsets off; + int err; + + if (--ctx->refcount) + return; + + if (!unmap) + goto out_free; + + err = xsk_get_mmap_offsets(umem->fd, &off); + if (err) + goto out_free; + + munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * + sizeof(__u64)); + munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * + sizeof(__u64)); + +out_free: + list_del(&ctx->list); + free(ctx); +} + +static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, + struct xsk_umem *umem, __u64 netns_cookie, int ifindex, + const char *ifname, __u32 queue_id, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp) +{ + struct xsk_ctx *ctx; + int err; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return NULL; + + if (!umem->fill_save) { + err = xsk_create_umem_rings(umem, xsk->fd, fill, comp); + if (err) { + free(ctx); + return NULL; + } + } else if (umem->fill_save != fill || umem->comp_save != comp) { + /* Copy over rings to new structs. */ + memcpy(fill, umem->fill_save, sizeof(*fill)); + memcpy(comp, umem->comp_save, sizeof(*comp)); + } + + ctx->netns_cookie = netns_cookie; + ctx->ifindex = ifindex; + ctx->refcount = 1; + ctx->umem = umem; + ctx->queue_id = queue_id; + memcpy(ctx->ifname, ifname, IFNAMSIZ - 1); + ctx->ifname[IFNAMSIZ - 1] = '\0'; + + ctx->fill = fill; + ctx->comp = comp; + list_add(&ctx->list, &umem->ctx_list); + return ctx; +} + +static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) +{ + free(xsk->ctx); + free(xsk); +} + +int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) +{ + struct xsk_ctx *ctx = xsk->ctx; + + ctx->xsks_map_fd = fd; + return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, &xsk->fd, 0); +} + +int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) +{ + struct xsk_socket *xsk; + int res; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + return -ENOMEM; + + res = xsk_init_xsk_struct(xsk, ifindex); + if (res) { + free(xsk); + return -EINVAL; + } + + res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); + + xsk_destroy_xsk_struct(xsk); + + return res; +} + +int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, + const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_socket_config *usr_config) +{ + bool rx_setup_done = false, tx_setup_done = false; + void *rx_map = NULL, *tx_map = NULL; + struct sockaddr_xdp sxdp = {}; + struct xdp_mmap_offsets off; + struct xsk_socket *xsk; + struct xsk_ctx *ctx; + int err, ifindex; + __u64 netns_cookie; + socklen_t optlen; + bool unmap; + + if (!umem || !xsk_ptr || !(rx || tx)) + return -EFAULT; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + return -ENOMEM; + + err = xsk_set_xdp_socket_config(&xsk->config, usr_config); + if (err) + goto out_xsk_alloc; + + ifindex = if_nametoindex(ifname); + if (!ifindex) { + err = -errno; + goto out_xsk_alloc; + } + + if (umem->refcount++ > 0) { + xsk->fd = socket(AF_XDP, SOCK_RAW, 0); + if (xsk->fd < 0) { + err = -errno; + goto out_xsk_alloc; + } + } else { + xsk->fd = umem->fd; + rx_setup_done = umem->rx_ring_setup_done; + tx_setup_done = umem->tx_ring_setup_done; + } + + optlen = sizeof(netns_cookie); + err = getsockopt(xsk->fd, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen); + if (err) { + if (errno != ENOPROTOOPT) { + err = -errno; + goto out_socket; + } + netns_cookie = INIT_NS; + } + + ctx = xsk_get_ctx(umem, netns_cookie, ifindex, queue_id); + if (!ctx) { + if (!fill || !comp) { + err = -EFAULT; + goto out_socket; + } + + ctx = xsk_create_ctx(xsk, umem, netns_cookie, ifindex, ifname, queue_id, + fill, comp); + if (!ctx) { + err = -ENOMEM; + goto out_socket; + } + } + xsk->ctx = ctx; + + if (rx && !rx_setup_done) { + err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, + &xsk->config.rx_size, + sizeof(xsk->config.rx_size)); + if (err) { + err = -errno; + goto out_put_ctx; + } + if (xsk->fd == umem->fd) + umem->rx_ring_setup_done = true; + + } + if (tx && !tx_setup_done) { + err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, + &xsk->config.tx_size, + sizeof(xsk->config.tx_size)); + if (err) { + err = -errno; + goto out_put_ctx; + } + if (xsk->fd == umem->fd) + umem->tx_ring_setup_done = true; + } + + err = xsk_get_mmap_offsets(xsk->fd, &off); + if (err) { + err = -errno; + goto out_put_ctx; + } + + if (rx) { + rx_map = mmap(NULL, off.rx.desc + + xsk->config.rx_size * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + xsk->fd, XDP_PGOFF_RX_RING); + if (rx_map == MAP_FAILED) { + err = -errno; + goto out_put_ctx; + } + + rx->mask = xsk->config.rx_size - 1; + rx->size = xsk->config.rx_size; + rx->producer = rx_map + off.rx.producer; + rx->consumer = rx_map + off.rx.consumer; + rx->flags = rx_map + off.rx.flags; + rx->ring = rx_map + off.rx.desc; + rx->cached_prod = *rx->producer; + rx->cached_cons = *rx->consumer; + } + xsk->rx = rx; + + if (tx) { + tx_map = mmap(NULL, off.tx.desc + + xsk->config.tx_size * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + xsk->fd, XDP_PGOFF_TX_RING); + if (tx_map == MAP_FAILED) { + err = -errno; + goto out_mmap_rx; + } + + tx->mask = xsk->config.tx_size - 1; + tx->size = xsk->config.tx_size; + tx->producer = tx_map + off.tx.producer; + tx->consumer = tx_map + off.tx.consumer; + tx->flags = tx_map + off.tx.flags; + tx->ring = tx_map + off.tx.desc; + tx->cached_prod = *tx->producer; + /* cached_cons is r->size bigger than the real consumer pointer + * See xsk_prod_nb_free + */ + tx->cached_cons = *tx->consumer + xsk->config.tx_size; + } + xsk->tx = tx; + + sxdp.sxdp_family = PF_XDP; + sxdp.sxdp_ifindex = ctx->ifindex; + sxdp.sxdp_queue_id = ctx->queue_id; + if (umem->refcount > 1) { + sxdp.sxdp_flags |= XDP_SHARED_UMEM; + sxdp.sxdp_shared_umem_fd = umem->fd; + } else { + sxdp.sxdp_flags = xsk->config.bind_flags; + } + + err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp)); + if (err) { + err = -errno; + goto out_mmap_tx; + } + + if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { + err = __xsk_setup_xdp_prog(xsk, NULL); + if (err) + goto out_mmap_tx; + } + + *xsk_ptr = xsk; + umem->fill_save = NULL; + umem->comp_save = NULL; + return 0; + +out_mmap_tx: + if (tx) + munmap(tx_map, off.tx.desc + + xsk->config.tx_size * sizeof(struct xdp_desc)); +out_mmap_rx: + if (rx) + munmap(rx_map, off.rx.desc + + xsk->config.rx_size * sizeof(struct xdp_desc)); +out_put_ctx: + unmap = umem->fill_save != fill; + xsk_put_ctx(ctx, unmap); +out_socket: + if (--umem->refcount) + close(xsk->fd); +out_xsk_alloc: + free(xsk); + return err; +} + +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, + const struct xsk_socket_config *usr_config) +{ + if (!umem) + return -EFAULT; + + return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem, + rx, tx, umem->fill_save, + umem->comp_save, usr_config); +} + +int xsk_umem__delete(struct xsk_umem *umem) +{ + struct xdp_mmap_offsets off; + int err; + + if (!umem) + return 0; + + if (umem->refcount) + return -EBUSY; + + err = xsk_get_mmap_offsets(umem->fd, &off); + if (!err && umem->fill_save && umem->comp_save) { + munmap(umem->fill_save->ring - off.fr.desc, + off.fr.desc + umem->config.fill_size * sizeof(__u64)); + munmap(umem->comp_save->ring - off.cr.desc, + off.cr.desc + umem->config.comp_size * sizeof(__u64)); + } + + close(umem->fd); + free(umem); + + return 0; +} + +static void xsk_release_xdp_prog(struct xsk_socket *xsk) +{ + struct xsk_ctx *ctx = xsk->ctx; + int value; + + if (xsk->ctx->refcnt_map_fd < 0) + goto out; + + value = xsk_decr_prog_refcnt(ctx->refcnt_map_fd); + if (value < 0) + pr_warn("Error occurred when decrementing xsk XDP prog refcount: %s, please detach program yourself\n", + strerror(-value)); + if (value) + goto out; + + xdp_program__detach(ctx->xdp_prog, ctx->ifindex, + xsk_convert_xdp_flags(xsk->config.xdp_flags), 0); +out: + xdp_program__close(ctx->xdp_prog); +} + +void xsk_socket__delete(struct xsk_socket *xsk) +{ + size_t desc_sz = sizeof(struct xdp_desc); + struct xdp_mmap_offsets off; + struct xsk_umem *umem; + struct xsk_ctx *ctx; + int err; + + if (!xsk) + return; + + ctx = xsk->ctx; + umem = ctx->umem; + if (ctx->xdp_prog) { + xsk_delete_map_entry(ctx->xsks_map_fd, ctx->queue_id); + xsk_release_xdp_prog(xsk); + } + + err = xsk_get_mmap_offsets(xsk->fd, &off); + if (!err) { + if (xsk->rx) { + munmap(xsk->rx->ring - off.rx.desc, + off.rx.desc + xsk->config.rx_size * desc_sz); + } + if (xsk->tx) { + munmap(xsk->tx->ring - off.tx.desc, + off.tx.desc + xsk->config.tx_size * desc_sz); + } + } + + xsk_put_ctx(ctx, true); + + umem->refcount--; + /* Do not close an fd that also has an associated umem connected + * to it. + */ + if (xsk->fd != umem->fd) + close(xsk->fd); + free(xsk); +} diff --git a/lib/libxdp/xsk_def_xdp_prog.c b/lib/libxdp/xsk_def_xdp_prog.c new file mode 100644 index 0000000..801ad12 --- /dev/null +++ b/lib/libxdp/xsk_def_xdp_prog.c @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <xdp/xdp_helpers.h> + +#include "xsk_def_xdp_prog.h" + +#define DEFAULT_QUEUE_IDS 64 + +struct { + __uint(type, BPF_MAP_TYPE_XSKMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, DEFAULT_QUEUE_IDS); +} xsks_map SEC(".maps"); + +struct { + __uint(priority, 20); + __uint(XDP_PASS, 1); +} XDP_RUN_CONFIG(xsk_def_prog); + +/* Program refcount, in order to work properly, + * must be declared before any other global variables + * and initialized with '1'. + */ +volatile int refcnt = 1; + +/* This is the program for post 5.3 kernels. */ +SEC("xdp") +int xsk_def_prog(struct xdp_md *ctx) +{ + /* Make sure refcount is referenced by the program */ + if (!refcnt) + return XDP_PASS; + + /* A set entry here means that the corresponding queue_id + * has an active AF_XDP socket bound to it. + */ + return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS); +} + +char _license[] SEC("license") = "GPL"; +__uint(xsk_prog_version, XSK_PROG_VERSION) SEC(XDP_METADATA_SECTION); diff --git a/lib/libxdp/xsk_def_xdp_prog.h b/lib/libxdp/xsk_def_xdp_prog.h new file mode 100644 index 0000000..b51883d --- /dev/null +++ b/lib/libxdp/xsk_def_xdp_prog.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) + +#ifndef __LIBXDP_XSK_DEF_XDP_PROG_H +#define __LIBXDP_XSK_DEF_XDP_PROG_H + +#define XDP_METADATA_SECTION "xdp_metadata" +#define XSK_PROG_VERSION 1 + +#endif /* __LIBXDP_XSK_DEF_XDP_PROG_H */ diff --git a/lib/libxdp/xsk_def_xdp_prog_5.3.c b/lib/libxdp/xsk_def_xdp_prog_5.3.c new file mode 100644 index 0000000..7973477 --- /dev/null +++ b/lib/libxdp/xsk_def_xdp_prog_5.3.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <xdp/xdp_helpers.h> + +#include "xsk_def_xdp_prog.h" + +#define DEFAULT_QUEUE_IDS 64 + +struct { + __uint(type, BPF_MAP_TYPE_XSKMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, DEFAULT_QUEUE_IDS); +} xsks_map SEC(".maps"); + +struct { + __uint(priority, 20); + __uint(XDP_PASS, 1); +} XDP_RUN_CONFIG(xsk_def_prog); + +/* Program refcount, in order to work properly, + * must be declared before any other global variables + * and initialized with '1'. + */ +volatile int refcnt = 1; + +/* This is the program for 5.3 kernels and older. */ +SEC("xdp") +int xsk_def_prog(struct xdp_md *ctx) +{ + int index = ctx->rx_queue_index; + + /* Make sure refcount is referenced by the program */ + if (!refcnt) + return XDP_PASS; + + /* A set entry here means that the corresponding queue_id + * has an active AF_XDP socket bound to it. + */ + if (bpf_map_lookup_elem(&xsks_map, &index)) + return bpf_redirect_map(&xsks_map, index, 0); + return XDP_PASS; + +} + +char _license[] SEC("license") = "GPL"; +__uint(xsk_prog_version, XSK_PROG_VERSION) SEC(XDP_METADATA_SECTION); |