diff options
Diffstat (limited to 'lib/libxdp/protocol.org')
-rw-r--r-- | lib/libxdp/protocol.org | 473 |
1 files changed, 473 insertions, 0 deletions
diff --git a/lib/libxdp/protocol.org b/lib/libxdp/protocol.org new file mode 100644 index 0000000..2adaf6a --- /dev/null +++ b/lib/libxdp/protocol.org @@ -0,0 +1,473 @@ +#+OPTIONS: ^:nil + +* Protocol for atomic loading of multi-prog dispatchers + +With the support for the =freplace= program type, it is possible to load +multiple XDP programs on a single interface by building a /dispatcher/ program +which will run on the interface, and which will call the component XDP programs +as functions using the =freplace= type. + +For this to work in an interoperable way, applications need to agree on how to +attach their XDP programs using this mechanism. This document outlines the +protocol implemented by =libxdp=, serving as both documentation and a blueprint +for anyone else who wants to implement the same protocol and interoperate. + +** Generating a dispatcher +The dispatcher is simply an XDP program that will call each of a number of stub +functions in turn, and depending on their return code either continue on to the +next function or return immediately. These stub functions are then replaced at +load time with the user XDP programs, using the =freplace= functionality. + +*** Dispatcher format +The dispatcher XDP program contains the main function containing the dispatcher +logic, 10 stub functions that can be replaced by component BPF programs, and a +configuration structure that is used by the dispatcher logic. + +In =libxdp=, this dispatcher is generated by [[https://github.com/xdp-project/xdp-tools/blob/master/lib/libxdp/xdp-dispatcher.c.in][an M4 macro file]] which expands to +the following: + +#+begin_src C +#define XDP_METADATA_SECTION "xdp_metadata" +#define XDP_DISPATCHER_VERSION 2 +#define XDP_DISPATCHER_MAGIC 236 +#define XDP_DISPATCHER_RETVAL 31 +#define MAX_DISPATCHER_ACTIONS 10 + +struct xdp_dispatcher_config { + __u8 magic; /* Set to XDP_DISPATCHER_MAGIC */ + __u8 dispatcher_version; /* Set to XDP_DISPATCHER_VERSION */ + __u8 num_progs_enabled; /* Number of active program slots */ + __u8 is_xdp_frags; /* Whether this dispatcher is loaded with XDP frags support */ + __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; + __u32 run_prios[MAX_DISPATCHER_ACTIONS]; + __u32 program_flags[MAX_DISPATCHER_ACTIONS]; +}; + +/* While 'const volatile' sounds a little like an oxymoron, there's reason + * behind the madness: + * + * - const places the data in rodata, where libbpf will mark it as read-only and + * frozen on program load, letting the kernel do dead code elimination based + * on the values. + * + * - volatile prevents the compiler from optimising away the checks based on the + * compile-time value of the variables, which is important since we will be + * changing the values before loading the program into the kernel. + */ +static volatile const struct xdp_dispatcher_config conf = {}; + +/* The volatile return value prevents the compiler from assuming it knows the + * return value and optimising based on that. + */ +__attribute__ ((noinline)) +int prog0(struct xdp_md *ctx) { + volatile int ret = XDP_DISPATCHER_RETVAL; + + if (!ctx) + return XDP_ABORTED; + return ret; +} +/* the above is repeated as prog1...prog9 */ + +SEC("xdp") +int xdp_dispatcher(struct xdp_md *ctx) +{ + __u8 num_progs_enabled = conf.num_progs_enabled; + int ret; + + if (num_progs_enabled < 1) + goto out; + ret = prog0(ctx); + if (!((1U << ret) & conf.chain_call_actions[0])) + return ret; + + /* the above is repeated for prog1...prog9 */ + +out: + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; +__uint(dispatcher_version, XDP_DISPATCHER_VERSION) SEC(XDP_METADATA_SECTION); +#+end_src + +The dispatcher program is pre-compiled and distributed with =libxdp=. Because +the configuration struct is marked as =const= in the source file, it will be put +into the =rodata=, which libbpf will turn into a read-only (frozen) map on load. +This allows the kernel verifier to perform dead code elimination based on the +values in the map. This is also the reason for the =num_progs_enabled= member of +the config struct: together with the checks in the main dispatcher function the +verifier will effectively remove all the stub function calls not being used, +without having to rely on dynamic compilation. + +When generating a dispatcher, this BPF object file is opened and the +configuration struct is populated before the object is loaded. As a forward +compatibility measure, =libxdp= will also check for the presence of the +=dispatcher_version= field in the =xdp_metadata= section (encoded like the +program metadata described in "Processing program metadata" below), and if it +doesn't match the expected version (currently version 2), will abort any action. + + +*** Populating the dispatcher configuration map +On loading, the dispatcher configuration map is populated as follows: + +- The =magic= field is set to the =XDP_DISPATCHER_MAGIC= value (236). This field + is here to make it possible to check if a program is a dispatcher without + looking at the program BTF in the future. + +- The =dispatcher_version= field is set to the current dispatcher version (2). + This is redundant with the BTF-encoded version in the metadata field, but must + be checked so that the BTF metadata version can be removed in the future. See + the section on old dispatcher versions below. + +- The =num_progs_enabled= member is simply set to the number of active programs + that will be attached to this dispatcher. + +- The =is_xdp_frags= variable is set to 1 if dispatcher is loaded with XDP frags + support (see section below), or 0 otherwise. + +The two other fields contain per-component program metadata, which is read from +the component programs as explained in the "Processing program metadata" section +below. + +- The =chain_call_actions= array is populated with a bitmap signifying which XDP + actions (return codes) of each component program should be interpreted as a + signal to continue execution of the next XDP program. For instance, a packet + filtering program might designate that an =XDP_PASS= action should make + execution continue, while other return codes should immediately end the call + chain and return. The special =XDP_DISPATCHER_RETVAL= (which is set to 31 + corresponding to the topmost bit in the bitmap) is always included in each + programs' =chain_call_actions=; this value is returned by the stub functions, + which ensures that should a component program become detached, processing + will always continue past the stub function. + +- The =run_prios= array contains the effective run priority of each component + program when it was installed. This is also read as program metadata, but + because it can be overridden at load time, the effective value is stored in + the configuration array so it can be carried forward when the dispatcher is + replaced. Component programs are expected to be sorted in order of their run + priority (as explained below in "Loading and attaching component programs"). + +- The =program_flags= is used to store the flags that an XDP program was loaded + with. This is populated with the value of the =BPF_F_XDP_HAS_FRAGS= flag if + the component program in this slot had that flag set (see the section on XDP + frags support below), and is 0 otherwise. + +**** Processing program metadata +As explained above, each component program must specify one or more chain call +actions and a run priority on attach. When loading a user program, =libxdp= will +attempt to read this metadata from the object file as explained in the +following; if no values are found in the object file, a default run priority of +50 will be applied, and =XDP_PASS= will be the only chain call action. + +The metadata is read from the object file by looking for BTF-encoded metadata in +the =.xdp_run_config= object section, encoded similar to the BTF-defined maps +used by libbpf (in the =.maps= section). Here, =libxdp= will look for a struct +definition with the XDP program function name prefixed by an underscore (e.g., +if the main XDP function is called =xdp_main=, libxdp will look for a struct +definition called =_xdp_main=). In this struct, a member =priority= encodes the +run priority, each XDP action can be set as a chain call action by setting a +struct member with the action name. + +The =xdp_helpers.h= header file included with XDP exposes helper macros that can +be used with the existing helpers in =bpf_helpers.h= (from libbpf), so a full +run configuration metadata section can be defined as follows: + +#+begin_src C +#include <bpf/bpf_helpers.h> +#include <xdp/xdp_helpers.h> + +struct { + __uint(priority, 10); + __uint(XDP_PASS, 1); + __uint(XDP_DROP, 1); +} XDP_RUN_CONFIG(my_xdp_func); +#+end_src + +This example sets priority 10 with chain call actions =XDP_PASS= and =XDP_DROP= +for the XDP program starting at =my_xdp_func()=. + +This turns into the following BTF information (as shown by =bpftool btf dump=): + +#+begin_src +[12] STRUCT '(anon)' size=24 vlen=3 + 'priority' type_id=13 bits_offset=0 + 'XDP_PASS' type_id=15 bits_offset=64 + 'XDP_DROP' type_id=15 bits_offset=128 +[13] PTR '(anon)' type_id=14 +[14] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=10 +[15] PTR '(anon)' type_id=16 +[16] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=1 +[17] VAR '_my_xdp_func' type_id=12, linkage=global-alloc +[18] DATASEC '.xdp_run_config' size=0 vlen=1 + type_id=17 offset=0 size=24 +#+end_src + +The parser will look for the =.xdp_run_config= DATASEC, then follow the types +recursively, extracting the field values from the =nr_elems= in the anonymous +arrays in type IDs 14 and 16. + +While =libxdp= will automatically load any metadata specified as above in the +program BTF, the application using =libxdp= can override these values at +runtime. These overridden values will be the ones used when determining program +order, and will be preserved in the dispatcher configuration map for subsequent +operations. + +*** Old versions of the XDP dispatcher +This document currently describes version 2 of the dispatcher and protocol. This +differs from version 1 in the following respects: + +- The dispatcher configuration map has gained the =magic= and + =dispatcher_version= fields for identifying the dispatcher and its version.. + +- The protocol now supports propagating the value of the =BPF_F_XDP_HAS_FRAGS= + field for supporting XDP frags programs for higher MTU. The dispatcher + configuration map has gained the =is_xdp_frags= and =program_flags= fields for + use with this feature. The protocol for propagating the frags field is + described below, and an implementation of this protocol that recognises + version 2 of the dispatcher MUST implement this protocol. + +Older versions of libxdp will check the dispatcher version field of any +dispatcher loaded in the kernel, and refuse to operate on a dispatcher with a +higher version than the library version implements. This means that if a newer +dispatcher is loaded, old versions of the library will be locked out of +modifying that dispatcher. This is by design: old library versions don't +recognise the semantics of new features added in subsequent versions, and so +would introduce bugs if it attempted to operate on newer versions. + +Newer versions of libxdp will, however, recognise older dispatcher versions. If +a newer version of libxdp loads a new program and finds an old dispatcher +version already loaded on an interface, it will display the programs attached to +it, but will refuse to replace it with a newer version so as not to lock out the +program that loaded the program(s) already attached. Manually unloading the +loaded programs will be required to load a new dispatcher version on the +interface. + +*** Loading and attaching component programs +When loading one or more XDP programs onto an interface (assuming no existing +program is found on the interface; for adding programs, see below), =libxdp= +first prepares a dispatcher program with the right number of slots, by +populating the configuration struct as described above. Then, this dispatcher +program is loaded into the kernel, with the =BPF_F_XDP_HAS_FRAGS= flag set if +all component programs have that flag set (see the section on supporting XDP +frags below). + +Having loaded the dispatcher program, =libxdp= then loads each of the component +programs. To do this, first the list of component programs is sorted by their +run priority, forming the final run sequence. Should several programs have the +same run priority, ties are broken in the following arbitrary, but +deterministic, order (see =cmp_xdp_programs()= [[https://github.com/xdp-project/xdp-tools/blob/master/lib/libxdp/libxdp.c][in libxdp.c]]): + +- By XDP function name (=bpf_program__name()= from libbpf) +- By sorting already-loaded programs before not-yet-loaded ones +- By unloaded programs by program size +- By loaded program bpf tag value (using =memcmp()=) +- By load time + +Before loading, each component program type is reset to =BPF_PROG_TYPE_EXT= with +an expected attach type of 0, and the =BPF_F_XDP_HAS_FRAGS= is unset (see the +section on supporting frags below). Then, the attachment target is set to the +dispatcher file descriptor and the BTF ID of the stub function to replace (i.e., +the first component program has =prog0()= as its target, and so on). Then the +program is loaded, at which point the kernel will verify the component program's +compatibility with the attach point. + +Having loaded the component program, it is attached to the dispatcher by way of +=bpf_link_create()=, specifying the same target file description and BTF ID used +when loading the program. This will return a link fd, which will be pinned to +prevent the attachment to unravel when the fd is closed (see "Locking and +pinning" below). + +*** Locking and pinning +To prevent the kernel from detaching any =freplace= program when its last file +description is closed, the programs must be pinned in =bpffs=. This is done in +the =xdp= subdirectory of =bpffs=, which by default means =/sys/fs/bpf/xdp=. If +the =LIBXDP_BPFFS= environment variable is set, this will override the location +of the top-level =bpffs=, and the =xdp= subdirectory will be created beneath +this path. + +The pathnames generated for pinning are the following: + +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID - dispatcher program for IFINDEX with BPF program ID DID +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-prog - component program 0, program reference +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-link - component program 0, bpf_link reference +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-prog - component program 1, program reference +- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-link - component program 1, bpf_link reference +- etc, up to ten component programs + +This means that several pin operations have to be performed for each dispatcher +program. Semantically, these are all atomic, so to make sure every consumer of +the hierarchy of pinned files gets a consistent view, locking is needed. This is +implemented by opening the parent directory =/sys/fs/bpf/xdp= with the +=O_DIRECTORY= flag, and obtaining a lock on the resulting file descriptor using +=flock(lock_fd, LOCK_EX)=. + +When creating a new dispatcher program, it will first be fully populated, with +all component programs attached. Then, the programs will be linked in =bpffs= as +specified above, and once this succeeds, the program will be attached to the +interface. If attaching the program fails, the programs will be unpinned again, +and the error returned to the caller. This order ensures atomic attachment to +the interface, without any risk that component programs will be automatically +detached due to a badly timed application crash. + +When loading the initial dispatcher program, the =XDP_FLAGS_UPDATE_IF_NOEXIST= +flag is set to prevent accidentally overriding any concurrent modifications. If +this fails, the whole operation starts over, turning the load into a +modification as described below. + +*** Supporting XDP programs with frags support (BPF_F_XDP_HAS_FRAGS flag) +Linux kernel 5.18 added support for a new API that allows XDP programs to access +packet data that spans more than a single page, allowing XDP programs to be +loaded on interfaces with bigger MTUs. Such packets will not have all their +packet data accessible by the traditional "direct packet access"; instead, only +the first fragment will be available this way, and the rest of the packet data +has to be accessed via the new =bpf_xdp_load_bytes()= helper. + +Existing XDP programs are written with the assumption that they can see the +whole packet data using direct packet access, which means they can subtly +malfunction if some of the packet data is suddenly invisible (for instance, +counting packet lengths is no longer accurate). Whether a given XDP program +supports the frags API or not is a semantic issue, and it's not possible for the +kernel to auto-detect this. For this reason, programs have to opt in to XDP +frags support at load time, by setting the =BPF_F_XDP_HAS_FRAGS= flag as they +are loaded into the kernel. Programs that are not loaded with this flag will be +rejected from attaching to network devices that use packet fragment (i.e., those +with a large MTU). + +This has implications for the XDP dispatcher, as its purpose is for multiple +programs to be loaded at the same time. Since the =BPF_F_XDP_HAS_FRAGS= cannot +be set for individual component programs, it has to be set for the dispatcher as +a whole. However, as described above, programs can subtly malfunction if they +are exposed to packets with fragments without being ready to do so. This means +that it's only safe to set the =BPF_F_XDP_HAS_FRAGS= on the dispatcher itself if +*all* component programs have the flag set. + +To properly propagate the flags even when adding new programs to an existing +dispatcher, the dispatcher itself needs to keep track of which of its component +programs had the =BPF_F_XDP_HAS_FRAGS= flag set when they were added. The +dispatcher configuration map users the =program_flags= array for this: for each +component program, this field is set to the value of the =BPF_F_XDP_HAS_FRAGS= +flag if that component program has the flag set, and to 0 otherwise. An +additional field, =is_xdp_frags=, is set if the dispatcher itself is loaded with +the frags field set (which may not be the case if the kernel doesn't support the +flag). + +When generating a dispatcher for a set of programs, libxdp simply tracks if all +component programs support the =BPF_F_XDP_HAS_FRAGS=, and if they do, the +dispatcher is loaded with this flag set. If any program attached to the +dispatcher does not support the flag, the dispatcher is loaded without this flag +set (and the =is_xdp_frags= field in the dispatcher configuration is set +accordingly). If libxdp determines that the running kernel does not support the +=BPF_F_XDP_HAS_FRAGS=, the dispatcher is loaded without the flag regardless of +the value of the component programs. + +When adding a program to an existing dispatcher, this may result in a +"downgrade", i.e., loading a new dispatcher without the frags flag to replace an +existing dispatcher that does have the flag set. This will result in the +replacement dispatcher being rejected by the kernel at attach time, but only if +the interface being attached to actually requires the frags flag (i.e., if it +has a large MTU). If the attachment is rejected, the old dispatcher will stay in +place, leading to no loss of functionality. + +** Adding or removing programs from an existing dispatcher +The sections above explain how to generate a dispatcher and attach it to an +interface, assuming no existing program is attached. When one or more programs +is already attached, a couple of extra steps are required to ensure that the +switch is made atomically. + +Briefly, changing the programs attached to an interface entails the following +steps: + +- Reading the existing dispatcher program and obtaining references to the + component programs. + +- Generating a new dispatcher containing the new set of programs (adding or + removing the programs needed). + +- Atomically swapping out the XDP program attachment on the interface so the new + dispatcher takes over from the old one. + +- Unpinning and dismantling the old dispatcher. + +These operations are each described in turn in the following sections. + +*** Reading list of existing programs from the kernel +The first step is to obtain the ID of the currently loaded XDP program using +=bpf_get_link_xdp_info()=. A file descriptor to the dispatcher is obtained using +=bpf_prog_get_fd_by_id()=, and the BTF information attached to the program is +obtained from the kernel. This is checked for the presence of the dispatcher +version field (as explained above), and the operation is aborted if this is not +present, or doesn't match what the library expects. + +Having thus established that the program loaded on the interface is indeed a +compatible dispatcher, the map ID of the map containing the configuration struct +is obtained from the kernel, and the configuration data is loaded from the map +(after checking that the map value size matches the expected configuration +struct). + +Then, the file lock on the directory in =bpffs= is obtained as explained in +the "Locking and pinning" section above, and, while holding this lock, file +descriptors to each of the component programs and =bpf_link= objects are +obtained. The end result is a reference to the full dispatcher structure (and +its component programs), corresponding to that generated on load. When +populating the component program structure in memory, the chain call actions and +run priority from the dispatcher configuration map is used instead of parsing +the BTF metadata of each program: This ensures that any modified values +specified at load time will be retained in stead of being reverted to the +values compiled into the BTF metadata. Similarly, the =program_flags= array of +the in-kernel dispatcher is used to determine which of the existing component +programs support the =BPF_F_XDP_HAS_FRAGS= flag (see the section on frags +support above). + +*** Generating a new dispatcher +Having obtained a reference to the existing dispatcher, =libxdp= takes that and +the list of programs to add to or remove from the interface, and simply +generates a new dispatcher with the new set of programs. When adding programs, +the whole list of programs is sorted according to their run priorities (as +explained above), resulting in new programs being inserted in the right place in +the existing sequence according to their priority. + +Generating this secondary dispatcher relies on the support for multiple +attachments for =freplace= programs, which was added in kernel 5.10. This allows +the =bpf_link_create()= operation to specify an attachment target in the new +dispatcher. In other words, the component programs will briefly be attached to +both the old and new dispatcher, but only one of those will be attached to the +interface. + +After completion of the new dispatcher, its component programs are pinned in +=bpffs= as described above. + +*** Atomic replace and retry +At this point, =libxdp= has references to both the old dispatcher, already +attached to the interface, and the new one with the modified set of component +programs. The new dispatcher is then atomically swapped out with the old one, +using the =XDP_FLAGS_REPLACE= flag to the netlink operation (and the +accompanying =IFLA_XDP_EXPECTED_FD= attribute). + +Once the atomic replace operation succeeds, the old dispatcher is unpinned from +=bppfs= and the in-memory references to both the old and new dispatchers are +released (since the new dispatcher was already pinned, preventing it from being +detached from the interface). + +Should this atomic replace instead *fail* because the program attached to the +interface changed while the new dispatcher was being built, the whole operation +is simply started over from the beginning. That is, the new dispatcher is +unpinned from =bpffs=, and the in-memory references to both dispatchers are +released (but no unpinning of the old dispatcher is performed!). Then, the +program ID attached to the interface is again read from the kernel, and the +operation proceeds from "Reading list of existing programs from the kernel". + + +** Compatibility with older kernels +The full functionality described above can only be attained with kernels version +5.10 or newer, because this is the version that introduced support for +re-attaching an freplace program in a secondary attachment point. However, the +freplace functionality itself was introduced in kernel 5.7, so for kernel +versions 5.7 to 5.9, multiple programs can be attached as long as they are all +attached to the dispatcher immediately as they are loaded. This is achieved by +using =bpf_raw_tracepoint_open()= in place of =bpf_link_create()= when attaching +the component programs to the dispatcher. The =bpf_raw_tracepoint_open()= +function doesn't take an attach target as a parameter; instead, it simply +attached the freplace program to the target that was specified at load time +(which is why it only works when all component programs are loaded together with +the dispatcher). |