summaryrefslogtreecommitdiffstats
path: root/lib/libxdp/protocol.org
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libxdp/protocol.org')
-rw-r--r--lib/libxdp/protocol.org473
1 files changed, 473 insertions, 0 deletions
diff --git a/lib/libxdp/protocol.org b/lib/libxdp/protocol.org
new file mode 100644
index 0000000..2adaf6a
--- /dev/null
+++ b/lib/libxdp/protocol.org
@@ -0,0 +1,473 @@
+#+OPTIONS: ^:nil
+
+* Protocol for atomic loading of multi-prog dispatchers
+
+With the support for the =freplace= program type, it is possible to load
+multiple XDP programs on a single interface by building a /dispatcher/ program
+which will run on the interface, and which will call the component XDP programs
+as functions using the =freplace= type.
+
+For this to work in an interoperable way, applications need to agree on how to
+attach their XDP programs using this mechanism. This document outlines the
+protocol implemented by =libxdp=, serving as both documentation and a blueprint
+for anyone else who wants to implement the same protocol and interoperate.
+
+** Generating a dispatcher
+The dispatcher is simply an XDP program that will call each of a number of stub
+functions in turn, and depending on their return code either continue on to the
+next function or return immediately. These stub functions are then replaced at
+load time with the user XDP programs, using the =freplace= functionality.
+
+*** Dispatcher format
+The dispatcher XDP program contains the main function containing the dispatcher
+logic, 10 stub functions that can be replaced by component BPF programs, and a
+configuration structure that is used by the dispatcher logic.
+
+In =libxdp=, this dispatcher is generated by [[https://github.com/xdp-project/xdp-tools/blob/master/lib/libxdp/xdp-dispatcher.c.in][an M4 macro file]] which expands to
+the following:
+
+#+begin_src C
+#define XDP_METADATA_SECTION "xdp_metadata"
+#define XDP_DISPATCHER_VERSION 2
+#define XDP_DISPATCHER_MAGIC 236
+#define XDP_DISPATCHER_RETVAL 31
+#define MAX_DISPATCHER_ACTIONS 10
+
+struct xdp_dispatcher_config {
+ __u8 magic; /* Set to XDP_DISPATCHER_MAGIC */
+ __u8 dispatcher_version; /* Set to XDP_DISPATCHER_VERSION */
+ __u8 num_progs_enabled; /* Number of active program slots */
+ __u8 is_xdp_frags; /* Whether this dispatcher is loaded with XDP frags support */
+ __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS];
+ __u32 run_prios[MAX_DISPATCHER_ACTIONS];
+ __u32 program_flags[MAX_DISPATCHER_ACTIONS];
+};
+
+/* While 'const volatile' sounds a little like an oxymoron, there's reason
+ * behind the madness:
+ *
+ * - const places the data in rodata, where libbpf will mark it as read-only and
+ * frozen on program load, letting the kernel do dead code elimination based
+ * on the values.
+ *
+ * - volatile prevents the compiler from optimising away the checks based on the
+ * compile-time value of the variables, which is important since we will be
+ * changing the values before loading the program into the kernel.
+ */
+static volatile const struct xdp_dispatcher_config conf = {};
+
+/* The volatile return value prevents the compiler from assuming it knows the
+ * return value and optimising based on that.
+ */
+__attribute__ ((noinline))
+int prog0(struct xdp_md *ctx) {
+ volatile int ret = XDP_DISPATCHER_RETVAL;
+
+ if (!ctx)
+ return XDP_ABORTED;
+ return ret;
+}
+/* the above is repeated as prog1...prog9 */
+
+SEC("xdp")
+int xdp_dispatcher(struct xdp_md *ctx)
+{
+ __u8 num_progs_enabled = conf.num_progs_enabled;
+ int ret;
+
+ if (num_progs_enabled < 1)
+ goto out;
+ ret = prog0(ctx);
+ if (!((1U << ret) & conf.chain_call_actions[0]))
+ return ret;
+
+ /* the above is repeated for prog1...prog9 */
+
+out:
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
+__uint(dispatcher_version, XDP_DISPATCHER_VERSION) SEC(XDP_METADATA_SECTION);
+#+end_src
+
+The dispatcher program is pre-compiled and distributed with =libxdp=. Because
+the configuration struct is marked as =const= in the source file, it will be put
+into the =rodata=, which libbpf will turn into a read-only (frozen) map on load.
+This allows the kernel verifier to perform dead code elimination based on the
+values in the map. This is also the reason for the =num_progs_enabled= member of
+the config struct: together with the checks in the main dispatcher function the
+verifier will effectively remove all the stub function calls not being used,
+without having to rely on dynamic compilation.
+
+When generating a dispatcher, this BPF object file is opened and the
+configuration struct is populated before the object is loaded. As a forward
+compatibility measure, =libxdp= will also check for the presence of the
+=dispatcher_version= field in the =xdp_metadata= section (encoded like the
+program metadata described in "Processing program metadata" below), and if it
+doesn't match the expected version (currently version 2), will abort any action.
+
+
+*** Populating the dispatcher configuration map
+On loading, the dispatcher configuration map is populated as follows:
+
+- The =magic= field is set to the =XDP_DISPATCHER_MAGIC= value (236). This field
+ is here to make it possible to check if a program is a dispatcher without
+ looking at the program BTF in the future.
+
+- The =dispatcher_version= field is set to the current dispatcher version (2).
+ This is redundant with the BTF-encoded version in the metadata field, but must
+ be checked so that the BTF metadata version can be removed in the future. See
+ the section on old dispatcher versions below.
+
+- The =num_progs_enabled= member is simply set to the number of active programs
+ that will be attached to this dispatcher.
+
+- The =is_xdp_frags= variable is set to 1 if dispatcher is loaded with XDP frags
+ support (see section below), or 0 otherwise.
+
+The two other fields contain per-component program metadata, which is read from
+the component programs as explained in the "Processing program metadata" section
+below.
+
+- The =chain_call_actions= array is populated with a bitmap signifying which XDP
+ actions (return codes) of each component program should be interpreted as a
+ signal to continue execution of the next XDP program. For instance, a packet
+ filtering program might designate that an =XDP_PASS= action should make
+ execution continue, while other return codes should immediately end the call
+ chain and return. The special =XDP_DISPATCHER_RETVAL= (which is set to 31
+ corresponding to the topmost bit in the bitmap) is always included in each
+ programs' =chain_call_actions=; this value is returned by the stub functions,
+ which ensures that should a component program become detached, processing
+ will always continue past the stub function.
+
+- The =run_prios= array contains the effective run priority of each component
+ program when it was installed. This is also read as program metadata, but
+ because it can be overridden at load time, the effective value is stored in
+ the configuration array so it can be carried forward when the dispatcher is
+ replaced. Component programs are expected to be sorted in order of their run
+ priority (as explained below in "Loading and attaching component programs").
+
+- The =program_flags= is used to store the flags that an XDP program was loaded
+ with. This is populated with the value of the =BPF_F_XDP_HAS_FRAGS= flag if
+ the component program in this slot had that flag set (see the section on XDP
+ frags support below), and is 0 otherwise.
+
+**** Processing program metadata
+As explained above, each component program must specify one or more chain call
+actions and a run priority on attach. When loading a user program, =libxdp= will
+attempt to read this metadata from the object file as explained in the
+following; if no values are found in the object file, a default run priority of
+50 will be applied, and =XDP_PASS= will be the only chain call action.
+
+The metadata is read from the object file by looking for BTF-encoded metadata in
+the =.xdp_run_config= object section, encoded similar to the BTF-defined maps
+used by libbpf (in the =.maps= section). Here, =libxdp= will look for a struct
+definition with the XDP program function name prefixed by an underscore (e.g.,
+if the main XDP function is called =xdp_main=, libxdp will look for a struct
+definition called =_xdp_main=). In this struct, a member =priority= encodes the
+run priority, each XDP action can be set as a chain call action by setting a
+struct member with the action name.
+
+The =xdp_helpers.h= header file included with XDP exposes helper macros that can
+be used with the existing helpers in =bpf_helpers.h= (from libbpf), so a full
+run configuration metadata section can be defined as follows:
+
+#+begin_src C
+#include <bpf/bpf_helpers.h>
+#include <xdp/xdp_helpers.h>
+
+struct {
+ __uint(priority, 10);
+ __uint(XDP_PASS, 1);
+ __uint(XDP_DROP, 1);
+} XDP_RUN_CONFIG(my_xdp_func);
+#+end_src
+
+This example sets priority 10 with chain call actions =XDP_PASS= and =XDP_DROP=
+for the XDP program starting at =my_xdp_func()=.
+
+This turns into the following BTF information (as shown by =bpftool btf dump=):
+
+#+begin_src
+[12] STRUCT '(anon)' size=24 vlen=3
+ 'priority' type_id=13 bits_offset=0
+ 'XDP_PASS' type_id=15 bits_offset=64
+ 'XDP_DROP' type_id=15 bits_offset=128
+[13] PTR '(anon)' type_id=14
+[14] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=10
+[15] PTR '(anon)' type_id=16
+[16] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=1
+[17] VAR '_my_xdp_func' type_id=12, linkage=global-alloc
+[18] DATASEC '.xdp_run_config' size=0 vlen=1
+ type_id=17 offset=0 size=24
+#+end_src
+
+The parser will look for the =.xdp_run_config= DATASEC, then follow the types
+recursively, extracting the field values from the =nr_elems= in the anonymous
+arrays in type IDs 14 and 16.
+
+While =libxdp= will automatically load any metadata specified as above in the
+program BTF, the application using =libxdp= can override these values at
+runtime. These overridden values will be the ones used when determining program
+order, and will be preserved in the dispatcher configuration map for subsequent
+operations.
+
+*** Old versions of the XDP dispatcher
+This document currently describes version 2 of the dispatcher and protocol. This
+differs from version 1 in the following respects:
+
+- The dispatcher configuration map has gained the =magic= and
+ =dispatcher_version= fields for identifying the dispatcher and its version..
+
+- The protocol now supports propagating the value of the =BPF_F_XDP_HAS_FRAGS=
+ field for supporting XDP frags programs for higher MTU. The dispatcher
+ configuration map has gained the =is_xdp_frags= and =program_flags= fields for
+ use with this feature. The protocol for propagating the frags field is
+ described below, and an implementation of this protocol that recognises
+ version 2 of the dispatcher MUST implement this protocol.
+
+Older versions of libxdp will check the dispatcher version field of any
+dispatcher loaded in the kernel, and refuse to operate on a dispatcher with a
+higher version than the library version implements. This means that if a newer
+dispatcher is loaded, old versions of the library will be locked out of
+modifying that dispatcher. This is by design: old library versions don't
+recognise the semantics of new features added in subsequent versions, and so
+would introduce bugs if it attempted to operate on newer versions.
+
+Newer versions of libxdp will, however, recognise older dispatcher versions. If
+a newer version of libxdp loads a new program and finds an old dispatcher
+version already loaded on an interface, it will display the programs attached to
+it, but will refuse to replace it with a newer version so as not to lock out the
+program that loaded the program(s) already attached. Manually unloading the
+loaded programs will be required to load a new dispatcher version on the
+interface.
+
+*** Loading and attaching component programs
+When loading one or more XDP programs onto an interface (assuming no existing
+program is found on the interface; for adding programs, see below), =libxdp=
+first prepares a dispatcher program with the right number of slots, by
+populating the configuration struct as described above. Then, this dispatcher
+program is loaded into the kernel, with the =BPF_F_XDP_HAS_FRAGS= flag set if
+all component programs have that flag set (see the section on supporting XDP
+frags below).
+
+Having loaded the dispatcher program, =libxdp= then loads each of the component
+programs. To do this, first the list of component programs is sorted by their
+run priority, forming the final run sequence. Should several programs have the
+same run priority, ties are broken in the following arbitrary, but
+deterministic, order (see =cmp_xdp_programs()= [[https://github.com/xdp-project/xdp-tools/blob/master/lib/libxdp/libxdp.c][in libxdp.c]]):
+
+- By XDP function name (=bpf_program__name()= from libbpf)
+- By sorting already-loaded programs before not-yet-loaded ones
+- By unloaded programs by program size
+- By loaded program bpf tag value (using =memcmp()=)
+- By load time
+
+Before loading, each component program type is reset to =BPF_PROG_TYPE_EXT= with
+an expected attach type of 0, and the =BPF_F_XDP_HAS_FRAGS= is unset (see the
+section on supporting frags below). Then, the attachment target is set to the
+dispatcher file descriptor and the BTF ID of the stub function to replace (i.e.,
+the first component program has =prog0()= as its target, and so on). Then the
+program is loaded, at which point the kernel will verify the component program's
+compatibility with the attach point.
+
+Having loaded the component program, it is attached to the dispatcher by way of
+=bpf_link_create()=, specifying the same target file description and BTF ID used
+when loading the program. This will return a link fd, which will be pinned to
+prevent the attachment to unravel when the fd is closed (see "Locking and
+pinning" below).
+
+*** Locking and pinning
+To prevent the kernel from detaching any =freplace= program when its last file
+description is closed, the programs must be pinned in =bpffs=. This is done in
+the =xdp= subdirectory of =bpffs=, which by default means =/sys/fs/bpf/xdp=. If
+the =LIBXDP_BPFFS= environment variable is set, this will override the location
+of the top-level =bpffs=, and the =xdp= subdirectory will be created beneath
+this path.
+
+The pathnames generated for pinning are the following:
+
+- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID - dispatcher program for IFINDEX with BPF program ID DID
+- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-prog - component program 0, program reference
+- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-link - component program 0, bpf_link reference
+- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-prog - component program 1, program reference
+- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-link - component program 1, bpf_link reference
+- etc, up to ten component programs
+
+This means that several pin operations have to be performed for each dispatcher
+program. Semantically, these are all atomic, so to make sure every consumer of
+the hierarchy of pinned files gets a consistent view, locking is needed. This is
+implemented by opening the parent directory =/sys/fs/bpf/xdp= with the
+=O_DIRECTORY= flag, and obtaining a lock on the resulting file descriptor using
+=flock(lock_fd, LOCK_EX)=.
+
+When creating a new dispatcher program, it will first be fully populated, with
+all component programs attached. Then, the programs will be linked in =bpffs= as
+specified above, and once this succeeds, the program will be attached to the
+interface. If attaching the program fails, the programs will be unpinned again,
+and the error returned to the caller. This order ensures atomic attachment to
+the interface, without any risk that component programs will be automatically
+detached due to a badly timed application crash.
+
+When loading the initial dispatcher program, the =XDP_FLAGS_UPDATE_IF_NOEXIST=
+flag is set to prevent accidentally overriding any concurrent modifications. If
+this fails, the whole operation starts over, turning the load into a
+modification as described below.
+
+*** Supporting XDP programs with frags support (BPF_F_XDP_HAS_FRAGS flag)
+Linux kernel 5.18 added support for a new API that allows XDP programs to access
+packet data that spans more than a single page, allowing XDP programs to be
+loaded on interfaces with bigger MTUs. Such packets will not have all their
+packet data accessible by the traditional "direct packet access"; instead, only
+the first fragment will be available this way, and the rest of the packet data
+has to be accessed via the new =bpf_xdp_load_bytes()= helper.
+
+Existing XDP programs are written with the assumption that they can see the
+whole packet data using direct packet access, which means they can subtly
+malfunction if some of the packet data is suddenly invisible (for instance,
+counting packet lengths is no longer accurate). Whether a given XDP program
+supports the frags API or not is a semantic issue, and it's not possible for the
+kernel to auto-detect this. For this reason, programs have to opt in to XDP
+frags support at load time, by setting the =BPF_F_XDP_HAS_FRAGS= flag as they
+are loaded into the kernel. Programs that are not loaded with this flag will be
+rejected from attaching to network devices that use packet fragment (i.e., those
+with a large MTU).
+
+This has implications for the XDP dispatcher, as its purpose is for multiple
+programs to be loaded at the same time. Since the =BPF_F_XDP_HAS_FRAGS= cannot
+be set for individual component programs, it has to be set for the dispatcher as
+a whole. However, as described above, programs can subtly malfunction if they
+are exposed to packets with fragments without being ready to do so. This means
+that it's only safe to set the =BPF_F_XDP_HAS_FRAGS= on the dispatcher itself if
+*all* component programs have the flag set.
+
+To properly propagate the flags even when adding new programs to an existing
+dispatcher, the dispatcher itself needs to keep track of which of its component
+programs had the =BPF_F_XDP_HAS_FRAGS= flag set when they were added. The
+dispatcher configuration map users the =program_flags= array for this: for each
+component program, this field is set to the value of the =BPF_F_XDP_HAS_FRAGS=
+flag if that component program has the flag set, and to 0 otherwise. An
+additional field, =is_xdp_frags=, is set if the dispatcher itself is loaded with
+the frags field set (which may not be the case if the kernel doesn't support the
+flag).
+
+When generating a dispatcher for a set of programs, libxdp simply tracks if all
+component programs support the =BPF_F_XDP_HAS_FRAGS=, and if they do, the
+dispatcher is loaded with this flag set. If any program attached to the
+dispatcher does not support the flag, the dispatcher is loaded without this flag
+set (and the =is_xdp_frags= field in the dispatcher configuration is set
+accordingly). If libxdp determines that the running kernel does not support the
+=BPF_F_XDP_HAS_FRAGS=, the dispatcher is loaded without the flag regardless of
+the value of the component programs.
+
+When adding a program to an existing dispatcher, this may result in a
+"downgrade", i.e., loading a new dispatcher without the frags flag to replace an
+existing dispatcher that does have the flag set. This will result in the
+replacement dispatcher being rejected by the kernel at attach time, but only if
+the interface being attached to actually requires the frags flag (i.e., if it
+has a large MTU). If the attachment is rejected, the old dispatcher will stay in
+place, leading to no loss of functionality.
+
+** Adding or removing programs from an existing dispatcher
+The sections above explain how to generate a dispatcher and attach it to an
+interface, assuming no existing program is attached. When one or more programs
+is already attached, a couple of extra steps are required to ensure that the
+switch is made atomically.
+
+Briefly, changing the programs attached to an interface entails the following
+steps:
+
+- Reading the existing dispatcher program and obtaining references to the
+ component programs.
+
+- Generating a new dispatcher containing the new set of programs (adding or
+ removing the programs needed).
+
+- Atomically swapping out the XDP program attachment on the interface so the new
+ dispatcher takes over from the old one.
+
+- Unpinning and dismantling the old dispatcher.
+
+These operations are each described in turn in the following sections.
+
+*** Reading list of existing programs from the kernel
+The first step is to obtain the ID of the currently loaded XDP program using
+=bpf_get_link_xdp_info()=. A file descriptor to the dispatcher is obtained using
+=bpf_prog_get_fd_by_id()=, and the BTF information attached to the program is
+obtained from the kernel. This is checked for the presence of the dispatcher
+version field (as explained above), and the operation is aborted if this is not
+present, or doesn't match what the library expects.
+
+Having thus established that the program loaded on the interface is indeed a
+compatible dispatcher, the map ID of the map containing the configuration struct
+is obtained from the kernel, and the configuration data is loaded from the map
+(after checking that the map value size matches the expected configuration
+struct).
+
+Then, the file lock on the directory in =bpffs= is obtained as explained in
+the "Locking and pinning" section above, and, while holding this lock, file
+descriptors to each of the component programs and =bpf_link= objects are
+obtained. The end result is a reference to the full dispatcher structure (and
+its component programs), corresponding to that generated on load. When
+populating the component program structure in memory, the chain call actions and
+run priority from the dispatcher configuration map is used instead of parsing
+the BTF metadata of each program: This ensures that any modified values
+specified at load time will be retained in stead of being reverted to the
+values compiled into the BTF metadata. Similarly, the =program_flags= array of
+the in-kernel dispatcher is used to determine which of the existing component
+programs support the =BPF_F_XDP_HAS_FRAGS= flag (see the section on frags
+support above).
+
+*** Generating a new dispatcher
+Having obtained a reference to the existing dispatcher, =libxdp= takes that and
+the list of programs to add to or remove from the interface, and simply
+generates a new dispatcher with the new set of programs. When adding programs,
+the whole list of programs is sorted according to their run priorities (as
+explained above), resulting in new programs being inserted in the right place in
+the existing sequence according to their priority.
+
+Generating this secondary dispatcher relies on the support for multiple
+attachments for =freplace= programs, which was added in kernel 5.10. This allows
+the =bpf_link_create()= operation to specify an attachment target in the new
+dispatcher. In other words, the component programs will briefly be attached to
+both the old and new dispatcher, but only one of those will be attached to the
+interface.
+
+After completion of the new dispatcher, its component programs are pinned in
+=bpffs= as described above.
+
+*** Atomic replace and retry
+At this point, =libxdp= has references to both the old dispatcher, already
+attached to the interface, and the new one with the modified set of component
+programs. The new dispatcher is then atomically swapped out with the old one,
+using the =XDP_FLAGS_REPLACE= flag to the netlink operation (and the
+accompanying =IFLA_XDP_EXPECTED_FD= attribute).
+
+Once the atomic replace operation succeeds, the old dispatcher is unpinned from
+=bppfs= and the in-memory references to both the old and new dispatchers are
+released (since the new dispatcher was already pinned, preventing it from being
+detached from the interface).
+
+Should this atomic replace instead *fail* because the program attached to the
+interface changed while the new dispatcher was being built, the whole operation
+is simply started over from the beginning. That is, the new dispatcher is
+unpinned from =bpffs=, and the in-memory references to both dispatchers are
+released (but no unpinning of the old dispatcher is performed!). Then, the
+program ID attached to the interface is again read from the kernel, and the
+operation proceeds from "Reading list of existing programs from the kernel".
+
+
+** Compatibility with older kernels
+The full functionality described above can only be attained with kernels version
+5.10 or newer, because this is the version that introduced support for
+re-attaching an freplace program in a secondary attachment point. However, the
+freplace functionality itself was introduced in kernel 5.7, so for kernel
+versions 5.7 to 5.9, multiple programs can be attached as long as they are all
+attached to the dispatcher immediately as they are loaded. This is achieved by
+using =bpf_raw_tracepoint_open()= in place of =bpf_link_create()= when attaching
+the component programs to the dispatcher. The =bpf_raw_tracepoint_open()=
+function doesn't take an attach target as a parameter; instead, it simply
+attached the freplace program to the target that was specified at load time
+(which is why it only works when all component programs are loaded together with
+the dispatcher).