summaryrefslogtreecommitdiffstats
path: root/lib/libxdp/protocol.org
blob: 2adaf6a0866ae4d7bb74a59a778f6395cc7c80b9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
#+OPTIONS: ^:nil

* Protocol for atomic loading of multi-prog dispatchers

With the support for the =freplace= program type, it is possible to load
multiple XDP programs on a single interface by building a /dispatcher/ program
which will run on the interface, and which will call the component XDP programs
as functions using the =freplace= type.

For this to work in an interoperable way, applications need to agree on how to
attach their XDP programs using this mechanism. This document outlines the
protocol implemented by =libxdp=, serving as both documentation and a blueprint
for anyone else who wants to implement the same protocol and interoperate.

** Generating a dispatcher
The dispatcher is simply an XDP program that will call each of a number of stub
functions in turn, and depending on their return code either continue on to the
next function or return immediately. These stub functions are then replaced at
load time with the user XDP programs, using the =freplace= functionality.

*** Dispatcher format
The dispatcher XDP program contains the main function containing the dispatcher
logic, 10 stub functions that can be replaced by component BPF programs, and a
configuration structure that is used by the dispatcher logic.

In =libxdp=, this dispatcher is generated by [[https://github.com/xdp-project/xdp-tools/blob/master/lib/libxdp/xdp-dispatcher.c.in][an M4 macro file]] which expands to
the following:

#+begin_src C
#define XDP_METADATA_SECTION "xdp_metadata"
#define XDP_DISPATCHER_VERSION 2
#define XDP_DISPATCHER_MAGIC 236
#define XDP_DISPATCHER_RETVAL 31
#define MAX_DISPATCHER_ACTIONS 10

struct xdp_dispatcher_config {
	__u8 magic;                         /* Set to XDP_DISPATCHER_MAGIC */
	__u8 dispatcher_version;            /* Set to XDP_DISPATCHER_VERSION */
	__u8 num_progs_enabled;             /* Number of active program slots */
	__u8 is_xdp_frags;                  /* Whether this dispatcher is loaded with XDP frags support */
	__u32 chain_call_actions[MAX_DISPATCHER_ACTIONS];
	__u32 run_prios[MAX_DISPATCHER_ACTIONS];
	__u32 program_flags[MAX_DISPATCHER_ACTIONS];
};

/* While 'const volatile' sounds a little like an oxymoron, there's reason
 * behind the madness:
 *
 * - const places the data in rodata, where libbpf will mark it as read-only and
 *   frozen on program load, letting the kernel do dead code elimination based
 *   on the values.
 *
 * - volatile prevents the compiler from optimising away the checks based on the
 *   compile-time value of the variables, which is important since we will be
 *   changing the values before loading the program into the kernel.
 */
static volatile const struct xdp_dispatcher_config conf = {};

/* The volatile return value prevents the compiler from assuming it knows the
 * return value and optimising based on that.
 */
__attribute__ ((noinline))
int prog0(struct xdp_md *ctx) {
        volatile int ret = XDP_DISPATCHER_RETVAL;

        if (!ctx)
          return XDP_ABORTED;
        return ret;
}
/* the above is repeated as prog1...prog9 */

SEC("xdp")
int xdp_dispatcher(struct xdp_md *ctx)
{
        __u8 num_progs_enabled = conf.num_progs_enabled;
        int ret;

        if (num_progs_enabled < 1)
                goto out;
        ret = prog0(ctx);
        if (!((1U << ret) & conf.chain_call_actions[0]))
                return ret;

        /* the above is repeated for prog1...prog9 */

out:
        return XDP_PASS;
}

char _license[] SEC("license") = "GPL";
__uint(dispatcher_version, XDP_DISPATCHER_VERSION) SEC(XDP_METADATA_SECTION);
#+end_src

The dispatcher program is pre-compiled and distributed with =libxdp=. Because
the configuration struct is marked as =const= in the source file, it will be put
into the =rodata=, which libbpf will turn into a read-only (frozen) map on load.
This allows the kernel verifier to perform dead code elimination based on the
values in the map. This is also the reason for the =num_progs_enabled= member of
the config struct: together with the checks in the main dispatcher function the
verifier will effectively remove all the stub function calls not being used,
without having to rely on dynamic compilation.

When generating a dispatcher, this BPF object file is opened and the
configuration struct is populated before the object is loaded. As a forward
compatibility measure, =libxdp= will also check for the presence of the
=dispatcher_version= field in the =xdp_metadata= section (encoded like the
program metadata described in "Processing program metadata" below), and if it
doesn't match the expected version (currently version 2), will abort any action.


*** Populating the dispatcher configuration map
On loading, the dispatcher configuration map is populated as follows:

- The =magic= field is set to the =XDP_DISPATCHER_MAGIC= value (236). This field
  is here to make it possible to check if a program is a dispatcher without
  looking at the program BTF in the future.

- The =dispatcher_version= field is set to the current dispatcher version (2).
  This is redundant with the BTF-encoded version in the metadata field, but must
  be checked so that the BTF metadata version can be removed in the future. See
  the section on old dispatcher versions below.

- The =num_progs_enabled= member is simply set to the number of active programs
  that will be attached to this dispatcher.

- The =is_xdp_frags= variable is set to 1 if dispatcher is loaded with XDP frags
  support (see section below), or 0 otherwise.

The two other fields contain per-component program metadata, which is read from
the component programs as explained in the "Processing program metadata" section
below.

- The =chain_call_actions= array is populated with a bitmap signifying which XDP
  actions (return codes) of each component program should be interpreted as a
  signal to continue execution of the next XDP program. For instance, a packet
  filtering program might designate that an =XDP_PASS= action should make
  execution continue, while other return codes should immediately end the call
  chain and return. The special =XDP_DISPATCHER_RETVAL= (which is set to 31
  corresponding to the topmost bit in the bitmap) is always included in each
  programs' =chain_call_actions=; this value is returned by the stub functions,
  which ensures that should a component program become detached, processing
  will always continue past the stub function.

- The =run_prios= array contains the effective run priority of each component
  program when it was installed. This is also read as program metadata, but
  because it can be overridden at load time, the effective value is stored in
  the configuration array so it can be carried forward when the dispatcher is
  replaced. Component programs are expected to be sorted in order of their run
  priority (as explained below in "Loading and attaching component programs").

- The =program_flags= is used to store the flags that an XDP program was loaded
  with. This is populated with the value of the =BPF_F_XDP_HAS_FRAGS= flag if
  the component program in this slot had that flag set (see the section on XDP
  frags support below), and is 0 otherwise.

**** Processing program metadata
As explained above, each component program must specify one or more chain call
actions and a run priority on attach. When loading a user program, =libxdp= will
attempt to read this metadata from the object file as explained in the
following; if no values are found in the object file, a default run priority of
50 will be applied, and =XDP_PASS= will be the only chain call action.

The metadata is read from the object file by looking for BTF-encoded metadata in
the =.xdp_run_config= object section, encoded similar to the BTF-defined maps
used by libbpf (in the =.maps= section). Here, =libxdp= will look for a struct
definition with the XDP program function name prefixed by an underscore (e.g.,
if the main XDP function is called =xdp_main=, libxdp will look for a struct
definition called =_xdp_main=). In this struct, a member =priority= encodes the
run priority, each XDP action can be set as a chain call action by setting a
struct member with the action name.

The =xdp_helpers.h= header file included with XDP exposes helper macros that can
be used with the existing helpers in =bpf_helpers.h= (from libbpf), so a full
run configuration metadata section can be defined as follows:

#+begin_src C
#include <bpf/bpf_helpers.h>
#include <xdp/xdp_helpers.h>

struct {
	__uint(priority, 10);
	__uint(XDP_PASS, 1);
	__uint(XDP_DROP, 1);
} XDP_RUN_CONFIG(my_xdp_func);
#+end_src

This example sets priority 10 with chain call actions =XDP_PASS= and =XDP_DROP=
for the XDP program starting at =my_xdp_func()=.

This turns into the following BTF information (as shown by =bpftool btf dump=):

#+begin_src
[12] STRUCT '(anon)' size=24 vlen=3
	'priority' type_id=13 bits_offset=0
	'XDP_PASS' type_id=15 bits_offset=64
	'XDP_DROP' type_id=15 bits_offset=128
[13] PTR '(anon)' type_id=14
[14] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=10
[15] PTR '(anon)' type_id=16
[16] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=1
[17] VAR '_my_xdp_func' type_id=12, linkage=global-alloc
[18] DATASEC '.xdp_run_config' size=0 vlen=1
	type_id=17 offset=0 size=24
#+end_src

The parser will look for the =.xdp_run_config= DATASEC, then follow the types
recursively, extracting the field values from the =nr_elems= in the anonymous
arrays in type IDs 14 and 16.

While =libxdp= will automatically load any metadata specified as above in the
program BTF, the application using =libxdp= can override these values at
runtime. These overridden values will be the ones used when determining program
order, and will be preserved in the dispatcher configuration map for subsequent
operations.

*** Old versions of the XDP dispatcher
This document currently describes version 2 of the dispatcher and protocol. This
differs from version 1 in the following respects:

- The dispatcher configuration map has gained the =magic= and
  =dispatcher_version= fields for identifying the dispatcher and its version..

- The protocol now supports propagating the value of the =BPF_F_XDP_HAS_FRAGS=
  field for supporting XDP frags programs for higher MTU. The dispatcher
  configuration map has gained the =is_xdp_frags= and =program_flags= fields for
  use with this feature. The protocol for propagating the frags field is
  described below, and an implementation of this protocol that recognises
  version 2 of the dispatcher MUST implement this protocol.

Older versions of libxdp will check the dispatcher version field of any
dispatcher loaded in the kernel, and refuse to operate on a dispatcher with a
higher version than the library version implements. This means that if a newer
dispatcher is loaded, old versions of the library will be locked out of
modifying that dispatcher. This is by design: old library versions don't
recognise the semantics of new features added in subsequent versions, and so
would introduce bugs if it attempted to operate on newer versions.

Newer versions of libxdp will, however, recognise older dispatcher versions. If
a newer version of libxdp loads a new program and finds an old dispatcher
version already loaded on an interface, it will display the programs attached to
it, but will refuse to replace it with a newer version so as not to lock out the
program that loaded the program(s) already attached. Manually unloading the
loaded programs will be required to load a new dispatcher version on the
interface.

*** Loading and attaching component programs
When loading one or more XDP programs onto an interface (assuming no existing
program is found on the interface; for adding programs, see below), =libxdp=
first prepares a dispatcher program with the right number of slots, by
populating the configuration struct as described above. Then, this dispatcher
program is loaded into the kernel, with the =BPF_F_XDP_HAS_FRAGS= flag set if
all component programs have that flag set (see the section on supporting XDP
frags below).

Having loaded the dispatcher program, =libxdp= then loads each of the component
programs. To do this, first the list of component programs is sorted by their
run priority, forming the final run sequence. Should several programs have the
same run priority, ties are broken in the following arbitrary, but
deterministic, order (see =cmp_xdp_programs()= [[https://github.com/xdp-project/xdp-tools/blob/master/lib/libxdp/libxdp.c][in libxdp.c]]):

- By XDP function name (=bpf_program__name()= from libbpf)
- By sorting already-loaded programs before not-yet-loaded ones
- By unloaded programs by program size
- By loaded program bpf tag value (using =memcmp()=)
- By load time

Before loading, each component program type is reset to =BPF_PROG_TYPE_EXT= with
an expected attach type of 0, and the =BPF_F_XDP_HAS_FRAGS= is unset (see the
section on supporting frags below). Then, the attachment target is set to the
dispatcher file descriptor and the BTF ID of the stub function to replace (i.e.,
the first component program has =prog0()= as its target, and so on). Then the
program is loaded, at which point the kernel will verify the component program's
compatibility with the attach point.

Having loaded the component program, it is attached to the dispatcher by way of
=bpf_link_create()=, specifying the same target file description and BTF ID used
when loading the program. This will return a link fd, which will be pinned to
prevent the attachment to unravel when the fd is closed (see "Locking and
pinning" below).

*** Locking and pinning
To prevent the kernel from detaching any =freplace= program when its last file
description is closed, the programs must be pinned in =bpffs=. This is done in
the =xdp= subdirectory of =bpffs=, which by default means =/sys/fs/bpf/xdp=. If
the =LIBXDP_BPFFS= environment variable is set, this will override the location
of the top-level =bpffs=, and the =xdp= subdirectory will be created beneath
this path.

The pathnames generated for pinning are the following:

- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID - dispatcher program for IFINDEX with BPF program ID DID
- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-prog - component program 0, program reference
- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-link - component program 0, bpf_link reference
- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-prog - component program 1, program reference
- /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-link - component program 1, bpf_link reference
- etc, up to ten component programs

This means that several pin operations have to be performed for each dispatcher
program. Semantically, these are all atomic, so to make sure every consumer of
the hierarchy of pinned files gets a consistent view, locking is needed. This is
implemented by opening the parent directory =/sys/fs/bpf/xdp= with the
=O_DIRECTORY= flag, and obtaining a lock on the resulting file descriptor using
=flock(lock_fd, LOCK_EX)=.

When creating a new dispatcher program, it will first be fully populated, with
all component programs attached. Then, the programs will be linked in =bpffs= as
specified above, and once this succeeds, the program will be attached to the
interface. If attaching the program fails, the programs will be unpinned again,
and the error returned to the caller. This order ensures atomic attachment to
the interface, without any risk that component programs will be automatically
detached due to a badly timed application crash.

When loading the initial dispatcher program, the =XDP_FLAGS_UPDATE_IF_NOEXIST=
flag is set to prevent accidentally overriding any concurrent modifications. If
this fails, the whole operation starts over, turning the load into a
modification as described below.

*** Supporting XDP programs with frags support (BPF_F_XDP_HAS_FRAGS flag)
Linux kernel 5.18 added support for a new API that allows XDP programs to access
packet data that spans more than a single page, allowing XDP programs to be
loaded on interfaces with bigger MTUs. Such packets will not have all their
packet data accessible by the traditional "direct packet access"; instead, only
the first fragment will be available this way, and the rest of the packet data
has to be accessed via the new =bpf_xdp_load_bytes()= helper.

Existing XDP programs are written with the assumption that they can see the
whole packet data using direct packet access, which means they can subtly
malfunction if some of the packet data is suddenly invisible (for instance,
counting packet lengths is no longer accurate). Whether a given XDP program
supports the frags API or not is a semantic issue, and it's not possible for the
kernel to auto-detect this. For this reason, programs have to opt in to XDP
frags support at load time, by setting the =BPF_F_XDP_HAS_FRAGS= flag as they
are loaded into the kernel. Programs that are not loaded with this flag will be
rejected from attaching to network devices that use packet fragment (i.e., those
with a large MTU).

This has implications for the XDP dispatcher, as its purpose is for multiple
programs to be loaded at the same time. Since the =BPF_F_XDP_HAS_FRAGS= cannot
be set for individual component programs, it has to be set for the dispatcher as
a whole. However, as described above, programs can subtly malfunction if they
are exposed to packets with fragments without being ready to do so. This means
that it's only safe to set the =BPF_F_XDP_HAS_FRAGS= on the dispatcher itself if
*all* component programs have the flag set.

To properly propagate the flags even when adding new programs to an existing
dispatcher, the dispatcher itself needs to keep track of which of its component
programs had the =BPF_F_XDP_HAS_FRAGS= flag set when they were added. The
dispatcher configuration map users the =program_flags= array for this: for each
component program, this field is set to the value of the =BPF_F_XDP_HAS_FRAGS=
flag if that component program has the flag set, and to 0 otherwise. An
additional field, =is_xdp_frags=, is set if the dispatcher itself is loaded with
the frags field set (which may not be the case if the kernel doesn't support the
flag).

When generating a dispatcher for a set of programs, libxdp simply tracks if all
component programs support the =BPF_F_XDP_HAS_FRAGS=, and if they do, the
dispatcher is loaded with this flag set. If any program attached to the
dispatcher does not support the flag, the dispatcher is loaded without this flag
set (and the =is_xdp_frags= field in the dispatcher configuration is set
accordingly). If libxdp determines that the running kernel does not support the
=BPF_F_XDP_HAS_FRAGS=, the dispatcher is loaded without the flag regardless of
the value of the component programs.

When adding a program to an existing dispatcher, this may result in a
"downgrade", i.e., loading a new dispatcher without the frags flag to replace an
existing dispatcher that does have the flag set. This will result in the
replacement dispatcher being rejected by the kernel at attach time, but only if
the interface being attached to actually requires the frags flag (i.e., if it
has a large MTU). If the attachment is rejected, the old dispatcher will stay in
place, leading to no loss of functionality.

** Adding or removing programs from an existing dispatcher
The sections above explain how to generate a dispatcher and attach it to an
interface, assuming no existing program is attached. When one or more programs
is already attached, a couple of extra steps are required to ensure that the
switch is made atomically.

Briefly, changing the programs attached to an interface entails the following
steps:

- Reading the existing dispatcher program and obtaining references to the
  component programs.

- Generating a new dispatcher containing the new set of programs (adding or
  removing the programs needed).

- Atomically swapping out the XDP program attachment on the interface so the new
  dispatcher takes over from the old one.

- Unpinning and dismantling the old dispatcher.

These operations are each described in turn in the following sections.

*** Reading list of existing programs from the kernel
The first step is to obtain the ID of the currently loaded XDP program using
=bpf_get_link_xdp_info()=. A file descriptor to the dispatcher is obtained using
=bpf_prog_get_fd_by_id()=, and the BTF information attached to the program is
obtained from the kernel. This is checked for the presence of the dispatcher
version field (as explained above), and the operation is aborted if this is not
present, or doesn't match what the library expects.

Having thus established that the program loaded on the interface is indeed a
compatible dispatcher, the map ID of the map containing the configuration struct
is obtained from the kernel, and the configuration data is loaded from the map
(after checking that the map value size matches the expected configuration
struct).

Then, the file lock on the directory in =bpffs= is obtained as explained in
the "Locking and pinning" section above, and, while holding this lock, file
descriptors to each of the component programs and =bpf_link= objects are
obtained. The end result is a reference to the full dispatcher structure (and
its component programs), corresponding to that generated on load. When
populating the component program structure in memory, the chain call actions and
run priority from the dispatcher configuration map is used instead of parsing
the BTF metadata of each program: This ensures that any modified values
specified at load time will be retained in stead of being reverted to the
values compiled into the BTF metadata. Similarly, the =program_flags= array of
the in-kernel dispatcher is used to determine which of the existing component
programs support the =BPF_F_XDP_HAS_FRAGS= flag (see the section on frags
support above).

*** Generating a new dispatcher
Having obtained a reference to the existing dispatcher, =libxdp= takes that and
the list of programs to add to or remove from the interface, and simply
generates a new dispatcher with the new set of programs. When adding programs,
the whole list of programs is sorted according to their run priorities (as
explained above), resulting in new programs being inserted in the right place in
the existing sequence according to their priority.

Generating this secondary dispatcher relies on the support for multiple
attachments for =freplace= programs, which was added in kernel 5.10. This allows
the =bpf_link_create()= operation to specify an attachment target in the new
dispatcher. In other words, the component programs will briefly be attached to
both the old and new dispatcher, but only one of those will be attached to the
interface.

After completion of the new dispatcher, its component programs are pinned in
=bpffs= as described above.

*** Atomic replace and retry
At this point, =libxdp= has references to both the old dispatcher, already
attached to the interface, and the new one with the modified set of component
programs. The new dispatcher is then atomically swapped out with the old one,
using the =XDP_FLAGS_REPLACE= flag to the netlink operation (and the
accompanying =IFLA_XDP_EXPECTED_FD= attribute).

Once the atomic replace operation succeeds, the old dispatcher is unpinned from
=bppfs= and the in-memory references to both the old and new dispatchers are
released (since the new dispatcher was already pinned, preventing it from being
detached from the interface).

Should this atomic replace instead *fail* because the program attached to the
interface changed while the new dispatcher was being built, the whole operation
is simply started over from the beginning. That is, the new dispatcher is
unpinned from =bpffs=, and the in-memory references to both dispatchers are
released (but no unpinning of the old dispatcher is performed!). Then, the
program ID attached to the interface is again read from the kernel, and the
operation proceeds from "Reading list of existing programs from the kernel".


** Compatibility with older kernels
The full functionality described above can only be attained with kernels version
5.10 or newer, because this is the version that introduced support for
re-attaching an freplace program in a secondary attachment point. However, the
freplace functionality itself was introduced in kernel 5.7, so for kernel
versions 5.7 to 5.9, multiple programs can be attached as long as they are all
attached to the dispatcher immediately as they are loaded. This is achieved by
using =bpf_raw_tracepoint_open()= in place of =bpf_link_create()= when attaching
the component programs to the dispatcher. The =bpf_raw_tracepoint_open()=
function doesn't take an attach target as a parameter; instead, it simply
attached the freplace program to the target that was specified at load time
(which is why it only works when all component programs are loaded together with
the dispatcher).