summaryrefslogtreecommitdiffstats
path: root/man2/bpf.2
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:40:15 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:40:15 +0000
commit399644e47874bff147afb19c89228901ac39340e (patch)
tree1c4c0b733f4c16b5783b41bebb19194a9ef62ad1 /man2/bpf.2
parentInitial commit. (diff)
downloadmanpages-399644e47874bff147afb19c89228901ac39340e.tar.xz
manpages-399644e47874bff147afb19c89228901ac39340e.zip
Adding upstream version 6.05.01.upstream/6.05.01
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'man2/bpf.2')
-rw-r--r--man2/bpf.21273
1 files changed, 1273 insertions, 0 deletions
diff --git a/man2/bpf.2 b/man2/bpf.2
new file mode 100644
index 0000000..4df108d
--- /dev/null
+++ b/man2/bpf.2
@@ -0,0 +1,1273 @@
+.\" Copyright (C) 2015 Alexei Starovoitov <ast@kernel.org>
+.\" and Copyright (C) 2015 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH bpf 2 2023-07-28 "Linux man-pages 6.05.01"
+.SH NAME
+bpf \- perform a command on an extended BPF map or program
+.SH SYNOPSIS
+.nf
+.B #include <linux/bpf.h>
+.PP
+.BI "int bpf(int " cmd ", union bpf_attr *" attr ", unsigned int " size );
+.fi
+.SH DESCRIPTION
+The
+.BR bpf ()
+system call performs a range of operations related to extended
+Berkeley Packet Filters.
+Extended BPF (or eBPF) is similar to
+the original ("classic") BPF (cBPF) used to filter network packets.
+For both cBPF and eBPF programs,
+the kernel statically analyzes the programs before loading them,
+in order to ensure that they cannot harm the running system.
+.PP
+eBPF extends cBPF in multiple ways, including the ability to call
+a fixed set of in-kernel helper functions
+.\" See 'enum bpf_func_id' in include/uapi/linux/bpf.h
+(via the
+.B BPF_CALL
+opcode extension provided by eBPF)
+and access shared data structures such as eBPF maps.
+.\"
+.SS Extended BPF Design/Architecture
+eBPF maps are a generic data structure for storage of different data types.
+Data types are generally treated as binary blobs, so a user just specifies
+the size of the key and the size of the value at map-creation time.
+In other words, a key/value for a given map can have an arbitrary structure.
+.PP
+A user process can create multiple maps (with key/value-pairs being
+opaque bytes of data) and access them via file descriptors.
+Different eBPF programs can access the same maps in parallel.
+It's up to the user process and eBPF program to decide what they store
+inside maps.
+.PP
+There's one special map type, called a program array.
+This type of map stores file descriptors referring to other eBPF programs.
+When a lookup in the map is performed, the program flow is
+redirected in-place to the beginning of another eBPF program and does not
+return back to the calling program.
+The level of nesting has a fixed limit of 32,
+.\" Defined by the kernel constant MAX_TAIL_CALL_CNT in include/linux/bpf.h
+so that infinite loops cannot be crafted.
+At run time, the program file descriptors stored in the map can be modified,
+so program functionality can be altered based on specific requirements.
+All programs referred to in a program-array map must
+have been previously loaded into the kernel via
+.BR bpf ().
+If a map lookup fails, the current program continues its execution.
+See
+.B BPF_MAP_TYPE_PROG_ARRAY
+below for further details.
+.PP
+Generally, eBPF programs are loaded by the user process and automatically
+unloaded when the process exits.
+In some cases, for example,
+.BR tc\-bpf (8),
+the program will continue to stay alive inside the kernel even after the
+process that loaded the program exits.
+In that case,
+the tc subsystem holds a reference to the eBPF program after the
+file descriptor has been closed by the user-space program.
+Thus, whether a specific program continues to live inside the kernel
+depends on how it is further attached to a given kernel subsystem
+after it was loaded via
+.BR bpf ().
+.PP
+Each eBPF program is a set of instructions that is safe to run until
+its completion.
+An in-kernel verifier statically determines that the eBPF program
+terminates and is safe to execute.
+During verification, the kernel increments reference counts for each of
+the maps that the eBPF program uses,
+so that the attached maps can't be removed until the program is unloaded.
+.PP
+eBPF programs can be attached to different events.
+These events can be the arrival of network packets, tracing
+events, classification events by network queueing disciplines
+(for eBPF programs attached to a
+.BR tc (8)
+classifier), and other types that may be added in the future.
+A new event triggers execution of the eBPF program, which
+may store information about the event in eBPF maps.
+Beyond storing data, eBPF programs may call a fixed set of
+in-kernel helper functions.
+.PP
+The same eBPF program can be attached to multiple events and different
+eBPF programs can access the same map:
+.PP
+.in +4n
+.EX
+tracing tracing tracing packet packet packet
+event A event B event C on eth0 on eth1 on eth2
+ | | | | | \[ha]
+ | | | | v |
+ \-\-> tracing <\-\- tracing socket tc ingress tc egress
+ prog_1 prog_2 prog_3 classifier action
+ | | | | prog_4 prog_5
+ |\-\-\- \-\-\-\-\-| |\-\-\-\-\-\-| map_3 | |
+ map_1 map_2 \-\-| map_4 |\-\-
+.EE
+.in
+.\"
+.SS Arguments
+The operation to be performed by the
+.BR bpf ()
+system call is determined by the
+.I cmd
+argument.
+Each operation takes an accompanying argument,
+provided via
+.IR attr ,
+which is a pointer to a union of type
+.I bpf_attr
+(see below).
+The unused fields and padding must be zeroed out before the call.
+The
+.I size
+argument is the size of the union pointed to by
+.IR attr .
+.PP
+The value provided in
+.I cmd
+is one of the following:
+.TP
+.B BPF_MAP_CREATE
+Create a map and return a file descriptor that refers to the map.
+The close-on-exec file descriptor flag (see
+.BR fcntl (2))
+is automatically enabled for the new file descriptor.
+.TP
+.B BPF_MAP_LOOKUP_ELEM
+Look up an element by key in a specified map and return its value.
+.TP
+.B BPF_MAP_UPDATE_ELEM
+Create or update an element (key/value pair) in a specified map.
+.TP
+.B BPF_MAP_DELETE_ELEM
+Look up and delete an element by key in a specified map.
+.TP
+.B BPF_MAP_GET_NEXT_KEY
+Look up an element by key in a specified map and return the key
+of the next element.
+.TP
+.B BPF_PROG_LOAD
+Verify and load an eBPF program,
+returning a new file descriptor associated with the program.
+The close-on-exec file descriptor flag (see
+.BR fcntl (2))
+is automatically enabled for the new file descriptor.
+.IP
+The
+.I bpf_attr
+union consists of various anonymous structures that are used by different
+.BR bpf ()
+commands:
+.PP
+.in +4n
+.EX
+union bpf_attr {
+ struct { /* Used by BPF_MAP_CREATE */
+ __u32 map_type;
+ __u32 key_size; /* size of key in bytes */
+ __u32 value_size; /* size of value in bytes */
+ __u32 max_entries; /* maximum number of entries
+ in a map */
+ };
+\&
+ struct { /* Used by BPF_MAP_*_ELEM and BPF_MAP_GET_NEXT_KEY
+ commands */
+ __u32 map_fd;
+ __aligned_u64 key;
+ union {
+ __aligned_u64 value;
+ __aligned_u64 next_key;
+ };
+ __u64 flags;
+ };
+\&
+ struct { /* Used by BPF_PROG_LOAD */
+ __u32 prog_type;
+ __u32 insn_cnt;
+ __aligned_u64 insns; /* \[aq]const struct bpf_insn *\[aq] */
+ __aligned_u64 license; /* \[aq]const char *\[aq] */
+ __u32 log_level; /* verbosity level of verifier */
+ __u32 log_size; /* size of user buffer */
+ __aligned_u64 log_buf; /* user supplied \[aq]char *\[aq]
+ buffer */
+ __u32 kern_version;
+ /* checked when prog_type=kprobe
+ (since Linux 4.1) */
+.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
+ };
+} __attribute__((aligned(8)));
+.EE
+.in
+.\"
+.SS eBPF maps
+Maps are a generic data structure for storage of different types of data.
+They allow sharing of data between eBPF kernel programs,
+and also between kernel and user-space applications.
+.PP
+Each map type has the following attributes:
+.IP \[bu] 3
+type
+.IP \[bu]
+maximum number of elements
+.IP \[bu]
+key size in bytes
+.IP \[bu]
+value size in bytes
+.PP
+The following wrapper functions demonstrate how various
+.BR bpf ()
+commands can be used to access the maps.
+The functions use the
+.I cmd
+argument to invoke different operations.
+.TP
+.B BPF_MAP_CREATE
+The
+.B BPF_MAP_CREATE
+command creates a new map,
+returning a new file descriptor that refers to the map.
+.IP
+.in +4n
+.EX
+int
+bpf_create_map(enum bpf_map_type map_type,
+ unsigned int key_size,
+ unsigned int value_size,
+ unsigned int max_entries)
+{
+ union bpf_attr attr = {
+ .map_type = map_type,
+ .key_size = key_size,
+ .value_size = value_size,
+ .max_entries = max_entries
+ };
+\&
+ return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+The new map has the type specified by
+.IR map_type ,
+and attributes as specified in
+.IR key_size ,
+.IR value_size ,
+and
+.IR max_entries .
+On success, this operation returns a file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to
+.BR EINVAL ,
+.BR EPERM ,
+or
+.BR ENOMEM .
+.IP
+The
+.I key_size
+and
+.I value_size
+attributes will be used by the verifier during program loading
+to check that the program is calling
+.BR bpf_map_*_elem ()
+helper functions with a correctly initialized
+.I key
+and to check that the program doesn't access the map element
+.I value
+beyond the specified
+.IR value_size .
+For example, when a map is created with a
+.I key_size
+of 8 and the eBPF program calls
+.IP
+.in +4n
+.EX
+bpf_map_lookup_elem(map_fd, fp \- 4)
+.EE
+.in
+.IP
+the program will be rejected,
+since the in-kernel helper function
+.IP
+.in +4n
+.EX
+bpf_map_lookup_elem(map_fd, void *key)
+.EE
+.in
+.IP
+expects to read 8 bytes from the location pointed to by
+.IR key ,
+but the
+.I fp\ \-\ 4
+(where
+.I fp
+is the top of the stack)
+starting address will cause out-of-bounds stack access.
+.IP
+Similarly, when a map is created with a
+.I value_size
+of 1 and the eBPF program contains
+.IP
+.in +4n
+.EX
+value = bpf_map_lookup_elem(...);
+*(u32 *) value = 1;
+.EE
+.in
+.IP
+the program will be rejected, since it accesses the
+.I value
+pointer beyond the specified 1 byte
+.I value_size
+limit.
+.IP
+Currently, the following values are supported for
+.IR map_type :
+.IP
+.in +4n
+.EX
+enum bpf_map_type {
+ BPF_MAP_TYPE_UNSPEC, /* Reserve 0 as invalid map type */
+ BPF_MAP_TYPE_HASH,
+ BPF_MAP_TYPE_ARRAY,
+ BPF_MAP_TYPE_PROG_ARRAY,
+ BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ BPF_MAP_TYPE_PERCPU_HASH,
+ BPF_MAP_TYPE_PERCPU_ARRAY,
+ BPF_MAP_TYPE_STACK_TRACE,
+ BPF_MAP_TYPE_CGROUP_ARRAY,
+ BPF_MAP_TYPE_LRU_HASH,
+ BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ BPF_MAP_TYPE_LPM_TRIE,
+ BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ BPF_MAP_TYPE_HASH_OF_MAPS,
+ BPF_MAP_TYPE_DEVMAP,
+ BPF_MAP_TYPE_SOCKMAP,
+ BPF_MAP_TYPE_CPUMAP,
+ BPF_MAP_TYPE_XSKMAP,
+ BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_CGROUP_STORAGE,
+ BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+ BPF_MAP_TYPE_QUEUE,
+ BPF_MAP_TYPE_STACK,
+ /* See /usr/include/linux/bpf.h for the full list. */
+};
+.EE
+.in
+.IP
+.I map_type
+selects one of the available map implementations in the kernel.
+.\" FIXME We need an explanation of why one might choose each of
+.\" these map implementations
+For all map types,
+eBPF programs access maps with the same
+.BR bpf_map_lookup_elem ()
+and
+.BR bpf_map_update_elem ()
+helper functions.
+Further details of the various map types are given below.
+.TP
+.B BPF_MAP_LOOKUP_ELEM
+The
+.B BPF_MAP_LOOKUP_ELEM
+command looks up an element with a given
+.I key
+in the map referred to by the file descriptor
+.IR fd .
+.IP
+.in +4n
+.EX
+int
+bpf_lookup_elem(int fd, const void *key, void *value)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ .value = ptr_to_u64(value),
+ };
+\&
+ return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+If an element is found,
+the operation returns zero and stores the element's value into
+.IR value ,
+which must point to a buffer of
+.I value_size
+bytes.
+.IP
+If no element is found, the operation returns \-1 and sets
+.I errno
+to
+.BR ENOENT .
+.TP
+.B BPF_MAP_UPDATE_ELEM
+The
+.B BPF_MAP_UPDATE_ELEM
+command
+creates or updates an element with a given
+.I key/value
+in the map referred to by the file descriptor
+.IR fd .
+.IP
+.in +4n
+.EX
+int
+bpf_update_elem(int fd, const void *key, const void *value,
+ uint64_t flags)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ .value = ptr_to_u64(value),
+ .flags = flags,
+ };
+\&
+ return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+The
+.I flags
+argument should be specified as one of the following:
+.RS
+.TP
+.B BPF_ANY
+Create a new element or update an existing element.
+.TP
+.B BPF_NOEXIST
+Create a new element only if it did not exist.
+.TP
+.B BPF_EXIST
+Update an existing element.
+.RE
+.IP
+On success, the operation returns zero.
+On error, \-1 is returned and
+.I errno
+is set to
+.BR EINVAL ,
+.BR EPERM ,
+.BR ENOMEM ,
+or
+.BR E2BIG .
+.B E2BIG
+indicates that the number of elements in the map reached the
+.I max_entries
+limit specified at map creation time.
+.B EEXIST
+will be returned if
+.I flags
+specifies
+.B BPF_NOEXIST
+and the element with
+.I key
+already exists in the map.
+.B ENOENT
+will be returned if
+.I flags
+specifies
+.B BPF_EXIST
+and the element with
+.I key
+doesn't exist in the map.
+.TP
+.B BPF_MAP_DELETE_ELEM
+The
+.B BPF_MAP_DELETE_ELEM
+command
+deletes the element whose key is
+.I key
+from the map referred to by the file descriptor
+.IR fd .
+.IP
+.in +4n
+.EX
+int
+bpf_delete_elem(int fd, const void *key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ };
+\&
+ return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+On success, zero is returned.
+If the element is not found, \-1 is returned and
+.I errno
+is set to
+.BR ENOENT .
+.TP
+.B BPF_MAP_GET_NEXT_KEY
+The
+.B BPF_MAP_GET_NEXT_KEY
+command looks up an element by
+.I key
+in the map referred to by the file descriptor
+.I fd
+and sets the
+.I next_key
+pointer to the key of the next element.
+.IP
+.in +4n
+.EX
+int
+bpf_get_next_key(int fd, const void *key, void *next_key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ .next_key = ptr_to_u64(next_key),
+ };
+\&
+ return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+If
+.I key
+is found, the operation returns zero and sets the
+.I next_key
+pointer to the key of the next element.
+If
+.I key
+is not found, the operation returns zero and sets the
+.I next_key
+pointer to the key of the first element.
+If
+.I key
+is the last element, \-1 is returned and
+.I errno
+is set to
+.BR ENOENT .
+Other possible
+.I errno
+values are
+.BR ENOMEM ,
+.BR EFAULT ,
+.BR EPERM ,
+and
+.BR EINVAL .
+This method can be used to iterate over all elements in the map.
+.TP
+.B close(map_fd)
+Delete the map referred to by the file descriptor
+.IR map_fd .
+When the user-space program that created a map exits, all maps will
+be deleted automatically (but see NOTES).
+.\"
+.SS eBPF map types
+The following map types are supported:
+.TP
+.B BPF_MAP_TYPE_HASH
+.\" commit 0f8e4bd8a1fc8c4185f1630061d0a1f2d197a475
+Hash-table maps have the following characteristics:
+.RS
+.IP \[bu] 3
+Maps are created and destroyed by user-space programs.
+Both user-space and eBPF programs
+can perform lookup, update, and delete operations.
+.IP \[bu]
+The kernel takes care of allocating and freeing key/value pairs.
+.IP \[bu]
+The
+.BR map_update_elem ()
+helper will fail to insert new element when the
+.I max_entries
+limit is reached.
+(This ensures that eBPF programs cannot exhaust memory.)
+.IP \[bu]
+.BR map_update_elem ()
+replaces existing elements atomically.
+.RE
+.IP
+Hash-table maps are
+optimized for speed of lookup.
+.TP
+.B BPF_MAP_TYPE_ARRAY
+.\" commit 28fbcfa08d8ed7c5a50d41a0433aad222835e8e3
+Array maps have the following characteristics:
+.RS
+.IP \[bu] 3
+Optimized for fastest possible lookup.
+In the future the verifier/JIT compiler
+may recognize lookup() operations that employ a constant key
+and optimize it into constant pointer.
+It is possible to optimize a non-constant
+key into direct pointer arithmetic as well, since pointers and
+.I value_size
+are constant for the life of the eBPF program.
+In other words,
+.BR array_map_lookup_elem ()
+may be 'inlined' by the verifier/JIT compiler
+while preserving concurrent access to this map from user space.
+.IP \[bu]
+All array elements pre-allocated and zero initialized at init time
+.IP \[bu]
+The key is an array index, and must be exactly four bytes.
+.IP \[bu]
+.BR map_delete_elem ()
+fails with the error
+.BR EINVAL ,
+since elements cannot be deleted.
+.IP \[bu]
+.BR map_update_elem ()
+replaces elements in a
+.B nonatomic
+fashion;
+for atomic updates, a hash-table map should be used instead.
+There is however one special case that can also be used with arrays:
+the atomic built-in
+.B __sync_fetch_and_add()
+can be used on 32 and 64 bit atomic counters.
+For example, it can be
+applied on the whole value itself if it represents a single counter,
+or in case of a structure containing multiple counters, it could be
+used on individual counters.
+This is quite often useful for aggregation and accounting of events.
+.RE
+.IP
+Among the uses for array maps are the following:
+.RS
+.IP \[bu] 3
+As "global" eBPF variables: an array of 1 element whose key is (index) 0
+and where the value is a collection of 'global' variables which
+eBPF programs can use to keep state between events.
+.IP \[bu]
+Aggregation of tracing events into a fixed set of buckets.
+.IP \[bu]
+Accounting of networking events, for example, number of packets and packet
+sizes.
+.RE
+.TP
+.BR BPF_MAP_TYPE_PROG_ARRAY " (since Linux 4.2)"
+A program array map is a special kind of array map whose map values
+contain only file descriptors referring to other eBPF programs.
+Thus, both the
+.I key_size
+and
+.I value_size
+must be exactly four bytes.
+This map is used in conjunction with the
+.BR bpf_tail_call ()
+helper.
+.IP
+This means that an eBPF program with a program array map attached to it
+can call from kernel side into
+.IP
+.in +4n
+.EX
+void bpf_tail_call(void *context, void *prog_map,
+ unsigned int index);
+.EE
+.in
+.IP
+and therefore replace its own program flow with the one from the program
+at the given program array slot, if present.
+This can be regarded as kind of a jump table to a different eBPF program.
+The invoked program will then reuse the same stack.
+When a jump into the new program has been performed,
+it won't return to the old program anymore.
+.IP
+If no eBPF program is found at the given index of the program array
+(because the map slot doesn't contain a valid program file descriptor,
+the specified lookup index/key is out of bounds,
+or the limit of 32
+.\" MAX_TAIL_CALL_CNT
+nested calls has been exceed),
+execution continues with the current eBPF program.
+This can be used as a fall-through for default cases.
+.IP
+A program array map is useful, for example, in tracing or networking, to
+handle individual system calls or protocols in their own subprograms and
+use their identifiers as an individual map index.
+This approach may result in performance benefits,
+and also makes it possible to overcome the maximum
+instruction limit of a single eBPF program.
+In dynamic environments,
+a user-space daemon might atomically replace individual subprograms
+at run-time with newer versions to alter overall program behavior,
+for instance, if global policies change.
+.\"
+.SS eBPF programs
+The
+.B BPF_PROG_LOAD
+command is used to load an eBPF program into the kernel.
+The return value for this command is a new file descriptor associated
+with this eBPF program.
+.PP
+.in +4n
+.EX
+char bpf_log_buf[LOG_BUF_SIZE];
+\&
+int
+bpf_prog_load(enum bpf_prog_type type,
+ const struct bpf_insn *insns, int insn_cnt,
+ const char *license)
+{
+ union bpf_attr attr = {
+ .prog_type = type,
+ .insns = ptr_to_u64(insns),
+ .insn_cnt = insn_cnt,
+ .license = ptr_to_u64(license),
+ .log_buf = ptr_to_u64(bpf_log_buf),
+ .log_size = LOG_BUF_SIZE,
+ .log_level = 1,
+ };
+\&
+ return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+}
+.EE
+.in
+.PP
+.I prog_type
+is one of the available program types:
+.IP
+.in +4n
+.EX
+enum bpf_prog_type {
+ BPF_PROG_TYPE_UNSPEC, /* Reserve 0 as invalid
+ program type */
+ BPF_PROG_TYPE_SOCKET_FILTER,
+ BPF_PROG_TYPE_KPROBE,
+ BPF_PROG_TYPE_SCHED_CLS,
+ BPF_PROG_TYPE_SCHED_ACT,
+ BPF_PROG_TYPE_TRACEPOINT,
+ BPF_PROG_TYPE_XDP,
+ BPF_PROG_TYPE_PERF_EVENT,
+ BPF_PROG_TYPE_CGROUP_SKB,
+ BPF_PROG_TYPE_CGROUP_SOCK,
+ BPF_PROG_TYPE_LWT_IN,
+ BPF_PROG_TYPE_LWT_OUT,
+ BPF_PROG_TYPE_LWT_XMIT,
+ BPF_PROG_TYPE_SOCK_OPS,
+ BPF_PROG_TYPE_SK_SKB,
+ BPF_PROG_TYPE_CGROUP_DEVICE,
+ BPF_PROG_TYPE_SK_MSG,
+ BPF_PROG_TYPE_RAW_TRACEPOINT,
+ BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_PROG_TYPE_LWT_SEG6LOCAL,
+ BPF_PROG_TYPE_LIRC_MODE2,
+ BPF_PROG_TYPE_SK_REUSEPORT,
+ BPF_PROG_TYPE_FLOW_DISSECTOR,
+ /* See /usr/include/linux/bpf.h for the full list. */
+};
+.EE
+.in
+.PP
+For further details of eBPF program types, see below.
+.PP
+The remaining fields of
+.I bpf_attr
+are set as follows:
+.IP \[bu] 3
+.I insns
+is an array of
+.I "struct bpf_insn"
+instructions.
+.IP \[bu]
+.I insn_cnt
+is the number of instructions in the program referred to by
+.IR insns .
+.IP \[bu]
+.I license
+is a license string, which must be GPL compatible to call helper functions
+marked
+.IR gpl_only .
+(The licensing rules are the same as for kernel modules,
+so that also dual licenses, such as "Dual BSD/GPL", may be used.)
+.IP \[bu]
+.I log_buf
+is a pointer to a caller-allocated buffer in which the in-kernel
+verifier can store the verification log.
+This log is a multi-line string that can be checked by
+the program author in order to understand how the verifier came to
+the conclusion that the eBPF program is unsafe.
+The format of the output can change at any time as the verifier evolves.
+.IP \[bu]
+.I log_size
+size of the buffer pointed to by
+.IR log_buf .
+If the size of the buffer is not large enough to store all
+verifier messages, \-1 is returned and
+.I errno
+is set to
+.BR ENOSPC .
+.IP \[bu]
+.I log_level
+verbosity level of the verifier.
+A value of zero means that the verifier will not provide a log;
+in this case,
+.I log_buf
+must be a NULL pointer, and
+.I log_size
+must be zero.
+.PP
+Applying
+.BR close (2)
+to the file descriptor returned by
+.B BPF_PROG_LOAD
+will unload the eBPF program (but see NOTES).
+.PP
+Maps are accessible from eBPF programs and are used to exchange data between
+eBPF programs and between eBPF programs and user-space programs.
+For example,
+eBPF programs can process various events (like kprobe, packets) and
+store their data into a map,
+and user-space programs can then fetch data from the map.
+Conversely, user-space programs can use a map as a configuration mechanism,
+populating the map with values checked by the eBPF program,
+which then modifies its behavior on the fly according to those values.
+.\"
+.\"
+.SS eBPF program types
+The eBPF program type
+.RI ( prog_type )
+determines the subset of kernel helper functions that the program
+may call.
+The program type also determines the program input (context)\[em]the
+format of
+.I "struct bpf_context"
+(which is the data blob passed into the eBPF program as the first argument).
+.\"
+.\" FIXME
+.\" Somewhere in this page we need a general introduction to the
+.\" bpf_context. For example, how does a BPF program access the
+.\" context?
+.PP
+For example, a tracing program does not have the exact same
+subset of helper functions as a socket filter program
+(though they may have some helpers in common).
+Similarly,
+the input (context) for a tracing program is a set of register values,
+while for a socket filter it is a network packet.
+.PP
+The set of functions available to eBPF programs of a given type may increase
+in the future.
+.PP
+The following program types are supported:
+.TP
+.BR BPF_PROG_TYPE_SOCKET_FILTER " (since Linux 3.19)"
+Currently, the set of functions for
+.B BPF_PROG_TYPE_SOCKET_FILTER
+is:
+.IP
+.in +4n
+.EX
+bpf_map_lookup_elem(map_fd, void *key)
+ /* look up key in a map_fd */
+bpf_map_update_elem(map_fd, void *key, void *value)
+ /* update key/value */
+bpf_map_delete_elem(map_fd, void *key)
+ /* delete key in a map_fd */
+.EE
+.in
+.IP
+The
+.I bpf_context
+argument is a pointer to a
+.IR "struct __sk_buff" .
+.\" FIXME: We need some text here to explain how the program
+.\" accesses __sk_buff.
+.\" See 'struct __sk_buff' and commit 9bac3d6d548e5
+.\"
+.\" Alexei commented:
+.\" Actually now in case of SOCKET_FILTER, SCHED_CLS, SCHED_ACT
+.\" the program can now access skb fields.
+.\"
+.TP
+.BR BPF_PROG_TYPE_KPROBE " (since Linux 4.1)"
+.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
+[To be documented]
+.\" FIXME Document this program type
+.\" Describe allowed helper functions for this program type
+.\" Describe bpf_context for this program type
+.\"
+.\" FIXME We need text here to describe 'kern_version'
+.TP
+.BR BPF_PROG_TYPE_SCHED_CLS " (since Linux 4.1)"
+.\" commit 96be4325f443dbbfeb37d2a157675ac0736531a1
+.\" commit e2e9b6541dd4b31848079da80fe2253daaafb549
+[To be documented]
+.\" FIXME Document this program type
+.\" Describe allowed helper functions for this program type
+.\" Describe bpf_context for this program type
+.TP
+.BR BPF_PROG_TYPE_SCHED_ACT " (since Linux 4.1)"
+.\" commit 94caee8c312d96522bcdae88791aaa9ebcd5f22c
+.\" commit a8cb5f556b567974d75ea29c15181c445c541b1f
+[To be documented]
+.\" FIXME Document this program type
+.\" Describe allowed helper functions for this program type
+.\" Describe bpf_context for this program type
+.SS Events
+Once a program is loaded, it can be attached to an event.
+Various kernel subsystems have different ways to do so.
+.PP
+Since Linux 3.19,
+.\" commit 89aa075832b0da4402acebd698d0411dcc82d03e
+the following call will attach the program
+.I prog_fd
+to the socket
+.IR sockfd ,
+which was created by an earlier call to
+.BR socket (2):
+.PP
+.in +4n
+.EX
+setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_BPF,
+ &prog_fd, sizeof(prog_fd));
+.EE
+.in
+.PP
+Since Linux 4.1,
+.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
+the following call may be used to attach
+the eBPF program referred to by the file descriptor
+.I prog_fd
+to a perf event file descriptor,
+.IR event_fd ,
+that was created by a previous call to
+.BR perf_event_open (2):
+.PP
+.in +4n
+.EX
+ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+.EE
+.in
+.\"
+.\"
+.SH RETURN VALUE
+For a successful call, the return value depends on the operation:
+.TP
+.B BPF_MAP_CREATE
+The new file descriptor associated with the eBPF map.
+.TP
+.B BPF_PROG_LOAD
+The new file descriptor associated with the eBPF program.
+.TP
+All other commands
+Zero.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B E2BIG
+The eBPF program is too large or a map reached the
+.I max_entries
+limit (maximum number of elements).
+.TP
+.B EACCES
+For
+.BR BPF_PROG_LOAD ,
+even though all program instructions are valid, the program has been
+rejected because it was deemed unsafe.
+This may be because it may have
+accessed a disallowed memory region or an uninitialized stack/register or
+because the function constraints don't match the actual types or because
+there was a misaligned memory access.
+In this case, it is recommended to call
+.BR bpf ()
+again with
+.I log_level = 1
+and examine
+.I log_buf
+for the specific reason provided by the verifier.
+.TP
+.B EAGAIN
+For
+.BR BPF_PROG_LOAD ,
+indicates that needed resources are blocked.
+This happens when the verifier detects pending signals
+while it is checking the validity of the bpf program.
+In this case, just call
+.BR bpf ()
+again with the same parameters.
+.TP
+.B EBADF
+.I fd
+is not an open file descriptor.
+.TP
+.B EFAULT
+One of the pointers
+.RI ( key
+or
+.I value
+or
+.I log_buf
+or
+.IR insns )
+is outside the accessible address space.
+.TP
+.B EINVAL
+The value specified in
+.I cmd
+is not recognized by this kernel.
+.TP
+.B EINVAL
+For
+.BR BPF_MAP_CREATE ,
+either
+.I map_type
+or attributes are invalid.
+.TP
+.B EINVAL
+For
+.B BPF_MAP_*_ELEM
+commands,
+some of the fields of
+.I "union bpf_attr"
+that are not used by this command
+are not set to zero.
+.TP
+.B EINVAL
+For
+.BR BPF_PROG_LOAD ,
+indicates an attempt to load an invalid program.
+eBPF programs can be deemed
+invalid due to unrecognized instructions, the use of reserved fields, jumps
+out of range, infinite loops or calls of unknown functions.
+.TP
+.B ENOENT
+For
+.B BPF_MAP_LOOKUP_ELEM
+or
+.BR BPF_MAP_DELETE_ELEM ,
+indicates that the element with the given
+.I key
+was not found.
+.TP
+.B ENOMEM
+Cannot allocate sufficient memory.
+.TP
+.B EPERM
+The call was made without sufficient privilege
+(without the
+.B CAP_SYS_ADMIN
+capability).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.18.
+.SH NOTES
+Prior to Linux 4.4, all
+.BR bpf ()
+commands require the caller to have the
+.B CAP_SYS_ADMIN
+capability.
+From Linux 4.4 onwards,
+.\" commit 1be7f75d1668d6296b80bf35dcf6762393530afc
+an unprivileged user may create limited programs of type
+.B BPF_PROG_TYPE_SOCKET_FILTER
+and associated maps.
+However they may not store kernel pointers within
+the maps and are presently limited to the following helper functions:
+.\" [Linux 5.6] mtk: The list of available functions is, I think, governed
+.\" by the check in net/core/filter.c::bpf_base_func_proto().
+.IP \[bu] 3
+get_random
+.PD 0
+.IP \[bu]
+get_smp_processor_id
+.IP \[bu]
+tail_call
+.IP \[bu]
+ktime_get_ns
+.PD
+.PP
+Unprivileged access may be blocked by writing the value 1 to the file
+.IR /proc/sys/kernel/unprivileged_bpf_disabled .
+.PP
+eBPF objects (maps and programs) can be shared between processes.
+For example, after
+.BR fork (2),
+the child inherits file descriptors referring to the same eBPF objects.
+In addition, file descriptors referring to eBPF objects can be
+transferred over UNIX domain sockets.
+File descriptors referring to eBPF objects can be duplicated
+in the usual way, using
+.BR dup (2)
+and similar calls.
+An eBPF object is deallocated only after all file descriptors
+referring to the object have been closed.
+.PP
+eBPF programs can be written in a restricted C that is compiled (using the
+.B clang
+compiler) into eBPF bytecode.
+Various features are omitted from this restricted C, such as loops,
+global variables, variadic functions, floating-point numbers,
+and passing structures as function arguments.
+Some examples can be found in the
+.I samples/bpf/*_kern.c
+files in the kernel source tree.
+.\" There are also examples for the tc classifier, in the iproute2
+.\" project, in examples/bpf
+.PP
+The kernel contains a just-in-time (JIT) compiler that translates
+eBPF bytecode into native machine code for better performance.
+Before Linux 4.15,
+the JIT compiler is disabled by default,
+but its operation can be controlled by writing one of the
+following integer strings to the file
+.IR /proc/sys/net/core/bpf_jit_enable :
+.TP
+.B 0
+Disable JIT compilation (default).
+.TP
+.B 1
+Normal compilation.
+.TP
+.B 2
+Debugging mode.
+The generated opcodes are dumped in hexadecimal into the kernel log.
+These opcodes can then be disassembled using the program
+.I tools/net/bpf_jit_disasm.c
+provided in the kernel source tree.
+.PP
+Since Linux 4.15,
+.\" commit 290af86629b25ffd1ed6232c4e9107da031705cb
+the kernel may configured with the
+.B CONFIG_BPF_JIT_ALWAYS_ON
+option.
+In this case, the JIT compiler is always enabled, and the
+.I bpf_jit_enable
+is initialized to 1 and is immutable.
+(This kernel configuration option was provided as a mitigation for
+one of the Spectre attacks against the BPF interpreter.)
+.PP
+The JIT compiler for eBPF is currently
+.\" Last reviewed in Linux 4.18-rc by grepping for BPF_ALU64 in arch/
+.\" and by checking the documentation for bpf_jit_enable in
+.\" Documentation/sysctl/net.txt
+available for the following architectures:
+.IP \[bu] 3
+x86-64 (since Linux 3.18; cBPF since Linux 3.0);
+.\" commit 0a14842f5a3c0e88a1e59fac5c3025db39721f74
+.PD 0
+.IP \[bu]
+ARM32 (since Linux 3.18; cBPF since Linux 3.4);
+.\" commit ddecdfcea0ae891f782ae853771c867ab51024c2
+.IP \[bu]
+SPARC 32 (since Linux 3.18; cBPF since Linux 3.5);
+.\" commit 2809a2087cc44b55e4377d7b9be3f7f5d2569091
+.IP \[bu]
+ARM-64 (since Linux 3.18);
+.\" commit e54bcde3d69d40023ae77727213d14f920eb264a
+.IP \[bu]
+s390 (since Linux 4.1; cBPF since Linux 3.7);
+.\" commit c10302efe569bfd646b4c22df29577a4595b4580
+.IP \[bu]
+PowerPC 64 (since Linux 4.8; cBPF since Linux 3.1);
+.\" commit 0ca87f05ba8bdc6791c14878464efc901ad71e99
+.\" commit 156d0e290e969caba25f1851c52417c14d141b24
+.IP \[bu]
+SPARC 64 (since Linux 4.12);
+.\" commit 7a12b5031c6b947cc13918237ae652b536243b76
+.IP \[bu]
+x86-32 (since Linux 4.18);
+.\" commit 03f5781be2c7b7e728d724ac70ba10799cc710d7
+.IP \[bu]
+MIPS 64 (since Linux 4.18; cBPF since Linux 3.16);
+.\" commit c6610de353da5ca6eee5b8960e838a87a90ead0c
+.\" commit f381bf6d82f032b7410185b35d000ea370ac706b
+.IP \[bu]
+riscv (since Linux 5.1).
+.\" commit 2353ecc6f91fd15b893fa01bf85a1c7a823ee4f2
+.PD
+.SH EXAMPLES
+.\" [[FIXME]] SRC BEGIN (bpf.c)
+.EX
+/* bpf+sockets example:
+ * 1. create array map of 256 elements
+ * 2. load program that counts number of packets received
+ * r0 = skb\->data[ETH_HLEN + offsetof(struct iphdr, protocol)]
+ * map[r0]++
+ * 3. attach prog_fd to raw socket via setsockopt()
+ * 4. print number of received TCP/UDP packets every second
+ */
+int
+main(int argc, char *argv[])
+{
+ int sock, map_fd, prog_fd, key;
+ long long value = 0, tcp_cnt, udp_cnt;
+\&
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key),
+ sizeof(value), 256);
+ if (map_fd < 0) {
+ printf("failed to create map \[aq]%s\[aq]\en", strerror(errno));
+ /* likely not run as root */
+ return 1;
+ }
+\&
+ struct bpf_insn prog[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* r6 = r1 */
+ BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol)),
+ /* r0 = ip\->proto */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, \-4),
+ /* *(u32 *)(fp \- 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* r2 = fp */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, \-4), /* r2 = r2 \- 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* r1 = map_fd */
+ BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
+ /* r0 = map_lookup(r1, r2) */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ /* if (r0 == 0) goto pc+2 */
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0),
+ /* lock *(u64 *) r0 += r1 */
+.\" == atomic64_add
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(), /* return r0 */
+ };
+\&
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog,
+ sizeof(prog) / sizeof(prog[0]), "GPL");
+\&
+ sock = open_raw_sock("lo");
+\&
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+ sizeof(prog_fd)) == 0);
+\&
+ for (;;) {
+ key = IPPROTO_TCP;
+ assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
+ key = IPPROTO_UDP;
+ assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0);
+ printf("TCP %lld UDP %lld packets\en", tcp_cnt, udp_cnt);
+ sleep(1);
+ }
+\&
+ return 0;
+}
+.EE
+.\" SRC END
+.PP
+Some complete working code can be found in the
+.I samples/bpf
+directory in the kernel source tree.
+.SH SEE ALSO
+.BR seccomp (2),
+.BR bpf\-helpers (7),
+.BR socket (7),
+.BR tc (8),
+.BR tc\-bpf (8)
+.PP
+Both classic and extended BPF are explained in the kernel source file
+.IR Documentation/networking/filter.txt .