diff options
Diffstat (limited to 'headers/xdp')
-rw-r--r-- | headers/xdp/libxdp.h | 174 | ||||
-rw-r--r-- | headers/xdp/parsing_helpers.h | 279 | ||||
-rw-r--r-- | headers/xdp/prog_dispatcher.h | 34 | ||||
-rw-r--r-- | headers/xdp/xdp_helpers.h | 12 | ||||
-rw-r--r-- | headers/xdp/xdp_sample.bpf.h | 130 | ||||
-rw-r--r-- | headers/xdp/xdp_sample_common.bpf.h | 297 | ||||
-rw-r--r-- | headers/xdp/xdp_sample_shared.h | 19 | ||||
-rw-r--r-- | headers/xdp/xdp_stats_kern.h | 50 | ||||
-rw-r--r-- | headers/xdp/xdp_stats_kern_user.h | 27 | ||||
-rw-r--r-- | headers/xdp/xsk.h | 271 |
10 files changed, 1293 insertions, 0 deletions
diff --git a/headers/xdp/libxdp.h b/headers/xdp/libxdp.h new file mode 100644 index 0000000..c1a6066 --- /dev/null +++ b/headers/xdp/libxdp.h @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * XDP management utility functions + * + * Copyright (C) 2020 Toke Høiland-Jørgensen <toke@redhat.com> + */ + +#ifndef __LIBXDP_LIBXDP_H +#define __LIBXDP_LIBXDP_H + +#include <stdio.h> +#include <linux/bpf.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include "xdp_helpers.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define XDP_BPFFS_ENVVAR "LIBXDP_BPFFS" +#define XDP_BPFFS_MOUNT_ENVVAR "LIBXDP_BPFFS_AUTOMOUNT" +#define XDP_OBJECT_ENVVAR "LIBXDP_OBJECT_PATH" + +enum xdp_attach_mode { + XDP_MODE_UNSPEC = 0, + XDP_MODE_NATIVE, + XDP_MODE_SKB, + XDP_MODE_HW +}; + +/* This is compatible with libbpf logging levels */ +enum libxdp_print_level { + LIBXDP_WARN, + LIBXDP_INFO, + LIBXDP_DEBUG, +}; +typedef int (*libxdp_print_fn_t)(enum libxdp_print_level level, + const char *, va_list ap); + +libxdp_print_fn_t libxdp_set_print(libxdp_print_fn_t fn); + + +struct xdp_program; +struct xdp_multiprog; + +long libxdp_get_error(const void *ptr); +int libxdp_strerror(int err, char *buf, size_t size); +int libxdp_clean_references(int ifindex); + + +struct xdp_program *xdp_program__from_bpf_obj(struct bpf_object *obj, + const char *section_name); +struct xdp_program *xdp_program__find_file(const char *filename, + const char *section_name, + struct bpf_object_open_opts *opts); +struct xdp_program *xdp_program__open_file(const char *filename, + const char *section_name, + struct bpf_object_open_opts *opts); +struct xdp_program *xdp_program__from_fd(int fd); +struct xdp_program *xdp_program__from_id(__u32 prog_id); +struct xdp_program *xdp_program__from_pin(const char *pin_path); +struct xdp_program *xdp_program__clone(struct xdp_program *xdp_prog, + unsigned int flags); + +void xdp_program__close(struct xdp_program *xdp_prog); +int xdp_program__test_run(struct xdp_program *xdp_prog, + struct bpf_test_run_opts *opts, + unsigned int flags); + +enum xdp_attach_mode xdp_program__is_attached(const struct xdp_program *xdp_prog, + int ifindex); +const char *xdp_program__name(const struct xdp_program *xdp_prog); +const unsigned char *xdp_program__tag(const struct xdp_program *xdp_prog); +struct bpf_object *xdp_program__bpf_obj(struct xdp_program *xdp_prog); +const struct btf *xdp_program__btf(struct xdp_program *xdp_prog); +uint32_t xdp_program__id(const struct xdp_program *xdp_prog); +int xdp_program__fd(const struct xdp_program *xdp_prog); +unsigned int xdp_program__run_prio(const struct xdp_program *xdp_prog); +int xdp_program__set_run_prio(struct xdp_program *xdp_prog, + unsigned int run_prio); +bool xdp_program__chain_call_enabled(const struct xdp_program *xdp_prog, + enum xdp_action action); +int xdp_program__set_chain_call_enabled(struct xdp_program *prog, + unsigned int action, + bool enabled); +int xdp_program__print_chain_call_actions(const struct xdp_program *prog, + char *buf, + size_t buf_len); +bool xdp_program__xdp_frags_support(const struct xdp_program *prog); +int xdp_program__set_xdp_frags_support(struct xdp_program *prog, bool frags); + +int xdp_program__pin(struct xdp_program *xdp_prog, const char *pin_path); +int xdp_program__attach(struct xdp_program *xdp_prog, + int ifindex, enum xdp_attach_mode mode, + unsigned int flags); +int xdp_program__attach_multi(struct xdp_program **progs, size_t num_progs, + int ifindex, enum xdp_attach_mode mode, + unsigned int flags); +int xdp_program__detach(struct xdp_program *xdp_prog, + int ifindex, enum xdp_attach_mode mode, + unsigned int flags); +int xdp_program__detach_multi(struct xdp_program **progs, size_t num_progs, + int ifindex, enum xdp_attach_mode mode, + unsigned int flags); + +struct xdp_multiprog *xdp_multiprog__get_from_ifindex(int ifindex); +struct xdp_program *xdp_multiprog__next_prog(const struct xdp_program *prog, + const struct xdp_multiprog *mp); +void xdp_multiprog__close(struct xdp_multiprog *mp); +int xdp_multiprog__detach(struct xdp_multiprog *mp); +enum xdp_attach_mode xdp_multiprog__attach_mode(const struct xdp_multiprog *mp); +struct xdp_program *xdp_multiprog__main_prog(const struct xdp_multiprog *mp); +struct xdp_program *xdp_multiprog__hw_prog(const struct xdp_multiprog *mp); +bool xdp_multiprog__is_legacy(const struct xdp_multiprog *mp); +int xdp_multiprog__program_count(const struct xdp_multiprog *mp); +bool xdp_multiprog__xdp_frags_support(const struct xdp_multiprog *mp); + +/* Only following members can be set at once: + * + * @obj, @prog_name + * Create using BPF program with name @prog_name in BPF object @obj + * + * @prog_name is optional. In absence of @prog_name, first program of BPF + * object is picked. + * + * @find_filename, @prog_name, @opts + * Create using BPF program with name @prog_name in BPF object located in + * LIBXDP_OBJECT_PATH with filename @find_filename, using + * bpf_object_open_opts @opts. + * + * @prog_name and @opts is optional. In absence of @prog_name, first + * program of BPF object is picked. + * + * @open_filename, @prog_name, @opts + * Create using BPF program with name @prog_name in BPF object located at + * path @open_filename, using bpf_object_open_opts @opts. + * + * @prog_name and @opts is optional. In absence of @prog_name, first + * program of BPF object is picked. + * + * @id + * Load from BPF program with ID @id + * + * @fd + * Load from BPF program with fd @fd + * + * When one of these combinations is set, all other members of the opts struct + * must be zeroed out. + */ +struct xdp_program_opts { + size_t sz; + struct bpf_object *obj; + struct bpf_object_open_opts *opts; + const char *prog_name; + const char *find_filename; + const char *open_filename; + const char *pin_path; + __u32 id; + int fd; + size_t :0; +}; +#define xdp_program_opts__last_field fd + +#define DECLARE_LIBXDP_OPTS DECLARE_LIBBPF_OPTS + +struct xdp_program *xdp_program__create(struct xdp_program_opts *opts); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/headers/xdp/parsing_helpers.h b/headers/xdp/parsing_helpers.h new file mode 100644 index 0000000..2fc7b6a --- /dev/null +++ b/headers/xdp/parsing_helpers.h @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */ +/* + * This file contains parsing functions that can be used in eXDP programs. The + * functions are marked as __always_inline, and fully defined in this header + * file to be included in the BPF program. + * + * Each helper parses a packet header, including doing bounds checking, and + * returns the type of its contents if successful, and -1 otherwise. + * + * For Ethernet and IP headers, the content type is the type of the payload + * (h_proto for Ethernet, nexthdr for IPv6), for ICMP it is the ICMP type field. + * All return values are in host byte order. + */ + +#ifndef __PARSING_HELPERS_H +#define __PARSING_HELPERS_H + +#include <stddef.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/in.h> +#include <bpf/bpf_endian.h> + +/* Header cursor to keep track of current parsing position */ +struct hdr_cursor { + void *pos; +}; + +/* + * struct vlan_hdr - vlan header + * @h_vlan_TCI: priority and VLAN ID + * @h_vlan_encapsulated_proto: packet type ID or len + */ +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +/* + * Struct icmphdr_common represents the common part of the icmphdr and icmp6hdr + * structures. + */ +struct icmphdr_common { + __u8 type; + __u8 code; + __sum16 cksum; +}; + +/* Allow users of header file to redefine VLAN max depth */ +#ifndef VLAN_MAX_DEPTH +#define VLAN_MAX_DEPTH 4 +#endif + +/* Longest chain of IPv6 extension headers to resolve */ +#ifndef IPV6_EXT_MAX_CHAIN +#define IPV6_EXT_MAX_CHAIN 6 +#endif + + +static __always_inline int proto_is_vlan(__u16 h_proto) +{ + return !!(h_proto == bpf_htons(ETH_P_8021Q) || + h_proto == bpf_htons(ETH_P_8021AD)); +} + +/* Notice, parse_ethhdr() will skip VLAN tags, by advancing nh->pos and returns + * next header EtherType, BUT the ethhdr pointer supplied still points to the + * Ethernet header. Thus, caller can look at eth->h_proto to see if this was a + * VLAN tagged packet. + */ +static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, + struct ethhdr **ethhdr) +{ + struct ethhdr *eth = nh->pos; + struct vlan_hdr *vlh; + __u16 h_proto; + int i; + + if (eth + 1 > data_end) + return -1; + + nh->pos = eth + 1; + *ethhdr = eth; + vlh = nh->pos; + h_proto = eth->h_proto; + + /* Use loop unrolling to avoid the verifier restriction on loops; + * support up to VLAN_MAX_DEPTH layers of VLAN encapsulation. + */ + #pragma unroll + for (i = 0; i < VLAN_MAX_DEPTH; i++) { + if (!proto_is_vlan(h_proto)) + break; + + if (vlh + 1 > data_end) + break; + + h_proto = vlh->h_vlan_encapsulated_proto; + vlh++; + } + + nh->pos = vlh; + return h_proto; /* network-byte-order */ +} + +static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh, + void *data_end, + __u8 next_hdr_type) +{ + for (int i = 0; i < IPV6_EXT_MAX_CHAIN; ++i) { + struct ipv6_opt_hdr *hdr = nh->pos; + + if (hdr + 1 > data_end) + return -1; + + switch (next_hdr_type) { + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_MH: + nh->pos = (char *)hdr + (hdr->hdrlen + 1) * 8; + next_hdr_type = hdr->nexthdr; + break; + case IPPROTO_AH: + nh->pos = (char *)hdr + (hdr->hdrlen + 2) * 4; + next_hdr_type = hdr->nexthdr; + break; + case IPPROTO_FRAGMENT: + nh->pos = (char *)hdr + 8; + next_hdr_type = hdr->nexthdr; + break; + default: + /* Found a header that is not an IPv6 extension header */ + return next_hdr_type; + } + } + + return -1; +} + +static __always_inline int parse_ip6hdr(struct hdr_cursor *nh, + void *data_end, + struct ipv6hdr **ip6hdr) +{ + struct ipv6hdr *ip6h = nh->pos; + + /* Pointer-arithmetic bounds check; pointer +1 points to after end of + * thing being pointed to. We will be using this style in the remainder + * of the tutorial. + */ + if (ip6h + 1 > data_end) + return -1; + + nh->pos = ip6h + 1; + *ip6hdr = ip6h; + + return skip_ip6hdrext(nh, data_end, ip6h->nexthdr); +} + +static __always_inline int parse_iphdr(struct hdr_cursor *nh, + void *data_end, + struct iphdr **iphdr) +{ + struct iphdr *iph = nh->pos; + int hdrsize; + + if (iph + 1 > data_end) + return -1; + + hdrsize = iph->ihl * 4; + + /* Variable-length IPv4 header, need to use byte-based arithmetic */ + if (nh->pos + hdrsize > data_end) + return -1; + + nh->pos += hdrsize; + *iphdr = iph; + + return iph->protocol; +} + +static __always_inline int parse_icmp6hdr(struct hdr_cursor *nh, + void *data_end, + struct icmp6hdr **icmp6hdr) +{ + struct icmp6hdr *icmp6h = nh->pos; + + if (icmp6h + 1 > data_end) + return -1; + + nh->pos = icmp6h + 1; + *icmp6hdr = icmp6h; + + return icmp6h->icmp6_type; +} + +static __always_inline int parse_icmphdr(struct hdr_cursor *nh, + void *data_end, + struct icmphdr **icmphdr) +{ + struct icmphdr *icmph = nh->pos; + + if (icmph + 1 > data_end) + return -1; + + nh->pos = icmph + 1; + *icmphdr = icmph; + + return icmph->type; +} + +static __always_inline int parse_icmphdr_common(struct hdr_cursor *nh, + void *data_end, + struct icmphdr_common **icmphdr) +{ + struct icmphdr_common *h = nh->pos; + + if (h + 1 > data_end) + return -1; + + nh->pos = h + 1; + *icmphdr = h; + + return h->type; +} + +/* + * parse_udphdr: parse the udp header and return the length of the udp payload + */ +static __always_inline int parse_udphdr(struct hdr_cursor *nh, + void *data_end, + struct udphdr **udphdr) +{ + int len; + struct udphdr *h = nh->pos; + + if (h + 1 > data_end) + return -1; + + nh->pos = h + 1; + *udphdr = h; + + len = bpf_ntohs(h->len) - sizeof(struct udphdr); + if (len < 0) + return -1; + + return len; +} + +/* + * parse_tcphdr: parse and return the length of the tcp header + */ +static __always_inline int parse_tcphdr(struct hdr_cursor *nh, + void *data_end, + struct tcphdr **tcphdr) +{ + int len; + struct tcphdr *h = nh->pos; + + if (h + 1 > data_end) + return -1; + + len = h->doff * 4; + if ((void *) h + len > data_end) + return -1; + + nh->pos = h + 1; + *tcphdr = h; + + return len; +} + +#endif /* __PARSING_HELPERS_H */ diff --git a/headers/xdp/prog_dispatcher.h b/headers/xdp/prog_dispatcher.h new file mode 100644 index 0000000..e7ead85 --- /dev/null +++ b/headers/xdp/prog_dispatcher.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */ + +#ifndef __PROG_DISPATCHER_H +#define __PROG_DISPATCHER_H + +#include <linux/types.h> + +#define XDP_METADATA_SECTION "xdp_metadata" +#define XDP_DISPATCHER_VERSION 2 + +/* magic byte is 'X' + 'D' + 'P' (88+68+80=236) */ +#define XDP_DISPATCHER_MAGIC 236 +/* default retval for dispatcher corresponds to the highest bit in the + * chain_call_actions bitmap; we use this to make sure the dispatcher always + * continues the calls chain if a function does not have an freplace program + * attached. + */ +#define XDP_DISPATCHER_RETVAL 31 + +#ifndef MAX_DISPATCHER_ACTIONS +#define MAX_DISPATCHER_ACTIONS 10 +#endif + +struct xdp_dispatcher_config { + __u8 magic; /* Set to XDP_DISPATCHER_MAGIC */ + __u8 dispatcher_version; /* Set to XDP_DISPATCHER_VERSION */ + __u8 num_progs_enabled; /* Number of active program slots */ + __u8 is_xdp_frags; /* Whether this dispatcher is loaded with XDP frags support */ + __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; + __u32 run_prios[MAX_DISPATCHER_ACTIONS]; + __u32 program_flags[MAX_DISPATCHER_ACTIONS]; +}; + +#endif diff --git a/headers/xdp/xdp_helpers.h b/headers/xdp/xdp_helpers.h new file mode 100644 index 0000000..ec29536 --- /dev/null +++ b/headers/xdp/xdp_helpers.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */ + +#ifndef __XDP_HELPERS_H +#define __XDP_HELPERS_H + +#define _CONCAT(x,y) x ## y +#define XDP_RUN_CONFIG(f) _CONCAT(_,f) SEC(".xdp_run_config") + +#define XDP_DEFAULT_RUN_PRIO 50 +#define XDP_DEFAULT_CHAIN_CALL_ACTIONS (1<<XDP_PASS) + +#endif diff --git a/headers/xdp/xdp_sample.bpf.h b/headers/xdp/xdp_sample.bpf.h new file mode 100644 index 0000000..6f31895 --- /dev/null +++ b/headers/xdp/xdp_sample.bpf.h @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _XDP_SAMPLE_BPF_H +#define _XDP_SAMPLE_BPF_H + +#include <linux/bpf.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#include "xdp_sample_shared.h" + +#define ETH_ALEN 6 +#define ETH_P_802_3_MIN 0x0600 +#define ETH_P_8021Q 0x8100 +#define ETH_P_8021AD 0x88A8 +#define ETH_P_IP 0x0800 +#define ETH_P_IPV6 0x86DD +#define ETH_P_ARP 0x0806 +#define IPPROTO_ICMPV6 58 + +#define EINVAL 22 +#define ENETDOWN 100 +#define EMSGSIZE 90 +#define EOPNOTSUPP 95 +#define ENOSPC 28 + +typedef struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, unsigned int); + __type(value, struct datarec); +} array_map; + +extern array_map rx_cnt; +extern const volatile int nr_cpus; + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +/* + * Note: including linux/compiler.h or linux/kernel.h for the macros below + * conflicts with vmlinux.h include in BPF files, so we define them here. + * + * Following functions are taken from kernel sources and + * break aliasing rules in their original form. + * + * While kernel is compiled with -fno-strict-aliasing, + * perf uses -Wstrict-aliasing=3 which makes build fail + * under gcc 4.4. + * + * Using extra __may_alias__ type to allow aliasing + * in this case. + */ +typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; +typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; +typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; +typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; + +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; + case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; + case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; + case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; + default: + asm volatile ("" : : : "memory"); + __builtin_memcpy((void *)res, (const void *)p, size); + asm volatile ("" : : : "memory"); + } +} + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; + case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; + case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; + case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; + default: + asm volatile ("" : : : "memory"); + __builtin_memcpy((void *)p, (const void *)res, size); + asm volatile ("" : : : "memory"); + } +} + +#define READ_ONCE(x) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE(x, val) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +/* Add a value using relaxed read and relaxed write. Less expensive than + * fetch_add when there is no write concurrency. + */ +#define NO_TEAR_ADD(x, val) WRITE_ONCE((x), READ_ONCE(x) + (val)) +#define NO_TEAR_INC(x) NO_TEAR_ADD((x), 1) + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#endif diff --git a/headers/xdp/xdp_sample_common.bpf.h b/headers/xdp/xdp_sample_common.bpf.h new file mode 100644 index 0000000..3a7263f --- /dev/null +++ b/headers/xdp/xdp_sample_common.bpf.h @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +/* GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ +#ifndef _XDP_SAMPLE_COMMON_BPF_H +#define _XDP_SAMPLE_COMMON_BPF_H + +#include "xdp_sample.bpf.h" + +#include <bpf/vmlinux.h> +#include <stddef.h> +#include <stdbool.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +array_map rx_cnt SEC(".maps"); +array_map redir_err_cnt SEC(".maps"); +array_map cpumap_enqueue_cnt SEC(".maps"); +array_map cpumap_kthread_cnt SEC(".maps"); +array_map exception_cnt SEC(".maps"); +array_map devmap_xmit_cnt SEC(".maps"); +array_map rxq_cnt SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 32 * 32); + __type(key, __u64); + __type(value, struct datarec); +} devmap_xmit_cnt_multi SEC(".maps"); + +const volatile int nr_cpus = 0; + +/* These can be set before loading so that redundant comparisons can be DCE'd by + * the verifier, and only actual matches are tried after loading tp_btf program. + * This allows sample to filter tracepoint stats based on net_device. + */ +const volatile int from_match[32] = {}; +const volatile int to_match[32] = {}; + +int cpumap_map_id = 0; + +/* Find if b is part of set a, but if a is empty set then evaluate to true */ +#define IN_SET(a, b) \ + ({ \ + bool __res = !(a)[0]; \ + for (int i = 0; i < ARRAY_SIZE(a) && (a)[i]; i++) { \ + __res = (a)[i] == (b); \ + if (__res) \ + break; \ + } \ + __res; \ + }) + +static __always_inline __u32 xdp_get_err_key(int err) +{ + switch (err) { + case 0: + return 0; + case -EINVAL: + return 2; + case -ENETDOWN: + return 3; + case -EMSGSIZE: + return 4; + case -EOPNOTSUPP: + return 5; + case -ENOSPC: + return 6; + default: + return 1; + } +} + +static __always_inline int xdp_redirect_collect_stat(int from, int err) +{ + __u32 cpu = bpf_get_smp_processor_id(); + __u32 key = XDP_REDIRECT_ERROR; + struct datarec *rec; + __u32 idx; + + if (!IN_SET(from_match, from)) + return 0; + + key = xdp_get_err_key(err); + + idx = key * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&redir_err_cnt, &idx); + if (!rec) + return 0; + if (key) + NO_TEAR_INC(rec->dropped); + else + NO_TEAR_INC(rec->processed); + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tp_btf/xdp_redirect_err") +int BPF_PROG(tp_xdp_redirect_err, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, __u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect_map_err") +int BPF_PROG(tp_xdp_redirect_map_err, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, __u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect") +int BPF_PROG(tp_xdp_redirect, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, __u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect_map") +int BPF_PROG(tp_xdp_redirect_map, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, __u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_cpumap_enqueue") +int BPF_PROG(tp_xdp_cpumap_enqueue, int map_id, unsigned int processed, + unsigned int drops, int to_cpu) +{ + __u32 cpu = bpf_get_smp_processor_id(); + struct datarec *rec; + __u32 idx; + + if (cpumap_map_id && cpumap_map_id != map_id) + return 0; + + idx = to_cpu * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &idx); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, processed); + NO_TEAR_ADD(rec->dropped, drops); + /* Record bulk events, then userspace can calc average bulk size */ + if (processed > 0) + NO_TEAR_INC(rec->issue); + /* Inception: It's possible to detect overload situations, via + * this tracepoint. This can be used for creating a feedback + * loop to XDP, which can take appropriate actions to mitigate + * this overload situation. + */ + return 0; +} + +SEC("tp_btf/xdp_cpumap_kthread") +int BPF_PROG(tp_xdp_cpumap_kthread, int map_id, unsigned int processed, + unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats) +{ + struct datarec *rec; + __u32 cpu; + + if (cpumap_map_id && cpumap_map_id != map_id) + return 0; + + cpu = bpf_get_smp_processor_id(); + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &cpu); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, processed); + NO_TEAR_ADD(rec->dropped, drops); + NO_TEAR_ADD(rec->xdp_pass, xdp_stats->pass); + NO_TEAR_ADD(rec->xdp_drop, xdp_stats->drop); + NO_TEAR_ADD(rec->xdp_redirect, xdp_stats->redirect); + /* Count times kthread yielded CPU via schedule call */ + if (sched) + NO_TEAR_INC(rec->issue); + return 0; +} + +SEC("tp_btf/xdp_cpumap_kthread") +int BPF_PROG(tp_xdp_cpumap_compat, int map_id, unsigned int processed, + unsigned int drops, int sched) +{ + struct datarec *rec; + __u32 cpu; + + if (cpumap_map_id && cpumap_map_id != map_id) + return 0; + + cpu = bpf_get_smp_processor_id(); + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &cpu); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, processed); + NO_TEAR_ADD(rec->dropped, drops); + /* Count times kthread yielded CPU via schedule call */ + if (sched) + NO_TEAR_INC(rec->issue); + return 0; +} + +SEC("tp_btf/xdp_exception") +int BPF_PROG(tp_xdp_exception, const struct net_device *dev, + const struct bpf_prog *xdp, __u32 act) +{ + __u32 cpu = bpf_get_smp_processor_id(); + struct datarec *rec; + __u32 key = act, idx; + + if (!IN_SET(from_match, dev->ifindex)) + return 0; + if (!IN_SET(to_match, dev->ifindex)) + return 0; + + if (key > XDP_REDIRECT) + key = XDP_REDIRECT + 1; + + idx = key * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&exception_cnt, &idx); + if (!rec) + return 0; + NO_TEAR_INC(rec->dropped); + + return 0; +} + +SEC("tp_btf/xdp_devmap_xmit") +int BPF_PROG(tp_xdp_devmap_xmit, const struct net_device *from_dev, + const struct net_device *to_dev, int sent, int drops, int err) +{ + struct datarec *rec; + int idx_in, idx_out; + __u32 cpu; + + idx_in = from_dev->ifindex; + idx_out = to_dev->ifindex; + + if (!IN_SET(from_match, idx_in)) + return 0; + if (!IN_SET(to_match, idx_out)) + return 0; + + cpu = bpf_get_smp_processor_id(); + rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &cpu); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, sent); + NO_TEAR_ADD(rec->dropped, drops); + /* Record bulk events, then userspace can calc average bulk size */ + NO_TEAR_INC(rec->info); + /* Record error cases, where no frame were sent */ + /* Catch API error of drv ndo_xdp_xmit sent more than count */ + if (err || drops < 0) + NO_TEAR_INC(rec->issue); + return 0; +} + +SEC("tp_btf/xdp_devmap_xmit") +int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device *from_dev, + const struct net_device *to_dev, int sent, int drops, int err) +{ + struct datarec empty = {}; + struct datarec *rec; + int idx_in, idx_out; + __u64 idx; + + idx_in = from_dev->ifindex; + idx_out = to_dev->ifindex; + idx = idx_in; + idx = idx << 32 | idx_out; + + if (!IN_SET(from_match, idx_in)) + return 0; + if (!IN_SET(to_match, idx_out)) + return 0; + + bpf_map_update_elem(&devmap_xmit_cnt_multi, &idx, &empty, BPF_NOEXIST); + rec = bpf_map_lookup_elem(&devmap_xmit_cnt_multi, &idx); + if (!rec) + return 0; + + NO_TEAR_ADD(rec->processed, sent); + NO_TEAR_ADD(rec->dropped, drops); + NO_TEAR_INC(rec->info); + if (err || drops < 0) + NO_TEAR_INC(rec->issue); + return 0; +} + +#endif diff --git a/headers/xdp/xdp_sample_shared.h b/headers/xdp/xdp_sample_shared.h new file mode 100644 index 0000000..2a7b006 --- /dev/null +++ b/headers/xdp/xdp_sample_shared.h @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef _XDP_SAMPLE_SHARED_H +#define _XDP_SAMPLE_SHARED_H + +#include <stddef.h> + +struct datarec { + size_t processed; + size_t dropped; + size_t issue; + union { + size_t xdp_pass; + size_t info; + }; + size_t xdp_drop; + size_t xdp_redirect; +} __attribute__((aligned(64))); + +#endif diff --git a/headers/xdp/xdp_stats_kern.h b/headers/xdp/xdp_stats_kern.h new file mode 100644 index 0000000..11fed4a --- /dev/null +++ b/headers/xdp/xdp_stats_kern.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Used *ONLY* by BPF-prog running kernel side. */ +#ifndef __XDP_STATS_KERN_H +#define __XDP_STATS_KERN_H + +/* Data record type 'struct datarec' is defined in common/xdp_stats_kern_user.h, + * programs using this header must first include that file. + */ +#ifndef __XDP_STATS_KERN_USER_H +#warning "You forgot to #include <../common/xdp_stats_kern_user.h>" +#include <../common/xdp_stats_kern_user.h> +#endif + +#ifndef XDP_STATS_MAP_PINNING +#define XDP_STATS_MAP_PINNING LIBBPF_PIN_BY_NAME +#endif + +/* Keeps stats per (enum) xdp_action */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, XDP_ACTION_MAX); + __type(key, __u32); + __type(value, struct xdp_stats_record); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} XDP_STATS_MAP_NAME SEC(".maps"); + + +static __always_inline +__u32 xdp_stats_record_action(struct xdp_md *ctx, __u32 action) +{ + if (action >= XDP_ACTION_MAX) + return XDP_ABORTED; + + /* Lookup in kernel BPF-side return pointer to actual data record */ + struct xdp_stats_record *rec = bpf_map_lookup_elem(&xdp_stats_map, &action); + if (!rec) + return XDP_ABORTED; + + /* BPF_MAP_TYPE_PERCPU_ARRAY returns a data record specific to current + * CPU and XDP hooks runs under Softirq, which makes it safe to update + * without atomic operations. + */ + rec->rx_packets++; + rec->rx_bytes += (ctx->data_end - ctx->data); + + return action; +} + +#endif /* __XDP_STATS_KERN_H */ diff --git a/headers/xdp/xdp_stats_kern_user.h b/headers/xdp/xdp_stats_kern_user.h new file mode 100644 index 0000000..25f3f9b --- /dev/null +++ b/headers/xdp/xdp_stats_kern_user.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Used by BPF-prog kernel side BPF-progs and userspace programs, + * for sharing xdp_stats common struct and DEFINEs. + */ +#ifndef __XDP_STATS_KERN_USER_H +#define __XDP_STATS_KERN_USER_H + +/* This is the data record stored in the map */ +struct xdp_stats_record { + union { + __u64 packets; + __u64 rx_packets; + }; + union { + __u64 bytes; + __u64 rx_bytes; + }; +}; + +#ifndef XDP_ACTION_MAX +#define XDP_ACTION_MAX (XDP_REDIRECT + 1) +#endif + +#define XDP_STATS_MAP_NAME xdp_stats_map + +#endif /* __XDP_STATS_KERN_USER_H */ diff --git a/headers/xdp/xsk.h b/headers/xdp/xsk.h new file mode 100644 index 0000000..92fb4ab --- /dev/null +++ b/headers/xdp/xsk.h @@ -0,0 +1,271 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * AF_XDP user-space access library. + * + * Copyright(c) 2018 - 2021 Intel Corporation. + * + * Author(s): Magnus Karlsson <magnus.karlsson@intel.com> + */ + +/* So as not to clash with these functions when they where part of libbpf */ +#ifndef __LIBBPF_XSK_H +#define __LIBBPF_XSK_H + +#include <stdio.h> +#include <stdint.h> +#include <bpf/libbpf.h> +#include <linux/if_xdp.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __GNUC_STDC_INLINE__ +#define XDP_ALWAYS_INLINE inline __attribute__((__always_inline__)) +#elif __GNUC_GNU_INLINE__ +#define XDP_ALWAYS_INLINE static inline __attribute__((__always_inline__)) +#else +#define XDP_ALWAYS_INLINE static inline +#endif + +/* Do not access these members directly. Use the functions below. */ +#define DEFINE_XSK_RING(name) \ +struct name { \ + __u32 cached_prod; \ + __u32 cached_cons; \ + __u32 mask; \ + __u32 size; \ + __u32 *producer; \ + __u32 *consumer; \ + void *ring; \ + __u32 *flags; \ +} + +DEFINE_XSK_RING(xsk_ring_prod); +DEFINE_XSK_RING(xsk_ring_cons); + +/* For a detailed explanation on the memory barriers associated with the + * ring, please take a look at net/xdp/xsk_queue.h in the Linux kernel source tree. + */ + +struct xsk_umem; +struct xsk_socket; + +XDP_ALWAYS_INLINE __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, + __u32 idx) +{ + __u64 *addrs = (__u64 *)fill->ring; + + return &addrs[idx & fill->mask]; +} + +XDP_ALWAYS_INLINE const __u64 * +xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx) +{ + const __u64 *addrs = (const __u64 *)comp->ring; + + return &addrs[idx & comp->mask]; +} + +XDP_ALWAYS_INLINE struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, + __u32 idx) +{ + struct xdp_desc *descs = (struct xdp_desc *)tx->ring; + + return &descs[idx & tx->mask]; +} + +XDP_ALWAYS_INLINE const struct xdp_desc * +xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx) +{ + const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring; + + return &descs[idx & rx->mask]; +} + +XDP_ALWAYS_INLINE int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r) +{ + return *r->flags & XDP_RING_NEED_WAKEUP; +} + +XDP_ALWAYS_INLINE __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) +{ + __u32 free_entries = r->cached_cons - r->cached_prod; + + if (free_entries >= nb) + return free_entries; + + /* Refresh the local tail pointer. + * cached_cons is r->size bigger than the real consumer pointer so + * that this addition can be avoided in the more frequently + * executed code that computs free_entries in the beginning of + * this function. Without this optimization it whould have been + * free_entries = r->cached_prod - r->cached_cons + r->size. + */ + r->cached_cons = __atomic_load_n(r->consumer, __ATOMIC_ACQUIRE); + r->cached_cons += r->size; + + return r->cached_cons - r->cached_prod; +} + +XDP_ALWAYS_INLINE __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) +{ + __u32 entries = r->cached_prod - r->cached_cons; + + if (entries == 0) { + r->cached_prod = __atomic_load_n(r->producer, __ATOMIC_ACQUIRE); + entries = r->cached_prod - r->cached_cons; + } + + return (entries > nb) ? nb : entries; +} + +XDP_ALWAYS_INLINE __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx) +{ + if (xsk_prod_nb_free(prod, nb) < nb) + return 0; + + *idx = prod->cached_prod; + prod->cached_prod += nb; + + return nb; +} + +XDP_ALWAYS_INLINE void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) +{ + /* Make sure everything has been written to the ring before indicating + * this to the kernel by writing the producer pointer. + */ + __atomic_store_n(prod->producer, *prod->producer + nb, __ATOMIC_RELEASE); +} + +XDP_ALWAYS_INLINE __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) +{ + __u32 entries = xsk_cons_nb_avail(cons, nb); + + if (entries > 0) { + *idx = cons->cached_cons; + cons->cached_cons += entries; + } + + return entries; +} + +XDP_ALWAYS_INLINE void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb) +{ + cons->cached_cons -= nb; +} + +XDP_ALWAYS_INLINE void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) +{ + /* Make sure data has been read before indicating we are done + * with the entries by updating the consumer pointer. + */ + __atomic_store_n(cons->consumer, *cons->consumer + nb, __ATOMIC_RELEASE); +} + +XDP_ALWAYS_INLINE void *xsk_umem__get_data(void *umem_area, __u64 addr) +{ + return &((char *)umem_area)[addr]; +} + +XDP_ALWAYS_INLINE __u64 xsk_umem__extract_addr(__u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +XDP_ALWAYS_INLINE __u64 xsk_umem__extract_offset(__u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +XDP_ALWAYS_INLINE __u64 xsk_umem__add_offset_to_addr(__u64 addr) +{ + return xsk_umem__extract_addr(addr) + xsk_umem__extract_offset(addr); +} + +int xsk_umem__fd(const struct xsk_umem *umem); +int xsk_socket__fd(const struct xsk_socket *xsk); + +#define XSK_RING_CONS__DEFAULT_NUM_DESCS 2048 +#define XSK_RING_PROD__DEFAULT_NUM_DESCS 2048 +#define XSK_UMEM__DEFAULT_FRAME_SHIFT 12 /* 4096 bytes */ +#define XSK_UMEM__DEFAULT_FRAME_SIZE (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT) +#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0 +#define XSK_UMEM__DEFAULT_FLAGS 0 + +struct xsk_umem_config { + __u32 fill_size; + __u32 comp_size; + __u32 frame_size; + __u32 frame_headroom; + __u32 flags; +}; + +int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); +int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); + +/* Flags for the libbpf_flags field. + * We still call this field libbpf_flags for compatibility reasons. + */ +#define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0) +#define XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD (1 << 0) + +struct xsk_socket_config { + __u32 rx_size; + __u32 tx_size; + union { + __u32 libbpf_flags; + __u32 libxdp_flags; + }; + __u32 xdp_flags; + __u16 bind_flags; +}; + +/* Set config to NULL to get the default configuration. */ +int xsk_umem__create(struct xsk_umem **umem, + void *umem_area, __u64 size, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_umem_config *config); +int xsk_socket__create(struct xsk_socket **xsk, + const char *ifname, __u32 queue_id, + struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + const struct xsk_socket_config *config); +int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, + const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_socket_config *config); + +/* Returns 0 for success and -EBUSY if the umem is still in use. */ +int xsk_umem__delete(struct xsk_umem *umem); +void xsk_socket__delete(struct xsk_socket *xsk); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_XSK_H */ + +/* For new functions post libbpf */ +#ifndef __LIBXDP_XSK_H +#define __LIBXDP_XSK_H + +#ifdef __cplusplus +extern "C" { +#endif + + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBXDP_XSK_H */ |