diff options
Diffstat (limited to 'lib/libxdp/xsk.c')
-rw-r--r-- | lib/libxdp/xsk.c | 1299 |
1 files changed, 1299 insertions, 0 deletions
diff --git a/lib/libxdp/xsk.c b/lib/libxdp/xsk.c new file mode 100644 index 0000000..c6c201b --- /dev/null +++ b/lib/libxdp/xsk.c @@ -0,0 +1,1299 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * AF_XDP user-space access library. + * + * Copyright(c) 2018 - 2021 Intel Corporation. + * + * Author(s): Magnus Karlsson <magnus.karlsson@intel.com> + */ + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <arpa/inet.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <dirent.h> +#include <linux/err.h> +#include <linux/ethtool.h> +#include <linux/filter.h> +#include <linux/if_ether.h> +#include <linux/if_link.h> +#include <linux/if_packet.h> +#include <linux/if_xdp.h> +#include <linux/list.h> +#include <linux/sockios.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <xdp/xsk.h> + +#include "libxdp_internal.h" +#include "xsk_def_xdp_prog.h" +#include "bpf_instr.h" + +#ifndef SOL_XDP + #define SOL_XDP 283 +#endif + +#ifndef AF_XDP + #define AF_XDP 44 +#endif + +#ifndef PF_XDP + #define PF_XDP AF_XDP +#endif + +#ifndef SO_NETNS_COOKIE + #define SO_NETNS_COOKIE 71 +#endif + +#define INIT_NS 1 + +struct xsk_umem { + struct xsk_ring_prod *fill_save; + struct xsk_ring_cons *comp_save; + char *umem_area; + struct xsk_umem_config config; + int fd; + int refcount; + struct list_head ctx_list; + bool rx_ring_setup_done; + bool tx_ring_setup_done; +}; + +struct xsk_ctx { + struct xsk_ring_prod *fill; + struct xsk_ring_cons *comp; + struct xsk_umem *umem; + __u32 queue_id; + int refcount; + int ifindex; + __u64 netns_cookie; + int xsks_map_fd; + struct list_head list; + struct xdp_program *xdp_prog; + int refcnt_map_fd; + char ifname[IFNAMSIZ]; +}; + +struct xsk_socket { + struct xsk_ring_cons *rx; + struct xsk_ring_prod *tx; + struct xsk_ctx *ctx; + struct xsk_socket_config config; + int fd; +}; + +struct xsk_nl_info { + int ifindex; + int fd; + bool xdp_prog_attached; +}; + +/* Up until and including Linux 5.3 */ +struct xdp_ring_offset_v1 { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +/* Up until and including Linux 5.3 */ +struct xdp_mmap_offsets_v1 { + struct xdp_ring_offset_v1 rx; + struct xdp_ring_offset_v1 tx; + struct xdp_ring_offset_v1 fr; + struct xdp_ring_offset_v1 cr; +}; + +/* Export all inline helpers as symbols for use by language bindings. */ +extern inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, + __u32 idx); +extern inline const __u64 * +xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx); +extern inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, + __u32 idx); +extern inline const struct xdp_desc * +xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx); +extern inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r); +extern inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb); +extern inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb); +extern inline __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, + __u32 *idx); +extern inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb); +extern inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, + __u32 *idx); +extern inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb); +extern inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb); +extern inline void *xsk_umem__get_data(void *umem_area, __u64 addr); +extern inline __u64 xsk_umem__extract_addr(__u64 addr); +extern inline __u64 xsk_umem__extract_offset(__u64 addr); +extern inline __u64 xsk_umem__add_offset_to_addr(__u64 addr); + +int xsk_umem__fd(const struct xsk_umem *umem) +{ + return umem ? umem->fd : -EINVAL; +} + +int xsk_socket__fd(const struct xsk_socket *xsk) +{ + return xsk ? xsk->fd : -EINVAL; +} + +static bool xsk_page_aligned(void *buffer) +{ + unsigned long addr = (unsigned long)buffer; + + return !(addr & (getpagesize() - 1)); +} + +static void xsk_set_umem_config(struct xsk_umem_config *cfg, + const struct xsk_umem_config *usr_cfg) +{ + if (!usr_cfg) { + cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; + cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + cfg->flags = XSK_UMEM__DEFAULT_FLAGS; + return; + } + + cfg->fill_size = usr_cfg->fill_size; + cfg->comp_size = usr_cfg->comp_size; + cfg->frame_size = usr_cfg->frame_size; + cfg->frame_headroom = usr_cfg->frame_headroom; + cfg->flags = usr_cfg->flags; +} + +static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, + const struct xsk_socket_config *usr_cfg) +{ + if (!usr_cfg) { + cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg->libbpf_flags = 0; + cfg->xdp_flags = 0; + cfg->bind_flags = 0; + return 0; + } + + if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD) + return -EINVAL; + + cfg->rx_size = usr_cfg->rx_size; + cfg->tx_size = usr_cfg->tx_size; + cfg->libbpf_flags = usr_cfg->libbpf_flags; + cfg->xdp_flags = usr_cfg->xdp_flags; + cfg->bind_flags = usr_cfg->bind_flags; + + return 0; +} + +static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off) +{ + struct xdp_mmap_offsets_v1 off_v1; + + /* getsockopt on a kernel <= 5.3 has no flags fields. + * Copy over the offsets to the correct places in the >=5.4 format + * and put the flags where they would have been on that kernel. + */ + memcpy(&off_v1, off, sizeof(off_v1)); + + off->rx.producer = off_v1.rx.producer; + off->rx.consumer = off_v1.rx.consumer; + off->rx.desc = off_v1.rx.desc; + off->rx.flags = off_v1.rx.consumer + sizeof(__u32); + + off->tx.producer = off_v1.tx.producer; + off->tx.consumer = off_v1.tx.consumer; + off->tx.desc = off_v1.tx.desc; + off->tx.flags = off_v1.tx.consumer + sizeof(__u32); + + off->fr.producer = off_v1.fr.producer; + off->fr.consumer = off_v1.fr.consumer; + off->fr.desc = off_v1.fr.desc; + off->fr.flags = off_v1.fr.consumer + sizeof(__u32); + + off->cr.producer = off_v1.cr.producer; + off->cr.consumer = off_v1.cr.consumer; + off->cr.desc = off_v1.cr.desc; + off->cr.flags = off_v1.cr.consumer + sizeof(__u32); +} + +static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off) +{ + socklen_t optlen; + int err; + + optlen = sizeof(*off); + err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen); + if (err) + return err; + + if (optlen == sizeof(*off)) + return 0; + + if (optlen == sizeof(struct xdp_mmap_offsets_v1)) { + xsk_mmap_offsets_v1(off); + return 0; + } + + return -EINVAL; +} + +static int xsk_create_umem_rings(struct xsk_umem *umem, int fd, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp) +{ + struct xdp_mmap_offsets off; + void *map; + int err; + + err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING, + &umem->config.fill_size, + sizeof(umem->config.fill_size)); + if (err) + return -errno; + + err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, + &umem->config.comp_size, + sizeof(umem->config.comp_size)); + if (err) + return -errno; + + err = xsk_get_mmap_offsets(fd, &off); + if (err) + return -errno; + + map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, + XDP_UMEM_PGOFF_FILL_RING); + if (map == MAP_FAILED) + return -errno; + + fill->mask = umem->config.fill_size - 1; + fill->size = umem->config.fill_size; + fill->producer = map + off.fr.producer; + fill->consumer = map + off.fr.consumer; + fill->flags = map + off.fr.flags; + fill->ring = map + off.fr.desc; + fill->cached_cons = umem->config.fill_size; + + map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, + XDP_UMEM_PGOFF_COMPLETION_RING); + if (map == MAP_FAILED) { + err = -errno; + goto out_mmap; + } + + comp->mask = umem->config.comp_size - 1; + comp->size = umem->config.comp_size; + comp->producer = map + off.cr.producer; + comp->consumer = map + off.cr.consumer; + comp->flags = map + off.cr.flags; + comp->ring = map + off.cr.desc; + + return 0; + +out_mmap: + munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64)); + return err; +} + +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, + __u64 size, struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_umem_config *usr_config) +{ + struct xdp_umem_reg mr; + struct xsk_umem *umem; + int err; + + if (!umem_area || !umem_ptr || !fill || !comp) + return -EFAULT; + if (!size && !xsk_page_aligned(umem_area)) + return -EINVAL; + + umem = calloc(1, sizeof(*umem)); + if (!umem) + return -ENOMEM; + + umem->fd = socket(AF_XDP, SOCK_RAW, 0); + if (umem->fd < 0) { + err = -errno; + goto out_umem_alloc; + } + + umem->umem_area = umem_area; + INIT_LIST_HEAD(&umem->ctx_list); + xsk_set_umem_config(&umem->config, usr_config); + + memset(&mr, 0, sizeof(mr)); + mr.addr = (uintptr_t)umem_area; + mr.len = size; + mr.chunk_size = umem->config.frame_size; + mr.headroom = umem->config.frame_headroom; + mr.flags = umem->config.flags; + + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); + if (err) { + err = -errno; + goto out_socket; + } + + err = xsk_create_umem_rings(umem, umem->fd, fill, comp); + if (err) + goto out_socket; + + umem->fill_save = fill; + umem->comp_save = comp; + *umem_ptr = umem; + return 0; + +out_socket: + close(umem->fd); +out_umem_alloc: + free(umem); + return err; +} + +static int xsk_init_xsk_struct(struct xsk_socket *xsk, int ifindex) +{ + char ifname[IFNAMSIZ]; + struct xsk_ctx *ctx; + char *interface; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + interface = if_indextoname(ifindex, &ifname[0]); + if (!interface) { + free(ctx); + return -errno; + } + + ctx->ifindex = ifindex; + memcpy(ctx->ifname, ifname, IFNAMSIZ -1); + ctx->ifname[IFNAMSIZ - 1] = 0; + + xsk->ctx = ctx; + + return 0; +} + +static enum xdp_attach_mode xsk_convert_xdp_flags(__u32 xdp_flags) +{ + if (xdp_flags & ~XDP_FLAGS_MASK) + pr_warn("XDP flag: 0x%x contains flags not supported by libxdp.\n", xdp_flags); + + if (xdp_flags & XDP_FLAGS_SKB_MODE) + return XDP_MODE_SKB; + if (xdp_flags & XDP_FLAGS_DRV_MODE) + return XDP_MODE_NATIVE; + if (xdp_flags & XDP_FLAGS_HW_MODE) + return XDP_MODE_HW; + + return XDP_MODE_NATIVE; +} + +#define MAX_DEV_QUEUE_PATH_LEN 64 + +static void xsk_get_queues_from_sysfs(const char* ifname, __u32 *rx, __u32 *tx) { + char buf[MAX_DEV_QUEUE_PATH_LEN]; + struct dirent *entry; + DIR *dir; + int err; + + *rx = *tx = 0; + + err = try_snprintf(buf, MAX_DEV_QUEUE_PATH_LEN, + "/sys/class/net/%s/queues/", ifname); + if (err) + return; + + dir = opendir(buf); + if(dir == NULL) + return; + + while((entry = readdir(dir))) { + if (0 == strncmp(entry->d_name, "rx", 2)) + ++*rx; + + if (0 == strncmp(entry->d_name, "tx", 2)) + ++*tx; + } + + closedir(dir); +} + +static int xsk_get_max_queues(char *ifname) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + struct ifreq ifr = {}; + int fd, err, ret; + + fd = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (fd < 0) + return -errno; + + ifr.ifr_data = (void *)&channels; + memcpy(ifr.ifr_name, ifname, IFNAMSIZ - 1); + ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + err = ioctl(fd, SIOCETHTOOL, &ifr); + if (err && errno != EOPNOTSUPP) { + ret = -errno; + goto out; + } + + if (err) { + /* If the device says it has no channels, + * try to get rx tx from sysfs, otherwise all traffic + * is sent to a single stream, so max queues = 1. + */ + __u32 rx, tx; + xsk_get_queues_from_sysfs(ifr.ifr_name, &rx, &tx); + ret = max(max(rx, tx), 1); + } else { + /* Take the max of rx, tx, combined. Drivers return + * the number of channels in different ways. + */ + ret = max(channels.max_rx, channels.max_tx); + ret = max(ret, (int)channels.max_combined); + } + +out: + close(fd); + return ret; +} + +static int xsk_size_map(struct xdp_program *xdp_prog, char *ifname) +{ + struct bpf_object *bpf_obj = xdp_program__bpf_obj(xdp_prog); + struct bpf_map *map; + int max_queues; + int err; + + max_queues = xsk_get_max_queues(ifname); + if (max_queues < 0) + return max_queues; + + map = bpf_object__find_map_by_name(bpf_obj, "xsks_map"); + if (!map) + return -ENOENT; + + err = bpf_map__set_max_entries(map, max_queues); + if (err) + return err; + + return 0; +} + +static void xsk_delete_map_entry(int xsks_map_fd, __u32 queue_id) +{ + bpf_map_delete_elem(xsks_map_fd, &queue_id); + close(xsks_map_fd); +} + +static int xsk_lookup_map_by_filter(int prog_fd, + bool (*map_info_filter)(struct bpf_map_info *map_info)) +{ + __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info); + __u32 map_len = sizeof(struct bpf_map_info); + struct bpf_prog_info prog_info = {}; + int fd, err, xsks_map_fd = -ENOENT; + struct bpf_map_info map_info; + + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_len); + if (err) + return err; + + num_maps = prog_info.nr_map_ids; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids)); + if (!map_ids) + return -ENOMEM; + + memset(&prog_info, 0, prog_len); + prog_info.nr_map_ids = num_maps; + prog_info.map_ids = (__u64)(unsigned long)map_ids; + + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_len); + if (err) { + free(map_ids); + return err; + } + + for (i = 0; i < prog_info.nr_map_ids; i++) { + fd = bpf_map_get_fd_by_id(map_ids[i]); + if (fd < 0) + continue; + + memset(&map_info, 0, map_len); + err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); + if (err) { + close(fd); + continue; + } + + if (map_info_filter(&map_info)) { + xsks_map_fd = fd; + break; + } + + close(fd); + } + + free(map_ids); + return xsks_map_fd; +} + +static bool xsk_map_is_socket_map(struct bpf_map_info *map_info) +{ + return !strncmp(map_info->name, "xsks_map", sizeof(map_info->name)) && + map_info->key_size == 4 && map_info->value_size == 4; +} + +static bool xsk_map_is_refcnt_map(struct bpf_map_info *map_info) +{ + /* In order to avoid confusing users with multiple identically named + * maps, libbpf names non-custom internal maps (.data, .bss, etc.) + * in an unexpected way, namely the first 8 characters of a bpf object + * name + a suffix signifying the internal map type, + * ex. "xdp_def_" + ".data". + */ + return !strncmp(map_info->name, "xsk_def_.data", + sizeof(map_info->name)) && + map_info->value_size >= sizeof(int); +} + +static int xsk_lookup_bpf_map(int prog_fd) +{ + return xsk_lookup_map_by_filter(prog_fd, &xsk_map_is_socket_map); +} + +static int xsk_lookup_refcnt_map(int prog_fd, const char *xdp_filename) +{ + int map_fd = xsk_lookup_map_by_filter(prog_fd, &xsk_map_is_refcnt_map); + + if (map_fd >= 0) + goto out; + + if (map_fd != -ENOENT) { + pr_debug("Error getting refcount map: %s\n", strerror(-map_fd)); + goto out; + } + + if (xdp_filename) + pr_warn("Refcount was not found in %s or kernel does not support required features, so automatic program removal on unload is disabled\n", + xdp_filename); + else + pr_warn("Another XSK socket was created by a version of libxdp that doesn't support program refcnt, so automatic program removal on unload is disabled.\n"); +out: + return map_fd; +} + +#ifdef HAVE_LIBBPF_BPF_MAP_CREATE +/* bpf_map_create() and the new bpf_prog_create() were added at the same time - + * however there's a naming conflict with another bpf_prog_load() function in + * older versions of libbpf; to avoid hitting that we create our own wrapper + * function for this one even with new libbpf versions. + */ +static int xsk_check_create_prog(struct bpf_insn *insns, size_t insns_cnt) +{ + return bpf_prog_load(BPF_PROG_TYPE_XDP, "testprog", + "GPL", insns, insns_cnt, NULL); +} +#else +static int bpf_map_create(enum bpf_map_type map_type, + __unused const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + __unused void *opts) +{ + struct bpf_create_map_attr map_attr; + + memset(&map_attr, 0, sizeof(map_attr)); + map_attr.map_type = map_type; + map_attr.key_size = key_size; + map_attr.value_size = value_size; + map_attr.max_entries = max_entries; + + return bpf_create_map_xattr(&map_attr); +} + +static int xsk_check_create_prog(struct bpf_insn *insns, size_t insns_cnt) +{ + struct bpf_load_program_attr prog_attr; + + memset(&prog_attr, 0, sizeof(prog_attr)); + prog_attr.prog_type = BPF_PROG_TYPE_XDP; + prog_attr.insns = insns; + prog_attr.insns_cnt = insns_cnt; + prog_attr.license = "GPL"; + + return bpf_load_program_xattr(&prog_attr, NULL, 0); +} +#endif + +static bool xsk_check_redirect_flags(void) +{ + char data_in = 0, data_out; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &data_in, + .data_out = &data_out, + .data_size_in = 1); + struct bpf_insn insns[] = { + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, XDP_PASS), + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + BPF_EXIT_INSN(), + }; + int prog_fd, map_fd, ret; + bool detected = false; + + map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xskmap", + sizeof(int), sizeof(int), 1, NULL); + if (map_fd < 0) + return detected; + + insns[0].imm = map_fd; + + prog_fd = xsk_check_create_prog(insns, ARRAY_SIZE(insns)); + if (prog_fd < 0) { + close(map_fd); + return detected; + } + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + if (!ret && opts.retval == XDP_PASS) + detected = true; + close(prog_fd); + close(map_fd); + return detected; +} + +static struct xdp_program *xsk_lookup_program(int ifindex) +{ + const char *version_name = "xsk_prog_version"; + const char *prog_name = "xsk_def_prog"; + struct xdp_multiprog *multi_prog; + struct xdp_program *prog = NULL; + __u32 version; + int err; + + multi_prog = xdp_multiprog__get_from_ifindex(ifindex); + if (IS_ERR(multi_prog)) + return NULL; + + if (xdp_multiprog__is_legacy(multi_prog)) { + prog = xdp_multiprog__main_prog(multi_prog); + prog = strcmp(xdp_program__name(prog), prog_name) ? NULL : prog; + goto check; + } + + while ((prog = xdp_multiprog__next_prog(prog, multi_prog))) + if (!strcmp(xdp_program__name(prog), prog_name)) + break; + +check: + if (!prog) + goto out; + + err = check_xdp_prog_version(xdp_program__btf(prog), version_name, &version); + if (err) { + prog = ERR_PTR(err); + goto out; + } + if (version > XSK_PROG_VERSION) { + pr_warn("XSK default program version %d higher than supported %d\n", version, + XSK_PROG_VERSION); + prog = ERR_PTR(-EOPNOTSUPP); + } + +out: + if (!IS_ERR_OR_NULL(prog)) + prog = xdp_program__clone(prog, 0); + + xdp_multiprog__close(multi_prog); + return prog; +} + +static int xsk_update_prog_refcnt(int refcnt_map_fd, int delta) +{ + struct bpf_map_info map_info = {}; + __u32 info_len = sizeof(map_info); + int *value_data = NULL; + int lock_fd, ret; + __u32 key = 0; + + ret = bpf_obj_get_info_by_fd(refcnt_map_fd, &map_info, &info_len); + if (ret) + return ret; + + value_data = calloc(1, map_info.value_size); + if (!value_data) + return -ENOMEM; + + lock_fd = xdp_lock_acquire(); + if (lock_fd < 0) { + ret = lock_fd; + goto out; + } + + /* Note, if other global variables are added before the refcnt, + * this changes map's value type, not number of elements, + * so additional offset must be applied to value_data, + * when reading refcount, but map key always stays zero + */ + ret = bpf_map_lookup_elem(refcnt_map_fd, &key, value_data); + if (ret) + goto unlock; + + /* If refcount is 0, program is awaiting detach and can't be used */ + if (*value_data) { + *value_data += delta; + ret = bpf_map_update_elem(refcnt_map_fd, &key, value_data, 0); + if (ret) + goto unlock; + } + + ret = *value_data; +unlock: + xdp_lock_release(lock_fd); +out: + free(value_data); + return ret; +} + +static int xsk_incr_prog_refcnt(int refcnt_map_fd) +{ + return xsk_update_prog_refcnt(refcnt_map_fd, 1); +} + +static int xsk_decr_prog_refcnt(int refcnt_map_fd) +{ + return xsk_update_prog_refcnt(refcnt_map_fd, -1); +} + +static int __xsk_setup_xdp_prog(struct xsk_socket *xsk, int *xsks_map_fd) +{ + const char *fallback_prog = "xsk_def_xdp_prog_5.3.o"; + const char *default_prog = "xsk_def_xdp_prog.o"; + struct xsk_ctx *ctx = xsk->ctx; + const char *file_name = NULL; + bool attached = false; + int err; + + ctx->xdp_prog = xsk_lookup_program(ctx->ifindex); + if (IS_ERR(ctx->xdp_prog)) + return PTR_ERR(ctx->xdp_prog); + + ctx->refcnt_map_fd = -ENOENT; + + if (ctx->xdp_prog) { + int refcnt; + + ctx->refcnt_map_fd = xsk_lookup_refcnt_map(xdp_program__fd(ctx->xdp_prog), NULL); + if (ctx->refcnt_map_fd == -ENOENT) + goto map_lookup; + + if (ctx->refcnt_map_fd < 0) { + err = ctx->refcnt_map_fd; + goto err_prog_load; + } + + refcnt = xsk_incr_prog_refcnt(ctx->refcnt_map_fd); + if (refcnt < 0) { + err = refcnt; + pr_debug("Error occurred when incrementing xsk XDP prog refcount: %s\n", + strerror(-err)); + goto err_prog_load; + } + + if (!refcnt) { + pr_warn("Current program is being detached, falling back on creating a new program\n"); + close(ctx->refcnt_map_fd); + ctx->refcnt_map_fd = -ENOENT; + xdp_program__close(ctx->xdp_prog); + ctx->xdp_prog = NULL; + } + } + + if (!ctx->xdp_prog) { + file_name = xsk_check_redirect_flags() ? default_prog : fallback_prog; + ctx->xdp_prog = xdp_program__find_file(file_name, NULL, NULL); + if (IS_ERR(ctx->xdp_prog)) + return PTR_ERR(ctx->xdp_prog); + + err = xsk_size_map(ctx->xdp_prog, ctx->ifname); + if (err) + goto err_prog_load; + + err = xdp_program__attach(ctx->xdp_prog, ctx->ifindex, + xsk_convert_xdp_flags(xsk->config.xdp_flags), 0); + if (err) + goto err_prog_load; + + attached = true; + } + + if (ctx->refcnt_map_fd < 0) { + ctx->refcnt_map_fd = xsk_lookup_refcnt_map(xdp_program__fd(ctx->xdp_prog), + file_name); + if (ctx->refcnt_map_fd < 0 && ctx->refcnt_map_fd != -ENOENT) { + err = ctx->refcnt_map_fd; + goto err_prog_load; + } + } +map_lookup: + ctx->xsks_map_fd = xsk_lookup_bpf_map(xdp_program__fd(ctx->xdp_prog)); + if (ctx->xsks_map_fd < 0) { + err = ctx->xsks_map_fd; + goto err_lookup; + } + + if (xsk->rx) { + err = bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, &xsk->fd, 0); + if (err) + goto err_lookup; + } + if (xsks_map_fd) + *xsks_map_fd = ctx->xsks_map_fd; + + return 0; + +err_lookup: + if (attached) + xdp_program__detach(ctx->xdp_prog, ctx->ifindex, + xsk_convert_xdp_flags(xsk->config.xdp_flags), 0); +err_prog_load: + if (ctx->refcnt_map_fd >= 0) + close(ctx->refcnt_map_fd); + ctx->refcnt_map_fd = -ENOENT; + xdp_program__close(ctx->xdp_prog); + ctx->xdp_prog = NULL; + return err; +} + +static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, __u64 netns_cookie, int ifindex, __u32 queue_id) +{ + struct xsk_ctx *ctx; + + if (list_empty(&umem->ctx_list)) + return NULL; + + list_for_each_entry(ctx, &umem->ctx_list, list) { + if (ctx->netns_cookie == netns_cookie && ctx->ifindex == ifindex && ctx->queue_id == queue_id) { + ctx->refcount++; + return ctx; + } + } + + return NULL; +} + +static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap) +{ + struct xsk_umem *umem = ctx->umem; + struct xdp_mmap_offsets off; + int err; + + if (--ctx->refcount) + return; + + if (!unmap) + goto out_free; + + err = xsk_get_mmap_offsets(umem->fd, &off); + if (err) + goto out_free; + + munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * + sizeof(__u64)); + munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * + sizeof(__u64)); + +out_free: + list_del(&ctx->list); + free(ctx); +} + +static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, + struct xsk_umem *umem, __u64 netns_cookie, int ifindex, + const char *ifname, __u32 queue_id, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp) +{ + struct xsk_ctx *ctx; + int err; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return NULL; + + if (!umem->fill_save) { + err = xsk_create_umem_rings(umem, xsk->fd, fill, comp); + if (err) { + free(ctx); + return NULL; + } + } else if (umem->fill_save != fill || umem->comp_save != comp) { + /* Copy over rings to new structs. */ + memcpy(fill, umem->fill_save, sizeof(*fill)); + memcpy(comp, umem->comp_save, sizeof(*comp)); + } + + ctx->netns_cookie = netns_cookie; + ctx->ifindex = ifindex; + ctx->refcount = 1; + ctx->umem = umem; + ctx->queue_id = queue_id; + memcpy(ctx->ifname, ifname, IFNAMSIZ - 1); + ctx->ifname[IFNAMSIZ - 1] = '\0'; + + ctx->fill = fill; + ctx->comp = comp; + list_add(&ctx->list, &umem->ctx_list); + return ctx; +} + +static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) +{ + free(xsk->ctx); + free(xsk); +} + +int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) +{ + struct xsk_ctx *ctx = xsk->ctx; + + ctx->xsks_map_fd = fd; + return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, &xsk->fd, 0); +} + +int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) +{ + struct xsk_socket *xsk; + int res; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + return -ENOMEM; + + res = xsk_init_xsk_struct(xsk, ifindex); + if (res) { + free(xsk); + return -EINVAL; + } + + res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); + + xsk_destroy_xsk_struct(xsk); + + return res; +} + +int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, + const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_socket_config *usr_config) +{ + bool rx_setup_done = false, tx_setup_done = false; + void *rx_map = NULL, *tx_map = NULL; + struct sockaddr_xdp sxdp = {}; + struct xdp_mmap_offsets off; + struct xsk_socket *xsk; + struct xsk_ctx *ctx; + int err, ifindex; + __u64 netns_cookie; + socklen_t optlen; + bool unmap; + + if (!umem || !xsk_ptr || !(rx || tx)) + return -EFAULT; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + return -ENOMEM; + + err = xsk_set_xdp_socket_config(&xsk->config, usr_config); + if (err) + goto out_xsk_alloc; + + ifindex = if_nametoindex(ifname); + if (!ifindex) { + err = -errno; + goto out_xsk_alloc; + } + + if (umem->refcount++ > 0) { + xsk->fd = socket(AF_XDP, SOCK_RAW, 0); + if (xsk->fd < 0) { + err = -errno; + goto out_xsk_alloc; + } + } else { + xsk->fd = umem->fd; + rx_setup_done = umem->rx_ring_setup_done; + tx_setup_done = umem->tx_ring_setup_done; + } + + optlen = sizeof(netns_cookie); + err = getsockopt(xsk->fd, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen); + if (err) { + if (errno != ENOPROTOOPT) { + err = -errno; + goto out_socket; + } + netns_cookie = INIT_NS; + } + + ctx = xsk_get_ctx(umem, netns_cookie, ifindex, queue_id); + if (!ctx) { + if (!fill || !comp) { + err = -EFAULT; + goto out_socket; + } + + ctx = xsk_create_ctx(xsk, umem, netns_cookie, ifindex, ifname, queue_id, + fill, comp); + if (!ctx) { + err = -ENOMEM; + goto out_socket; + } + } + xsk->ctx = ctx; + + if (rx && !rx_setup_done) { + err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, + &xsk->config.rx_size, + sizeof(xsk->config.rx_size)); + if (err) { + err = -errno; + goto out_put_ctx; + } + if (xsk->fd == umem->fd) + umem->rx_ring_setup_done = true; + + } + if (tx && !tx_setup_done) { + err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, + &xsk->config.tx_size, + sizeof(xsk->config.tx_size)); + if (err) { + err = -errno; + goto out_put_ctx; + } + if (xsk->fd == umem->fd) + umem->tx_ring_setup_done = true; + } + + err = xsk_get_mmap_offsets(xsk->fd, &off); + if (err) { + err = -errno; + goto out_put_ctx; + } + + if (rx) { + rx_map = mmap(NULL, off.rx.desc + + xsk->config.rx_size * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + xsk->fd, XDP_PGOFF_RX_RING); + if (rx_map == MAP_FAILED) { + err = -errno; + goto out_put_ctx; + } + + rx->mask = xsk->config.rx_size - 1; + rx->size = xsk->config.rx_size; + rx->producer = rx_map + off.rx.producer; + rx->consumer = rx_map + off.rx.consumer; + rx->flags = rx_map + off.rx.flags; + rx->ring = rx_map + off.rx.desc; + rx->cached_prod = *rx->producer; + rx->cached_cons = *rx->consumer; + } + xsk->rx = rx; + + if (tx) { + tx_map = mmap(NULL, off.tx.desc + + xsk->config.tx_size * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + xsk->fd, XDP_PGOFF_TX_RING); + if (tx_map == MAP_FAILED) { + err = -errno; + goto out_mmap_rx; + } + + tx->mask = xsk->config.tx_size - 1; + tx->size = xsk->config.tx_size; + tx->producer = tx_map + off.tx.producer; + tx->consumer = tx_map + off.tx.consumer; + tx->flags = tx_map + off.tx.flags; + tx->ring = tx_map + off.tx.desc; + tx->cached_prod = *tx->producer; + /* cached_cons is r->size bigger than the real consumer pointer + * See xsk_prod_nb_free + */ + tx->cached_cons = *tx->consumer + xsk->config.tx_size; + } + xsk->tx = tx; + + sxdp.sxdp_family = PF_XDP; + sxdp.sxdp_ifindex = ctx->ifindex; + sxdp.sxdp_queue_id = ctx->queue_id; + if (umem->refcount > 1) { + sxdp.sxdp_flags |= XDP_SHARED_UMEM; + sxdp.sxdp_shared_umem_fd = umem->fd; + } else { + sxdp.sxdp_flags = xsk->config.bind_flags; + } + + err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp)); + if (err) { + err = -errno; + goto out_mmap_tx; + } + + if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { + err = __xsk_setup_xdp_prog(xsk, NULL); + if (err) + goto out_mmap_tx; + } + + *xsk_ptr = xsk; + umem->fill_save = NULL; + umem->comp_save = NULL; + return 0; + +out_mmap_tx: + if (tx) + munmap(tx_map, off.tx.desc + + xsk->config.tx_size * sizeof(struct xdp_desc)); +out_mmap_rx: + if (rx) + munmap(rx_map, off.rx.desc + + xsk->config.rx_size * sizeof(struct xdp_desc)); +out_put_ctx: + unmap = umem->fill_save != fill; + xsk_put_ctx(ctx, unmap); +out_socket: + if (--umem->refcount) + close(xsk->fd); +out_xsk_alloc: + free(xsk); + return err; +} + +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, + const struct xsk_socket_config *usr_config) +{ + if (!umem) + return -EFAULT; + + return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem, + rx, tx, umem->fill_save, + umem->comp_save, usr_config); +} + +int xsk_umem__delete(struct xsk_umem *umem) +{ + struct xdp_mmap_offsets off; + int err; + + if (!umem) + return 0; + + if (umem->refcount) + return -EBUSY; + + err = xsk_get_mmap_offsets(umem->fd, &off); + if (!err && umem->fill_save && umem->comp_save) { + munmap(umem->fill_save->ring - off.fr.desc, + off.fr.desc + umem->config.fill_size * sizeof(__u64)); + munmap(umem->comp_save->ring - off.cr.desc, + off.cr.desc + umem->config.comp_size * sizeof(__u64)); + } + + close(umem->fd); + free(umem); + + return 0; +} + +static void xsk_release_xdp_prog(struct xsk_socket *xsk) +{ + struct xsk_ctx *ctx = xsk->ctx; + int value; + + if (xsk->ctx->refcnt_map_fd < 0) + goto out; + + value = xsk_decr_prog_refcnt(ctx->refcnt_map_fd); + if (value < 0) + pr_warn("Error occurred when decrementing xsk XDP prog refcount: %s, please detach program yourself\n", + strerror(-value)); + if (value) + goto out; + + xdp_program__detach(ctx->xdp_prog, ctx->ifindex, + xsk_convert_xdp_flags(xsk->config.xdp_flags), 0); +out: + xdp_program__close(ctx->xdp_prog); +} + +void xsk_socket__delete(struct xsk_socket *xsk) +{ + size_t desc_sz = sizeof(struct xdp_desc); + struct xdp_mmap_offsets off; + struct xsk_umem *umem; + struct xsk_ctx *ctx; + int err; + + if (!xsk) + return; + + ctx = xsk->ctx; + umem = ctx->umem; + if (ctx->xdp_prog) { + xsk_delete_map_entry(ctx->xsks_map_fd, ctx->queue_id); + xsk_release_xdp_prog(xsk); + } + + err = xsk_get_mmap_offsets(xsk->fd, &off); + if (!err) { + if (xsk->rx) { + munmap(xsk->rx->ring - off.rx.desc, + off.rx.desc + xsk->config.rx_size * desc_sz); + } + if (xsk->tx) { + munmap(xsk->tx->ring - off.tx.desc, + off.tx.desc + xsk->config.tx_size * desc_sz); + } + } + + xsk_put_ctx(ctx, true); + + umem->refcount--; + /* Do not close an fd that also has an associated umem connected + * to it. + */ + if (xsk->fd != umem->fd) + close(xsk->fd); + free(xsk); +} |