diff options
Diffstat (limited to 'src/libknot/xdp/xdp.c')
-rw-r--r-- | src/libknot/xdp/xdp.c | 717 |
1 files changed, 717 insertions, 0 deletions
diff --git a/src/libknot/xdp/xdp.c b/src/libknot/xdp/xdp.c new file mode 100644 index 0000000..29dfe61 --- /dev/null +++ b/src/libknot/xdp/xdp.c @@ -0,0 +1,717 @@ +/* Copyright (C) 2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include <assert.h> +#include <errno.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "libknot/attribute.h" +#include "libknot/endian.h" +#include "libknot/errcode.h" +#include "libknot/xdp/bpf-user.h" +#include "libknot/xdp/xdp.h" +#include "contrib/macros.h" + +/* Don't fragment flag. */ +#define IP_DF 0x4000 + +#define FRAME_SIZE 2048 +#define UMEM_FRAME_COUNT_RX 4096 +#define UMEM_FRAME_COUNT_TX UMEM_FRAME_COUNT_RX // No reason to differ so far. +#define UMEM_RING_LEN_RX (UMEM_FRAME_COUNT_RX * 2) +#define UMEM_RING_LEN_TX (UMEM_FRAME_COUNT_TX * 2) +#define UMEM_FRAME_COUNT (UMEM_FRAME_COUNT_RX + UMEM_FRAME_COUNT_TX) + +/* With recent compilers we statically check #defines for settings that + * get refused by AF_XDP drivers (in current versions, at least). */ +#if (__STDC_VERSION__ >= 201112L) +#define IS_POWER_OF_2(n) (((n) & (n - 1)) == 0) +_Static_assert((FRAME_SIZE == 4096 || FRAME_SIZE == 2048) + && IS_POWER_OF_2(UMEM_FRAME_COUNT) + /* The following two inequalities aren't required by drivers, but they allow + * our implementation assume that the rings can never get filled. */ + && IS_POWER_OF_2(UMEM_RING_LEN_RX) && UMEM_RING_LEN_RX > UMEM_FRAME_COUNT_RX + && IS_POWER_OF_2(UMEM_RING_LEN_TX) && UMEM_RING_LEN_TX > UMEM_FRAME_COUNT_TX + && UMEM_FRAME_COUNT_TX <= (1 << 16) /* see tx_free_indices */ + , "Incorrect #define combination for AF_XDP."); +#endif + +/*! \brief The memory layout of IPv4 umem frame. */ +struct udpv4 { + union { + uint8_t bytes[1]; + struct { + struct ethhdr eth; // No VLAN support; CRC at the "end" of .data! + struct iphdr ipv4; + struct udphdr udp; + uint8_t data[]; + } __attribute__((packed)); + }; +}; + +/*! \brief The memory layout of IPv6 umem frame. */ +struct udpv6 { + union { + uint8_t bytes[1]; + struct { + struct ethhdr eth; // No VLAN support; CRC at the "end" of .data! + struct ipv6hdr ipv6; + struct udphdr udp; + uint8_t data[]; + } __attribute__((packed)); + }; +}; + +/*! \brief The memory layout of each umem frame. */ +struct umem_frame { + union { + uint8_t bytes[FRAME_SIZE]; + union { + struct udpv4 udpv4; + struct udpv6 udpv6; + }; + }; +}; + +_public_ +const size_t KNOT_XDP_PAYLOAD_OFFSET4 = offsetof(struct udpv4, data) + offsetof(struct umem_frame, udpv4); +_public_ +const size_t KNOT_XDP_PAYLOAD_OFFSET6 = offsetof(struct udpv6, data) + offsetof(struct umem_frame, udpv6); + +static int configure_xsk_umem(struct kxsk_umem **out_umem) +{ + /* Allocate memory and call driver to create the UMEM. */ + struct kxsk_umem *umem = calloc(1, + offsetof(struct kxsk_umem, tx_free_indices) + + sizeof(umem->tx_free_indices[0]) * UMEM_FRAME_COUNT_TX); + if (umem == NULL) { + return KNOT_ENOMEM; + } + + int ret = posix_memalign((void **)&umem->frames, getpagesize(), + FRAME_SIZE * UMEM_FRAME_COUNT); + if (ret != 0) { + free(umem); + return KNOT_ENOMEM; + } + + const struct xsk_umem_config config = { + .fill_size = UMEM_RING_LEN_RX, + .comp_size = UMEM_RING_LEN_TX, + .frame_size = FRAME_SIZE, + .frame_headroom = 0, + }; + + ret = xsk_umem__create(&umem->umem, umem->frames, FRAME_SIZE * UMEM_FRAME_COUNT, + &umem->fq, &umem->cq, &config); + if (ret != KNOT_EOK) { + free(umem->frames); + free(umem); + return ret; + } + *out_umem = umem; + + /* Designate the starting chunk of buffers for TX, and put them onto the stack. */ + umem->tx_free_count = UMEM_FRAME_COUNT_TX; + for (uint32_t i = 0; i < UMEM_FRAME_COUNT_TX; ++i) { + umem->tx_free_indices[i] = i; + } + + /* Designate the rest of buffers for RX, and pass them to the driver. */ + uint32_t idx = 0; + ret = xsk_ring_prod__reserve(&umem->fq, UMEM_FRAME_COUNT_RX, &idx); + if (ret != UMEM_FRAME_COUNT - UMEM_FRAME_COUNT_TX) { + assert(0); + return KNOT_ERROR; + } + assert(idx == 0); + for (uint32_t i = UMEM_FRAME_COUNT_TX; i < UMEM_FRAME_COUNT; ++i) { + *xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * FRAME_SIZE; + } + xsk_ring_prod__submit(&umem->fq, UMEM_FRAME_COUNT_RX); + + return KNOT_EOK; +} + +static void deconfigure_xsk_umem(struct kxsk_umem *umem) +{ + (void)xsk_umem__delete(umem->umem); + free(umem->frames); + free(umem); +} + +static int configure_xsk_socket(struct kxsk_umem *umem, + const struct kxsk_iface *iface, + knot_xdp_socket_t **out_sock) +{ + knot_xdp_socket_t *xsk_info = calloc(1, sizeof(*xsk_info)); + if (xsk_info == NULL) { + return KNOT_ENOMEM; + } + xsk_info->iface = iface; + xsk_info->umem = umem; + + const struct xsk_socket_config sock_conf = { + .tx_size = UMEM_RING_LEN_TX, + .rx_size = UMEM_RING_LEN_RX, + .libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD, + }; + + int ret = xsk_socket__create(&xsk_info->xsk, iface->if_name, + iface->if_queue, umem->umem, + &xsk_info->rx, &xsk_info->tx, &sock_conf); + if (ret != 0) { + free(xsk_info); + return ret; + } + + *out_sock = xsk_info; + return KNOT_EOK; +} + +_public_ +int knot_xdp_init(knot_xdp_socket_t **socket, const char *if_name, int if_queue, + uint32_t listen_port, knot_xdp_load_bpf_t load_bpf) +{ + if (socket == NULL || if_name == NULL) { + return KNOT_EINVAL; + } + + struct kxsk_iface *iface; + int ret = kxsk_iface_new(if_name, if_queue, load_bpf, &iface); + if (ret != KNOT_EOK) { + return ret; + } + + /* Initialize shared packet_buffer for umem usage. */ + struct kxsk_umem *umem = NULL; + ret = configure_xsk_umem(&umem); + if (ret != KNOT_EOK) { + kxsk_iface_free(iface); + return ret; + } + + ret = configure_xsk_socket(umem, iface, socket); + if (ret != KNOT_EOK) { + deconfigure_xsk_umem(umem); + kxsk_iface_free(iface); + return ret; + } + + ret = kxsk_socket_start(iface, listen_port, (*socket)->xsk); + if (ret != KNOT_EOK) { + xsk_socket__delete((*socket)->xsk); + deconfigure_xsk_umem(umem); + kxsk_iface_free(iface); + free(*socket); + *socket = NULL; + return ret; + } + + return ret; +} + +_public_ +void knot_xdp_deinit(knot_xdp_socket_t *socket) +{ + if (socket == NULL) { + return; + } + + kxsk_socket_stop(socket->iface); + xsk_socket__delete(socket->xsk); + deconfigure_xsk_umem(socket->umem); + + kxsk_iface_free((struct kxsk_iface *)/*const-cast*/socket->iface); + free(socket); +} + +_public_ +int knot_xdp_socket_fd(knot_xdp_socket_t *socket) +{ + if (socket == NULL) { + return 0; + } + + return xsk_socket__fd(socket->xsk); +} + +static void tx_free_relative(struct kxsk_umem *umem, uint64_t addr_relative) +{ + /* The address may not point to *start* of buffer, but `/` solves that. */ + uint64_t index = addr_relative / FRAME_SIZE; + assert(index < UMEM_FRAME_COUNT); + umem->tx_free_indices[umem->tx_free_count++] = index; +} + +_public_ +void knot_xdp_send_prepare(knot_xdp_socket_t *socket) +{ + if (socket == NULL) { + return; + } + + struct kxsk_umem *const umem = socket->umem; + struct xsk_ring_cons *const cq = &umem->cq; + + uint32_t idx = 0; + const uint32_t completed = xsk_ring_cons__peek(cq, UINT32_MAX, &idx); + if (completed == 0) { + return; + } + assert(umem->tx_free_count + completed <= UMEM_FRAME_COUNT_TX); + + for (uint32_t i = 0; i < completed; ++i) { + uint64_t addr_relative = *xsk_ring_cons__comp_addr(cq, idx++); + tx_free_relative(umem, addr_relative); + } + + xsk_ring_cons__release(cq, completed); +} + +static struct umem_frame *alloc_tx_frame(struct kxsk_umem *umem) +{ + if (unlikely(umem->tx_free_count == 0)) { + return NULL; + } + + uint32_t index = umem->tx_free_indices[--umem->tx_free_count]; + return umem->frames + index; +} + +_public_ +int knot_xdp_send_alloc(knot_xdp_socket_t *socket, bool ipv6, knot_xdp_msg_t *out, + const knot_xdp_msg_t *in_reply_to) +{ + if (socket == NULL || out == NULL) { + return KNOT_EINVAL; + } + + size_t ofs = ipv6 ? KNOT_XDP_PAYLOAD_OFFSET6 : KNOT_XDP_PAYLOAD_OFFSET4; + + struct umem_frame *uframe = alloc_tx_frame(socket->umem); + if (uframe == NULL) { + return KNOT_ENOMEM; + } + + memset(out, 0, sizeof(*out)); + + out->payload.iov_base = ipv6 ? uframe->udpv6.data : uframe->udpv4.data; + out->payload.iov_len = MIN(UINT16_MAX, FRAME_SIZE - ofs); + + const struct ethhdr *eth = (struct ethhdr *)uframe; + out->eth_from = (void *)ð->h_source; + out->eth_to = (void *)ð->h_dest; + + if (in_reply_to != NULL) { + memcpy(out->eth_from, in_reply_to->eth_to, ETH_ALEN); + memcpy(out->eth_to, in_reply_to->eth_from, ETH_ALEN); + + memcpy(&out->ip_from, &in_reply_to->ip_to, sizeof(out->ip_from)); + memcpy(&out->ip_to, &in_reply_to->ip_from, sizeof(out->ip_to)); + } + + return KNOT_EOK; +} + +static uint16_t from32to16(uint32_t sum) +{ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return sum; +} + +static uint16_t ipv4_checksum(const uint8_t *ipv4_hdr) +{ + const uint16_t *h = (const uint16_t *)ipv4_hdr; + uint32_t sum32 = 0; + for (int i = 0; i < 10; ++i) { + if (i != 5) { + sum32 += h[i]; + } + } + return ~from32to16(sum32); +} + +/* Checksum endianness implementation notes for ipv4_checksum() and udp_checksum_step(). + * + * The basis for checksum is addition on big-endian 16-bit words, with bit 16 carrying + * over to bit 0. That can be viewed as first byte carrying to the second and the + * second one carrying back to the first one, i.e. a symmetrical situation. + * Therefore the result is the same even when arithmetics is done on litte-endian (!) + */ + +static void udp_checksum_step(size_t *result, const void *_data, size_t _data_len) +{ + assert(!(_data_len & 1)); + const uint16_t *data = _data; + size_t len = _data_len / 2; + while (len-- > 0) { + *result += *data++; + } +} + +static void udp_checksum_finish(size_t *result) +{ + while (*result > 0xffff) { + *result = (*result & 0xffff) + (*result >> 16); + } + if (*result != 0xffff) { + *result = ~*result; + } +} + +static uint8_t *msg_uframe_ptr(knot_xdp_socket_t *socket, const knot_xdp_msg_t *msg, + /* Next parameters are just for debugging. */ + bool ipv6) +{ + uint8_t *uNULL = NULL; + uint8_t *uframe_p = uNULL + ((msg->payload.iov_base - NULL) & ~(FRAME_SIZE - 1)); + +#ifndef NDEBUG + intptr_t pd = (uint8_t *)msg->payload.iov_base - uframe_p + - (ipv6 ? KNOT_XDP_PAYLOAD_OFFSET6 : KNOT_XDP_PAYLOAD_OFFSET4); + /* This assertion might fire in some OK cases. For example, the second branch + * had to be added for cases with "emulated" AF_XDP support. */ + assert(pd == XDP_PACKET_HEADROOM || pd == 0); + + const uint8_t *umem_mem_start = socket->umem->frames->bytes; + const uint8_t *umem_mem_end = umem_mem_start + FRAME_SIZE * UMEM_FRAME_COUNT; + assert(umem_mem_start <= uframe_p && uframe_p < umem_mem_end); +#endif + return uframe_p; +} + +static void xsk_sendmsg_ipv4(knot_xdp_socket_t *socket, const knot_xdp_msg_t *msg, + uint32_t index) +{ + uint8_t *uframe_p = msg_uframe_ptr(socket, msg, false); + struct umem_frame *uframe = (struct umem_frame *)uframe_p; + struct udpv4 *h = &uframe->udpv4; + + const struct sockaddr_in *src_v4 = (const struct sockaddr_in *)&msg->ip_from; + const struct sockaddr_in *dst_v4 = (const struct sockaddr_in *)&msg->ip_to; + const uint16_t udp_len = sizeof(h->udp) + msg->payload.iov_len; + + h->eth.h_proto = __constant_htons(ETH_P_IP); + + h->ipv4.version = IPVERSION; + h->ipv4.ihl = 5; + h->ipv4.tos = 0; + h->ipv4.tot_len = htobe16(5 * 4 + udp_len); + h->ipv4.id = 0; + h->ipv4.frag_off = 0; + h->ipv4.ttl = IPDEFTTL; + h->ipv4.protocol = IPPROTO_UDP; + memcpy(&h->ipv4.saddr, &src_v4->sin_addr, sizeof(src_v4->sin_addr)); + memcpy(&h->ipv4.daddr, &dst_v4->sin_addr, sizeof(dst_v4->sin_addr)); + h->ipv4.check = ipv4_checksum(h->bytes + sizeof(struct ethhdr)); + + h->udp.len = htobe16(udp_len); + h->udp.source = src_v4->sin_port; + h->udp.dest = dst_v4->sin_port; + h->udp.check = 0; // Optional for IPv4 - not computed. + + *xsk_ring_prod__tx_desc(&socket->tx, index) = (struct xdp_desc){ + .addr = h->bytes - socket->umem->frames->bytes, + .len = KNOT_XDP_PAYLOAD_OFFSET4 + msg->payload.iov_len + }; +} + +static void xsk_sendmsg_ipv6(knot_xdp_socket_t *socket, const knot_xdp_msg_t *msg, + uint32_t index) +{ + uint8_t *uframe_p = msg_uframe_ptr(socket, msg, true); + struct umem_frame *uframe = (struct umem_frame *)uframe_p; + struct udpv6 *h = &uframe->udpv6; + + const struct sockaddr_in6 *src_v6 = (const struct sockaddr_in6 *)&msg->ip_from; + const struct sockaddr_in6 *dst_v6 = (const struct sockaddr_in6 *)&msg->ip_to; + const uint16_t udp_len = sizeof(h->udp) + msg->payload.iov_len; + + h->eth.h_proto = __constant_htons(ETH_P_IPV6); + + h->ipv6.version = 6; + h->ipv6.priority = 0; + memset(h->ipv6.flow_lbl, 0, sizeof(h->ipv6.flow_lbl)); + h->ipv6.payload_len = htobe16(udp_len); + h->ipv6.nexthdr = IPPROTO_UDP; + h->ipv6.hop_limit = IPDEFTTL; + memcpy(&h->ipv6.saddr, &src_v6->sin6_addr, sizeof(src_v6->sin6_addr)); + memcpy(&h->ipv6.daddr, &dst_v6->sin6_addr, sizeof(dst_v6->sin6_addr)); + + h->udp.len = htobe16(udp_len); + h->udp.source = src_v6->sin6_port; + h->udp.dest = dst_v6->sin6_port; + h->udp.check = 0; // Mandatory for IPv6 - computed afterwards. + + size_t chk = 0; + udp_checksum_step(&chk, &h->ipv6.saddr, sizeof(h->ipv6.saddr)); + udp_checksum_step(&chk, &h->ipv6.daddr, sizeof(h->ipv6.daddr)); + udp_checksum_step(&chk, &h->udp.len, sizeof(h->udp.len)); + __be16 version = htobe16(h->ipv6.nexthdr); + udp_checksum_step(&chk, &version, sizeof(version)); + udp_checksum_step(&chk, &h->udp, sizeof(h->udp)); + size_t padded_len = msg->payload.iov_len; + if (padded_len & 1) { + ((uint8_t *)msg->payload.iov_base)[padded_len++] = 0; + } + udp_checksum_step(&chk, msg->payload.iov_base, padded_len); + udp_checksum_finish(&chk); + h->udp.check = chk; + + *xsk_ring_prod__tx_desc(&socket->tx, index) = (struct xdp_desc){ + .addr = h->bytes - socket->umem->frames->bytes, + .len = KNOT_XDP_PAYLOAD_OFFSET6 + msg->payload.iov_len + }; +} + +_public_ +int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[], + uint32_t count, uint32_t *sent) +{ + if (socket == NULL || msgs == NULL || sent == NULL) { + return KNOT_EINVAL; + } + + /* Now we want to do something close to + * xsk_ring_prod__reserve(&socket->tx, count, *idx) + * but we don't know in advance if we utilize *whole* `count`, + * and the API doesn't allow "cancelling reservations". + * Therefore we handle `socket->tx.cached_prod` by hand; + * that's simplified by the fact that there is always free space. + */ + assert(UMEM_RING_LEN_TX > UMEM_FRAME_COUNT_TX); + uint32_t idx = socket->tx.cached_prod; + + for (uint32_t i = 0; i < count; ++i) { + const knot_xdp_msg_t *msg = &msgs[i]; + + if (msg->payload.iov_len && msg->ip_from.sin6_family == AF_INET) { + xsk_sendmsg_ipv4(socket, msg, idx++); + } else if (msg->payload.iov_len && msg->ip_from.sin6_family == AF_INET6) { + xsk_sendmsg_ipv6(socket, msg, idx++); + } else { + /* Some problem; we just ignore this message. */ + uint64_t addr_relative = (uint8_t *)msg->payload.iov_base + - socket->umem->frames->bytes; + tx_free_relative(socket->umem, addr_relative); + } + } + + *sent = idx - socket->tx.cached_prod; + assert(*sent <= count); + socket->tx.cached_prod = idx; + xsk_ring_prod__submit(&socket->tx, *sent); + socket->kernel_needs_wakeup = true; + + return KNOT_EOK; +} + +_public_ +int knot_xdp_send_finish(knot_xdp_socket_t *socket) +{ + if (socket == NULL) { + return KNOT_EINVAL; + } + + /* Trigger sending queued packets. */ + if (!socket->kernel_needs_wakeup) { + return KNOT_EOK; + } + + int ret = sendto(xsk_socket__fd(socket->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + const bool is_ok = (ret >= 0); + // List of "safe" errors taken from + // https://github.com/torvalds/linux/blame/master/samples/bpf/xdpsock_user.c + const bool is_again = !is_ok && (errno == ENOBUFS || errno == EAGAIN + || errno == EBUSY || errno == ENETDOWN); + // Some of the !is_ok cases are a little unclear - what to do about the syscall, + // including how caller of _sendmsg_finish() should react. + if (is_ok || !is_again) { + socket->kernel_needs_wakeup = false; + } + if (is_again) { + return KNOT_EAGAIN; + } else if (is_ok) { + return KNOT_EOK; + } else { + return -errno; + } + /* This syscall might be avoided with a newer kernel feature (>= 5.4): + https://www.kernel.org/doc/html/latest/networking/af_xdp.html#xdp-use-need-wakeup-bind-flag + Unfortunately it's not easy to continue supporting older kernels + when using this feature on newer ones. + */ +} + +static void rx_desc(knot_xdp_socket_t *socket, const struct xdp_desc *desc, + knot_xdp_msg_t *msg) +{ + uint8_t *uframe_p = socket->umem->frames->bytes + desc->addr; + const struct ethhdr *eth = (struct ethhdr *)uframe_p; + const struct iphdr *ip4 = NULL; + const struct ipv6hdr *ip6 = NULL; + const struct udphdr *udp = NULL; + + switch (eth->h_proto) { + case __constant_htons(ETH_P_IP): + ip4 = (struct iphdr *)(uframe_p + sizeof(struct ethhdr)); + // Next conditions are ensured by the BPF filter. + assert(ip4->version == 4); + assert(ip4->frag_off == 0 || + ip4->frag_off == __constant_htons(IP_DF)); + assert(ip4->protocol == IPPROTO_UDP); + // IPv4 header checksum is not verified! + udp = (struct udphdr *)(uframe_p + sizeof(struct ethhdr) + + ip4->ihl * 4); + break; + case __constant_htons(ETH_P_IPV6): + ip6 = (struct ipv6hdr *)(uframe_p + sizeof(struct ethhdr)); + // Next conditions are ensured by the BPF filter. + assert(ip6->version == 6); + assert(ip6->nexthdr == IPPROTO_UDP); + udp = (struct udphdr *)(uframe_p + sizeof(struct ethhdr) + + sizeof(struct ipv6hdr)); + break; + default: + assert(0); + msg->payload.iov_len = 0; + return; + } + // UDP checksum is not verified! + + assert(eth && (!!ip4 != !!ip6) && udp); + + // Process the packet; ownership is passed on, beware of holding frames. + + msg->payload.iov_base = (uint8_t *)udp + sizeof(struct udphdr); + msg->payload.iov_len = be16toh(udp->len) - sizeof(struct udphdr); + + msg->eth_from = (void *)ð->h_source; + msg->eth_to = (void *)ð->h_dest; + + if (ip4 != NULL) { + struct sockaddr_in *src_v4 = (struct sockaddr_in *)&msg->ip_from; + struct sockaddr_in *dst_v4 = (struct sockaddr_in *)&msg->ip_to; + memcpy(&src_v4->sin_addr, &ip4->saddr, sizeof(src_v4->sin_addr)); + memcpy(&dst_v4->sin_addr, &ip4->daddr, sizeof(dst_v4->sin_addr)); + src_v4->sin_port = udp->source; + dst_v4->sin_port = udp->dest; + src_v4->sin_family = AF_INET; + dst_v4->sin_family = AF_INET; + } else { + assert(ip6); + struct sockaddr_in6 *src_v6 = (struct sockaddr_in6 *)&msg->ip_from; + struct sockaddr_in6 *dst_v6 = (struct sockaddr_in6 *)&msg->ip_to; + memcpy(&src_v6->sin6_addr, &ip6->saddr, sizeof(src_v6->sin6_addr)); + memcpy(&dst_v6->sin6_addr, &ip6->daddr, sizeof(dst_v6->sin6_addr)); + src_v6->sin6_port = udp->source; + dst_v6->sin6_port = udp->dest; + src_v6->sin6_family = AF_INET6; + dst_v6->sin6_family = AF_INET6; + // Flow label is ignored. + } +} + +_public_ +int knot_xdp_recv(knot_xdp_socket_t *socket, knot_xdp_msg_t msgs[], + uint32_t max_count, uint32_t *count) +{ + if (socket == NULL || msgs == NULL || count == NULL) { + return KNOT_EINVAL; + } + + uint32_t idx = 0; + const uint32_t available = xsk_ring_cons__peek(&socket->rx, max_count, &idx); + if (available == 0) { + *count = 0; + return KNOT_EOK; + } + assert(available <= max_count); + + for (uint32_t i = 0; i < available; ++i) { + rx_desc(socket, xsk_ring_cons__rx_desc(&socket->rx, idx++), &msgs[i]); + } + + xsk_ring_cons__release(&socket->rx, available); + *count = available; + + return KNOT_EOK; +} + +_public_ +void knot_xdp_recv_finish(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[], + uint32_t count) +{ + if (socket == NULL || msgs == NULL) { + return; + } + + struct kxsk_umem *const umem = socket->umem; + struct xsk_ring_prod *const fq = &umem->fq; + + uint32_t idx = 0; + const uint32_t reserved = xsk_ring_prod__reserve(fq, count, &idx); + assert(reserved == count); + + for (uint32_t i = 0; i < reserved; ++i) { + uint8_t *uframe_p = msg_uframe_ptr(socket, &msgs[i], + msgs[i].ip_from.sin6_family == AF_INET6); + uint64_t offset = uframe_p - umem->frames->bytes; + *xsk_ring_prod__fill_addr(fq, idx++) = offset; + } + + xsk_ring_prod__submit(fq, reserved); +} + +_public_ +void knot_xdp_info(const knot_xdp_socket_t *socket, FILE *file) +{ + if (socket == NULL || file == NULL) { + return; + } + + // The number of busy frames + #define RING_BUSY(ring) \ + ((*(ring)->producer - *(ring)->consumer) & (ring)->mask) + + #define RING_PRINFO(name, ring) \ + fprintf(file, "Ring %s: size %4d, busy %4d (prod %4d, cons %4d)\n", \ + name, (unsigned)(ring)->size, \ + (unsigned)RING_BUSY((ring)), \ + (unsigned)*(ring)->producer, (unsigned)*(ring)->consumer) + + const int rx_busyf = RING_BUSY(&socket->umem->fq) + RING_BUSY(&socket->rx); + fprintf(file, "\nLOST RX frames: %4d", (int)(UMEM_FRAME_COUNT_RX - rx_busyf)); + + const int tx_busyf = RING_BUSY(&socket->umem->cq) + RING_BUSY(&socket->tx); + const int tx_freef = socket->umem->tx_free_count; + fprintf(file, "\nLOST TX frames: %4d\n", (int)(UMEM_FRAME_COUNT_TX - tx_busyf - tx_freef)); + + RING_PRINFO("FQ", &socket->umem->fq); + RING_PRINFO("RX", &socket->rx); + RING_PRINFO("TX", &socket->tx); + RING_PRINFO("CQ", &socket->umem->cq); + fprintf(file, "TX free frames: %4d\n", tx_freef); +} |