summaryrefslogtreecommitdiffstats
path: root/src/libknot/xdp/xdp.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libknot/xdp/xdp.c')
-rw-r--r--src/libknot/xdp/xdp.c253
1 files changed, 157 insertions, 96 deletions
diff --git a/src/libknot/xdp/xdp.c b/src/libknot/xdp/xdp.c
index 8286884..132f5c4 100644
--- a/src/libknot/xdp/xdp.c
+++ b/src/libknot/xdp/xdp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2023 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -38,63 +38,68 @@
#include "contrib/net.h"
#define FRAME_SIZE 2048
-
-#define FRAME_COUNT_TX 2048
-#define FRAME_COUNT_RX 2048
-#define FRAME_COUNT (FRAME_COUNT_TX + FRAME_COUNT_RX)
-
-#define RING_LEN_TX FRAME_COUNT_TX
-#define RING_LEN_CQ FRAME_COUNT_TX
-#define RING_LEN_RX FRAME_COUNT_RX
-/* It's recommended that the FQ ring size >= HW RX ring size + AF_XDP RX ring size. */
-#define RING_LEN_FQ (2 * FRAME_COUNT_RX)
-
-#define ALLOC_RETRY_NUM 15
-#define ALLOC_RETRY_DELAY 20 // In nanoseconds.
-
-/* With recent compilers we statically check #defines for settings that
- * get refused by AF_XDP drivers (in current versions, at least). */
-#if (__STDC_VERSION__ >= 201112L)
-#define IS_POWER_OF_2(n) (((n) & (n - 1)) == 0)
-_Static_assert((FRAME_SIZE == 4096 || FRAME_SIZE == 2048)
- && IS_POWER_OF_2(RING_LEN_TX) && IS_POWER_OF_2(RING_LEN_RX)
- && IS_POWER_OF_2(RING_LEN_CQ) && IS_POWER_OF_2(RING_LEN_FQ)
- && FRAME_COUNT_TX <= (1 << 16) /* see tx_free_indices */
- , "Incorrect #define combination for AF_XDP.");
-#endif
+#define DEFAULT_RING_SIZE 2048
+#define RETRY_DELAY 20 // In nanoseconds.
struct umem_frame {
uint8_t bytes[FRAME_SIZE];
};
-static int configure_xsk_umem(struct kxsk_umem **out_umem, bool extra_frames)
+static bool valid_config(const knot_xdp_config_t *config)
+{
+ if (FRAME_SIZE != 2048 && FRAME_SIZE != 4096) {
+ return false;
+ }
+
+ if (config == NULL) {
+ return true;
+ }
+
+ if ((config->ring_size & (config->ring_size - 1)) != 0) {
+ return false;
+ }
+
+ return true;
+}
+
+static uint32_t ring_size(const knot_xdp_config_t *config)
+{
+ return config != NULL ? config->ring_size : DEFAULT_RING_SIZE;
+}
+
+static int configure_xsk_umem(struct kxsk_umem **out_umem, uint32_t ring_size)
{
/* Allocate memory and call driver to create the UMEM. */
struct kxsk_umem *umem = calloc(1,
offsetof(struct kxsk_umem, tx_free_indices)
- + sizeof(umem->tx_free_indices[0]) * FRAME_COUNT_TX);
+ + sizeof(umem->tx_free_indices[0]) * ring_size);
if (umem == NULL) {
return KNOT_ENOMEM;
}
+ umem->ring_size = ring_size;
- size_t frame_count = FRAME_COUNT + (extra_frames ? FRAME_COUNT_RX : 0);
+ /* It's recommended that the FQ ring size >= HW RX ring size + AF_XDP RX ring size.
+ * However, the performance is better if FQ size == AF_XDP RX size. */
+ const uint32_t FQ_SIZE = umem->ring_size;
+ const uint32_t CQ_SIZE = umem->ring_size;
+ const uint32_t FRAMES = FQ_SIZE + CQ_SIZE;
int ret = posix_memalign((void **)&umem->frames, getpagesize(),
- FRAME_SIZE * frame_count);
+ FRAME_SIZE * FRAMES);
if (ret != 0) {
free(umem);
return KNOT_ENOMEM;
}
- const struct xsk_umem_config config = {
- .fill_size = RING_LEN_FQ,
- .comp_size = RING_LEN_CQ,
+ const struct xsk_umem_config umem_config = {
+ .fill_size = FQ_SIZE,
+ .comp_size = CQ_SIZE,
.frame_size = FRAME_SIZE,
.frame_headroom = KNOT_XDP_PKT_ALIGNMENT,
};
- ret = xsk_umem__create(&umem->umem, umem->frames, FRAME_SIZE * frame_count,
- &umem->fq, &umem->cq, &config);
+ ret = xsk_umem__create(&umem->umem, umem->frames, FRAME_SIZE * FRAMES,
+ &umem->fq, &umem->cq, &umem_config);
if (ret != KNOT_EOK) {
free(umem->frames);
free(umem);
@@ -103,23 +108,23 @@ static int configure_xsk_umem(struct kxsk_umem **out_umem, bool extra_frames)
*out_umem = umem;
/* Designate the starting chunk of buffers for TX, and put them onto the stack. */
- umem->tx_free_count = FRAME_COUNT_TX;
- for (uint32_t i = 0; i < FRAME_COUNT_TX; ++i) {
+ umem->tx_free_count = CQ_SIZE;
+ for (uint32_t i = 0; i < CQ_SIZE; ++i) {
umem->tx_free_indices[i] = i;
}
/* Designate the rest of buffers for RX, and pass them to the driver. */
uint32_t idx = 0;
- ret = xsk_ring_prod__reserve(&umem->fq, frame_count - FRAME_COUNT_TX, &idx);
- if (ret != frame_count - FRAME_COUNT_TX) {
+ ret = xsk_ring_prod__reserve(&umem->fq, FQ_SIZE, &idx);
+ if (ret != FQ_SIZE) {
assert(0);
return KNOT_ERROR;
}
assert(idx == 0);
- for (uint32_t i = FRAME_COUNT_TX; i < frame_count; ++i) {
+ for (uint32_t i = CQ_SIZE; i < CQ_SIZE + FQ_SIZE; ++i) {
*xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * FRAME_SIZE;
}
- xsk_ring_prod__submit(&umem->fq, frame_count - FRAME_COUNT_TX);
+ xsk_ring_prod__submit(&umem->fq, FQ_SIZE);
return KNOT_EOK;
}
@@ -131,6 +136,33 @@ static void deconfigure_xsk_umem(struct kxsk_umem *umem)
free(umem);
}
+static int enable_busypoll(int socket, unsigned timeout_us, unsigned budget)
+{
+#if defined(SO_PREFER_BUSY_POLL) && defined(SO_BUSY_POLL_BUDGET)
+ int opt_val = 1;
+ if (setsockopt(socket, SOL_SOCKET, SO_PREFER_BUSY_POLL,
+ &opt_val, sizeof(opt_val)) != 0) {
+ return knot_map_errno();
+ }
+
+ opt_val = timeout_us;
+ if (setsockopt(socket, SOL_SOCKET, SO_BUSY_POLL,
+ &opt_val, sizeof(opt_val)) != 0) {
+ return knot_map_errno();
+ }
+
+ opt_val = budget;
+ if (setsockopt(socket, SOL_SOCKET, SO_BUSY_POLL_BUDGET,
+ &opt_val, sizeof(opt_val)) != 0) {
+ return knot_map_errno();
+ }
+
+ return KNOT_EOK;
+#else
+ return KNOT_ENOTSUP;
+#endif
+}
+
static int configure_xsk_socket(struct kxsk_umem *umem,
const struct kxsk_iface *iface,
knot_xdp_socket_t **out_sock,
@@ -143,14 +175,14 @@ static int configure_xsk_socket(struct kxsk_umem *umem,
xsk_info->iface = iface;
xsk_info->umem = umem;
- uint16_t bind_flags = 0;
+ uint16_t bind_flags = XDP_USE_NEED_WAKEUP;
if (config != NULL && config->force_copy) {
bind_flags |= XDP_COPY;
}
const struct xsk_socket_config sock_conf = {
- .tx_size = RING_LEN_TX,
- .rx_size = RING_LEN_RX,
+ .tx_size = umem->ring_size,
+ .rx_size = umem->ring_size,
.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD,
.bind_flags = bind_flags,
};
@@ -163,6 +195,17 @@ static int configure_xsk_socket(struct kxsk_umem *umem,
return ret;
}
+ if (config != NULL && config->busy_poll_budget > 0) {
+ ret = enable_busypoll(xsk_socket__fd(xsk_info->xsk),
+ config->busy_poll_timeout, config->busy_poll_budget);
+ if (ret != KNOT_EOK) {
+ xsk_socket__delete(xsk_info->xsk);
+ free(xsk_info);
+ return ret;
+ }
+ xsk_info->busy_poll = true;
+ }
+
*out_sock = xsk_info;
return KNOT_EOK;
}
@@ -172,7 +215,7 @@ int knot_xdp_init(knot_xdp_socket_t **socket, const char *if_name, int if_queue,
knot_xdp_filter_flag_t flags, uint16_t udp_port, uint16_t quic_port,
knot_xdp_load_bpf_t load_bpf, const knot_xdp_config_t *xdp_config)
{
- if (socket == NULL || if_name == NULL ||
+ if (socket == NULL || if_name == NULL || !valid_config(xdp_config) ||
(udp_port == quic_port && (flags & KNOT_XDP_FILTER_UDP) && (flags & KNOT_XDP_FILTER_QUIC)) ||
(flags & (KNOT_XDP_FILTER_UDP | KNOT_XDP_FILTER_TCP | KNOT_XDP_FILTER_QUIC)) == 0) {
return KNOT_EINVAL;
@@ -187,7 +230,7 @@ int knot_xdp_init(knot_xdp_socket_t **socket, const char *if_name, int if_queue,
/* Initialize shared packet_buffer for umem usage. */
struct kxsk_umem *umem = NULL;
- ret = configure_xsk_umem(&umem, xdp_config->extra_frames);
+ ret = configure_xsk_umem(&umem, ring_size(xdp_config));
if (ret != KNOT_EOK) {
kxsk_iface_free(iface);
return ret;
@@ -266,7 +309,7 @@ static void tx_free_relative(struct kxsk_umem *umem, uint64_t addr_relative)
{
/* The address may not point to *start* of buffer, but `/` solves that. */
uint64_t index = addr_relative / FRAME_SIZE;
- assert(index < FRAME_COUNT);
+ assert(index < umem->ring_size);
umem->tx_free_indices[umem->tx_free_count++] = index;
}
@@ -285,7 +328,7 @@ void knot_xdp_send_prepare(knot_xdp_socket_t *socket)
if (completed == 0) {
return;
}
- assert(umem->tx_free_count + completed <= FRAME_COUNT_TX);
+ assert(umem->tx_free_count + completed <= umem->ring_size);
for (uint32_t i = 0; i < completed; ++i) {
uint64_t addr_relative = *xsk_ring_cons__comp_addr(cq, idx++);
@@ -301,12 +344,13 @@ static struct umem_frame *alloc_tx_frame(knot_xdp_socket_t *socket)
return malloc(sizeof(struct umem_frame));
}
- const struct timespec delay = { .tv_nsec = ALLOC_RETRY_DELAY };
struct kxsk_umem *umem = socket->umem;
- for (int i = 0; unlikely(umem->tx_free_count == 0); i++) {
- if (i == ALLOC_RETRY_NUM) {
- return NULL;
+ const struct timespec delay = { .tv_nsec = RETRY_DELAY };
+ while (unlikely(umem->tx_free_count == 0)) {
+ if (socket->busy_poll || xsk_ring_prod__needs_wakeup(&socket->tx)) {
+ (void)sendto(xsk_socket__fd(socket->xsk), NULL, 0,
+ MSG_DONTWAIT, NULL, 0);
}
nanosleep(&delay, NULL);
knot_xdp_send_prepare(socket);
@@ -381,9 +425,7 @@ int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[],
}
if (unlikely(socket->send_mock != NULL)) {
int ret = socket->send_mock(socket, msgs, count, sent);
- for (uint32_t i = 0; i < count; ++i) {
- free_unsent(socket, &msgs[i]);
- }
+ knot_xdp_send_free(socket, msgs, count);
return ret;
}
@@ -393,12 +435,13 @@ int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[],
* and the API doesn't allow "cancelling reservations".
* Therefore we handle `socket->tx.cached_prod` by hand.
*/
- if (xsk_prod_nb_free(&socket->tx, count) < count) {
- /* This situation was sometimes observed in the emulated XDP mode. */
- for (uint32_t i = 0; i < count; ++i) {
- free_unsent(socket, &msgs[i]);
+ const struct timespec delay = { .tv_nsec = RETRY_DELAY };
+ while (unlikely(xsk_prod_nb_free(&socket->tx, count) < count)) {
+ if (socket->busy_poll || xsk_ring_prod__needs_wakeup(&socket->tx)) {
+ (void)sendto(xsk_socket__fd(socket->xsk), NULL, 0,
+ MSG_DONTWAIT, NULL, 0);
}
- return KNOT_ENOBUFS;
+ nanosleep(&delay, NULL);
}
uint32_t idx = socket->tx.cached_prod;
@@ -425,7 +468,6 @@ int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[],
assert(*sent <= count);
socket->tx.cached_prod = idx;
xsk_ring_prod__submit(&socket->tx, *sent);
- socket->kernel_needs_wakeup = true;
return KNOT_EOK;
}
@@ -446,34 +488,19 @@ int knot_xdp_send_finish(knot_xdp_socket_t *socket)
return KNOT_EINVAL;
}
- /* Trigger sending queued packets. */
- if (!socket->kernel_needs_wakeup) {
+ if (!socket->busy_poll && !xsk_ring_prod__needs_wakeup(&socket->tx)) {
return KNOT_EOK;
}
int ret = sendto(xsk_socket__fd(socket->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
- const bool is_ok = (ret >= 0);
- // List of "safe" errors taken from
- // https://github.com/torvalds/linux/blame/master/samples/bpf/xdpsock_user.c
- const bool is_again = !is_ok && (errno == ENOBUFS || errno == EAGAIN
- || errno == EBUSY || errno == ENETDOWN);
- // Some of the !is_ok cases are a little unclear - what to do about the syscall,
- // including how caller of _sendmsg_finish() should react.
- if (is_ok || !is_again) {
- socket->kernel_needs_wakeup = false;
- }
- if (is_again) {
- return KNOT_EAGAIN;
- } else if (is_ok) {
+ if (ret >= 0) {
return KNOT_EOK;
+ } else if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY ||
+ errno == ENETDOWN) {
+ return KNOT_EAGAIN;
} else {
return -errno;
}
- /* This syscall might be avoided with a newer kernel feature (>= 5.4):
- https://www.kernel.org/doc/html/latest/networking/af_xdp.html#xdp-use-need-wakeup-bind-flag
- Unfortunately it's not easy to continue supporting older kernels
- when using this feature on newer ones.
- */
}
_public_
@@ -518,7 +545,7 @@ int knot_xdp_recv(knot_xdp_socket_t *socket, knot_xdp_msg_t msgs[],
static uint8_t *msg_uframe_ptr(const knot_xdp_msg_t *msg)
{
- return NULL + ((msg->payload.iov_base - NULL) & ~(FRAME_SIZE - 1));
+ return (uint8_t *)((uintptr_t)msg->payload.iov_base & ~(FRAME_SIZE - 1));
}
_public_
@@ -529,30 +556,32 @@ void knot_xdp_recv_finish(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[]
return;
}
- const struct timespec delay = { .tv_nsec = ALLOC_RETRY_DELAY };
-
struct kxsk_umem *const umem = socket->umem;
struct xsk_ring_prod *const fq = &umem->fq;
uint32_t idx = 0;
- uint32_t reserved = xsk_ring_prod__reserve(fq, count, &idx);
- for (int i = 0; unlikely(reserved < count); i++) {
- if (i == ALLOC_RETRY_NUM) {
- return;
+ const struct timespec delay = { .tv_nsec = RETRY_DELAY };
+ while (unlikely(xsk_ring_prod__reserve(fq, count, &idx) != count)) {
+ if (socket->busy_poll || xsk_ring_prod__needs_wakeup(fq)) {
+ (void)recvfrom(xsk_socket__fd(socket->xsk), NULL, 0,
+ MSG_DONTWAIT, NULL, NULL);
}
nanosleep(&delay, NULL);
- reserved = xsk_ring_prod__reserve(fq, count, &idx);
}
- for (uint32_t i = 0; i < reserved; ++i) {
+ for (uint32_t i = 0; i < count; ++i) {
uint8_t *uframe_p = msg_uframe_ptr(&msgs[i]);
uint64_t offset = uframe_p - umem->frames->bytes;
*xsk_ring_prod__fill_addr(fq, idx++) = offset;
}
- xsk_ring_prod__submit(fq, reserved);
+ xsk_ring_prod__submit(fq, count);
+ // recvfrom() here slightly worsens the performance, poll is called later anyway.
}
+// The number of busy frames
+#define RING_BUSY(ring) ((*(ring)->producer - *(ring)->consumer) & (ring)->mask)
+
_public_
void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file)
{
@@ -560,10 +589,6 @@ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file)
return;
}
- // The number of busy frames
- #define RING_BUSY(ring) \
- ((*(ring)->producer - *(ring)->consumer) & (ring)->mask)
-
#define RING_PRINFO(name, ring) \
fprintf(file, "Ring %s: size %4d, busy %4d (prod %4d, cons %4d)\n", \
name, (unsigned)(ring)->size, \
@@ -571,11 +596,11 @@ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file)
(unsigned)*(ring)->producer, (unsigned)*(ring)->consumer)
const int rx_busyf = RING_BUSY(&socket->umem->fq) + RING_BUSY(&socket->rx);
- fprintf(file, "\nLOST RX frames: %4d", (int)(FRAME_COUNT_RX - rx_busyf));
+ fprintf(file, "\nLOST RX frames: %4d", (int)(socket->umem->ring_size - rx_busyf));
const int tx_busyf = RING_BUSY(&socket->umem->cq) + RING_BUSY(&socket->tx);
const int tx_freef = socket->umem->tx_free_count;
- fprintf(file, "\nLOST TX frames: %4d\n", (int)(FRAME_COUNT_TX - tx_busyf - tx_freef));
+ fprintf(file, "\nLOST TX frames: %4d\n", (int)(socket->umem->ring_size - tx_busyf - tx_freef));
RING_PRINFO("FQ", &socket->umem->fq);
RING_PRINFO("RX", &socket->rx);
@@ -583,3 +608,39 @@ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file)
RING_PRINFO("CQ", &socket->umem->cq);
fprintf(file, "TX free frames: %4d\n", tx_freef);
}
+
+_public_
+int knot_xdp_socket_stats(knot_xdp_socket_t *socket, knot_xdp_stats_t *stats)
+{
+ if (socket == NULL || stats == NULL) {
+ return KNOT_EINVAL;
+ }
+
+ memset(stats, 0, sizeof(*stats));
+
+ stats->if_name = socket->iface->if_name;
+ stats->if_index = socket->iface->if_index;
+ stats->if_queue = socket->iface->if_queue;
+
+ struct xdp_statistics xdp_stats;
+ socklen_t optlen = sizeof(xdp_stats);
+
+ int fd = knot_xdp_socket_fd(socket);
+ int ret = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &xdp_stats, &optlen);
+ if (ret != 0) {
+ return knot_map_errno();
+ } else if (optlen != sizeof(xdp_stats)) {
+ return KNOT_EINVAL;
+ }
+
+ size_t common_size = MIN(sizeof(xdp_stats), sizeof(stats->socket));
+ memcpy(&stats->socket, &xdp_stats, common_size);
+
+ stats->rings.tx_busy = socket->umem->ring_size - socket->umem->tx_free_count;
+ stats->rings.fq_fill = RING_BUSY(&socket->umem->fq);
+ stats->rings.rx_fill = RING_BUSY(&socket->rx);
+ stats->rings.tx_fill = RING_BUSY(&socket->tx);
+ stats->rings.cq_fill = RING_BUSY(&socket->umem->cq);
+
+ return KNOT_EOK;
+}