summaryrefslogtreecommitdiffstats
path: root/src/libknot/xdp
diff options
context:
space:
mode:
Diffstat (limited to 'src/libknot/xdp')
-rw-r--r--src/libknot/xdp/Makefile.in5
-rw-r--r--src/libknot/xdp/bpf-user.h12
-rw-r--r--src/libknot/xdp/tcp.c266
-rw-r--r--src/libknot/xdp/tcp.h13
-rw-r--r--src/libknot/xdp/xdp.c253
-rw-r--r--src/libknot/xdp/xdp.h58
6 files changed, 355 insertions, 252 deletions
diff --git a/src/libknot/xdp/Makefile.in b/src/libknot/xdp/Makefile.in
index 8aa77ce..f065f3e 100644
--- a/src/libknot/xdp/Makefile.in
+++ b/src/libknot/xdp/Makefile.in
@@ -270,6 +270,8 @@ infodir = @infodir@
install_sh = @install_sh@
libbpf_CFLAGS = @libbpf_CFLAGS@
libbpf_LIBS = @libbpf_LIBS@
+libdbus_CFLAGS = @libdbus_CFLAGS@
+libdbus_LIBS = @libdbus_LIBS@
libdir = @libdir@
libdnssec_SONAME = @libdnssec_SONAME@
libdnssec_SOVERSION = @libdnssec_SOVERSION@
@@ -281,8 +283,6 @@ libfstrm_CFLAGS = @libfstrm_CFLAGS@
libfstrm_LIBS = @libfstrm_LIBS@
libidn2_CFLAGS = @libidn2_CFLAGS@
libidn2_LIBS = @libidn2_LIBS@
-libidn_CFLAGS = @libidn_CFLAGS@
-libidn_LIBS = @libidn_LIBS@
libknot_SONAME = @libknot_SONAME@
libknot_SOVERSION = @libknot_SOVERSION@
libknot_VERSION_INFO = @libknot_VERSION_INFO@
@@ -300,7 +300,6 @@ libprotobuf_c_CFLAGS = @libprotobuf_c_CFLAGS@
libprotobuf_c_LIBS = @libprotobuf_c_LIBS@
liburcu_CFLAGS = @liburcu_CFLAGS@
liburcu_LIBS = @liburcu_LIBS@
-liburcu_PKGCONFIG = @liburcu_PKGCONFIG@
libxdp_CFLAGS = @libxdp_CFLAGS@
libxdp_LIBS = @libxdp_LIBS@
libzscanner_SONAME = @libzscanner_SONAME@
diff --git a/src/libknot/xdp/bpf-user.h b/src/libknot/xdp/bpf-user.h
index 37aac61..b76c9d6 100644
--- a/src/libknot/xdp/bpf-user.h
+++ b/src/libknot/xdp/bpf-user.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2022 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -60,8 +60,10 @@ struct kxsk_umem {
/*! The memory frames. */
struct umem_frame *frames;
+ /*! Size of RX and TX rings. */
+ uint16_t ring_size;
/*! The number of free frames (for TX). */
- uint32_t tx_free_count;
+ uint16_t tx_free_count;
/*! Stack of indices of the free frames (for TX). */
uint16_t tx_free_indices[];
};
@@ -82,15 +84,15 @@ struct knot_xdp_socket {
/*! If non-NULL, it's a mocked socket with this send function. */
int (*send_mock)(struct knot_xdp_socket *, const knot_xdp_msg_t[], uint32_t, uint32_t *);
- /*! The kernel has to be woken up by a syscall indication. */
- bool kernel_needs_wakeup;
-
/*! The limit of frame size. */
unsigned frame_limit;
/*! Mapping of interface indices to VLAN tags. */
uint16_t *vlan_map;
uint16_t vlan_map_max;
+
+ /*! Enabled preferred busy polling. */
+ bool busy_poll;
};
/*!
diff --git a/src/libknot/xdp/tcp.c b/src/libknot/xdp/tcp.c
index eae73a9..d219db9 100644
--- a/src/libknot/xdp/tcp.c
+++ b/src/libknot/xdp/tcp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2023 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -287,172 +287,162 @@ static void conn_update(knot_tcp_conn_t *conn, const knot_xdp_msg_t *msg)
}
_public_
-int knot_tcp_recv(knot_tcp_relay_t *relays, knot_xdp_msg_t msgs[], uint32_t msg_count,
+int knot_tcp_recv(knot_tcp_relay_t *relay, knot_xdp_msg_t *msg,
knot_tcp_table_t *tcp_table, knot_tcp_table_t *syn_table,
knot_tcp_ignore_t ignore)
{
- if (msg_count == 0) {
- return KNOT_EOK;
- }
- if (relays == NULL || msgs == NULL || tcp_table == NULL) {
+ if (relay == NULL || msg == NULL || tcp_table == NULL) {
return KNOT_EINVAL;
}
- memset(relays, 0, msg_count * sizeof(*relays));
+ memset(relay, 0, sizeof(*relay));
- knot_tcp_relay_t *relay = relays;
int ret = KNOT_EOK;
- for (knot_xdp_msg_t *msg = msgs; msg != msgs + msg_count && ret == KNOT_EOK; msg++) {
- if (!(msg->flags & KNOT_XDP_MSG_TCP)) {
- continue;
- }
+ if (!(msg->flags & KNOT_XDP_MSG_TCP)) {
+ return KNOT_EOK;
+ }
- uint64_t conn_hash = 0;
- knot_tcp_conn_t **pconn = tcp_table_lookup(&msg->ip_from, &msg->ip_to,
- &conn_hash, tcp_table);
- knot_tcp_conn_t *conn = *pconn;
- bool seq_ack_match = check_seq_ack(msg, conn);
- if (seq_ack_match) {
- assert(conn->mss != 0);
- conn_update(conn, msg);
-
- rem_align_pointers(conn, tcp_table);
- rem_node(tcp_conn_node(conn));
- add_tail(tcp_table_timeout(tcp_table), tcp_conn_node(conn));
-
- if (msg->flags & KNOT_XDP_MSG_ACK) {
- conn->acked = msg->ackno;
- knot_tcp_outbufs_ack(&conn->outbufs, msg->ackno, &tcp_table->outbufs_total);
- }
+ uint64_t conn_hash = 0;
+ knot_tcp_conn_t **pconn = tcp_table_lookup(&msg->ip_from, &msg->ip_to,
+ &conn_hash, tcp_table);
+ knot_tcp_conn_t *conn = *pconn;
+ bool seq_ack_match = check_seq_ack(msg, conn);
+ if (seq_ack_match) {
+ assert(conn->mss != 0);
+ conn_update(conn, msg);
+
+ rem_align_pointers(conn, tcp_table);
+ rem_node(tcp_conn_node(conn));
+ add_tail(tcp_table_timeout(tcp_table), tcp_conn_node(conn));
+
+ if (msg->flags & KNOT_XDP_MSG_ACK) {
+ conn->acked = msg->ackno;
+ knot_tcp_outbufs_ack(&conn->outbufs, msg->ackno, &tcp_table->outbufs_total);
}
+ }
- relay->msg = msg;
- relay->conn = conn;
+ relay->msg = msg;
+ relay->conn = conn;
- // process incoming data
- if (seq_ack_match && (msg->flags & KNOT_XDP_MSG_ACK) && msg->payload.iov_len > 0) {
- if (!(ignore & XDP_TCP_IGNORE_DATA_ACK)) {
- relay->auto_answer = KNOT_XDP_MSG_ACK;
- }
- ret = knot_tcp_inbufs_upd(&conn->inbuf, msg->payload, false,
- &relay->inbf, &tcp_table->inbufs_total);
- if (ret != KNOT_EOK) {
- break;
- }
- if (conn->inbuf.iov_len > 0 && tcp_table->next_ibuf == NULL) {
- tcp_table->next_ibuf = conn;
- }
+ // process incoming data
+ if (seq_ack_match && (msg->flags & KNOT_XDP_MSG_ACK) && msg->payload.iov_len > 0) {
+ if (!(ignore & XDP_TCP_IGNORE_DATA_ACK)) {
+ relay->auto_answer = KNOT_XDP_MSG_ACK;
+ }
+ ret = knot_tcp_inbufs_upd(&conn->inbuf, msg->payload, false,
+ &relay->inbf, &tcp_table->inbufs_total);
+ if (ret != KNOT_EOK) {
+ return ret;
+ }
+ if (conn->inbuf.iov_len > 0 && tcp_table->next_ibuf == NULL) {
+ tcp_table->next_ibuf = conn;
}
+ }
- // process TCP connection state
- switch (msg->flags & (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK |
- KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_RST)) {
- case KNOT_XDP_MSG_SYN:
- case (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK):
- if (conn == NULL) {
- bool synack = (msg->flags & KNOT_XDP_MSG_ACK);
-
- knot_tcp_table_t *add_table = tcp_table;
- if (syn_table != NULL) {
- if (synack) {
- break; // creating conn based on SYN+ACK is only for kxdpgun, disallow in knotd
- }
- add_table = syn_table;
- if (*tcp_table_lookup(&msg->ip_from, &msg->ip_to, &conn_hash, syn_table) != NULL) {
- break;
- }
- }
+ // process TCP connection state
+ switch (msg->flags & (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK |
+ KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_RST)) {
+ case KNOT_XDP_MSG_SYN:
+ case (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK):
+ if (conn == NULL) {
+ bool synack = (msg->flags & KNOT_XDP_MSG_ACK);
- ret = tcp_table_add(msg, conn_hash, add_table, &relay->conn);
- if (ret == KNOT_EOK) {
- relay->action = synack ? XDP_TCP_ESTABLISH : XDP_TCP_SYN;
- if (!(ignore & XDP_TCP_IGNORE_ESTABLISH)) {
- relay->auto_answer = synack ? KNOT_XDP_MSG_ACK : (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK);
- }
-
- conn = relay->conn;
- conn->state = synack ? XDP_TCP_NORMAL: XDP_TCP_ESTABLISHING;
- conn->mss = MAX(msg->mss, 536); // minimal MSS, most importantly not zero!
- conn->window_scale = msg->win_scale;
- conn_update(conn, msg);
- if (!synack) {
- conn->acked = dnssec_random_uint32_t();
- conn->ackno = conn->acked;
- }
+ knot_tcp_table_t *add_table = tcp_table;
+ if (syn_table != NULL) {
+ if (synack) {
+ break; // creating conn based on SYN+ACK is only for kxdpgun, disallow in knotd
+ }
+ add_table = syn_table;
+ if (*tcp_table_lookup(&msg->ip_from, &msg->ip_to, &conn_hash, syn_table) != NULL) {
+ break;
}
- } else {
- relay->auto_answer = KNOT_XDP_MSG_ACK;
}
- break;
- case KNOT_XDP_MSG_ACK:
- if (!seq_ack_match) {
- if (syn_table != NULL && msg->payload.iov_len == 0 && conn == NULL &&
- (pconn = tcp_table_lookup(&msg->ip_from, &msg->ip_to, &conn_hash, syn_table)) != NULL &&
- (conn = *pconn) != NULL && check_seq_ack(msg, conn)) {
- // move conn from syn_table to tcp_table
- tcp_table_remove(pconn, syn_table);
- tcp_table_insert(conn, conn_hash, tcp_table);
- relay->conn = conn;
- relay->action = XDP_TCP_ESTABLISH;
- conn->state = XDP_TCP_NORMAL;
- conn_update(conn, msg);
+
+ ret = tcp_table_add(msg, conn_hash, add_table, &relay->conn);
+ if (ret == KNOT_EOK) {
+ relay->action = synack ? XDP_TCP_ESTABLISH : XDP_TCP_SYN;
+ if (!(ignore & XDP_TCP_IGNORE_ESTABLISH)) {
+ relay->auto_answer = synack ? KNOT_XDP_MSG_ACK : (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK);
}
- } else {
- switch (conn->state) {
- case XDP_TCP_NORMAL:
- case XDP_TCP_CLOSING1: // just a mess, ignore
- break;
- case XDP_TCP_ESTABLISHING:
- conn->state = XDP_TCP_NORMAL;
- relay->action = XDP_TCP_ESTABLISH;
- break;
- case XDP_TCP_CLOSING2:
- if (msg->payload.iov_len == 0) { // otherwise ignore close
- tcp_table_remove(pconn, tcp_table);
- relay->answer = XDP_TCP_FREE;
- }
- break;
+
+ conn = relay->conn;
+ conn->state = synack ? XDP_TCP_NORMAL: XDP_TCP_ESTABLISHING;
+ conn->mss = MAX(msg->mss, 536); // minimal MSS, most importantly not zero!
+ conn->window_scale = msg->win_scale;
+ conn_update(conn, msg);
+ if (!synack) {
+ conn->acked = dnssec_random_uint32_t();
+ conn->ackno = conn->acked;
}
}
- break;
- case (KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_ACK):
- if (ignore & XDP_TCP_IGNORE_FIN) {
- break;
+ } else {
+ relay->auto_answer = KNOT_XDP_MSG_ACK;
+ }
+ break;
+ case KNOT_XDP_MSG_ACK:
+ if (!seq_ack_match) {
+ if (syn_table != NULL && msg->payload.iov_len == 0 && conn == NULL &&
+ (pconn = tcp_table_lookup(&msg->ip_from, &msg->ip_to, &conn_hash, syn_table)) != NULL &&
+ (conn = *pconn) != NULL && check_seq_ack(msg, conn)) {
+ // move conn from syn_table to tcp_table
+ tcp_table_remove(pconn, syn_table);
+ tcp_table_insert(conn, conn_hash, tcp_table);
+ relay->conn = conn;
+ relay->action = XDP_TCP_ESTABLISH;
+ conn->state = XDP_TCP_NORMAL;
+ conn_update(conn, msg);
}
- if (!seq_ack_match) {
- if (conn != NULL) {
- relay->auto_answer = KNOT_XDP_MSG_RST;
- relay->auto_seqno = msg->ackno;
- } // else ignore. It would be better and possible, but no big value for the price of CPU.
- } else {
- if (conn->state == XDP_TCP_CLOSING1) {
- relay->action = XDP_TCP_CLOSE;
- relay->auto_answer = KNOT_XDP_MSG_ACK;
- relay->answer = XDP_TCP_FREE;
+ } else {
+ switch (conn->state) {
+ case XDP_TCP_NORMAL:
+ case XDP_TCP_CLOSING1: // just a mess, ignore
+ break;
+ case XDP_TCP_ESTABLISHING:
+ conn->state = XDP_TCP_NORMAL;
+ relay->action = XDP_TCP_ESTABLISH;
+ break;
+ case XDP_TCP_CLOSING2:
+ if (msg->payload.iov_len == 0) { // otherwise ignore close
tcp_table_remove(pconn, tcp_table);
- } else if (msg->payload.iov_len == 0) { // otherwise ignore FIN
- relay->action = XDP_TCP_CLOSE;
- relay->auto_answer = KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_ACK;
- conn->state = XDP_TCP_CLOSING2;
+ relay->answer = XDP_TCP_FREE;
}
+ break;
}
+ }
+ break;
+ case (KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_ACK):
+ if (ignore & XDP_TCP_IGNORE_FIN) {
break;
- case KNOT_XDP_MSG_RST:
- if (conn != NULL && msg->seqno == conn->seqno) {
- relay->action = XDP_TCP_RESET;
- tcp_table_remove(pconn, tcp_table);
- relay->answer = XDP_TCP_FREE;
- } else if (conn != NULL) {
+ }
+ if (!seq_ack_match) {
+ if (conn != NULL) {
+ relay->auto_answer = KNOT_XDP_MSG_RST;
+ relay->auto_seqno = msg->ackno;
+ } // else ignore. It would be better and possible, but no big value for the price of CPU.
+ } else {
+ if (conn->state == XDP_TCP_CLOSING1) {
+ relay->action = XDP_TCP_CLOSE;
relay->auto_answer = KNOT_XDP_MSG_ACK;
+ relay->answer = XDP_TCP_FREE;
+ tcp_table_remove(pconn, tcp_table);
+ } else if (msg->payload.iov_len == 0) { // otherwise ignore FIN
+ relay->action = XDP_TCP_CLOSE;
+ relay->auto_answer = KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_ACK;
+ conn->state = XDP_TCP_CLOSING2;
}
- break;
- default:
- break;
}
-
- if (!knot_tcp_relay_empty(relay)) {
- relay++;
+ break;
+ case KNOT_XDP_MSG_RST:
+ if (conn != NULL && msg->seqno == conn->seqno) {
+ relay->action = XDP_TCP_RESET;
+ tcp_table_remove(pconn, tcp_table);
+ relay->answer = XDP_TCP_FREE;
+ } else if (conn != NULL) {
+ relay->auto_answer = KNOT_XDP_MSG_ACK;
}
+ break;
+ default:
+ break;
}
return ret;
diff --git a/src/libknot/xdp/tcp.h b/src/libknot/xdp/tcp.h
index 09fe652..39a30fd 100644
--- a/src/libknot/xdp/tcp.h
+++ b/src/libknot/xdp/tcp.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2023 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -148,18 +148,19 @@ knot_tcp_table_t *knot_tcp_table_new(size_t size, knot_tcp_table_t *secret_share
void knot_tcp_table_free(knot_tcp_table_t *table);
/*!
- * \brief Process received packets, prepare automatic responses (e.g. ACK), pick incoming data.
+ * \brief Process received packet, prepare automatic response (e.g. ACK), pick incoming data.
*
- * \param relays Out: relays to be filled with message/connection details.
- * \param msgs Packets received by knot_xdp_recv().
- * \param msg_count Number of received packets.
+ * \param relay Out: relay to be filled with message/connection details.
+ * \param msg Packet received by knot_xdp_recv().
* \param tcp_table Table of TCP connections.
* \param syn_table Optional: extra table for handling partially established connections.
* \param ignore Ignore specific TCP packets indication.
*
+ * \note resulting relay might be knot_tcp_relay_empty()
+ *
* \return KNOT_E*
*/
-int knot_tcp_recv(knot_tcp_relay_t *relays, knot_xdp_msg_t msgs[], uint32_t msg_count,
+int knot_tcp_recv(knot_tcp_relay_t *relay, knot_xdp_msg_t *msg,
knot_tcp_table_t *tcp_table, knot_tcp_table_t *syn_table,
knot_tcp_ignore_t ignore);
diff --git a/src/libknot/xdp/xdp.c b/src/libknot/xdp/xdp.c
index 8286884..132f5c4 100644
--- a/src/libknot/xdp/xdp.c
+++ b/src/libknot/xdp/xdp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2023 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -38,63 +38,68 @@
#include "contrib/net.h"
#define FRAME_SIZE 2048
-
-#define FRAME_COUNT_TX 2048
-#define FRAME_COUNT_RX 2048
-#define FRAME_COUNT (FRAME_COUNT_TX + FRAME_COUNT_RX)
-
-#define RING_LEN_TX FRAME_COUNT_TX
-#define RING_LEN_CQ FRAME_COUNT_TX
-#define RING_LEN_RX FRAME_COUNT_RX
-/* It's recommended that the FQ ring size >= HW RX ring size + AF_XDP RX ring size. */
-#define RING_LEN_FQ (2 * FRAME_COUNT_RX)
-
-#define ALLOC_RETRY_NUM 15
-#define ALLOC_RETRY_DELAY 20 // In nanoseconds.
-
-/* With recent compilers we statically check #defines for settings that
- * get refused by AF_XDP drivers (in current versions, at least). */
-#if (__STDC_VERSION__ >= 201112L)
-#define IS_POWER_OF_2(n) (((n) & (n - 1)) == 0)
-_Static_assert((FRAME_SIZE == 4096 || FRAME_SIZE == 2048)
- && IS_POWER_OF_2(RING_LEN_TX) && IS_POWER_OF_2(RING_LEN_RX)
- && IS_POWER_OF_2(RING_LEN_CQ) && IS_POWER_OF_2(RING_LEN_FQ)
- && FRAME_COUNT_TX <= (1 << 16) /* see tx_free_indices */
- , "Incorrect #define combination for AF_XDP.");
-#endif
+#define DEFAULT_RING_SIZE 2048
+#define RETRY_DELAY 20 // In nanoseconds.
struct umem_frame {
uint8_t bytes[FRAME_SIZE];
};
-static int configure_xsk_umem(struct kxsk_umem **out_umem, bool extra_frames)
+static bool valid_config(const knot_xdp_config_t *config)
+{
+ if (FRAME_SIZE != 2048 && FRAME_SIZE != 4096) {
+ return false;
+ }
+
+ if (config == NULL) {
+ return true;
+ }
+
+ if ((config->ring_size & (config->ring_size - 1)) != 0) {
+ return false;
+ }
+
+ return true;
+}
+
+static uint32_t ring_size(const knot_xdp_config_t *config)
+{
+ return config != NULL ? config->ring_size : DEFAULT_RING_SIZE;
+}
+
+static int configure_xsk_umem(struct kxsk_umem **out_umem, uint32_t ring_size)
{
/* Allocate memory and call driver to create the UMEM. */
struct kxsk_umem *umem = calloc(1,
offsetof(struct kxsk_umem, tx_free_indices)
- + sizeof(umem->tx_free_indices[0]) * FRAME_COUNT_TX);
+ + sizeof(umem->tx_free_indices[0]) * ring_size);
if (umem == NULL) {
return KNOT_ENOMEM;
}
+ umem->ring_size = ring_size;
- size_t frame_count = FRAME_COUNT + (extra_frames ? FRAME_COUNT_RX : 0);
+ /* It's recommended that the FQ ring size >= HW RX ring size + AF_XDP RX ring size.
+ * However, the performance is better if FQ size == AF_XDP RX size. */
+ const uint32_t FQ_SIZE = umem->ring_size;
+ const uint32_t CQ_SIZE = umem->ring_size;
+ const uint32_t FRAMES = FQ_SIZE + CQ_SIZE;
int ret = posix_memalign((void **)&umem->frames, getpagesize(),
- FRAME_SIZE * frame_count);
+ FRAME_SIZE * FRAMES);
if (ret != 0) {
free(umem);
return KNOT_ENOMEM;
}
- const struct xsk_umem_config config = {
- .fill_size = RING_LEN_FQ,
- .comp_size = RING_LEN_CQ,
+ const struct xsk_umem_config umem_config = {
+ .fill_size = FQ_SIZE,
+ .comp_size = CQ_SIZE,
.frame_size = FRAME_SIZE,
.frame_headroom = KNOT_XDP_PKT_ALIGNMENT,
};
- ret = xsk_umem__create(&umem->umem, umem->frames, FRAME_SIZE * frame_count,
- &umem->fq, &umem->cq, &config);
+ ret = xsk_umem__create(&umem->umem, umem->frames, FRAME_SIZE * FRAMES,
+ &umem->fq, &umem->cq, &umem_config);
if (ret != KNOT_EOK) {
free(umem->frames);
free(umem);
@@ -103,23 +108,23 @@ static int configure_xsk_umem(struct kxsk_umem **out_umem, bool extra_frames)
*out_umem = umem;
/* Designate the starting chunk of buffers for TX, and put them onto the stack. */
- umem->tx_free_count = FRAME_COUNT_TX;
- for (uint32_t i = 0; i < FRAME_COUNT_TX; ++i) {
+ umem->tx_free_count = CQ_SIZE;
+ for (uint32_t i = 0; i < CQ_SIZE; ++i) {
umem->tx_free_indices[i] = i;
}
/* Designate the rest of buffers for RX, and pass them to the driver. */
uint32_t idx = 0;
- ret = xsk_ring_prod__reserve(&umem->fq, frame_count - FRAME_COUNT_TX, &idx);
- if (ret != frame_count - FRAME_COUNT_TX) {
+ ret = xsk_ring_prod__reserve(&umem->fq, FQ_SIZE, &idx);
+ if (ret != FQ_SIZE) {
assert(0);
return KNOT_ERROR;
}
assert(idx == 0);
- for (uint32_t i = FRAME_COUNT_TX; i < frame_count; ++i) {
+ for (uint32_t i = CQ_SIZE; i < CQ_SIZE + FQ_SIZE; ++i) {
*xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * FRAME_SIZE;
}
- xsk_ring_prod__submit(&umem->fq, frame_count - FRAME_COUNT_TX);
+ xsk_ring_prod__submit(&umem->fq, FQ_SIZE);
return KNOT_EOK;
}
@@ -131,6 +136,33 @@ static void deconfigure_xsk_umem(struct kxsk_umem *umem)
free(umem);
}
+static int enable_busypoll(int socket, unsigned timeout_us, unsigned budget)
+{
+#if defined(SO_PREFER_BUSY_POLL) && defined(SO_BUSY_POLL_BUDGET)
+ int opt_val = 1;
+ if (setsockopt(socket, SOL_SOCKET, SO_PREFER_BUSY_POLL,
+ &opt_val, sizeof(opt_val)) != 0) {
+ return knot_map_errno();
+ }
+
+ opt_val = timeout_us;
+ if (setsockopt(socket, SOL_SOCKET, SO_BUSY_POLL,
+ &opt_val, sizeof(opt_val)) != 0) {
+ return knot_map_errno();
+ }
+
+ opt_val = budget;
+ if (setsockopt(socket, SOL_SOCKET, SO_BUSY_POLL_BUDGET,
+ &opt_val, sizeof(opt_val)) != 0) {
+ return knot_map_errno();
+ }
+
+ return KNOT_EOK;
+#else
+ return KNOT_ENOTSUP;
+#endif
+}
+
static int configure_xsk_socket(struct kxsk_umem *umem,
const struct kxsk_iface *iface,
knot_xdp_socket_t **out_sock,
@@ -143,14 +175,14 @@ static int configure_xsk_socket(struct kxsk_umem *umem,
xsk_info->iface = iface;
xsk_info->umem = umem;
- uint16_t bind_flags = 0;
+ uint16_t bind_flags = XDP_USE_NEED_WAKEUP;
if (config != NULL && config->force_copy) {
bind_flags |= XDP_COPY;
}
const struct xsk_socket_config sock_conf = {
- .tx_size = RING_LEN_TX,
- .rx_size = RING_LEN_RX,
+ .tx_size = umem->ring_size,
+ .rx_size = umem->ring_size,
.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD,
.bind_flags = bind_flags,
};
@@ -163,6 +195,17 @@ static int configure_xsk_socket(struct kxsk_umem *umem,
return ret;
}
+ if (config != NULL && config->busy_poll_budget > 0) {
+ ret = enable_busypoll(xsk_socket__fd(xsk_info->xsk),
+ config->busy_poll_timeout, config->busy_poll_budget);
+ if (ret != KNOT_EOK) {
+ xsk_socket__delete(xsk_info->xsk);
+ free(xsk_info);
+ return ret;
+ }
+ xsk_info->busy_poll = true;
+ }
+
*out_sock = xsk_info;
return KNOT_EOK;
}
@@ -172,7 +215,7 @@ int knot_xdp_init(knot_xdp_socket_t **socket, const char *if_name, int if_queue,
knot_xdp_filter_flag_t flags, uint16_t udp_port, uint16_t quic_port,
knot_xdp_load_bpf_t load_bpf, const knot_xdp_config_t *xdp_config)
{
- if (socket == NULL || if_name == NULL ||
+ if (socket == NULL || if_name == NULL || !valid_config(xdp_config) ||
(udp_port == quic_port && (flags & KNOT_XDP_FILTER_UDP) && (flags & KNOT_XDP_FILTER_QUIC)) ||
(flags & (KNOT_XDP_FILTER_UDP | KNOT_XDP_FILTER_TCP | KNOT_XDP_FILTER_QUIC)) == 0) {
return KNOT_EINVAL;
@@ -187,7 +230,7 @@ int knot_xdp_init(knot_xdp_socket_t **socket, const char *if_name, int if_queue,
/* Initialize shared packet_buffer for umem usage. */
struct kxsk_umem *umem = NULL;
- ret = configure_xsk_umem(&umem, xdp_config->extra_frames);
+ ret = configure_xsk_umem(&umem, ring_size(xdp_config));
if (ret != KNOT_EOK) {
kxsk_iface_free(iface);
return ret;
@@ -266,7 +309,7 @@ static void tx_free_relative(struct kxsk_umem *umem, uint64_t addr_relative)
{
/* The address may not point to *start* of buffer, but `/` solves that. */
uint64_t index = addr_relative / FRAME_SIZE;
- assert(index < FRAME_COUNT);
+ assert(index < umem->ring_size);
umem->tx_free_indices[umem->tx_free_count++] = index;
}
@@ -285,7 +328,7 @@ void knot_xdp_send_prepare(knot_xdp_socket_t *socket)
if (completed == 0) {
return;
}
- assert(umem->tx_free_count + completed <= FRAME_COUNT_TX);
+ assert(umem->tx_free_count + completed <= umem->ring_size);
for (uint32_t i = 0; i < completed; ++i) {
uint64_t addr_relative = *xsk_ring_cons__comp_addr(cq, idx++);
@@ -301,12 +344,13 @@ static struct umem_frame *alloc_tx_frame(knot_xdp_socket_t *socket)
return malloc(sizeof(struct umem_frame));
}
- const struct timespec delay = { .tv_nsec = ALLOC_RETRY_DELAY };
struct kxsk_umem *umem = socket->umem;
- for (int i = 0; unlikely(umem->tx_free_count == 0); i++) {
- if (i == ALLOC_RETRY_NUM) {
- return NULL;
+ const struct timespec delay = { .tv_nsec = RETRY_DELAY };
+ while (unlikely(umem->tx_free_count == 0)) {
+ if (socket->busy_poll || xsk_ring_prod__needs_wakeup(&socket->tx)) {
+ (void)sendto(xsk_socket__fd(socket->xsk), NULL, 0,
+ MSG_DONTWAIT, NULL, 0);
}
nanosleep(&delay, NULL);
knot_xdp_send_prepare(socket);
@@ -381,9 +425,7 @@ int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[],
}
if (unlikely(socket->send_mock != NULL)) {
int ret = socket->send_mock(socket, msgs, count, sent);
- for (uint32_t i = 0; i < count; ++i) {
- free_unsent(socket, &msgs[i]);
- }
+ knot_xdp_send_free(socket, msgs, count);
return ret;
}
@@ -393,12 +435,13 @@ int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[],
* and the API doesn't allow "cancelling reservations".
* Therefore we handle `socket->tx.cached_prod` by hand.
*/
- if (xsk_prod_nb_free(&socket->tx, count) < count) {
- /* This situation was sometimes observed in the emulated XDP mode. */
- for (uint32_t i = 0; i < count; ++i) {
- free_unsent(socket, &msgs[i]);
+ const struct timespec delay = { .tv_nsec = RETRY_DELAY };
+ while (unlikely(xsk_prod_nb_free(&socket->tx, count) < count)) {
+ if (socket->busy_poll || xsk_ring_prod__needs_wakeup(&socket->tx)) {
+ (void)sendto(xsk_socket__fd(socket->xsk), NULL, 0,
+ MSG_DONTWAIT, NULL, 0);
}
- return KNOT_ENOBUFS;
+ nanosleep(&delay, NULL);
}
uint32_t idx = socket->tx.cached_prod;
@@ -425,7 +468,6 @@ int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[],
assert(*sent <= count);
socket->tx.cached_prod = idx;
xsk_ring_prod__submit(&socket->tx, *sent);
- socket->kernel_needs_wakeup = true;
return KNOT_EOK;
}
@@ -446,34 +488,19 @@ int knot_xdp_send_finish(knot_xdp_socket_t *socket)
return KNOT_EINVAL;
}
- /* Trigger sending queued packets. */
- if (!socket->kernel_needs_wakeup) {
+ if (!socket->busy_poll && !xsk_ring_prod__needs_wakeup(&socket->tx)) {
return KNOT_EOK;
}
int ret = sendto(xsk_socket__fd(socket->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
- const bool is_ok = (ret >= 0);
- // List of "safe" errors taken from
- // https://github.com/torvalds/linux/blame/master/samples/bpf/xdpsock_user.c
- const bool is_again = !is_ok && (errno == ENOBUFS || errno == EAGAIN
- || errno == EBUSY || errno == ENETDOWN);
- // Some of the !is_ok cases are a little unclear - what to do about the syscall,
- // including how caller of _sendmsg_finish() should react.
- if (is_ok || !is_again) {
- socket->kernel_needs_wakeup = false;
- }
- if (is_again) {
- return KNOT_EAGAIN;
- } else if (is_ok) {
+ if (ret >= 0) {
return KNOT_EOK;
+ } else if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY ||
+ errno == ENETDOWN) {
+ return KNOT_EAGAIN;
} else {
return -errno;
}
- /* This syscall might be avoided with a newer kernel feature (>= 5.4):
- https://www.kernel.org/doc/html/latest/networking/af_xdp.html#xdp-use-need-wakeup-bind-flag
- Unfortunately it's not easy to continue supporting older kernels
- when using this feature on newer ones.
- */
}
_public_
@@ -518,7 +545,7 @@ int knot_xdp_recv(knot_xdp_socket_t *socket, knot_xdp_msg_t msgs[],
static uint8_t *msg_uframe_ptr(const knot_xdp_msg_t *msg)
{
- return NULL + ((msg->payload.iov_base - NULL) & ~(FRAME_SIZE - 1));
+ return (uint8_t *)((uintptr_t)msg->payload.iov_base & ~(FRAME_SIZE - 1));
}
_public_
@@ -529,30 +556,32 @@ void knot_xdp_recv_finish(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[]
return;
}
- const struct timespec delay = { .tv_nsec = ALLOC_RETRY_DELAY };
-
struct kxsk_umem *const umem = socket->umem;
struct xsk_ring_prod *const fq = &umem->fq;
uint32_t idx = 0;
- uint32_t reserved = xsk_ring_prod__reserve(fq, count, &idx);
- for (int i = 0; unlikely(reserved < count); i++) {
- if (i == ALLOC_RETRY_NUM) {
- return;
+ const struct timespec delay = { .tv_nsec = RETRY_DELAY };
+ while (unlikely(xsk_ring_prod__reserve(fq, count, &idx) != count)) {
+ if (socket->busy_poll || xsk_ring_prod__needs_wakeup(fq)) {
+ (void)recvfrom(xsk_socket__fd(socket->xsk), NULL, 0,
+ MSG_DONTWAIT, NULL, NULL);
}
nanosleep(&delay, NULL);
- reserved = xsk_ring_prod__reserve(fq, count, &idx);
}
- for (uint32_t i = 0; i < reserved; ++i) {
+ for (uint32_t i = 0; i < count; ++i) {
uint8_t *uframe_p = msg_uframe_ptr(&msgs[i]);
uint64_t offset = uframe_p - umem->frames->bytes;
*xsk_ring_prod__fill_addr(fq, idx++) = offset;
}
- xsk_ring_prod__submit(fq, reserved);
+ xsk_ring_prod__submit(fq, count);
+ // recvfrom() here slightly worsens the performance, poll is called later anyway.
}
+// The number of busy frames
+#define RING_BUSY(ring) ((*(ring)->producer - *(ring)->consumer) & (ring)->mask)
+
_public_
void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file)
{
@@ -560,10 +589,6 @@ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file)
return;
}
- // The number of busy frames
- #define RING_BUSY(ring) \
- ((*(ring)->producer - *(ring)->consumer) & (ring)->mask)
-
#define RING_PRINFO(name, ring) \
fprintf(file, "Ring %s: size %4d, busy %4d (prod %4d, cons %4d)\n", \
name, (unsigned)(ring)->size, \
@@ -571,11 +596,11 @@ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file)
(unsigned)*(ring)->producer, (unsigned)*(ring)->consumer)
const int rx_busyf = RING_BUSY(&socket->umem->fq) + RING_BUSY(&socket->rx);
- fprintf(file, "\nLOST RX frames: %4d", (int)(FRAME_COUNT_RX - rx_busyf));
+ fprintf(file, "\nLOST RX frames: %4d", (int)(socket->umem->ring_size - rx_busyf));
const int tx_busyf = RING_BUSY(&socket->umem->cq) + RING_BUSY(&socket->tx);
const int tx_freef = socket->umem->tx_free_count;
- fprintf(file, "\nLOST TX frames: %4d\n", (int)(FRAME_COUNT_TX - tx_busyf - tx_freef));
+ fprintf(file, "\nLOST TX frames: %4d\n", (int)(socket->umem->ring_size - tx_busyf - tx_freef));
RING_PRINFO("FQ", &socket->umem->fq);
RING_PRINFO("RX", &socket->rx);
@@ -583,3 +608,39 @@ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file)
RING_PRINFO("CQ", &socket->umem->cq);
fprintf(file, "TX free frames: %4d\n", tx_freef);
}
+
+_public_
+int knot_xdp_socket_stats(knot_xdp_socket_t *socket, knot_xdp_stats_t *stats)
+{
+ if (socket == NULL || stats == NULL) {
+ return KNOT_EINVAL;
+ }
+
+ memset(stats, 0, sizeof(*stats));
+
+ stats->if_name = socket->iface->if_name;
+ stats->if_index = socket->iface->if_index;
+ stats->if_queue = socket->iface->if_queue;
+
+ struct xdp_statistics xdp_stats;
+ socklen_t optlen = sizeof(xdp_stats);
+
+ int fd = knot_xdp_socket_fd(socket);
+ int ret = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &xdp_stats, &optlen);
+ if (ret != 0) {
+ return knot_map_errno();
+ } else if (optlen != sizeof(xdp_stats)) {
+ return KNOT_EINVAL;
+ }
+
+ size_t common_size = MIN(sizeof(xdp_stats), sizeof(stats->socket));
+ memcpy(&stats->socket, &xdp_stats, common_size);
+
+ stats->rings.tx_busy = socket->umem->ring_size - socket->umem->tx_free_count;
+ stats->rings.fq_fill = RING_BUSY(&socket->umem->fq);
+ stats->rings.rx_fill = RING_BUSY(&socket->rx);
+ stats->rings.tx_fill = RING_BUSY(&socket->tx);
+ stats->rings.cq_fill = RING_BUSY(&socket->umem->cq);
+
+ return KNOT_EOK;
+}
diff --git a/src/libknot/xdp/xdp.h b/src/libknot/xdp/xdp.h
index 6c8bb1e..5944d44 100644
--- a/src/libknot/xdp/xdp.h
+++ b/src/libknot/xdp/xdp.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2023 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -53,14 +53,54 @@ typedef struct knot_xdp_socket knot_xdp_socket_t;
/*! \brief Configuration of XDP socket. */
struct knot_xdp_config {
- bool force_generic; /*!< Use generic XDP mode (avoid driver/hadrware implementation). */
- bool force_copy; /*!< Force copying packet data between kernel and user-space (avoid zero-copy). */
- bool extra_frames; /*!< Extra FQ frames. */
+ uint16_t ring_size; /*!< Size of RX and TX rings (must be power of 2). */
+ bool force_generic; /*!< Use generic XDP mode (avoid driver/hardware implementation). */
+ bool force_copy; /*!< Force copying packet data between kernel and user-space (avoid zero-copy). */
+ unsigned busy_poll_timeout; /*!< Preferred busy poll budget (0 means disabled). */
+ unsigned busy_poll_budget; /*!< Preferred busy poll timeout (in microseconds) . */
};
/*! \brief Configuration of XDP socket. */
typedef struct knot_xdp_config knot_xdp_config_t;
+/*! \brief Various statistics of an XDP socket (optimally kernel >=5.9). */
+typedef struct {
+ /*! Interface name. */
+ const char *if_name;
+ /*! Interface name index (derived from ifname). */
+ int if_index;
+ /*! Network card queue id. */
+ unsigned if_queue;
+ /*! Counters (xdp_statistics) retrieved from the kernel via XDP_STATISTICS. */
+ struct {
+ /*! Dropped for other reasons. */
+ uint64_t rx_dropped;
+ /*! Dropped due to invalid descriptor. */
+ uint64_t rx_invalid;
+ /*! Dropped due to invalid descriptor. */
+ uint64_t tx_invalid;
+ /*! Dropped due to rx ring being full. */
+ uint64_t rx_full;
+ /*! Failed to retrieve item from fill ring. */
+ uint64_t fq_empty;
+ /*! Failed to retrieve item from tx ring. */
+ uint64_t tx_empty;
+ } socket;
+ /*! States of rings of the XDP socket. */
+ struct {
+ /*! Busy TX buffers. */
+ uint16_t tx_busy;
+ /*! Free buffers to consume from FQ ring. */
+ uint16_t fq_fill;
+ /*! Pending buffers in TX ring. */
+ uint16_t rx_fill;
+ /*! Pending buffers in RX ring. */
+ uint16_t tx_fill;
+ /*! Pending buffers in CQ ring. */
+ uint16_t cq_fill;
+ } rings;
+} knot_xdp_stats_t;
+
/*!
* \brief Initialize XDP socket.
*
@@ -196,4 +236,14 @@ void knot_xdp_recv_finish(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[]
*/
void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file);
+/*!
+ * \brief Gets various statistics of the XDP socket.
+ *
+ * \param socket XDP socket.
+ * \param stats Output structure.
+ *
+ * \return KNOT_E*
+ */
+int knot_xdp_socket_stats(knot_xdp_socket_t *socket, knot_xdp_stats_t *stats);
+
/*! @} */