diff options
Diffstat (limited to 'src/libknot/xdp')
-rw-r--r-- | src/libknot/xdp/Makefile.in | 5 | ||||
-rw-r--r-- | src/libknot/xdp/bpf-user.h | 12 | ||||
-rw-r--r-- | src/libknot/xdp/tcp.c | 266 | ||||
-rw-r--r-- | src/libknot/xdp/tcp.h | 13 | ||||
-rw-r--r-- | src/libknot/xdp/xdp.c | 253 | ||||
-rw-r--r-- | src/libknot/xdp/xdp.h | 58 |
6 files changed, 355 insertions, 252 deletions
diff --git a/src/libknot/xdp/Makefile.in b/src/libknot/xdp/Makefile.in index 8aa77ce..f065f3e 100644 --- a/src/libknot/xdp/Makefile.in +++ b/src/libknot/xdp/Makefile.in @@ -270,6 +270,8 @@ infodir = @infodir@ install_sh = @install_sh@ libbpf_CFLAGS = @libbpf_CFLAGS@ libbpf_LIBS = @libbpf_LIBS@ +libdbus_CFLAGS = @libdbus_CFLAGS@ +libdbus_LIBS = @libdbus_LIBS@ libdir = @libdir@ libdnssec_SONAME = @libdnssec_SONAME@ libdnssec_SOVERSION = @libdnssec_SOVERSION@ @@ -281,8 +283,6 @@ libfstrm_CFLAGS = @libfstrm_CFLAGS@ libfstrm_LIBS = @libfstrm_LIBS@ libidn2_CFLAGS = @libidn2_CFLAGS@ libidn2_LIBS = @libidn2_LIBS@ -libidn_CFLAGS = @libidn_CFLAGS@ -libidn_LIBS = @libidn_LIBS@ libknot_SONAME = @libknot_SONAME@ libknot_SOVERSION = @libknot_SOVERSION@ libknot_VERSION_INFO = @libknot_VERSION_INFO@ @@ -300,7 +300,6 @@ libprotobuf_c_CFLAGS = @libprotobuf_c_CFLAGS@ libprotobuf_c_LIBS = @libprotobuf_c_LIBS@ liburcu_CFLAGS = @liburcu_CFLAGS@ liburcu_LIBS = @liburcu_LIBS@ -liburcu_PKGCONFIG = @liburcu_PKGCONFIG@ libxdp_CFLAGS = @libxdp_CFLAGS@ libxdp_LIBS = @libxdp_LIBS@ libzscanner_SONAME = @libzscanner_SONAME@ diff --git a/src/libknot/xdp/bpf-user.h b/src/libknot/xdp/bpf-user.h index 37aac61..b76c9d6 100644 --- a/src/libknot/xdp/bpf-user.h +++ b/src/libknot/xdp/bpf-user.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2022 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -60,8 +60,10 @@ struct kxsk_umem { /*! The memory frames. */ struct umem_frame *frames; + /*! Size of RX and TX rings. */ + uint16_t ring_size; /*! The number of free frames (for TX). */ - uint32_t tx_free_count; + uint16_t tx_free_count; /*! Stack of indices of the free frames (for TX). */ uint16_t tx_free_indices[]; }; @@ -82,15 +84,15 @@ struct knot_xdp_socket { /*! If non-NULL, it's a mocked socket with this send function. */ int (*send_mock)(struct knot_xdp_socket *, const knot_xdp_msg_t[], uint32_t, uint32_t *); - /*! The kernel has to be woken up by a syscall indication. */ - bool kernel_needs_wakeup; - /*! The limit of frame size. */ unsigned frame_limit; /*! Mapping of interface indices to VLAN tags. */ uint16_t *vlan_map; uint16_t vlan_map_max; + + /*! Enabled preferred busy polling. */ + bool busy_poll; }; /*! diff --git a/src/libknot/xdp/tcp.c b/src/libknot/xdp/tcp.c index eae73a9..d219db9 100644 --- a/src/libknot/xdp/tcp.c +++ b/src/libknot/xdp/tcp.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -287,172 +287,162 @@ static void conn_update(knot_tcp_conn_t *conn, const knot_xdp_msg_t *msg) } _public_ -int knot_tcp_recv(knot_tcp_relay_t *relays, knot_xdp_msg_t msgs[], uint32_t msg_count, +int knot_tcp_recv(knot_tcp_relay_t *relay, knot_xdp_msg_t *msg, knot_tcp_table_t *tcp_table, knot_tcp_table_t *syn_table, knot_tcp_ignore_t ignore) { - if (msg_count == 0) { - return KNOT_EOK; - } - if (relays == NULL || msgs == NULL || tcp_table == NULL) { + if (relay == NULL || msg == NULL || tcp_table == NULL) { return KNOT_EINVAL; } - memset(relays, 0, msg_count * sizeof(*relays)); + memset(relay, 0, sizeof(*relay)); - knot_tcp_relay_t *relay = relays; int ret = KNOT_EOK; - for (knot_xdp_msg_t *msg = msgs; msg != msgs + msg_count && ret == KNOT_EOK; msg++) { - if (!(msg->flags & KNOT_XDP_MSG_TCP)) { - continue; - } + if (!(msg->flags & KNOT_XDP_MSG_TCP)) { + return KNOT_EOK; + } - uint64_t conn_hash = 0; - knot_tcp_conn_t **pconn = tcp_table_lookup(&msg->ip_from, &msg->ip_to, - &conn_hash, tcp_table); - knot_tcp_conn_t *conn = *pconn; - bool seq_ack_match = check_seq_ack(msg, conn); - if (seq_ack_match) { - assert(conn->mss != 0); - conn_update(conn, msg); - - rem_align_pointers(conn, tcp_table); - rem_node(tcp_conn_node(conn)); - add_tail(tcp_table_timeout(tcp_table), tcp_conn_node(conn)); - - if (msg->flags & KNOT_XDP_MSG_ACK) { - conn->acked = msg->ackno; - knot_tcp_outbufs_ack(&conn->outbufs, msg->ackno, &tcp_table->outbufs_total); - } + uint64_t conn_hash = 0; + knot_tcp_conn_t **pconn = tcp_table_lookup(&msg->ip_from, &msg->ip_to, + &conn_hash, tcp_table); + knot_tcp_conn_t *conn = *pconn; + bool seq_ack_match = check_seq_ack(msg, conn); + if (seq_ack_match) { + assert(conn->mss != 0); + conn_update(conn, msg); + + rem_align_pointers(conn, tcp_table); + rem_node(tcp_conn_node(conn)); + add_tail(tcp_table_timeout(tcp_table), tcp_conn_node(conn)); + + if (msg->flags & KNOT_XDP_MSG_ACK) { + conn->acked = msg->ackno; + knot_tcp_outbufs_ack(&conn->outbufs, msg->ackno, &tcp_table->outbufs_total); } + } - relay->msg = msg; - relay->conn = conn; + relay->msg = msg; + relay->conn = conn; - // process incoming data - if (seq_ack_match && (msg->flags & KNOT_XDP_MSG_ACK) && msg->payload.iov_len > 0) { - if (!(ignore & XDP_TCP_IGNORE_DATA_ACK)) { - relay->auto_answer = KNOT_XDP_MSG_ACK; - } - ret = knot_tcp_inbufs_upd(&conn->inbuf, msg->payload, false, - &relay->inbf, &tcp_table->inbufs_total); - if (ret != KNOT_EOK) { - break; - } - if (conn->inbuf.iov_len > 0 && tcp_table->next_ibuf == NULL) { - tcp_table->next_ibuf = conn; - } + // process incoming data + if (seq_ack_match && (msg->flags & KNOT_XDP_MSG_ACK) && msg->payload.iov_len > 0) { + if (!(ignore & XDP_TCP_IGNORE_DATA_ACK)) { + relay->auto_answer = KNOT_XDP_MSG_ACK; + } + ret = knot_tcp_inbufs_upd(&conn->inbuf, msg->payload, false, + &relay->inbf, &tcp_table->inbufs_total); + if (ret != KNOT_EOK) { + return ret; + } + if (conn->inbuf.iov_len > 0 && tcp_table->next_ibuf == NULL) { + tcp_table->next_ibuf = conn; } + } - // process TCP connection state - switch (msg->flags & (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK | - KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_RST)) { - case KNOT_XDP_MSG_SYN: - case (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK): - if (conn == NULL) { - bool synack = (msg->flags & KNOT_XDP_MSG_ACK); - - knot_tcp_table_t *add_table = tcp_table; - if (syn_table != NULL) { - if (synack) { - break; // creating conn based on SYN+ACK is only for kxdpgun, disallow in knotd - } - add_table = syn_table; - if (*tcp_table_lookup(&msg->ip_from, &msg->ip_to, &conn_hash, syn_table) != NULL) { - break; - } - } + // process TCP connection state + switch (msg->flags & (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK | + KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_RST)) { + case KNOT_XDP_MSG_SYN: + case (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK): + if (conn == NULL) { + bool synack = (msg->flags & KNOT_XDP_MSG_ACK); - ret = tcp_table_add(msg, conn_hash, add_table, &relay->conn); - if (ret == KNOT_EOK) { - relay->action = synack ? XDP_TCP_ESTABLISH : XDP_TCP_SYN; - if (!(ignore & XDP_TCP_IGNORE_ESTABLISH)) { - relay->auto_answer = synack ? KNOT_XDP_MSG_ACK : (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK); - } - - conn = relay->conn; - conn->state = synack ? XDP_TCP_NORMAL: XDP_TCP_ESTABLISHING; - conn->mss = MAX(msg->mss, 536); // minimal MSS, most importantly not zero! - conn->window_scale = msg->win_scale; - conn_update(conn, msg); - if (!synack) { - conn->acked = dnssec_random_uint32_t(); - conn->ackno = conn->acked; - } + knot_tcp_table_t *add_table = tcp_table; + if (syn_table != NULL) { + if (synack) { + break; // creating conn based on SYN+ACK is only for kxdpgun, disallow in knotd + } + add_table = syn_table; + if (*tcp_table_lookup(&msg->ip_from, &msg->ip_to, &conn_hash, syn_table) != NULL) { + break; } - } else { - relay->auto_answer = KNOT_XDP_MSG_ACK; } - break; - case KNOT_XDP_MSG_ACK: - if (!seq_ack_match) { - if (syn_table != NULL && msg->payload.iov_len == 0 && conn == NULL && - (pconn = tcp_table_lookup(&msg->ip_from, &msg->ip_to, &conn_hash, syn_table)) != NULL && - (conn = *pconn) != NULL && check_seq_ack(msg, conn)) { - // move conn from syn_table to tcp_table - tcp_table_remove(pconn, syn_table); - tcp_table_insert(conn, conn_hash, tcp_table); - relay->conn = conn; - relay->action = XDP_TCP_ESTABLISH; - conn->state = XDP_TCP_NORMAL; - conn_update(conn, msg); + + ret = tcp_table_add(msg, conn_hash, add_table, &relay->conn); + if (ret == KNOT_EOK) { + relay->action = synack ? XDP_TCP_ESTABLISH : XDP_TCP_SYN; + if (!(ignore & XDP_TCP_IGNORE_ESTABLISH)) { + relay->auto_answer = synack ? KNOT_XDP_MSG_ACK : (KNOT_XDP_MSG_SYN | KNOT_XDP_MSG_ACK); } - } else { - switch (conn->state) { - case XDP_TCP_NORMAL: - case XDP_TCP_CLOSING1: // just a mess, ignore - break; - case XDP_TCP_ESTABLISHING: - conn->state = XDP_TCP_NORMAL; - relay->action = XDP_TCP_ESTABLISH; - break; - case XDP_TCP_CLOSING2: - if (msg->payload.iov_len == 0) { // otherwise ignore close - tcp_table_remove(pconn, tcp_table); - relay->answer = XDP_TCP_FREE; - } - break; + + conn = relay->conn; + conn->state = synack ? XDP_TCP_NORMAL: XDP_TCP_ESTABLISHING; + conn->mss = MAX(msg->mss, 536); // minimal MSS, most importantly not zero! + conn->window_scale = msg->win_scale; + conn_update(conn, msg); + if (!synack) { + conn->acked = dnssec_random_uint32_t(); + conn->ackno = conn->acked; } } - break; - case (KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_ACK): - if (ignore & XDP_TCP_IGNORE_FIN) { - break; + } else { + relay->auto_answer = KNOT_XDP_MSG_ACK; + } + break; + case KNOT_XDP_MSG_ACK: + if (!seq_ack_match) { + if (syn_table != NULL && msg->payload.iov_len == 0 && conn == NULL && + (pconn = tcp_table_lookup(&msg->ip_from, &msg->ip_to, &conn_hash, syn_table)) != NULL && + (conn = *pconn) != NULL && check_seq_ack(msg, conn)) { + // move conn from syn_table to tcp_table + tcp_table_remove(pconn, syn_table); + tcp_table_insert(conn, conn_hash, tcp_table); + relay->conn = conn; + relay->action = XDP_TCP_ESTABLISH; + conn->state = XDP_TCP_NORMAL; + conn_update(conn, msg); } - if (!seq_ack_match) { - if (conn != NULL) { - relay->auto_answer = KNOT_XDP_MSG_RST; - relay->auto_seqno = msg->ackno; - } // else ignore. It would be better and possible, but no big value for the price of CPU. - } else { - if (conn->state == XDP_TCP_CLOSING1) { - relay->action = XDP_TCP_CLOSE; - relay->auto_answer = KNOT_XDP_MSG_ACK; - relay->answer = XDP_TCP_FREE; + } else { + switch (conn->state) { + case XDP_TCP_NORMAL: + case XDP_TCP_CLOSING1: // just a mess, ignore + break; + case XDP_TCP_ESTABLISHING: + conn->state = XDP_TCP_NORMAL; + relay->action = XDP_TCP_ESTABLISH; + break; + case XDP_TCP_CLOSING2: + if (msg->payload.iov_len == 0) { // otherwise ignore close tcp_table_remove(pconn, tcp_table); - } else if (msg->payload.iov_len == 0) { // otherwise ignore FIN - relay->action = XDP_TCP_CLOSE; - relay->auto_answer = KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_ACK; - conn->state = XDP_TCP_CLOSING2; + relay->answer = XDP_TCP_FREE; } + break; } + } + break; + case (KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_ACK): + if (ignore & XDP_TCP_IGNORE_FIN) { break; - case KNOT_XDP_MSG_RST: - if (conn != NULL && msg->seqno == conn->seqno) { - relay->action = XDP_TCP_RESET; - tcp_table_remove(pconn, tcp_table); - relay->answer = XDP_TCP_FREE; - } else if (conn != NULL) { + } + if (!seq_ack_match) { + if (conn != NULL) { + relay->auto_answer = KNOT_XDP_MSG_RST; + relay->auto_seqno = msg->ackno; + } // else ignore. It would be better and possible, but no big value for the price of CPU. + } else { + if (conn->state == XDP_TCP_CLOSING1) { + relay->action = XDP_TCP_CLOSE; relay->auto_answer = KNOT_XDP_MSG_ACK; + relay->answer = XDP_TCP_FREE; + tcp_table_remove(pconn, tcp_table); + } else if (msg->payload.iov_len == 0) { // otherwise ignore FIN + relay->action = XDP_TCP_CLOSE; + relay->auto_answer = KNOT_XDP_MSG_FIN | KNOT_XDP_MSG_ACK; + conn->state = XDP_TCP_CLOSING2; } - break; - default: - break; } - - if (!knot_tcp_relay_empty(relay)) { - relay++; + break; + case KNOT_XDP_MSG_RST: + if (conn != NULL && msg->seqno == conn->seqno) { + relay->action = XDP_TCP_RESET; + tcp_table_remove(pconn, tcp_table); + relay->answer = XDP_TCP_FREE; + } else if (conn != NULL) { + relay->auto_answer = KNOT_XDP_MSG_ACK; } + break; + default: + break; } return ret; diff --git a/src/libknot/xdp/tcp.h b/src/libknot/xdp/tcp.h index 09fe652..39a30fd 100644 --- a/src/libknot/xdp/tcp.h +++ b/src/libknot/xdp/tcp.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -148,18 +148,19 @@ knot_tcp_table_t *knot_tcp_table_new(size_t size, knot_tcp_table_t *secret_share void knot_tcp_table_free(knot_tcp_table_t *table); /*! - * \brief Process received packets, prepare automatic responses (e.g. ACK), pick incoming data. + * \brief Process received packet, prepare automatic response (e.g. ACK), pick incoming data. * - * \param relays Out: relays to be filled with message/connection details. - * \param msgs Packets received by knot_xdp_recv(). - * \param msg_count Number of received packets. + * \param relay Out: relay to be filled with message/connection details. + * \param msg Packet received by knot_xdp_recv(). * \param tcp_table Table of TCP connections. * \param syn_table Optional: extra table for handling partially established connections. * \param ignore Ignore specific TCP packets indication. * + * \note resulting relay might be knot_tcp_relay_empty() + * * \return KNOT_E* */ -int knot_tcp_recv(knot_tcp_relay_t *relays, knot_xdp_msg_t msgs[], uint32_t msg_count, +int knot_tcp_recv(knot_tcp_relay_t *relay, knot_xdp_msg_t *msg, knot_tcp_table_t *tcp_table, knot_tcp_table_t *syn_table, knot_tcp_ignore_t ignore); diff --git a/src/libknot/xdp/xdp.c b/src/libknot/xdp/xdp.c index 8286884..132f5c4 100644 --- a/src/libknot/xdp/xdp.c +++ b/src/libknot/xdp/xdp.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -38,63 +38,68 @@ #include "contrib/net.h" #define FRAME_SIZE 2048 - -#define FRAME_COUNT_TX 2048 -#define FRAME_COUNT_RX 2048 -#define FRAME_COUNT (FRAME_COUNT_TX + FRAME_COUNT_RX) - -#define RING_LEN_TX FRAME_COUNT_TX -#define RING_LEN_CQ FRAME_COUNT_TX -#define RING_LEN_RX FRAME_COUNT_RX -/* It's recommended that the FQ ring size >= HW RX ring size + AF_XDP RX ring size. */ -#define RING_LEN_FQ (2 * FRAME_COUNT_RX) - -#define ALLOC_RETRY_NUM 15 -#define ALLOC_RETRY_DELAY 20 // In nanoseconds. - -/* With recent compilers we statically check #defines for settings that - * get refused by AF_XDP drivers (in current versions, at least). */ -#if (__STDC_VERSION__ >= 201112L) -#define IS_POWER_OF_2(n) (((n) & (n - 1)) == 0) -_Static_assert((FRAME_SIZE == 4096 || FRAME_SIZE == 2048) - && IS_POWER_OF_2(RING_LEN_TX) && IS_POWER_OF_2(RING_LEN_RX) - && IS_POWER_OF_2(RING_LEN_CQ) && IS_POWER_OF_2(RING_LEN_FQ) - && FRAME_COUNT_TX <= (1 << 16) /* see tx_free_indices */ - , "Incorrect #define combination for AF_XDP."); -#endif +#define DEFAULT_RING_SIZE 2048 +#define RETRY_DELAY 20 // In nanoseconds. struct umem_frame { uint8_t bytes[FRAME_SIZE]; }; -static int configure_xsk_umem(struct kxsk_umem **out_umem, bool extra_frames) +static bool valid_config(const knot_xdp_config_t *config) +{ + if (FRAME_SIZE != 2048 && FRAME_SIZE != 4096) { + return false; + } + + if (config == NULL) { + return true; + } + + if ((config->ring_size & (config->ring_size - 1)) != 0) { + return false; + } + + return true; +} + +static uint32_t ring_size(const knot_xdp_config_t *config) +{ + return config != NULL ? config->ring_size : DEFAULT_RING_SIZE; +} + +static int configure_xsk_umem(struct kxsk_umem **out_umem, uint32_t ring_size) { /* Allocate memory and call driver to create the UMEM. */ struct kxsk_umem *umem = calloc(1, offsetof(struct kxsk_umem, tx_free_indices) - + sizeof(umem->tx_free_indices[0]) * FRAME_COUNT_TX); + + sizeof(umem->tx_free_indices[0]) * ring_size); if (umem == NULL) { return KNOT_ENOMEM; } + umem->ring_size = ring_size; - size_t frame_count = FRAME_COUNT + (extra_frames ? FRAME_COUNT_RX : 0); + /* It's recommended that the FQ ring size >= HW RX ring size + AF_XDP RX ring size. + * However, the performance is better if FQ size == AF_XDP RX size. */ + const uint32_t FQ_SIZE = umem->ring_size; + const uint32_t CQ_SIZE = umem->ring_size; + const uint32_t FRAMES = FQ_SIZE + CQ_SIZE; int ret = posix_memalign((void **)&umem->frames, getpagesize(), - FRAME_SIZE * frame_count); + FRAME_SIZE * FRAMES); if (ret != 0) { free(umem); return KNOT_ENOMEM; } - const struct xsk_umem_config config = { - .fill_size = RING_LEN_FQ, - .comp_size = RING_LEN_CQ, + const struct xsk_umem_config umem_config = { + .fill_size = FQ_SIZE, + .comp_size = CQ_SIZE, .frame_size = FRAME_SIZE, .frame_headroom = KNOT_XDP_PKT_ALIGNMENT, }; - ret = xsk_umem__create(&umem->umem, umem->frames, FRAME_SIZE * frame_count, - &umem->fq, &umem->cq, &config); + ret = xsk_umem__create(&umem->umem, umem->frames, FRAME_SIZE * FRAMES, + &umem->fq, &umem->cq, &umem_config); if (ret != KNOT_EOK) { free(umem->frames); free(umem); @@ -103,23 +108,23 @@ static int configure_xsk_umem(struct kxsk_umem **out_umem, bool extra_frames) *out_umem = umem; /* Designate the starting chunk of buffers for TX, and put them onto the stack. */ - umem->tx_free_count = FRAME_COUNT_TX; - for (uint32_t i = 0; i < FRAME_COUNT_TX; ++i) { + umem->tx_free_count = CQ_SIZE; + for (uint32_t i = 0; i < CQ_SIZE; ++i) { umem->tx_free_indices[i] = i; } /* Designate the rest of buffers for RX, and pass them to the driver. */ uint32_t idx = 0; - ret = xsk_ring_prod__reserve(&umem->fq, frame_count - FRAME_COUNT_TX, &idx); - if (ret != frame_count - FRAME_COUNT_TX) { + ret = xsk_ring_prod__reserve(&umem->fq, FQ_SIZE, &idx); + if (ret != FQ_SIZE) { assert(0); return KNOT_ERROR; } assert(idx == 0); - for (uint32_t i = FRAME_COUNT_TX; i < frame_count; ++i) { + for (uint32_t i = CQ_SIZE; i < CQ_SIZE + FQ_SIZE; ++i) { *xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * FRAME_SIZE; } - xsk_ring_prod__submit(&umem->fq, frame_count - FRAME_COUNT_TX); + xsk_ring_prod__submit(&umem->fq, FQ_SIZE); return KNOT_EOK; } @@ -131,6 +136,33 @@ static void deconfigure_xsk_umem(struct kxsk_umem *umem) free(umem); } +static int enable_busypoll(int socket, unsigned timeout_us, unsigned budget) +{ +#if defined(SO_PREFER_BUSY_POLL) && defined(SO_BUSY_POLL_BUDGET) + int opt_val = 1; + if (setsockopt(socket, SOL_SOCKET, SO_PREFER_BUSY_POLL, + &opt_val, sizeof(opt_val)) != 0) { + return knot_map_errno(); + } + + opt_val = timeout_us; + if (setsockopt(socket, SOL_SOCKET, SO_BUSY_POLL, + &opt_val, sizeof(opt_val)) != 0) { + return knot_map_errno(); + } + + opt_val = budget; + if (setsockopt(socket, SOL_SOCKET, SO_BUSY_POLL_BUDGET, + &opt_val, sizeof(opt_val)) != 0) { + return knot_map_errno(); + } + + return KNOT_EOK; +#else + return KNOT_ENOTSUP; +#endif +} + static int configure_xsk_socket(struct kxsk_umem *umem, const struct kxsk_iface *iface, knot_xdp_socket_t **out_sock, @@ -143,14 +175,14 @@ static int configure_xsk_socket(struct kxsk_umem *umem, xsk_info->iface = iface; xsk_info->umem = umem; - uint16_t bind_flags = 0; + uint16_t bind_flags = XDP_USE_NEED_WAKEUP; if (config != NULL && config->force_copy) { bind_flags |= XDP_COPY; } const struct xsk_socket_config sock_conf = { - .tx_size = RING_LEN_TX, - .rx_size = RING_LEN_RX, + .tx_size = umem->ring_size, + .rx_size = umem->ring_size, .libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD, .bind_flags = bind_flags, }; @@ -163,6 +195,17 @@ static int configure_xsk_socket(struct kxsk_umem *umem, return ret; } + if (config != NULL && config->busy_poll_budget > 0) { + ret = enable_busypoll(xsk_socket__fd(xsk_info->xsk), + config->busy_poll_timeout, config->busy_poll_budget); + if (ret != KNOT_EOK) { + xsk_socket__delete(xsk_info->xsk); + free(xsk_info); + return ret; + } + xsk_info->busy_poll = true; + } + *out_sock = xsk_info; return KNOT_EOK; } @@ -172,7 +215,7 @@ int knot_xdp_init(knot_xdp_socket_t **socket, const char *if_name, int if_queue, knot_xdp_filter_flag_t flags, uint16_t udp_port, uint16_t quic_port, knot_xdp_load_bpf_t load_bpf, const knot_xdp_config_t *xdp_config) { - if (socket == NULL || if_name == NULL || + if (socket == NULL || if_name == NULL || !valid_config(xdp_config) || (udp_port == quic_port && (flags & KNOT_XDP_FILTER_UDP) && (flags & KNOT_XDP_FILTER_QUIC)) || (flags & (KNOT_XDP_FILTER_UDP | KNOT_XDP_FILTER_TCP | KNOT_XDP_FILTER_QUIC)) == 0) { return KNOT_EINVAL; @@ -187,7 +230,7 @@ int knot_xdp_init(knot_xdp_socket_t **socket, const char *if_name, int if_queue, /* Initialize shared packet_buffer for umem usage. */ struct kxsk_umem *umem = NULL; - ret = configure_xsk_umem(&umem, xdp_config->extra_frames); + ret = configure_xsk_umem(&umem, ring_size(xdp_config)); if (ret != KNOT_EOK) { kxsk_iface_free(iface); return ret; @@ -266,7 +309,7 @@ static void tx_free_relative(struct kxsk_umem *umem, uint64_t addr_relative) { /* The address may not point to *start* of buffer, but `/` solves that. */ uint64_t index = addr_relative / FRAME_SIZE; - assert(index < FRAME_COUNT); + assert(index < umem->ring_size); umem->tx_free_indices[umem->tx_free_count++] = index; } @@ -285,7 +328,7 @@ void knot_xdp_send_prepare(knot_xdp_socket_t *socket) if (completed == 0) { return; } - assert(umem->tx_free_count + completed <= FRAME_COUNT_TX); + assert(umem->tx_free_count + completed <= umem->ring_size); for (uint32_t i = 0; i < completed; ++i) { uint64_t addr_relative = *xsk_ring_cons__comp_addr(cq, idx++); @@ -301,12 +344,13 @@ static struct umem_frame *alloc_tx_frame(knot_xdp_socket_t *socket) return malloc(sizeof(struct umem_frame)); } - const struct timespec delay = { .tv_nsec = ALLOC_RETRY_DELAY }; struct kxsk_umem *umem = socket->umem; - for (int i = 0; unlikely(umem->tx_free_count == 0); i++) { - if (i == ALLOC_RETRY_NUM) { - return NULL; + const struct timespec delay = { .tv_nsec = RETRY_DELAY }; + while (unlikely(umem->tx_free_count == 0)) { + if (socket->busy_poll || xsk_ring_prod__needs_wakeup(&socket->tx)) { + (void)sendto(xsk_socket__fd(socket->xsk), NULL, 0, + MSG_DONTWAIT, NULL, 0); } nanosleep(&delay, NULL); knot_xdp_send_prepare(socket); @@ -381,9 +425,7 @@ int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[], } if (unlikely(socket->send_mock != NULL)) { int ret = socket->send_mock(socket, msgs, count, sent); - for (uint32_t i = 0; i < count; ++i) { - free_unsent(socket, &msgs[i]); - } + knot_xdp_send_free(socket, msgs, count); return ret; } @@ -393,12 +435,13 @@ int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[], * and the API doesn't allow "cancelling reservations". * Therefore we handle `socket->tx.cached_prod` by hand. */ - if (xsk_prod_nb_free(&socket->tx, count) < count) { - /* This situation was sometimes observed in the emulated XDP mode. */ - for (uint32_t i = 0; i < count; ++i) { - free_unsent(socket, &msgs[i]); + const struct timespec delay = { .tv_nsec = RETRY_DELAY }; + while (unlikely(xsk_prod_nb_free(&socket->tx, count) < count)) { + if (socket->busy_poll || xsk_ring_prod__needs_wakeup(&socket->tx)) { + (void)sendto(xsk_socket__fd(socket->xsk), NULL, 0, + MSG_DONTWAIT, NULL, 0); } - return KNOT_ENOBUFS; + nanosleep(&delay, NULL); } uint32_t idx = socket->tx.cached_prod; @@ -425,7 +468,6 @@ int knot_xdp_send(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[], assert(*sent <= count); socket->tx.cached_prod = idx; xsk_ring_prod__submit(&socket->tx, *sent); - socket->kernel_needs_wakeup = true; return KNOT_EOK; } @@ -446,34 +488,19 @@ int knot_xdp_send_finish(knot_xdp_socket_t *socket) return KNOT_EINVAL; } - /* Trigger sending queued packets. */ - if (!socket->kernel_needs_wakeup) { + if (!socket->busy_poll && !xsk_ring_prod__needs_wakeup(&socket->tx)) { return KNOT_EOK; } int ret = sendto(xsk_socket__fd(socket->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - const bool is_ok = (ret >= 0); - // List of "safe" errors taken from - // https://github.com/torvalds/linux/blame/master/samples/bpf/xdpsock_user.c - const bool is_again = !is_ok && (errno == ENOBUFS || errno == EAGAIN - || errno == EBUSY || errno == ENETDOWN); - // Some of the !is_ok cases are a little unclear - what to do about the syscall, - // including how caller of _sendmsg_finish() should react. - if (is_ok || !is_again) { - socket->kernel_needs_wakeup = false; - } - if (is_again) { - return KNOT_EAGAIN; - } else if (is_ok) { + if (ret >= 0) { return KNOT_EOK; + } else if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || + errno == ENETDOWN) { + return KNOT_EAGAIN; } else { return -errno; } - /* This syscall might be avoided with a newer kernel feature (>= 5.4): - https://www.kernel.org/doc/html/latest/networking/af_xdp.html#xdp-use-need-wakeup-bind-flag - Unfortunately it's not easy to continue supporting older kernels - when using this feature on newer ones. - */ } _public_ @@ -518,7 +545,7 @@ int knot_xdp_recv(knot_xdp_socket_t *socket, knot_xdp_msg_t msgs[], static uint8_t *msg_uframe_ptr(const knot_xdp_msg_t *msg) { - return NULL + ((msg->payload.iov_base - NULL) & ~(FRAME_SIZE - 1)); + return (uint8_t *)((uintptr_t)msg->payload.iov_base & ~(FRAME_SIZE - 1)); } _public_ @@ -529,30 +556,32 @@ void knot_xdp_recv_finish(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[] return; } - const struct timespec delay = { .tv_nsec = ALLOC_RETRY_DELAY }; - struct kxsk_umem *const umem = socket->umem; struct xsk_ring_prod *const fq = &umem->fq; uint32_t idx = 0; - uint32_t reserved = xsk_ring_prod__reserve(fq, count, &idx); - for (int i = 0; unlikely(reserved < count); i++) { - if (i == ALLOC_RETRY_NUM) { - return; + const struct timespec delay = { .tv_nsec = RETRY_DELAY }; + while (unlikely(xsk_ring_prod__reserve(fq, count, &idx) != count)) { + if (socket->busy_poll || xsk_ring_prod__needs_wakeup(fq)) { + (void)recvfrom(xsk_socket__fd(socket->xsk), NULL, 0, + MSG_DONTWAIT, NULL, NULL); } nanosleep(&delay, NULL); - reserved = xsk_ring_prod__reserve(fq, count, &idx); } - for (uint32_t i = 0; i < reserved; ++i) { + for (uint32_t i = 0; i < count; ++i) { uint8_t *uframe_p = msg_uframe_ptr(&msgs[i]); uint64_t offset = uframe_p - umem->frames->bytes; *xsk_ring_prod__fill_addr(fq, idx++) = offset; } - xsk_ring_prod__submit(fq, reserved); + xsk_ring_prod__submit(fq, count); + // recvfrom() here slightly worsens the performance, poll is called later anyway. } +// The number of busy frames +#define RING_BUSY(ring) ((*(ring)->producer - *(ring)->consumer) & (ring)->mask) + _public_ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file) { @@ -560,10 +589,6 @@ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file) return; } - // The number of busy frames - #define RING_BUSY(ring) \ - ((*(ring)->producer - *(ring)->consumer) & (ring)->mask) - #define RING_PRINFO(name, ring) \ fprintf(file, "Ring %s: size %4d, busy %4d (prod %4d, cons %4d)\n", \ name, (unsigned)(ring)->size, \ @@ -571,11 +596,11 @@ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file) (unsigned)*(ring)->producer, (unsigned)*(ring)->consumer) const int rx_busyf = RING_BUSY(&socket->umem->fq) + RING_BUSY(&socket->rx); - fprintf(file, "\nLOST RX frames: %4d", (int)(FRAME_COUNT_RX - rx_busyf)); + fprintf(file, "\nLOST RX frames: %4d", (int)(socket->umem->ring_size - rx_busyf)); const int tx_busyf = RING_BUSY(&socket->umem->cq) + RING_BUSY(&socket->tx); const int tx_freef = socket->umem->tx_free_count; - fprintf(file, "\nLOST TX frames: %4d\n", (int)(FRAME_COUNT_TX - tx_busyf - tx_freef)); + fprintf(file, "\nLOST TX frames: %4d\n", (int)(socket->umem->ring_size - tx_busyf - tx_freef)); RING_PRINFO("FQ", &socket->umem->fq); RING_PRINFO("RX", &socket->rx); @@ -583,3 +608,39 @@ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file) RING_PRINFO("CQ", &socket->umem->cq); fprintf(file, "TX free frames: %4d\n", tx_freef); } + +_public_ +int knot_xdp_socket_stats(knot_xdp_socket_t *socket, knot_xdp_stats_t *stats) +{ + if (socket == NULL || stats == NULL) { + return KNOT_EINVAL; + } + + memset(stats, 0, sizeof(*stats)); + + stats->if_name = socket->iface->if_name; + stats->if_index = socket->iface->if_index; + stats->if_queue = socket->iface->if_queue; + + struct xdp_statistics xdp_stats; + socklen_t optlen = sizeof(xdp_stats); + + int fd = knot_xdp_socket_fd(socket); + int ret = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &xdp_stats, &optlen); + if (ret != 0) { + return knot_map_errno(); + } else if (optlen != sizeof(xdp_stats)) { + return KNOT_EINVAL; + } + + size_t common_size = MIN(sizeof(xdp_stats), sizeof(stats->socket)); + memcpy(&stats->socket, &xdp_stats, common_size); + + stats->rings.tx_busy = socket->umem->ring_size - socket->umem->tx_free_count; + stats->rings.fq_fill = RING_BUSY(&socket->umem->fq); + stats->rings.rx_fill = RING_BUSY(&socket->rx); + stats->rings.tx_fill = RING_BUSY(&socket->tx); + stats->rings.cq_fill = RING_BUSY(&socket->umem->cq); + + return KNOT_EOK; +} diff --git a/src/libknot/xdp/xdp.h b/src/libknot/xdp/xdp.h index 6c8bb1e..5944d44 100644 --- a/src/libknot/xdp/xdp.h +++ b/src/libknot/xdp/xdp.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -53,14 +53,54 @@ typedef struct knot_xdp_socket knot_xdp_socket_t; /*! \brief Configuration of XDP socket. */ struct knot_xdp_config { - bool force_generic; /*!< Use generic XDP mode (avoid driver/hadrware implementation). */ - bool force_copy; /*!< Force copying packet data between kernel and user-space (avoid zero-copy). */ - bool extra_frames; /*!< Extra FQ frames. */ + uint16_t ring_size; /*!< Size of RX and TX rings (must be power of 2). */ + bool force_generic; /*!< Use generic XDP mode (avoid driver/hardware implementation). */ + bool force_copy; /*!< Force copying packet data between kernel and user-space (avoid zero-copy). */ + unsigned busy_poll_timeout; /*!< Preferred busy poll budget (0 means disabled). */ + unsigned busy_poll_budget; /*!< Preferred busy poll timeout (in microseconds) . */ }; /*! \brief Configuration of XDP socket. */ typedef struct knot_xdp_config knot_xdp_config_t; +/*! \brief Various statistics of an XDP socket (optimally kernel >=5.9). */ +typedef struct { + /*! Interface name. */ + const char *if_name; + /*! Interface name index (derived from ifname). */ + int if_index; + /*! Network card queue id. */ + unsigned if_queue; + /*! Counters (xdp_statistics) retrieved from the kernel via XDP_STATISTICS. */ + struct { + /*! Dropped for other reasons. */ + uint64_t rx_dropped; + /*! Dropped due to invalid descriptor. */ + uint64_t rx_invalid; + /*! Dropped due to invalid descriptor. */ + uint64_t tx_invalid; + /*! Dropped due to rx ring being full. */ + uint64_t rx_full; + /*! Failed to retrieve item from fill ring. */ + uint64_t fq_empty; + /*! Failed to retrieve item from tx ring. */ + uint64_t tx_empty; + } socket; + /*! States of rings of the XDP socket. */ + struct { + /*! Busy TX buffers. */ + uint16_t tx_busy; + /*! Free buffers to consume from FQ ring. */ + uint16_t fq_fill; + /*! Pending buffers in TX ring. */ + uint16_t rx_fill; + /*! Pending buffers in RX ring. */ + uint16_t tx_fill; + /*! Pending buffers in CQ ring. */ + uint16_t cq_fill; + } rings; +} knot_xdp_stats_t; + /*! * \brief Initialize XDP socket. * @@ -196,4 +236,14 @@ void knot_xdp_recv_finish(knot_xdp_socket_t *socket, const knot_xdp_msg_t msgs[] */ void knot_xdp_socket_info(const knot_xdp_socket_t *socket, FILE *file); +/*! + * \brief Gets various statistics of the XDP socket. + * + * \param socket XDP socket. + * \param stats Output structure. + * + * \return KNOT_E* + */ +int knot_xdp_socket_stats(knot_xdp_socket_t *socket, knot_xdp_stats_t *stats); + /*! @} */ |