summaryrefslogtreecommitdiffstats
path: root/net/xdp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /net/xdp
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'net/xdp')
-rw-r--r--net/xdp/Kconfig16
-rw-r--r--net/xdp/Makefile4
-rw-r--r--net/xdp/xdp_umem.c259
-rw-r--r--net/xdp/xdp_umem.h15
-rw-r--r--net/xdp/xsk.c1722
-rw-r--r--net/xdp/xsk.h48
-rw-r--r--net/xdp/xsk_buff_pool.c696
-rw-r--r--net/xdp/xsk_diag.c214
-rw-r--r--net/xdp/xsk_queue.c66
-rw-r--r--net/xdp/xsk_queue.h462
-rw-r--r--net/xdp/xskmap.c281
11 files changed, 3783 insertions, 0 deletions
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
new file mode 100644
index 0000000000..71af2febe7
--- /dev/null
+++ b/net/xdp/Kconfig
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config XDP_SOCKETS
+ bool "XDP sockets"
+ depends on BPF_SYSCALL
+ default n
+ help
+ XDP sockets allows a channel between XDP programs and
+ userspace applications.
+
+config XDP_SOCKETS_DIAG
+ tristate "XDP sockets: monitoring interface"
+ depends on XDP_SOCKETS
+ default n
+ help
+ Support for PF_XDP sockets monitoring interface used by the ss tool.
+ If unsure, say Y.
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 0000000000..30cdc4315f
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o
+obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 0000000000..06cead2b8e
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/idr.h>
+#include <linux/vmalloc.h>
+
+#include "xdp_umem.h"
+#include "xsk_queue.h"
+
+static DEFINE_IDA(umem_ida);
+
+static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+{
+ unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
+
+ kvfree(umem->pgs);
+ umem->pgs = NULL;
+}
+
+static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
+{
+ if (umem->user) {
+ atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+ free_uid(umem->user);
+ }
+}
+
+static void xdp_umem_addr_unmap(struct xdp_umem *umem)
+{
+ vunmap(umem->addrs);
+ umem->addrs = NULL;
+}
+
+static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages,
+ u32 nr_pages)
+{
+ umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!umem->addrs)
+ return -ENOMEM;
+ return 0;
+}
+
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+ umem->zc = false;
+ ida_free(&umem_ida, umem->id);
+
+ xdp_umem_addr_unmap(umem);
+ xdp_umem_unpin_pages(umem);
+
+ xdp_umem_unaccount_pages(umem);
+ kfree(umem);
+}
+
+static void xdp_umem_release_deferred(struct work_struct *work)
+{
+ struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
+
+ xdp_umem_release(umem);
+}
+
+void xdp_get_umem(struct xdp_umem *umem)
+{
+ refcount_inc(&umem->users);
+}
+
+void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup)
+{
+ if (!umem)
+ return;
+
+ if (refcount_dec_and_test(&umem->users)) {
+ if (defer_cleanup) {
+ INIT_WORK(&umem->work, xdp_umem_release_deferred);
+ schedule_work(&umem->work);
+ } else {
+ xdp_umem_release(umem);
+ }
+ }
+}
+
+static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
+{
+ unsigned int gup_flags = FOLL_WRITE;
+ long npgs;
+ int err;
+
+ umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN);
+ if (!umem->pgs)
+ return -ENOMEM;
+
+ mmap_read_lock(current->mm);
+ npgs = pin_user_pages(address, umem->npgs,
+ gup_flags | FOLL_LONGTERM, &umem->pgs[0]);
+ mmap_read_unlock(current->mm);
+
+ if (npgs != umem->npgs) {
+ if (npgs >= 0) {
+ umem->npgs = npgs;
+ err = -ENOMEM;
+ goto out_pin;
+ }
+ err = npgs;
+ goto out_pgs;
+ }
+ return 0;
+
+out_pin:
+ xdp_umem_unpin_pages(umem);
+out_pgs:
+ kvfree(umem->pgs);
+ umem->pgs = NULL;
+ return err;
+}
+
+static int xdp_umem_account_pages(struct xdp_umem *umem)
+{
+ unsigned long lock_limit, new_npgs, old_npgs;
+
+ if (capable(CAP_IPC_LOCK))
+ return 0;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ umem->user = get_uid(current_user());
+
+ do {
+ old_npgs = atomic_long_read(&umem->user->locked_vm);
+ new_npgs = old_npgs + umem->npgs;
+ if (new_npgs > lock_limit) {
+ free_uid(umem->user);
+ umem->user = NULL;
+ return -ENOBUFS;
+ }
+ } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
+ new_npgs) != old_npgs);
+ return 0;
+}
+
+static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+{
+ bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+ u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
+ u64 addr = mr->addr, size = mr->len;
+ u32 chunks_rem, npgs_rem;
+ u64 chunks, npgs;
+ int err;
+
+ if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
+ /* Strictly speaking we could support this, if:
+ * - huge pages, or*
+ * - using an IOMMU, or
+ * - making sure the memory area is consecutive
+ * but for now, we simply say "computer says no".
+ */
+ return -EINVAL;
+ }
+
+ if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
+ return -EINVAL;
+
+ if (!unaligned_chunks && !is_power_of_2(chunk_size))
+ return -EINVAL;
+
+ if (!PAGE_ALIGNED(addr)) {
+ /* Memory area has to be page size aligned. For
+ * simplicity, this might change.
+ */
+ return -EINVAL;
+ }
+
+ if ((addr + size) < addr)
+ return -EINVAL;
+
+ npgs = div_u64_rem(size, PAGE_SIZE, &npgs_rem);
+ if (npgs_rem)
+ npgs++;
+ if (npgs > U32_MAX)
+ return -EINVAL;
+
+ chunks = div_u64_rem(size, chunk_size, &chunks_rem);
+ if (!chunks || chunks > U32_MAX)
+ return -EINVAL;
+
+ if (!unaligned_chunks && chunks_rem)
+ return -EINVAL;
+
+ if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
+ return -EINVAL;
+
+ umem->size = size;
+ umem->headroom = headroom;
+ umem->chunk_size = chunk_size;
+ umem->chunks = chunks;
+ umem->npgs = npgs;
+ umem->pgs = NULL;
+ umem->user = NULL;
+ umem->flags = mr->flags;
+
+ INIT_LIST_HEAD(&umem->xsk_dma_list);
+ refcount_set(&umem->users, 1);
+
+ err = xdp_umem_account_pages(umem);
+ if (err)
+ return err;
+
+ err = xdp_umem_pin_pages(umem, (unsigned long)addr);
+ if (err)
+ goto out_account;
+
+ err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs);
+ if (err)
+ goto out_unpin;
+
+ return 0;
+
+out_unpin:
+ xdp_umem_unpin_pages(umem);
+out_account:
+ xdp_umem_unaccount_pages(umem);
+ return err;
+}
+
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
+{
+ struct xdp_umem *umem;
+ int err;
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ err = ida_alloc(&umem_ida, GFP_KERNEL);
+ if (err < 0) {
+ kfree(umem);
+ return ERR_PTR(err);
+ }
+ umem->id = err;
+
+ err = xdp_umem_reg(umem, mr);
+ if (err) {
+ ida_free(&umem_ida, umem->id);
+ kfree(umem);
+ return ERR_PTR(err);
+ }
+
+ return umem;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 0000000000..aa9fe27804
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef XDP_UMEM_H_
+#define XDP_UMEM_H_
+
+#include <net/xdp_sock_drv.h>
+
+void xdp_get_umem(struct xdp_umem *umem);
+void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup);
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
+
+#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 0000000000..d849dc04a3
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,1722 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets
+ *
+ * AF_XDP sockets allows a channel between XDP programs and userspace
+ * applications.
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ * Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
+
+#include <linux/if_xdp.h>
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <net/xdp_sock_drv.h>
+#include <net/busy_poll.h>
+#include <net/netdev_rx_queue.h>
+#include <net/xdp.h>
+
+#include "xsk_queue.h"
+#include "xdp_umem.h"
+#include "xsk.h"
+
+#define TX_BATCH_SIZE 32
+
+static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
+
+void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
+{
+ if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
+ return;
+
+ pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
+ pool->cached_need_wakeup |= XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
+
+void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
+{
+ struct xdp_sock *xs;
+
+ if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
+ }
+ rcu_read_unlock();
+
+ pool->cached_need_wakeup |= XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
+
+void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
+{
+ if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
+ return;
+
+ pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+ pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
+
+void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
+{
+ struct xdp_sock *xs;
+
+ if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+ xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+ }
+ rcu_read_unlock();
+
+ pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
+
+bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
+{
+ return pool->uses_need_wakeup;
+}
+EXPORT_SYMBOL(xsk_uses_need_wakeup);
+
+struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
+ u16 queue_id)
+{
+ if (queue_id < dev->real_num_rx_queues)
+ return dev->_rx[queue_id].pool;
+ if (queue_id < dev->real_num_tx_queues)
+ return dev->_tx[queue_id].pool;
+
+ return NULL;
+}
+EXPORT_SYMBOL(xsk_get_pool_from_qid);
+
+void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
+{
+ if (queue_id < dev->num_rx_queues)
+ dev->_rx[queue_id].pool = NULL;
+ if (queue_id < dev->num_tx_queues)
+ dev->_tx[queue_id].pool = NULL;
+}
+
+/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
+ * not know if the device has more tx queues than rx, or the opposite.
+ * This might also change during run time.
+ */
+int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
+ u16 queue_id)
+{
+ if (queue_id >= max_t(unsigned int,
+ dev->real_num_rx_queues,
+ dev->real_num_tx_queues))
+ return -EINVAL;
+
+ if (queue_id < dev->real_num_rx_queues)
+ dev->_rx[queue_id].pool = pool;
+ if (queue_id < dev->real_num_tx_queues)
+ dev->_tx[queue_id].pool = pool;
+
+ return 0;
+}
+
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
+ u32 flags)
+{
+ u64 addr;
+ int err;
+
+ addr = xp_get_handle(xskb);
+ err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
+ if (err) {
+ xs->rx_queue_full++;
+ return err;
+ }
+
+ xp_release(xskb);
+ return 0;
+}
+
+static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+ u32 frags = xdp_buff_has_frags(xdp);
+ struct xdp_buff_xsk *pos, *tmp;
+ struct list_head *xskb_list;
+ u32 contd = 0;
+ int err;
+
+ if (frags)
+ contd = XDP_PKT_CONTD;
+
+ err = __xsk_rcv_zc(xs, xskb, len, contd);
+ if (err)
+ goto err;
+ if (likely(!frags))
+ return 0;
+
+ xskb_list = &xskb->pool->xskb_list;
+ list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
+ if (list_is_singular(xskb_list))
+ contd = 0;
+ len = pos->xdp.data_end - pos->xdp.data;
+ err = __xsk_rcv_zc(xs, pos, len, contd);
+ if (err)
+ goto err;
+ list_del(&pos->xskb_list_node);
+ }
+
+ return 0;
+err:
+ xsk_buff_free(xdp);
+ return err;
+}
+
+static void *xsk_copy_xdp_start(struct xdp_buff *from)
+{
+ if (unlikely(xdp_data_meta_unsupported(from)))
+ return from->data;
+ else
+ return from->data_meta;
+}
+
+static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
+ u32 *from_len, skb_frag_t **frag, u32 rem)
+{
+ u32 copied = 0;
+
+ while (1) {
+ u32 copy_len = min_t(u32, *from_len, to_len);
+
+ memcpy(to, *from, copy_len);
+ copied += copy_len;
+ if (rem == copied)
+ return copied;
+
+ if (*from_len == copy_len) {
+ *from = skb_frag_address(*frag);
+ *from_len = skb_frag_size((*frag)++);
+ } else {
+ *from += copy_len;
+ *from_len -= copy_len;
+ }
+ if (to_len == copy_len)
+ return copied;
+
+ to_len -= copy_len;
+ to += copy_len;
+ }
+}
+
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+ u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
+ void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
+ u32 from_len, meta_len, rem, num_desc;
+ struct xdp_buff_xsk *xskb;
+ struct xdp_buff *xsk_xdp;
+ skb_frag_t *frag;
+
+ from_len = xdp->data_end - copy_from;
+ meta_len = xdp->data - copy_from;
+ rem = len + meta_len;
+
+ if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
+ int err;
+
+ xsk_xdp = xsk_buff_alloc(xs->pool);
+ if (!xsk_xdp) {
+ xs->rx_dropped++;
+ return -ENOMEM;
+ }
+ memcpy(xsk_xdp->data - meta_len, copy_from, rem);
+ xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
+ err = __xsk_rcv_zc(xs, xskb, len, 0);
+ if (err) {
+ xsk_buff_free(xsk_xdp);
+ return err;
+ }
+
+ return 0;
+ }
+
+ num_desc = (len - 1) / frame_size + 1;
+
+ if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
+ xs->rx_dropped++;
+ return -ENOMEM;
+ }
+ if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
+ xs->rx_queue_full++;
+ return -ENOBUFS;
+ }
+
+ if (xdp_buff_has_frags(xdp)) {
+ struct skb_shared_info *sinfo;
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ frag = &sinfo->frags[0];
+ }
+
+ do {
+ u32 to_len = frame_size + meta_len;
+ u32 copied;
+
+ xsk_xdp = xsk_buff_alloc(xs->pool);
+ copy_to = xsk_xdp->data - meta_len;
+
+ copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
+ rem -= copied;
+
+ xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
+ __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
+ meta_len = 0;
+ } while (rem);
+
+ return 0;
+}
+
+static bool xsk_tx_writeable(struct xdp_sock *xs)
+{
+ if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
+ return false;
+
+ return true;
+}
+
+static bool xsk_is_bound(struct xdp_sock *xs)
+{
+ if (READ_ONCE(xs->state) == XSK_BOUND) {
+ /* Matches smp_wmb() in bind(). */
+ smp_rmb();
+ return true;
+ }
+ return false;
+}
+
+static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+ if (!xsk_is_bound(xs))
+ return -ENXIO;
+
+ if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+ return -EINVAL;
+
+ if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
+ xs->rx_dropped++;
+ return -ENOSPC;
+ }
+
+ sk_mark_napi_id_once_xdp(&xs->sk, xdp);
+ return 0;
+}
+
+static void xsk_flush(struct xdp_sock *xs)
+{
+ xskq_prod_submit(xs->rx);
+ __xskq_cons_release(xs->pool->fq);
+ sock_def_readable(&xs->sk);
+}
+
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+ u32 len = xdp_get_buff_len(xdp);
+ int err;
+
+ spin_lock_bh(&xs->rx_lock);
+ err = xsk_rcv_check(xs, xdp, len);
+ if (!err) {
+ err = __xsk_rcv(xs, xdp, len);
+ xsk_flush(xs);
+ }
+ spin_unlock_bh(&xs->rx_lock);
+ return err;
+}
+
+static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+ u32 len = xdp_get_buff_len(xdp);
+ int err;
+
+ err = xsk_rcv_check(xs, xdp, len);
+ if (err)
+ return err;
+
+ if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
+ len = xdp->data_end - xdp->data;
+ return xsk_rcv_zc(xs, xdp, len);
+ }
+
+ err = __xsk_rcv(xs, xdp, len);
+ if (!err)
+ xdp_return_buff(xdp);
+ return err;
+}
+
+int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+ struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
+ int err;
+
+ err = xsk_rcv(xs, xdp);
+ if (err)
+ return err;
+
+ if (!xs->flush_node.prev)
+ list_add(&xs->flush_node, flush_list);
+
+ return 0;
+}
+
+void __xsk_map_flush(void)
+{
+ struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
+ struct xdp_sock *xs, *tmp;
+
+ list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+ xsk_flush(xs);
+ __list_del_clearprev(&xs->flush_node);
+ }
+}
+
+void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
+{
+ xskq_prod_submit_n(pool->cq, nb_entries);
+}
+EXPORT_SYMBOL(xsk_tx_completed);
+
+void xsk_tx_release(struct xsk_buff_pool *pool)
+{
+ struct xdp_sock *xs;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+ __xskq_cons_release(xs->tx);
+ if (xsk_tx_writeable(xs))
+ xs->sk.sk_write_space(&xs->sk);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(xsk_tx_release);
+
+bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
+{
+ struct xdp_sock *xs;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+ if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
+ if (xskq_has_descs(xs->tx))
+ xskq_cons_release(xs->tx);
+ continue;
+ }
+
+ /* This is the backpressure mechanism for the Tx path.
+ * Reserve space in the completion queue and only proceed
+ * if there is space in it. This avoids having to implement
+ * any buffering in the Tx path.
+ */
+ if (xskq_prod_reserve_addr(pool->cq, desc->addr))
+ goto out;
+
+ xskq_cons_release(xs->tx);
+ rcu_read_unlock();
+ return true;
+ }
+
+out:
+ rcu_read_unlock();
+ return false;
+}
+EXPORT_SYMBOL(xsk_tx_peek_desc);
+
+static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
+{
+ struct xdp_desc *descs = pool->tx_descs;
+ u32 nb_pkts = 0;
+
+ while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
+ nb_pkts++;
+
+ xsk_tx_release(pool);
+ return nb_pkts;
+}
+
+u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
+{
+ struct xdp_sock *xs;
+
+ rcu_read_lock();
+ if (!list_is_singular(&pool->xsk_tx_list)) {
+ /* Fallback to the non-batched version */
+ rcu_read_unlock();
+ return xsk_tx_peek_release_fallback(pool, nb_pkts);
+ }
+
+ xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
+ if (!xs) {
+ nb_pkts = 0;
+ goto out;
+ }
+
+ nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
+
+ /* This is the backpressure mechanism for the Tx path. Try to
+ * reserve space in the completion queue for all packets, but
+ * if there are fewer slots available, just process that many
+ * packets. This avoids having to implement any buffering in
+ * the Tx path.
+ */
+ nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
+ if (!nb_pkts)
+ goto out;
+
+ nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
+ if (!nb_pkts) {
+ xs->tx->queue_empty_descs++;
+ goto out;
+ }
+
+ __xskq_cons_release(xs->tx);
+ xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
+ xs->sk.sk_write_space(&xs->sk);
+
+out:
+ rcu_read_unlock();
+ return nb_pkts;
+}
+EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
+
+static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
+{
+ struct net_device *dev = xs->dev;
+
+ return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
+}
+
+static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&xs->pool->cq_lock, flags);
+ ret = xskq_prod_reserve_addr(xs->pool->cq, addr);
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+
+ return ret;
+}
+
+static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&xs->pool->cq_lock, flags);
+ xskq_prod_submit_n(xs->pool->cq, n);
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+}
+
+static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&xs->pool->cq_lock, flags);
+ xskq_prod_cancel_n(xs->pool->cq, n);
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+}
+
+static u32 xsk_get_num_desc(struct sk_buff *skb)
+{
+ return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
+}
+
+static void xsk_destruct_skb(struct sk_buff *skb)
+{
+ xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
+ sock_wfree(skb);
+}
+
+static void xsk_set_destructor_arg(struct sk_buff *skb)
+{
+ long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+
+ skb_shinfo(skb)->destructor_arg = (void *)num;
+}
+
+static void xsk_consume_skb(struct sk_buff *skb)
+{
+ struct xdp_sock *xs = xdp_sk(skb->sk);
+
+ skb->destructor = sock_wfree;
+ xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb));
+ /* Free skb without triggering the perf drop trace */
+ consume_skb(skb);
+ xs->skb = NULL;
+}
+
+static void xsk_drop_skb(struct sk_buff *skb)
+{
+ xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
+ xsk_consume_skb(skb);
+}
+
+static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
+ struct xdp_desc *desc)
+{
+ struct xsk_buff_pool *pool = xs->pool;
+ u32 hr, len, ts, offset, copy, copied;
+ struct sk_buff *skb = xs->skb;
+ struct page *page;
+ void *buffer;
+ int err, i;
+ u64 addr;
+
+ if (!skb) {
+ hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
+
+ skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
+ if (unlikely(!skb))
+ return ERR_PTR(err);
+
+ skb_reserve(skb, hr);
+ }
+
+ addr = desc->addr;
+ len = desc->len;
+ ts = pool->unaligned ? len : pool->chunk_size;
+
+ buffer = xsk_buff_raw_get_data(pool, addr);
+ offset = offset_in_page(buffer);
+ addr = buffer - pool->addrs;
+
+ for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
+ if (unlikely(i >= MAX_SKB_FRAGS))
+ return ERR_PTR(-EOVERFLOW);
+
+ page = pool->umem->pgs[addr >> PAGE_SHIFT];
+ get_page(page);
+
+ copy = min_t(u32, PAGE_SIZE - offset, len - copied);
+ skb_fill_page_desc(skb, i, page, offset, copy);
+
+ copied += copy;
+ addr += copy;
+ offset = 0;
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += ts;
+
+ refcount_add(ts, &xs->sk.sk_wmem_alloc);
+
+ return skb;
+}
+
+static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
+ struct xdp_desc *desc)
+{
+ struct net_device *dev = xs->dev;
+ struct sk_buff *skb = xs->skb;
+ int err;
+
+ if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
+ skb = xsk_build_skb_zerocopy(xs, desc);
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ goto free_err;
+ }
+ } else {
+ u32 hr, tr, len;
+ void *buffer;
+
+ buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+ len = desc->len;
+
+ if (!skb) {
+ hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+ tr = dev->needed_tailroom;
+ skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+ if (unlikely(!skb))
+ goto free_err;
+
+ skb_reserve(skb, hr);
+ skb_put(skb, len);
+
+ err = skb_store_bits(skb, 0, buffer, len);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ goto free_err;
+ }
+ } else {
+ int nr_frags = skb_shinfo(skb)->nr_frags;
+ struct page *page;
+ u8 *vaddr;
+
+ if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
+ err = -EOVERFLOW;
+ goto free_err;
+ }
+
+ page = alloc_page(xs->sk.sk_allocation);
+ if (unlikely(!page)) {
+ err = -EAGAIN;
+ goto free_err;
+ }
+
+ vaddr = kmap_local_page(page);
+ memcpy(vaddr, buffer, len);
+ kunmap_local(vaddr);
+
+ skb_add_rx_frag(skb, nr_frags, page, 0, len, 0);
+ }
+ }
+
+ skb->dev = dev;
+ skb->priority = xs->sk.sk_priority;
+ skb->mark = READ_ONCE(xs->sk.sk_mark);
+ skb->destructor = xsk_destruct_skb;
+ xsk_set_destructor_arg(skb);
+
+ return skb;
+
+free_err:
+ if (err == -EOVERFLOW) {
+ /* Drop the packet */
+ xsk_set_destructor_arg(xs->skb);
+ xsk_drop_skb(xs->skb);
+ xskq_cons_release(xs->tx);
+ } else {
+ /* Let application retry */
+ xsk_cq_cancel_locked(xs, 1);
+ }
+
+ return ERR_PTR(err);
+}
+
+static int __xsk_generic_xmit(struct sock *sk)
+{
+ struct xdp_sock *xs = xdp_sk(sk);
+ u32 max_batch = TX_BATCH_SIZE;
+ bool sent_frame = false;
+ struct xdp_desc desc;
+ struct sk_buff *skb;
+ int err = 0;
+
+ mutex_lock(&xs->mutex);
+
+ /* Since we dropped the RCU read lock, the socket state might have changed. */
+ if (unlikely(!xsk_is_bound(xs))) {
+ err = -ENXIO;
+ goto out;
+ }
+
+ if (xs->queue_id >= xs->dev->real_num_tx_queues)
+ goto out;
+
+ while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
+ if (max_batch-- == 0) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ /* This is the backpressure mechanism for the Tx path.
+ * Reserve space in the completion queue and only proceed
+ * if there is space in it. This avoids having to implement
+ * any buffering in the Tx path.
+ */
+ if (xsk_cq_reserve_addr_locked(xs, desc.addr))
+ goto out;
+
+ skb = xsk_build_skb(xs, &desc);
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ if (err != -EOVERFLOW)
+ goto out;
+ err = 0;
+ continue;
+ }
+
+ xskq_cons_release(xs->tx);
+
+ if (xp_mb_desc(&desc)) {
+ xs->skb = skb;
+ continue;
+ }
+
+ err = __dev_direct_xmit(skb, xs->queue_id);
+ if (err == NETDEV_TX_BUSY) {
+ /* Tell user-space to retry the send */
+ xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
+ xsk_consume_skb(skb);
+ err = -EAGAIN;
+ goto out;
+ }
+
+ /* Ignore NET_XMIT_CN as packet might have been sent */
+ if (err == NET_XMIT_DROP) {
+ /* SKB completed but not sent */
+ err = -EBUSY;
+ xs->skb = NULL;
+ goto out;
+ }
+
+ sent_frame = true;
+ xs->skb = NULL;
+ }
+
+ if (xskq_has_descs(xs->tx)) {
+ if (xs->skb)
+ xsk_drop_skb(xs->skb);
+ xskq_cons_release(xs->tx);
+ }
+
+out:
+ if (sent_frame)
+ if (xsk_tx_writeable(xs))
+ sk->sk_write_space(sk);
+
+ mutex_unlock(&xs->mutex);
+ return err;
+}
+
+static int xsk_generic_xmit(struct sock *sk)
+{
+ int ret;
+
+ /* Drop the RCU lock since the SKB path might sleep. */
+ rcu_read_unlock();
+ ret = __xsk_generic_xmit(sk);
+ /* Reaquire RCU lock before going into common code. */
+ rcu_read_lock();
+
+ return ret;
+}
+
+static bool xsk_no_wakeup(struct sock *sk)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* Prefer busy-polling, skip the wakeup. */
+ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
+ READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
+#else
+ return false;
+#endif
+}
+
+static int xsk_check_common(struct xdp_sock *xs)
+{
+ if (unlikely(!xsk_is_bound(xs)))
+ return -ENXIO;
+ if (unlikely(!(xs->dev->flags & IFF_UP)))
+ return -ENETDOWN;
+
+ return 0;
+}
+
+static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
+{
+ bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct xsk_buff_pool *pool;
+ int err;
+
+ err = xsk_check_common(xs);
+ if (err)
+ return err;
+ if (unlikely(need_wait))
+ return -EOPNOTSUPP;
+ if (unlikely(!xs->tx))
+ return -ENOBUFS;
+
+ if (sk_can_busy_loop(sk)) {
+ if (xs->zc)
+ __sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
+ sk_busy_loop(sk, 1); /* only support non-blocking sockets */
+ }
+
+ if (xs->zc && xsk_no_wakeup(sk))
+ return 0;
+
+ pool = xs->pool;
+ if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
+ if (xs->zc)
+ return xsk_wakeup(xs, XDP_WAKEUP_TX);
+ return xsk_generic_xmit(sk);
+ }
+ return 0;
+}
+
+static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = __xsk_sendmsg(sock, m, total_len);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
+{
+ bool need_wait = !(flags & MSG_DONTWAIT);
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ int err;
+
+ err = xsk_check_common(xs);
+ if (err)
+ return err;
+ if (unlikely(!xs->rx))
+ return -ENOBUFS;
+ if (unlikely(need_wait))
+ return -EOPNOTSUPP;
+
+ if (sk_can_busy_loop(sk))
+ sk_busy_loop(sk, 1); /* only support non-blocking sockets */
+
+ if (xsk_no_wakeup(sk))
+ return 0;
+
+ if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
+ return xsk_wakeup(xs, XDP_WAKEUP_RX);
+ return 0;
+}
+
+static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = __xsk_recvmsg(sock, m, len, flags);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static __poll_t xsk_poll(struct file *file, struct socket *sock,
+ struct poll_table_struct *wait)
+{
+ __poll_t mask = 0;
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct xsk_buff_pool *pool;
+
+ sock_poll_wait(file, sock, wait);
+
+ rcu_read_lock();
+ if (xsk_check_common(xs))
+ goto out;
+
+ pool = xs->pool;
+
+ if (pool->cached_need_wakeup) {
+ if (xs->zc)
+ xsk_wakeup(xs, pool->cached_need_wakeup);
+ else if (xs->tx)
+ /* Poll needs to drive Tx also in copy mode */
+ xsk_generic_xmit(sk);
+ }
+
+ if (xs->rx && !xskq_prod_is_empty(xs->rx))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ if (xs->tx && xsk_tx_writeable(xs))
+ mask |= EPOLLOUT | EPOLLWRNORM;
+out:
+ rcu_read_unlock();
+ return mask;
+}
+
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
+ bool umem_queue)
+{
+ struct xsk_queue *q;
+
+ if (entries == 0 || *queue || !is_power_of_2(entries))
+ return -EINVAL;
+
+ q = xskq_create(entries, umem_queue);
+ if (!q)
+ return -ENOMEM;
+
+ /* Make sure queue is ready before it can be seen by others */
+ smp_wmb();
+ WRITE_ONCE(*queue, q);
+ return 0;
+}
+
+static void xsk_unbind_dev(struct xdp_sock *xs)
+{
+ struct net_device *dev = xs->dev;
+
+ if (xs->state != XSK_BOUND)
+ return;
+ WRITE_ONCE(xs->state, XSK_UNBOUND);
+
+ /* Wait for driver to stop using the xdp socket. */
+ xp_del_xsk(xs->pool, xs);
+ synchronize_net();
+ dev_put(dev);
+}
+
+static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
+ struct xdp_sock __rcu ***map_entry)
+{
+ struct xsk_map *map = NULL;
+ struct xsk_map_node *node;
+
+ *map_entry = NULL;
+
+ spin_lock_bh(&xs->map_list_lock);
+ node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
+ node);
+ if (node) {
+ bpf_map_inc(&node->map->map);
+ map = node->map;
+ *map_entry = node->map_entry;
+ }
+ spin_unlock_bh(&xs->map_list_lock);
+ return map;
+}
+
+static void xsk_delete_from_maps(struct xdp_sock *xs)
+{
+ /* This function removes the current XDP socket from all the
+ * maps it resides in. We need to take extra care here, due to
+ * the two locks involved. Each map has a lock synchronizing
+ * updates to the entries, and each socket has a lock that
+ * synchronizes access to the list of maps (map_list). For
+ * deadlock avoidance the locks need to be taken in the order
+ * "map lock"->"socket map list lock". We start off by
+ * accessing the socket map list, and take a reference to the
+ * map to guarantee existence between the
+ * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
+ * calls. Then we ask the map to remove the socket, which
+ * tries to remove the socket from the map. Note that there
+ * might be updates to the map between
+ * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
+ */
+ struct xdp_sock __rcu **map_entry = NULL;
+ struct xsk_map *map;
+
+ while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
+ xsk_map_try_sock_delete(map, xs, map_entry);
+ bpf_map_put(&map->map);
+ }
+}
+
+static int xsk_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct net *net;
+
+ if (!sk)
+ return 0;
+
+ net = sock_net(sk);
+
+ if (xs->skb)
+ xsk_drop_skb(xs->skb);
+
+ mutex_lock(&net->xdp.lock);
+ sk_del_node_init_rcu(sk);
+ mutex_unlock(&net->xdp.lock);
+
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+
+ xsk_delete_from_maps(xs);
+ mutex_lock(&xs->mutex);
+ xsk_unbind_dev(xs);
+ mutex_unlock(&xs->mutex);
+
+ xskq_destroy(xs->rx);
+ xskq_destroy(xs->tx);
+ xskq_destroy(xs->fq_tmp);
+ xskq_destroy(xs->cq_tmp);
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ sock_put(sk);
+
+ return 0;
+}
+
+static struct socket *xsk_lookup_xsk_from_fd(int fd)
+{
+ struct socket *sock;
+ int err;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return ERR_PTR(-ENOTSOCK);
+
+ if (sock->sk->sk_family != PF_XDP) {
+ sockfd_put(sock);
+ return ERR_PTR(-ENOPROTOOPT);
+ }
+
+ return sock;
+}
+
+static bool xsk_validate_queues(struct xdp_sock *xs)
+{
+ return xs->fq_tmp && xs->cq_tmp;
+}
+
+static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+ struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct net_device *dev;
+ int bound_dev_if;
+ u32 flags, qid;
+ int err = 0;
+
+ if (addr_len < sizeof(struct sockaddr_xdp))
+ return -EINVAL;
+ if (sxdp->sxdp_family != AF_XDP)
+ return -EINVAL;
+
+ flags = sxdp->sxdp_flags;
+ if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
+ XDP_USE_NEED_WAKEUP | XDP_USE_SG))
+ return -EINVAL;
+
+ bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
+ return -EINVAL;
+
+ rtnl_lock();
+ mutex_lock(&xs->mutex);
+ if (xs->state != XSK_READY) {
+ err = -EBUSY;
+ goto out_release;
+ }
+
+ dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out_release;
+ }
+
+ if (!xs->rx && !xs->tx) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ qid = sxdp->sxdp_queue_id;
+
+ if (flags & XDP_SHARED_UMEM) {
+ struct xdp_sock *umem_xs;
+ struct socket *sock;
+
+ if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
+ (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
+ /* Cannot specify flags for shared sockets. */
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (xs->umem) {
+ /* We have already our own. */
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
+ if (IS_ERR(sock)) {
+ err = PTR_ERR(sock);
+ goto out_unlock;
+ }
+
+ umem_xs = xdp_sk(sock->sk);
+ if (!xsk_is_bound(umem_xs)) {
+ err = -EBADF;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
+ if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
+ /* Share the umem with another socket on another qid
+ * and/or device.
+ */
+ xs->pool = xp_create_and_assign_umem(xs,
+ umem_xs->umem);
+ if (!xs->pool) {
+ err = -ENOMEM;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
+ err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
+ qid);
+ if (err) {
+ xp_destroy(xs->pool);
+ xs->pool = NULL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+ } else {
+ /* Share the buffer pool with the other socket. */
+ if (xs->fq_tmp || xs->cq_tmp) {
+ /* Do not allow setting your own fq or cq. */
+ err = -EINVAL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
+ xp_get_pool(umem_xs->pool);
+ xs->pool = umem_xs->pool;
+
+ /* If underlying shared umem was created without Tx
+ * ring, allocate Tx descs array that Tx batching API
+ * utilizes
+ */
+ if (xs->tx && !xs->pool->tx_descs) {
+ err = xp_alloc_tx_descs(xs->pool, xs);
+ if (err) {
+ xp_put_pool(xs->pool);
+ xs->pool = NULL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+ }
+ }
+
+ xdp_get_umem(umem_xs->umem);
+ WRITE_ONCE(xs->umem, umem_xs->umem);
+ sockfd_put(sock);
+ } else if (!xs->umem || !xsk_validate_queues(xs)) {
+ err = -EINVAL;
+ goto out_unlock;
+ } else {
+ /* This xsk has its own umem. */
+ xs->pool = xp_create_and_assign_umem(xs, xs->umem);
+ if (!xs->pool) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ err = xp_assign_dev(xs->pool, dev, qid, flags);
+ if (err) {
+ xp_destroy(xs->pool);
+ xs->pool = NULL;
+ goto out_unlock;
+ }
+ }
+
+ /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
+ xs->fq_tmp = NULL;
+ xs->cq_tmp = NULL;
+
+ xs->dev = dev;
+ xs->zc = xs->umem->zc;
+ xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
+ xs->queue_id = qid;
+ xp_add_xsk(xs->pool, xs);
+
+out_unlock:
+ if (err) {
+ dev_put(dev);
+ } else {
+ /* Matches smp_rmb() in bind() for shared umem
+ * sockets, and xsk_is_bound().
+ */
+ smp_wmb();
+ WRITE_ONCE(xs->state, XSK_BOUND);
+ }
+out_release:
+ mutex_unlock(&xs->mutex);
+ rtnl_unlock();
+ return err;
+}
+
+struct xdp_umem_reg_v1 {
+ __u64 addr; /* Start of packet data area */
+ __u64 len; /* Length of packet data area */
+ __u32 chunk_size;
+ __u32 headroom;
+};
+
+static int xsk_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ int err;
+
+ if (level != SOL_XDP)
+ return -ENOPROTOOPT;
+
+ switch (optname) {
+ case XDP_RX_RING:
+ case XDP_TX_RING:
+ {
+ struct xsk_queue **q;
+ int entries;
+
+ if (optlen < sizeof(entries))
+ return -EINVAL;
+ if (copy_from_sockptr(&entries, optval, sizeof(entries)))
+ return -EFAULT;
+
+ mutex_lock(&xs->mutex);
+ if (xs->state != XSK_READY) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
+ q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
+ err = xsk_init_queue(entries, q, false);
+ if (!err && optname == XDP_TX_RING)
+ /* Tx needs to be explicitly woken up the first time */
+ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
+ mutex_unlock(&xs->mutex);
+ return err;
+ }
+ case XDP_UMEM_REG:
+ {
+ size_t mr_size = sizeof(struct xdp_umem_reg);
+ struct xdp_umem_reg mr = {};
+ struct xdp_umem *umem;
+
+ if (optlen < sizeof(struct xdp_umem_reg_v1))
+ return -EINVAL;
+ else if (optlen < sizeof(mr))
+ mr_size = sizeof(struct xdp_umem_reg_v1);
+
+ if (copy_from_sockptr(&mr, optval, mr_size))
+ return -EFAULT;
+
+ mutex_lock(&xs->mutex);
+ if (xs->state != XSK_READY || xs->umem) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
+
+ umem = xdp_umem_create(&mr);
+ if (IS_ERR(umem)) {
+ mutex_unlock(&xs->mutex);
+ return PTR_ERR(umem);
+ }
+
+ /* Make sure umem is ready before it can be seen by others */
+ smp_wmb();
+ WRITE_ONCE(xs->umem, umem);
+ mutex_unlock(&xs->mutex);
+ return 0;
+ }
+ case XDP_UMEM_FILL_RING:
+ case XDP_UMEM_COMPLETION_RING:
+ {
+ struct xsk_queue **q;
+ int entries;
+
+ if (copy_from_sockptr(&entries, optval, sizeof(entries)))
+ return -EFAULT;
+
+ mutex_lock(&xs->mutex);
+ if (xs->state != XSK_READY) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
+
+ q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
+ &xs->cq_tmp;
+ err = xsk_init_queue(entries, q, true);
+ mutex_unlock(&xs->mutex);
+ return err;
+ }
+ default:
+ break;
+ }
+
+ return -ENOPROTOOPT;
+}
+
+static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
+{
+ ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+ ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+ ring->desc = offsetof(struct xdp_rxtx_ring, desc);
+}
+
+static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
+{
+ ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+ ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+ ring->desc = offsetof(struct xdp_umem_ring, desc);
+}
+
+struct xdp_statistics_v1 {
+ __u64 rx_dropped;
+ __u64 rx_invalid_descs;
+ __u64 tx_invalid_descs;
+};
+
+static int xsk_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ int len;
+
+ if (level != SOL_XDP)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case XDP_STATISTICS:
+ {
+ struct xdp_statistics stats = {};
+ bool extra_stats = true;
+ size_t stats_size;
+
+ if (len < sizeof(struct xdp_statistics_v1)) {
+ return -EINVAL;
+ } else if (len < sizeof(stats)) {
+ extra_stats = false;
+ stats_size = sizeof(struct xdp_statistics_v1);
+ } else {
+ stats_size = sizeof(stats);
+ }
+
+ mutex_lock(&xs->mutex);
+ stats.rx_dropped = xs->rx_dropped;
+ if (extra_stats) {
+ stats.rx_ring_full = xs->rx_queue_full;
+ stats.rx_fill_ring_empty_descs =
+ xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
+ stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
+ } else {
+ stats.rx_dropped += xs->rx_queue_full;
+ }
+ stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
+ stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
+ mutex_unlock(&xs->mutex);
+
+ if (copy_to_user(optval, &stats, stats_size))
+ return -EFAULT;
+ if (put_user(stats_size, optlen))
+ return -EFAULT;
+
+ return 0;
+ }
+ case XDP_MMAP_OFFSETS:
+ {
+ struct xdp_mmap_offsets off;
+ struct xdp_mmap_offsets_v1 off_v1;
+ bool flags_supported = true;
+ void *to_copy;
+
+ if (len < sizeof(off_v1))
+ return -EINVAL;
+ else if (len < sizeof(off))
+ flags_supported = false;
+
+ if (flags_supported) {
+ /* xdp_ring_offset is identical to xdp_ring_offset_v1
+ * except for the flags field added to the end.
+ */
+ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+ &off.rx);
+ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+ &off.tx);
+ xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+ &off.fr);
+ xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+ &off.cr);
+ off.rx.flags = offsetof(struct xdp_rxtx_ring,
+ ptrs.flags);
+ off.tx.flags = offsetof(struct xdp_rxtx_ring,
+ ptrs.flags);
+ off.fr.flags = offsetof(struct xdp_umem_ring,
+ ptrs.flags);
+ off.cr.flags = offsetof(struct xdp_umem_ring,
+ ptrs.flags);
+
+ len = sizeof(off);
+ to_copy = &off;
+ } else {
+ xsk_enter_rxtx_offsets(&off_v1.rx);
+ xsk_enter_rxtx_offsets(&off_v1.tx);
+ xsk_enter_umem_offsets(&off_v1.fr);
+ xsk_enter_umem_offsets(&off_v1.cr);
+
+ len = sizeof(off_v1);
+ to_copy = &off_v1;
+ }
+
+ if (copy_to_user(optval, to_copy, len))
+ return -EFAULT;
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+ }
+ case XDP_OPTIONS:
+ {
+ struct xdp_options opts = {};
+
+ if (len < sizeof(opts))
+ return -EINVAL;
+
+ mutex_lock(&xs->mutex);
+ if (xs->zc)
+ opts.flags |= XDP_OPTIONS_ZEROCOPY;
+ mutex_unlock(&xs->mutex);
+
+ len = sizeof(opts);
+ if (copy_to_user(optval, &opts, len))
+ return -EFAULT;
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+ }
+ default:
+ break;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int xsk_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long size = vma->vm_end - vma->vm_start;
+ struct xdp_sock *xs = xdp_sk(sock->sk);
+ int state = READ_ONCE(xs->state);
+ struct xsk_queue *q = NULL;
+
+ if (state != XSK_READY && state != XSK_BOUND)
+ return -EBUSY;
+
+ if (offset == XDP_PGOFF_RX_RING) {
+ q = READ_ONCE(xs->rx);
+ } else if (offset == XDP_PGOFF_TX_RING) {
+ q = READ_ONCE(xs->tx);
+ } else {
+ /* Matches the smp_wmb() in XDP_UMEM_REG */
+ smp_rmb();
+ if (offset == XDP_UMEM_PGOFF_FILL_RING)
+ q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
+ READ_ONCE(xs->pool->fq);
+ else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
+ q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
+ READ_ONCE(xs->pool->cq);
+ }
+
+ if (!q)
+ return -EINVAL;
+
+ /* Matches the smp_wmb() in xsk_init_queue */
+ smp_rmb();
+ if (size > q->ring_vmalloc_size)
+ return -EINVAL;
+
+ return remap_vmalloc_range(vma, q->ring, 0);
+}
+
+static int xsk_notifier(struct notifier_block *this,
+ unsigned long msg, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct sock *sk;
+
+ switch (msg) {
+ case NETDEV_UNREGISTER:
+ mutex_lock(&net->xdp.lock);
+ sk_for_each(sk, &net->xdp.list) {
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ mutex_lock(&xs->mutex);
+ if (xs->dev == dev) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+
+ xsk_unbind_dev(xs);
+
+ /* Clear device references. */
+ xp_clear_dev(xs->pool);
+ }
+ mutex_unlock(&xs->mutex);
+ }
+ mutex_unlock(&net->xdp.lock);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct proto xsk_proto = {
+ .name = "XDP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct xdp_sock),
+};
+
+static const struct proto_ops xsk_proto_ops = {
+ .family = PF_XDP,
+ .owner = THIS_MODULE,
+ .release = xsk_release,
+ .bind = xsk_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = xsk_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = xsk_setsockopt,
+ .getsockopt = xsk_getsockopt,
+ .sendmsg = xsk_sendmsg,
+ .recvmsg = xsk_recvmsg,
+ .mmap = xsk_mmap,
+};
+
+static void xsk_destruct(struct sock *sk)
+{
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ return;
+
+ if (!xp_put_pool(xs->pool))
+ xdp_put_umem(xs->umem, !xs->pool);
+}
+
+static int xsk_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct xdp_sock *xs;
+ struct sock *sk;
+
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ return -EPERM;
+ if (sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ if (protocol)
+ return -EPROTONOSUPPORT;
+
+ sock->state = SS_UNCONNECTED;
+
+ sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+ if (!sk)
+ return -ENOBUFS;
+
+ sock->ops = &xsk_proto_ops;
+
+ sock_init_data(sock, sk);
+
+ sk->sk_family = PF_XDP;
+
+ sk->sk_destruct = xsk_destruct;
+
+ sock_set_flag(sk, SOCK_RCU_FREE);
+
+ xs = xdp_sk(sk);
+ xs->state = XSK_READY;
+ mutex_init(&xs->mutex);
+ spin_lock_init(&xs->rx_lock);
+
+ INIT_LIST_HEAD(&xs->map_list);
+ spin_lock_init(&xs->map_list_lock);
+
+ mutex_lock(&net->xdp.lock);
+ sk_add_node_rcu(sk, &net->xdp.list);
+ mutex_unlock(&net->xdp.lock);
+
+ sock_prot_inuse_add(net, &xsk_proto, 1);
+
+ return 0;
+}
+
+static const struct net_proto_family xsk_family_ops = {
+ .family = PF_XDP,
+ .create = xsk_create,
+ .owner = THIS_MODULE,
+};
+
+static struct notifier_block xsk_netdev_notifier = {
+ .notifier_call = xsk_notifier,
+};
+
+static int __net_init xsk_net_init(struct net *net)
+{
+ mutex_init(&net->xdp.lock);
+ INIT_HLIST_HEAD(&net->xdp.list);
+ return 0;
+}
+
+static void __net_exit xsk_net_exit(struct net *net)
+{
+ WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
+}
+
+static struct pernet_operations xsk_net_ops = {
+ .init = xsk_net_init,
+ .exit = xsk_net_exit,
+};
+
+static int __init xsk_init(void)
+{
+ int err, cpu;
+
+ err = proto_register(&xsk_proto, 0 /* no slab */);
+ if (err)
+ goto out;
+
+ err = sock_register(&xsk_family_ops);
+ if (err)
+ goto out_proto;
+
+ err = register_pernet_subsys(&xsk_net_ops);
+ if (err)
+ goto out_sk;
+
+ err = register_netdevice_notifier(&xsk_netdev_notifier);
+ if (err)
+ goto out_pernet;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
+ return 0;
+
+out_pernet:
+ unregister_pernet_subsys(&xsk_net_ops);
+out_sk:
+ sock_unregister(PF_XDP);
+out_proto:
+ proto_unregister(&xsk_proto);
+out:
+ return err;
+}
+
+fs_initcall(xsk_init);
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
new file mode 100644
index 0000000000..a4bc4749fa
--- /dev/null
+++ b/net/xdp/xsk.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2019 Intel Corporation. */
+
+#ifndef XSK_H_
+#define XSK_H_
+
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+struct xdp_ring_offset_v1 {
+ __u64 producer;
+ __u64 consumer;
+ __u64 desc;
+};
+
+struct xdp_mmap_offsets_v1 {
+ struct xdp_ring_offset_v1 rx;
+ struct xdp_ring_offset_v1 tx;
+ struct xdp_ring_offset_v1 fr;
+ struct xdp_ring_offset_v1 cr;
+};
+
+/* Nodes are linked in the struct xdp_sock map_list field, and used to
+ * track which maps a certain socket reside in.
+ */
+
+struct xsk_map_node {
+ struct list_head node;
+ struct xsk_map *map;
+ struct xdp_sock __rcu **map_entry;
+};
+
+static inline struct xdp_sock *xdp_sk(struct sock *sk)
+{
+ return (struct xdp_sock *)sk;
+}
+
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+ struct xdp_sock __rcu **map_entry);
+void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
+int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
+ u16 queue_id);
+
+#endif /* XSK_H_ */
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
new file mode 100644
index 0000000000..b0a6116778
--- /dev/null
+++ b/net/xdp/xsk_buff_pool.c
@@ -0,0 +1,696 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <net/xsk_buff_pool.h>
+#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
+
+#include "xsk_queue.h"
+#include "xdp_umem.h"
+#include "xsk.h"
+
+void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
+{
+ unsigned long flags;
+
+ if (!xs->tx)
+ return;
+
+ spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+ list_add_rcu(&xs->tx_list, &pool->xsk_tx_list);
+ spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
+}
+
+void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
+{
+ unsigned long flags;
+
+ if (!xs->tx)
+ return;
+
+ spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+ list_del_rcu(&xs->tx_list);
+ spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
+}
+
+void xp_destroy(struct xsk_buff_pool *pool)
+{
+ if (!pool)
+ return;
+
+ kvfree(pool->tx_descs);
+ kvfree(pool->heads);
+ kvfree(pool);
+}
+
+int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs)
+{
+ pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs),
+ GFP_KERNEL);
+ if (!pool->tx_descs)
+ return -ENOMEM;
+
+ return 0;
+}
+
+struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
+ struct xdp_umem *umem)
+{
+ bool unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+ struct xsk_buff_pool *pool;
+ struct xdp_buff_xsk *xskb;
+ u32 i, entries;
+
+ entries = unaligned ? umem->chunks : 0;
+ pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL);
+ if (!pool)
+ goto out;
+
+ pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL);
+ if (!pool->heads)
+ goto out;
+
+ if (xs->tx)
+ if (xp_alloc_tx_descs(pool, xs))
+ goto out;
+
+ pool->chunk_mask = ~((u64)umem->chunk_size - 1);
+ pool->addrs_cnt = umem->size;
+ pool->heads_cnt = umem->chunks;
+ pool->free_heads_cnt = umem->chunks;
+ pool->headroom = umem->headroom;
+ pool->chunk_size = umem->chunk_size;
+ pool->chunk_shift = ffs(umem->chunk_size) - 1;
+ pool->unaligned = unaligned;
+ pool->frame_len = umem->chunk_size - umem->headroom -
+ XDP_PACKET_HEADROOM;
+ pool->umem = umem;
+ pool->addrs = umem->addrs;
+ INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->xskb_list);
+ INIT_LIST_HEAD(&pool->xsk_tx_list);
+ spin_lock_init(&pool->xsk_tx_list_lock);
+ spin_lock_init(&pool->cq_lock);
+ refcount_set(&pool->users, 1);
+
+ pool->fq = xs->fq_tmp;
+ pool->cq = xs->cq_tmp;
+
+ for (i = 0; i < pool->free_heads_cnt; i++) {
+ xskb = &pool->heads[i];
+ xskb->pool = pool;
+ xskb->xdp.frame_sz = umem->chunk_size - umem->headroom;
+ INIT_LIST_HEAD(&xskb->free_list_node);
+ INIT_LIST_HEAD(&xskb->xskb_list_node);
+ if (pool->unaligned)
+ pool->free_heads[i] = xskb;
+ else
+ xp_init_xskb_addr(xskb, pool, i * pool->chunk_size);
+ }
+
+ return pool;
+
+out:
+ xp_destroy(pool);
+ return NULL;
+}
+
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
+{
+ u32 i;
+
+ for (i = 0; i < pool->heads_cnt; i++)
+ pool->heads[i].xdp.rxq = rxq;
+}
+EXPORT_SYMBOL(xp_set_rxq_info);
+
+static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
+{
+ struct netdev_bpf bpf;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (pool->umem->zc) {
+ bpf.command = XDP_SETUP_XSK_POOL;
+ bpf.xsk.pool = NULL;
+ bpf.xsk.queue_id = pool->queue_id;
+
+ err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
+
+ if (err)
+ WARN(1, "Failed to disable zero-copy!\n");
+ }
+}
+
+#define NETDEV_XDP_ACT_ZC (NETDEV_XDP_ACT_BASIC | \
+ NETDEV_XDP_ACT_REDIRECT | \
+ NETDEV_XDP_ACT_XSK_ZEROCOPY)
+
+int xp_assign_dev(struct xsk_buff_pool *pool,
+ struct net_device *netdev, u16 queue_id, u16 flags)
+{
+ bool force_zc, force_copy;
+ struct netdev_bpf bpf;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ force_zc = flags & XDP_ZEROCOPY;
+ force_copy = flags & XDP_COPY;
+
+ if (force_zc && force_copy)
+ return -EINVAL;
+
+ if (xsk_get_pool_from_qid(netdev, queue_id))
+ return -EBUSY;
+
+ pool->netdev = netdev;
+ pool->queue_id = queue_id;
+ err = xsk_reg_pool_at_qid(netdev, pool, queue_id);
+ if (err)
+ return err;
+
+ if (flags & XDP_USE_SG)
+ pool->umem->flags |= XDP_UMEM_SG_FLAG;
+
+ if (flags & XDP_USE_NEED_WAKEUP)
+ pool->uses_need_wakeup = true;
+ /* Tx needs to be explicitly woken up the first time. Also
+ * for supporting drivers that do not implement this
+ * feature. They will always have to call sendto() or poll().
+ */
+ pool->cached_need_wakeup = XDP_WAKEUP_TX;
+
+ dev_hold(netdev);
+
+ if (force_copy)
+ /* For copy-mode, we are done. */
+ return 0;
+
+ if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) {
+ err = -EOPNOTSUPP;
+ goto err_unreg_pool;
+ }
+
+ if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) {
+ err = -EOPNOTSUPP;
+ goto err_unreg_pool;
+ }
+
+ bpf.command = XDP_SETUP_XSK_POOL;
+ bpf.xsk.pool = pool;
+ bpf.xsk.queue_id = queue_id;
+
+ err = netdev->netdev_ops->ndo_bpf(netdev, &bpf);
+ if (err)
+ goto err_unreg_pool;
+
+ if (!pool->dma_pages) {
+ WARN(1, "Driver did not DMA map zero-copy buffers");
+ err = -EINVAL;
+ goto err_unreg_xsk;
+ }
+ pool->umem->zc = true;
+ return 0;
+
+err_unreg_xsk:
+ xp_disable_drv_zc(pool);
+err_unreg_pool:
+ if (!force_zc)
+ err = 0; /* fallback to copy mode */
+ if (err) {
+ xsk_clear_pool_at_qid(netdev, queue_id);
+ dev_put(netdev);
+ }
+ return err;
+}
+
+int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
+ struct net_device *dev, u16 queue_id)
+{
+ u16 flags;
+ struct xdp_umem *umem = umem_xs->umem;
+
+ /* One fill and completion ring required for each queue id. */
+ if (!pool->fq || !pool->cq)
+ return -EINVAL;
+
+ flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY;
+ if (umem_xs->pool->uses_need_wakeup)
+ flags |= XDP_USE_NEED_WAKEUP;
+
+ return xp_assign_dev(pool, dev, queue_id, flags);
+}
+
+void xp_clear_dev(struct xsk_buff_pool *pool)
+{
+ if (!pool->netdev)
+ return;
+
+ xp_disable_drv_zc(pool);
+ xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
+ dev_put(pool->netdev);
+ pool->netdev = NULL;
+}
+
+static void xp_release_deferred(struct work_struct *work)
+{
+ struct xsk_buff_pool *pool = container_of(work, struct xsk_buff_pool,
+ work);
+
+ rtnl_lock();
+ xp_clear_dev(pool);
+ rtnl_unlock();
+
+ if (pool->fq) {
+ xskq_destroy(pool->fq);
+ pool->fq = NULL;
+ }
+
+ if (pool->cq) {
+ xskq_destroy(pool->cq);
+ pool->cq = NULL;
+ }
+
+ xdp_put_umem(pool->umem, false);
+ xp_destroy(pool);
+}
+
+void xp_get_pool(struct xsk_buff_pool *pool)
+{
+ refcount_inc(&pool->users);
+}
+
+bool xp_put_pool(struct xsk_buff_pool *pool)
+{
+ if (!pool)
+ return false;
+
+ if (refcount_dec_and_test(&pool->users)) {
+ INIT_WORK(&pool->work, xp_release_deferred);
+ schedule_work(&pool->work);
+ return true;
+ }
+
+ return false;
+}
+
+static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool)
+{
+ struct xsk_dma_map *dma_map;
+
+ list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) {
+ if (dma_map->netdev == pool->netdev)
+ return dma_map;
+ }
+
+ return NULL;
+}
+
+static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_device *netdev,
+ u32 nr_pages, struct xdp_umem *umem)
+{
+ struct xsk_dma_map *dma_map;
+
+ dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
+ if (!dma_map)
+ return NULL;
+
+ dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL);
+ if (!dma_map->dma_pages) {
+ kfree(dma_map);
+ return NULL;
+ }
+
+ dma_map->netdev = netdev;
+ dma_map->dev = dev;
+ dma_map->dma_need_sync = false;
+ dma_map->dma_pages_cnt = nr_pages;
+ refcount_set(&dma_map->users, 1);
+ list_add(&dma_map->list, &umem->xsk_dma_list);
+ return dma_map;
+}
+
+static void xp_destroy_dma_map(struct xsk_dma_map *dma_map)
+{
+ list_del(&dma_map->list);
+ kvfree(dma_map->dma_pages);
+ kfree(dma_map);
+}
+
+static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs)
+{
+ dma_addr_t *dma;
+ u32 i;
+
+ for (i = 0; i < dma_map->dma_pages_cnt; i++) {
+ dma = &dma_map->dma_pages[i];
+ if (*dma) {
+ *dma &= ~XSK_NEXT_PG_CONTIG_MASK;
+ dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE,
+ DMA_BIDIRECTIONAL, attrs);
+ *dma = 0;
+ }
+ }
+
+ xp_destroy_dma_map(dma_map);
+}
+
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+{
+ struct xsk_dma_map *dma_map;
+
+ if (!pool->dma_pages)
+ return;
+
+ dma_map = xp_find_dma_map(pool);
+ if (!dma_map) {
+ WARN(1, "Could not find dma_map for device");
+ return;
+ }
+
+ if (!refcount_dec_and_test(&dma_map->users))
+ return;
+
+ __xp_dma_unmap(dma_map, attrs);
+ kvfree(pool->dma_pages);
+ pool->dma_pages = NULL;
+ pool->dma_pages_cnt = 0;
+ pool->dev = NULL;
+}
+EXPORT_SYMBOL(xp_dma_unmap);
+
+static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
+{
+ u32 i;
+
+ for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
+ if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
+ dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
+ else
+ dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
+ }
+}
+
+static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
+{
+ if (!pool->unaligned) {
+ u32 i;
+
+ for (i = 0; i < pool->heads_cnt; i++) {
+ struct xdp_buff_xsk *xskb = &pool->heads[i];
+
+ xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr);
+ }
+ }
+
+ pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL);
+ if (!pool->dma_pages)
+ return -ENOMEM;
+
+ pool->dev = dma_map->dev;
+ pool->dma_pages_cnt = dma_map->dma_pages_cnt;
+ pool->dma_need_sync = dma_map->dma_need_sync;
+ memcpy(pool->dma_pages, dma_map->dma_pages,
+ pool->dma_pages_cnt * sizeof(*pool->dma_pages));
+
+ return 0;
+}
+
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+ unsigned long attrs, struct page **pages, u32 nr_pages)
+{
+ struct xsk_dma_map *dma_map;
+ dma_addr_t dma;
+ int err;
+ u32 i;
+
+ dma_map = xp_find_dma_map(pool);
+ if (dma_map) {
+ err = xp_init_dma_info(pool, dma_map);
+ if (err)
+ return err;
+
+ refcount_inc(&dma_map->users);
+ return 0;
+ }
+
+ dma_map = xp_create_dma_map(dev, pool->netdev, nr_pages, pool->umem);
+ if (!dma_map)
+ return -ENOMEM;
+
+ for (i = 0; i < dma_map->dma_pages_cnt; i++) {
+ dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+ DMA_BIDIRECTIONAL, attrs);
+ if (dma_mapping_error(dev, dma)) {
+ __xp_dma_unmap(dma_map, attrs);
+ return -ENOMEM;
+ }
+ if (dma_need_sync(dev, dma))
+ dma_map->dma_need_sync = true;
+ dma_map->dma_pages[i] = dma;
+ }
+
+ if (pool->unaligned)
+ xp_check_dma_contiguity(dma_map);
+
+ err = xp_init_dma_info(pool, dma_map);
+ if (err) {
+ __xp_dma_unmap(dma_map, attrs);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(xp_dma_map);
+
+static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+ u64 addr)
+{
+ return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size);
+}
+
+static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+ *addr = xp_unaligned_extract_addr(*addr);
+ if (*addr >= pool->addrs_cnt ||
+ *addr + pool->chunk_size > pool->addrs_cnt ||
+ xp_addr_crosses_non_contig_pg(pool, *addr))
+ return false;
+ return true;
+}
+
+static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+ *addr = xp_aligned_extract_addr(pool, *addr);
+ return *addr < pool->addrs_cnt;
+}
+
+static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
+{
+ struct xdp_buff_xsk *xskb;
+ u64 addr;
+ bool ok;
+
+ if (pool->free_heads_cnt == 0)
+ return NULL;
+
+ for (;;) {
+ if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
+ pool->fq->queue_empty_descs++;
+ return NULL;
+ }
+
+ ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+ xp_check_aligned(pool, &addr);
+ if (!ok) {
+ pool->fq->invalid_descs++;
+ xskq_cons_release(pool->fq);
+ continue;
+ }
+ break;
+ }
+
+ if (pool->unaligned) {
+ xskb = pool->free_heads[--pool->free_heads_cnt];
+ xp_init_xskb_addr(xskb, pool, addr);
+ if (pool->dma_pages)
+ xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
+ } else {
+ xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];
+ }
+
+ xskq_cons_release(pool->fq);
+ return xskb;
+}
+
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
+{
+ struct xdp_buff_xsk *xskb;
+
+ if (!pool->free_list_cnt) {
+ xskb = __xp_alloc(pool);
+ if (!xskb)
+ return NULL;
+ } else {
+ pool->free_list_cnt--;
+ xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk,
+ free_list_node);
+ list_del_init(&xskb->free_list_node);
+ }
+
+ xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
+ xskb->xdp.data_meta = xskb->xdp.data;
+ xskb->xdp.flags = 0;
+
+ if (pool->dma_need_sync) {
+ dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+ pool->frame_len,
+ DMA_BIDIRECTIONAL);
+ }
+ return &xskb->xdp;
+}
+EXPORT_SYMBOL(xp_alloc);
+
+static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+ u32 i, cached_cons, nb_entries;
+
+ if (max > pool->free_heads_cnt)
+ max = pool->free_heads_cnt;
+ max = xskq_cons_nb_entries(pool->fq, max);
+
+ cached_cons = pool->fq->cached_cons;
+ nb_entries = max;
+ i = max;
+ while (i--) {
+ struct xdp_buff_xsk *xskb;
+ u64 addr;
+ bool ok;
+
+ __xskq_cons_read_addr_unchecked(pool->fq, cached_cons++, &addr);
+
+ ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+ xp_check_aligned(pool, &addr);
+ if (unlikely(!ok)) {
+ pool->fq->invalid_descs++;
+ nb_entries--;
+ continue;
+ }
+
+ if (pool->unaligned) {
+ xskb = pool->free_heads[--pool->free_heads_cnt];
+ xp_init_xskb_addr(xskb, pool, addr);
+ if (pool->dma_pages)
+ xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
+ } else {
+ xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];
+ }
+
+ *xdp = &xskb->xdp;
+ xdp++;
+ }
+
+ xskq_cons_release_n(pool->fq, max);
+ return nb_entries;
+}
+
+static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 nb_entries)
+{
+ struct xdp_buff_xsk *xskb;
+ u32 i;
+
+ nb_entries = min_t(u32, nb_entries, pool->free_list_cnt);
+
+ i = nb_entries;
+ while (i--) {
+ xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node);
+ list_del_init(&xskb->free_list_node);
+
+ *xdp = &xskb->xdp;
+ xdp++;
+ }
+ pool->free_list_cnt -= nb_entries;
+
+ return nb_entries;
+}
+
+u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+ u32 nb_entries1 = 0, nb_entries2;
+
+ if (unlikely(pool->dma_need_sync)) {
+ struct xdp_buff *buff;
+
+ /* Slow path */
+ buff = xp_alloc(pool);
+ if (buff)
+ *xdp = buff;
+ return !!buff;
+ }
+
+ if (unlikely(pool->free_list_cnt)) {
+ nb_entries1 = xp_alloc_reused(pool, xdp, max);
+ if (nb_entries1 == max)
+ return nb_entries1;
+
+ max -= nb_entries1;
+ xdp += nb_entries1;
+ }
+
+ nb_entries2 = xp_alloc_new_from_fq(pool, xdp, max);
+ if (!nb_entries2)
+ pool->fq->queue_empty_descs++;
+
+ return nb_entries1 + nb_entries2;
+}
+EXPORT_SYMBOL(xp_alloc_batch);
+
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
+{
+ if (pool->free_list_cnt >= count)
+ return true;
+ return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt);
+}
+EXPORT_SYMBOL(xp_can_alloc);
+
+void xp_free(struct xdp_buff_xsk *xskb)
+{
+ if (!list_empty(&xskb->free_list_node))
+ return;
+
+ xskb->pool->free_list_cnt++;
+ list_add(&xskb->free_list_node, &xskb->pool->free_list);
+}
+EXPORT_SYMBOL(xp_free);
+
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
+{
+ addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+ return pool->addrs + addr;
+}
+EXPORT_SYMBOL(xp_raw_get_data);
+
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
+{
+ addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+ return (pool->dma_pages[addr >> PAGE_SHIFT] &
+ ~XSK_NEXT_PG_CONTIG_MASK) +
+ (addr & ~PAGE_MASK);
+}
+EXPORT_SYMBOL(xp_raw_get_dma);
+
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
+{
+ dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
+ xskb->pool->frame_len, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
+
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+ size_t size)
+{
+ dma_sync_single_range_for_device(pool->dev, dma, 0,
+ size, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_device_slow);
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
new file mode 100644
index 0000000000..22b36c8143
--- /dev/null
+++ b/net/xdp/xsk_diag.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets monitoring support
+ *
+ * Copyright(c) 2019 Intel Corporation.
+ *
+ * Author: Björn Töpel <bjorn.topel@intel.com>
+ */
+
+#include <linux/module.h>
+#include <net/xdp_sock.h>
+#include <linux/xdp_diag.h>
+#include <linux/sock_diag.h>
+
+#include "xsk_queue.h"
+#include "xsk.h"
+
+static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb)
+{
+ struct xdp_diag_info di = {};
+
+ di.ifindex = xs->dev ? xs->dev->ifindex : 0;
+ di.queue_id = xs->queue_id;
+ return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di);
+}
+
+static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type,
+ struct sk_buff *nlskb)
+{
+ struct xdp_diag_ring dr = {};
+
+ dr.entries = queue->nentries;
+ return nla_put(nlskb, nl_type, sizeof(dr), &dr);
+}
+
+static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs,
+ struct sk_buff *nlskb)
+{
+ int err = 0;
+
+ if (xs->rx)
+ err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb);
+ if (!err && xs->tx)
+ err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb);
+ return err;
+}
+
+static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
+{
+ struct xsk_buff_pool *pool = xs->pool;
+ struct xdp_umem *umem = xs->umem;
+ struct xdp_diag_umem du = {};
+ int err;
+
+ if (!umem)
+ return 0;
+
+ du.id = umem->id;
+ du.size = umem->size;
+ du.num_pages = umem->npgs;
+ du.chunk_size = umem->chunk_size;
+ du.headroom = umem->headroom;
+ du.ifindex = (pool && pool->netdev) ? pool->netdev->ifindex : 0;
+ du.queue_id = pool ? pool->queue_id : 0;
+ du.flags = 0;
+ if (umem->zc)
+ du.flags |= XDP_DU_F_ZEROCOPY;
+ du.refs = refcount_read(&umem->users);
+
+ err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du);
+ if (!err && pool && pool->fq)
+ err = xsk_diag_put_ring(pool->fq,
+ XDP_DIAG_UMEM_FILL_RING, nlskb);
+ if (!err && pool && pool->cq)
+ err = xsk_diag_put_ring(pool->cq,
+ XDP_DIAG_UMEM_COMPLETION_RING, nlskb);
+ return err;
+}
+
+static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb)
+{
+ struct xdp_diag_stats du = {};
+
+ du.n_rx_dropped = xs->rx_dropped;
+ du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx);
+ du.n_rx_full = xs->rx_queue_full;
+ du.n_fill_ring_empty = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
+ du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx);
+ du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx);
+ return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du);
+}
+
+static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
+ struct xdp_diag_req *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u32 flags, int sk_ino)
+{
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct xdp_diag_msg *msg;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ msg = nlmsg_data(nlh);
+ memset(msg, 0, sizeof(*msg));
+ msg->xdiag_family = AF_XDP;
+ msg->xdiag_type = sk->sk_type;
+ msg->xdiag_ino = sk_ino;
+ sock_diag_save_cookie(sk, msg->xdiag_cookie);
+
+ mutex_lock(&xs->mutex);
+ if (READ_ONCE(xs->state) == XSK_UNBOUND)
+ goto out_nlmsg_trim;
+
+ if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
+ goto out_nlmsg_trim;
+
+ if ((req->xdiag_show & XDP_SHOW_INFO) &&
+ nla_put_u32(nlskb, XDP_DIAG_UID,
+ from_kuid_munged(user_ns, sock_i_uid(sk))))
+ goto out_nlmsg_trim;
+
+ if ((req->xdiag_show & XDP_SHOW_RING_CFG) &&
+ xsk_diag_put_rings_cfg(xs, nlskb))
+ goto out_nlmsg_trim;
+
+ if ((req->xdiag_show & XDP_SHOW_UMEM) &&
+ xsk_diag_put_umem(xs, nlskb))
+ goto out_nlmsg_trim;
+
+ if ((req->xdiag_show & XDP_SHOW_MEMINFO) &&
+ sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
+ goto out_nlmsg_trim;
+
+ if ((req->xdiag_show & XDP_SHOW_STATS) &&
+ xsk_diag_put_stats(xs, nlskb))
+ goto out_nlmsg_trim;
+
+ mutex_unlock(&xs->mutex);
+ nlmsg_end(nlskb, nlh);
+ return 0;
+
+out_nlmsg_trim:
+ mutex_unlock(&xs->mutex);
+ nlmsg_cancel(nlskb, nlh);
+ return -EMSGSIZE;
+}
+
+static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb)
+{
+ struct xdp_diag_req *req = nlmsg_data(cb->nlh);
+ struct net *net = sock_net(nlskb->sk);
+ int num = 0, s_num = cb->args[0];
+ struct sock *sk;
+
+ mutex_lock(&net->xdp.lock);
+
+ sk_for_each(sk, &net->xdp.list) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num++ < s_num)
+ continue;
+
+ if (xsk_diag_fill(sk, nlskb, req,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ sock_i_ino(sk)) < 0) {
+ num--;
+ break;
+ }
+ }
+
+ mutex_unlock(&net->xdp.lock);
+ cb->args[0] = num;
+ return nlskb->len;
+}
+
+static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr)
+{
+ struct netlink_dump_control c = { .dump = xsk_diag_dump };
+ int hdrlen = sizeof(struct xdp_diag_req);
+ struct net *net = sock_net(nlskb->sk);
+
+ if (nlmsg_len(hdr) < hdrlen)
+ return -EINVAL;
+
+ if (!(hdr->nlmsg_flags & NLM_F_DUMP))
+ return -EOPNOTSUPP;
+
+ return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c);
+}
+
+static const struct sock_diag_handler xsk_diag_handler = {
+ .family = AF_XDP,
+ .dump = xsk_diag_handler_dump,
+};
+
+static int __init xsk_diag_init(void)
+{
+ return sock_diag_register(&xsk_diag_handler);
+}
+
+static void __exit xsk_diag_exit(void)
+{
+ sock_diag_unregister(&xsk_diag_handler);
+}
+
+module_init(xsk_diag_init);
+module_exit(xsk_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 0000000000..d2c2640300
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/overflow.h>
+#include <linux/vmalloc.h>
+#include <net/xdp_sock_drv.h>
+
+#include "xsk_queue.h"
+
+static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
+{
+ struct xdp_umem_ring *umem_ring;
+ struct xdp_rxtx_ring *rxtx_ring;
+
+ if (umem_queue)
+ return struct_size(umem_ring, desc, q->nentries);
+ return struct_size(rxtx_ring, desc, q->nentries);
+}
+
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
+{
+ struct xsk_queue *q;
+ size_t size;
+
+ q = kzalloc(sizeof(*q), GFP_KERNEL);
+ if (!q)
+ return NULL;
+
+ q->nentries = nentries;
+ q->ring_mask = nentries - 1;
+
+ size = xskq_get_ring_size(q, umem_queue);
+
+ /* size which is overflowing or close to SIZE_MAX will become 0 in
+ * PAGE_ALIGN(), checking SIZE_MAX is enough due to the previous
+ * is_power_of_2(), the rest will be handled by vmalloc_user()
+ */
+ if (unlikely(size == SIZE_MAX)) {
+ kfree(q);
+ return NULL;
+ }
+
+ size = PAGE_ALIGN(size);
+
+ q->ring = vmalloc_user(size);
+ if (!q->ring) {
+ kfree(q);
+ return NULL;
+ }
+
+ q->ring_vmalloc_size = size;
+ return q;
+}
+
+void xskq_destroy(struct xsk_queue *q)
+{
+ if (!q)
+ return;
+
+ vfree(q->ring);
+ kfree(q);
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 0000000000..13354a1e42
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef _LINUX_XSK_QUEUE_H
+#define _LINUX_XSK_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/if_xdp.h>
+#include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
+
+#include "xsk.h"
+
+struct xdp_ring {
+ u32 producer ____cacheline_aligned_in_smp;
+ /* Hinder the adjacent cache prefetcher to prefetch the consumer
+ * pointer if the producer pointer is touched and vice versa.
+ */
+ u32 pad1 ____cacheline_aligned_in_smp;
+ u32 consumer ____cacheline_aligned_in_smp;
+ u32 pad2 ____cacheline_aligned_in_smp;
+ u32 flags;
+ u32 pad3 ____cacheline_aligned_in_smp;
+};
+
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+ struct xdp_ring ptrs;
+ struct xdp_desc desc[] ____cacheline_aligned_in_smp;
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+ struct xdp_ring ptrs;
+ u64 desc[] ____cacheline_aligned_in_smp;
+};
+
+struct xsk_queue {
+ u32 ring_mask;
+ u32 nentries;
+ u32 cached_prod;
+ u32 cached_cons;
+ struct xdp_ring *ring;
+ u64 invalid_descs;
+ u64 queue_empty_descs;
+ size_t ring_vmalloc_size;
+};
+
+struct parsed_desc {
+ u32 mb;
+ u32 valid;
+};
+
+/* The structure of the shared state of the rings are a simple
+ * circular buffer, as outlined in
+ * Documentation/core-api/circular-buffers.rst. For the Rx and
+ * completion ring, the kernel is the producer and user space is the
+ * consumer. For the Tx and fill rings, the kernel is the consumer and
+ * user space is the producer.
+ *
+ * producer consumer
+ *
+ * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C)
+ * STORE $data LOAD $data
+ * STORE.rel ->producer (B) STORE.rel ->consumer (D)
+ * }
+ *
+ * (A) pairs with (D), and (B) pairs with (C).
+ *
+ * Starting with (B), it protects the data from being written after
+ * the producer pointer. If this barrier was missing, the consumer
+ * could observe the producer pointer being set and thus load the data
+ * before the producer has written the new data. The consumer would in
+ * this case load the old data.
+ *
+ * (C) protects the consumer from speculatively loading the data before
+ * the producer pointer actually has been read. If we do not have this
+ * barrier, some architectures could load old data as speculative loads
+ * are not discarded as the CPU does not know there is a dependency
+ * between ->producer and data.
+ *
+ * (A) is a control dependency that separates the load of ->consumer
+ * from the stores of $data. In case ->consumer indicates there is no
+ * room in the buffer to store $data we do not. The dependency will
+ * order both of the stores after the loads. So no barrier is needed.
+ *
+ * (D) protects the load of the data to be observed to happen after the
+ * store of the consumer pointer. If we did not have this memory
+ * barrier, the producer could observe the consumer pointer being set
+ * and overwrite the data with a new value before the consumer got the
+ * chance to read the old value. The consumer would thus miss reading
+ * the old entry and very likely read the new entry twice, once right
+ * now and again after circling through the ring.
+ */
+
+/* The operations on the rings are the following:
+ *
+ * producer consumer
+ *
+ * RESERVE entries PEEK in the ring for entries
+ * WRITE data into the ring READ data from the ring
+ * SUBMIT entries RELEASE entries
+ *
+ * The producer reserves one or more entries in the ring. It can then
+ * fill in these entries and finally submit them so that they can be
+ * seen and read by the consumer.
+ *
+ * The consumer peeks into the ring to see if the producer has written
+ * any new entries. If so, the consumer can then read these entries
+ * and when it is done reading them release them back to the producer
+ * so that the producer can use these slots to fill in new entries.
+ *
+ * The function names below reflect these operations.
+ */
+
+/* Functions that read and validate content from consumer rings. */
+
+static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+ u32 idx = cached_cons & q->ring_mask;
+
+ *addr = ring->desc[idx];
+}
+
+static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
+{
+ if (q->cached_cons != q->cached_prod) {
+ __xskq_cons_read_addr_unchecked(q, q->cached_cons, addr);
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool xp_unused_options_set(u32 options)
+{
+ return options & ~XDP_PKT_CONTD;
+}
+
+static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ u64 offset = desc->addr & (pool->chunk_size - 1);
+
+ if (!desc->len)
+ return false;
+
+ if (offset + desc->len > pool->chunk_size)
+ return false;
+
+ if (desc->addr >= pool->addrs_cnt)
+ return false;
+
+ if (xp_unused_options_set(desc->options))
+ return false;
+ return true;
+}
+
+static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
+
+ if (!desc->len)
+ return false;
+
+ if (desc->len > pool->chunk_size)
+ return false;
+
+ if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
+ xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+ return false;
+
+ if (xp_unused_options_set(desc->options))
+ return false;
+ return true;
+}
+
+static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) :
+ xp_aligned_validate_desc(pool, desc);
+}
+
+static inline bool xskq_has_descs(struct xsk_queue *q)
+{
+ return q->cached_cons != q->cached_prod;
+}
+
+static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
+ struct xdp_desc *d,
+ struct xsk_buff_pool *pool)
+{
+ if (!xp_validate_desc(pool, d)) {
+ q->invalid_descs++;
+ return false;
+ }
+ return true;
+}
+
+static inline bool xskq_cons_read_desc(struct xsk_queue *q,
+ struct xdp_desc *desc,
+ struct xsk_buff_pool *pool)
+{
+ if (q->cached_cons != q->cached_prod) {
+ struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+ u32 idx = q->cached_cons & q->ring_mask;
+
+ *desc = ring->desc[idx];
+ return xskq_cons_is_valid_desc(q, desc, pool);
+ }
+
+ q->queue_empty_descs++;
+ return false;
+}
+
+static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
+{
+ q->cached_cons += cnt;
+}
+
+static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool,
+ struct xdp_desc *desc, struct parsed_desc *parsed)
+{
+ parsed->valid = xskq_cons_is_valid_desc(q, desc, pool);
+ parsed->mb = xp_mb_desc(desc);
+}
+
+static inline
+u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+ u32 max)
+{
+ u32 cached_cons = q->cached_cons, nb_entries = 0;
+ struct xdp_desc *descs = pool->tx_descs;
+ u32 total_descs = 0, nr_frags = 0;
+
+ /* track first entry, if stumble upon *any* invalid descriptor, rewind
+ * current packet that consists of frags and stop the processing
+ */
+ while (cached_cons != q->cached_prod && nb_entries < max) {
+ struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+ u32 idx = cached_cons & q->ring_mask;
+ struct parsed_desc parsed;
+
+ descs[nb_entries] = ring->desc[idx];
+ cached_cons++;
+ parse_desc(q, pool, &descs[nb_entries], &parsed);
+ if (unlikely(!parsed.valid))
+ break;
+
+ if (likely(!parsed.mb)) {
+ total_descs += (nr_frags + 1);
+ nr_frags = 0;
+ } else {
+ nr_frags++;
+ if (nr_frags == pool->netdev->xdp_zc_max_segs) {
+ nr_frags = 0;
+ break;
+ }
+ }
+ nb_entries++;
+ }
+
+ cached_cons -= nr_frags;
+ /* Release valid plus any invalid entries */
+ xskq_cons_release_n(q, cached_cons - q->cached_cons);
+ return total_descs;
+}
+
+/* Functions for consumers */
+
+static inline void __xskq_cons_release(struct xsk_queue *q)
+{
+ smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */
+}
+
+static inline void __xskq_cons_peek(struct xsk_queue *q)
+{
+ /* Refresh the local pointer */
+ q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */
+}
+
+static inline void xskq_cons_get_entries(struct xsk_queue *q)
+{
+ __xskq_cons_release(q);
+ __xskq_cons_peek(q);
+}
+
+static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max)
+{
+ u32 entries = q->cached_prod - q->cached_cons;
+
+ if (entries >= max)
+ return max;
+
+ __xskq_cons_peek(q);
+ entries = q->cached_prod - q->cached_cons;
+
+ return entries >= max ? max : entries;
+}
+
+static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
+{
+ return xskq_cons_nb_entries(q, cnt) >= cnt;
+}
+
+static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
+{
+ if (q->cached_prod == q->cached_cons)
+ xskq_cons_get_entries(q);
+ return xskq_cons_read_addr_unchecked(q, addr);
+}
+
+static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
+ struct xdp_desc *desc,
+ struct xsk_buff_pool *pool)
+{
+ if (q->cached_prod == q->cached_cons)
+ xskq_cons_get_entries(q);
+ return xskq_cons_read_desc(q, desc, pool);
+}
+
+/* To improve performance in the xskq_cons_release functions, only update local state here.
+ * Reflect this to global state when we get new entries from the ring in
+ * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop.
+ */
+static inline void xskq_cons_release(struct xsk_queue *q)
+{
+ q->cached_cons++;
+}
+
+static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt)
+{
+ q->cached_cons -= cnt;
+}
+
+static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
+{
+ /* No barriers needed since data is not accessed */
+ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer);
+}
+
+/* Functions for producers */
+
+static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
+{
+ u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
+
+ if (free_entries >= max)
+ return max;
+
+ /* Refresh the local tail pointer */
+ q->cached_cons = READ_ONCE(q->ring->consumer);
+ free_entries = q->nentries - (q->cached_prod - q->cached_cons);
+
+ return free_entries >= max ? max : free_entries;
+}
+
+static inline bool xskq_prod_is_full(struct xsk_queue *q)
+{
+ return xskq_prod_nb_free(q, 1) ? false : true;
+}
+
+static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt)
+{
+ q->cached_prod -= cnt;
+}
+
+static inline int xskq_prod_reserve(struct xsk_queue *q)
+{
+ if (xskq_prod_is_full(q))
+ return -ENOSPC;
+
+ /* A, matches D */
+ q->cached_prod++;
+ return 0;
+}
+
+static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ if (xskq_prod_is_full(q))
+ return -ENOSPC;
+
+ /* A, matches D */
+ ring->desc[q->cached_prod++ & q->ring_mask] = addr;
+ return 0;
+}
+
+static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
+ u32 nb_entries)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+ u32 i, cached_prod;
+
+ /* A, matches D */
+ cached_prod = q->cached_prod;
+ for (i = 0; i < nb_entries; i++)
+ ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr;
+ q->cached_prod = cached_prod;
+}
+
+static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
+ u64 addr, u32 len, u32 flags)
+{
+ struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+ u32 idx;
+
+ if (xskq_prod_is_full(q))
+ return -ENOBUFS;
+
+ /* A, matches D */
+ idx = q->cached_prod++ & q->ring_mask;
+ ring->desc[idx].addr = addr;
+ ring->desc[idx].len = len;
+ ring->desc[idx].options = flags;
+
+ return 0;
+}
+
+static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx)
+{
+ smp_store_release(&q->ring->producer, idx); /* B, matches C */
+}
+
+static inline void xskq_prod_submit(struct xsk_queue *q)
+{
+ __xskq_prod_submit(q, q->cached_prod);
+}
+
+static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries)
+{
+ __xskq_prod_submit(q, q->ring->producer + nb_entries);
+}
+
+static inline bool xskq_prod_is_empty(struct xsk_queue *q)
+{
+ /* No barriers needed since data is not accessed */
+ return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer);
+}
+
+/* For both producers and consumers */
+
+static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+{
+ return q ? q->invalid_descs : 0;
+}
+
+static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q)
+{
+ return q ? q->queue_empty_descs : 0;
+}
+
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
+void xskq_destroy(struct xsk_queue *q_ops);
+
+#endif /* _LINUX_XSK_QUEUE_H */
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
new file mode 100644
index 0000000000..e1c526f97c
--- /dev/null
+++ b/net/xdp/xskmap.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/btf_ids.h>
+
+#include "xsk.h"
+
+static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
+ struct xdp_sock __rcu **map_entry)
+{
+ struct xsk_map_node *node;
+
+ node = bpf_map_kzalloc(&map->map, sizeof(*node),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_inc(&map->map);
+ atomic_inc(&map->count);
+
+ node->map = map;
+ node->map_entry = map_entry;
+ return node;
+}
+
+static void xsk_map_node_free(struct xsk_map_node *node)
+{
+ struct xsk_map *map = node->map;
+
+ bpf_map_put(&node->map->map);
+ kfree(node);
+ atomic_dec(&map->count);
+}
+
+static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
+{
+ spin_lock_bh(&xs->map_list_lock);
+ list_add_tail(&node->node, &xs->map_list);
+ spin_unlock_bh(&xs->map_list_lock);
+}
+
+static void xsk_map_sock_delete(struct xdp_sock *xs,
+ struct xdp_sock __rcu **map_entry)
+{
+ struct xsk_map_node *n, *tmp;
+
+ spin_lock_bh(&xs->map_list_lock);
+ list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
+ if (map_entry == n->map_entry) {
+ list_del(&n->node);
+ xsk_map_node_free(n);
+ }
+ }
+ spin_unlock_bh(&xs->map_list_lock);
+}
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+ struct xsk_map *m;
+ int numa_node;
+ u64 size;
+
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 ||
+ attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+ return ERR_PTR(-EINVAL);
+
+ numa_node = bpf_map_attr_numa_node(attr);
+ size = struct_size(m, xsk_map, attr->max_entries);
+
+ m = bpf_map_area_alloc(size, numa_node);
+ if (!m)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&m->map, attr);
+ spin_lock_init(&m->lock);
+
+ return &m->map;
+}
+
+static u64 xsk_map_mem_usage(const struct bpf_map *map)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+ return struct_size(m, xsk_map, map->max_entries) +
+ (u64)atomic_read(&m->count) * sizeof(struct xsk_map_node);
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+ synchronize_net();
+ bpf_map_area_free(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= m->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == m->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
+ struct bpf_insn *insn = insn_buf;
+
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+ return insn - insn_buf;
+}
+
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
+static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held());
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock __rcu **map_entry;
+ struct xdp_sock *xs, *old_xs;
+ u32 i = *(u32 *)key, fd = *(u32 *)value;
+ struct xsk_map_node *node;
+ struct socket *sock;
+ int err;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= m->map.max_entries))
+ return -E2BIG;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return err;
+
+ if (sock->sk->sk_family != PF_XDP) {
+ sockfd_put(sock);
+ return -EOPNOTSUPP;
+ }
+
+ xs = (struct xdp_sock *)sock->sk;
+
+ map_entry = &m->xsk_map[i];
+ node = xsk_map_node_alloc(m, map_entry);
+ if (IS_ERR(node)) {
+ sockfd_put(sock);
+ return PTR_ERR(node);
+ }
+
+ spin_lock_bh(&m->lock);
+ old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock));
+ if (old_xs == xs) {
+ err = 0;
+ goto out;
+ } else if (old_xs && map_flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto out;
+ } else if (!old_xs && map_flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto out;
+ }
+ xsk_map_sock_add(xs, node);
+ rcu_assign_pointer(*map_entry, xs);
+ if (old_xs)
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
+ sockfd_put(sock);
+ return 0;
+
+out:
+ spin_unlock_bh(&m->lock);
+ sockfd_put(sock);
+ xsk_map_node_free(node);
+ return err;
+}
+
+static long xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock __rcu **map_entry;
+ struct xdp_sock *old_xs;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ spin_lock_bh(&m->lock);
+ map_entry = &m->xsk_map[k];
+ old_xs = unrcu_pointer(xchg(map_entry, NULL));
+ if (old_xs)
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
+
+ return 0;
+}
+
+static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, index, flags, 0,
+ __xsk_map_lookup_elem);
+}
+
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+ struct xdp_sock __rcu **map_entry)
+{
+ spin_lock_bh(&map->lock);
+ if (rcu_access_pointer(*map_entry) == xs) {
+ rcu_assign_pointer(*map_entry, NULL);
+ xsk_map_sock_delete(xs, map_entry);
+ }
+ spin_unlock_bh(&map->lock);
+}
+
+static bool xsk_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ return meta0->max_entries == meta1->max_entries &&
+ bpf_map_meta_equal(meta0, meta1);
+}
+
+BTF_ID_LIST_SINGLE(xsk_map_btf_ids, struct, xsk_map)
+const struct bpf_map_ops xsk_map_ops = {
+ .map_meta_equal = xsk_map_meta_equal,
+ .map_alloc = xsk_map_alloc,
+ .map_free = xsk_map_free,
+ .map_get_next_key = xsk_map_get_next_key,
+ .map_lookup_elem = xsk_map_lookup_elem,
+ .map_gen_lookup = xsk_map_gen_lookup,
+ .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
+ .map_update_elem = xsk_map_update_elem,
+ .map_delete_elem = xsk_map_delete_elem,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = xsk_map_mem_usage,
+ .map_btf_id = &xsk_map_btf_ids[0],
+ .map_redirect = xsk_map_redirect,
+};