Adding upstream version 6.6.15.upstream/6.6.15

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-11 08:27:49 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-11 08:27:49 +0000
commit: ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
tree: b2d64bc10158fdd5497876388cd68142ca374ed3 /net/xdp
parent: Initial commit. (diff)
download: linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
11 files changed, 3783 insertions, 0 deletions
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
new file mode 100644
index 0000000000..71af2febe7
--- /dev/null
+++ b/net/xdp/Kconfig
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config XDP_SOCKETS
+	bool "XDP sockets"
+	depends on BPF_SYSCALL
+	default n
+	help
+	  XDP sockets allows a channel between XDP programs and
+	  userspace applications.
+
+config XDP_SOCKETS_DIAG
+	tristate "XDP sockets: monitoring interface"
+	depends on XDP_SOCKETS
+	default n
+	help
+	  Support for PF_XDP sockets monitoring interface used by the ss tool.
+	  If unsure, say Y.
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 0000000000..30cdc4315f
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o
+obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 0000000000..06cead2b8e
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/idr.h>
+#include <linux/vmalloc.h>
+
+#include "xdp_umem.h"
+#include "xsk_queue.h"
+
+static DEFINE_IDA(umem_ida);
+
+static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+{
+	unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
+
+	kvfree(umem->pgs);
+	umem->pgs = NULL;
+}
+
+static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
+{
+	if (umem->user) {
+		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+		free_uid(umem->user);
+	}
+}
+
+static void xdp_umem_addr_unmap(struct xdp_umem *umem)
+{
+	vunmap(umem->addrs);
+	umem->addrs = NULL;
+}
+
+static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages,
+			     u32 nr_pages)
+{
+	umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!umem->addrs)
+		return -ENOMEM;
+	return 0;
+}
+
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+	umem->zc = false;
+	ida_free(&umem_ida, umem->id);
+
+	xdp_umem_addr_unmap(umem);
+	xdp_umem_unpin_pages(umem);
+
+	xdp_umem_unaccount_pages(umem);
+	kfree(umem);
+}
+
+static void xdp_umem_release_deferred(struct work_struct *work)
+{
+	struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
+
+	xdp_umem_release(umem);
+}
+
+void xdp_get_umem(struct xdp_umem *umem)
+{
+	refcount_inc(&umem->users);
+}
+
+void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup)
+{
+	if (!umem)
+		return;
+
+	if (refcount_dec_and_test(&umem->users)) {
+		if (defer_cleanup) {
+			INIT_WORK(&umem->work, xdp_umem_release_deferred);
+			schedule_work(&umem->work);
+		} else {
+			xdp_umem_release(umem);
+		}
+	}
+}
+
+static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
+{
+	unsigned int gup_flags = FOLL_WRITE;
+	long npgs;
+	int err;
+
+	umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN);
+	if (!umem->pgs)
+		return -ENOMEM;
+
+	mmap_read_lock(current->mm);
+	npgs = pin_user_pages(address, umem->npgs,
+			      gup_flags | FOLL_LONGTERM, &umem->pgs[0]);
+	mmap_read_unlock(current->mm);
+
+	if (npgs != umem->npgs) {
+		if (npgs >= 0) {
+			umem->npgs = npgs;
+			err = -ENOMEM;
+			goto out_pin;
+		}
+		err = npgs;
+		goto out_pgs;
+	}
+	return 0;
+
+out_pin:
+	xdp_umem_unpin_pages(umem);
+out_pgs:
+	kvfree(umem->pgs);
+	umem->pgs = NULL;
+	return err;
+}
+
+static int xdp_umem_account_pages(struct xdp_umem *umem)
+{
+	unsigned long lock_limit, new_npgs, old_npgs;
+
+	if (capable(CAP_IPC_LOCK))
+		return 0;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	umem->user = get_uid(current_user());
+
+	do {
+		old_npgs = atomic_long_read(&umem->user->locked_vm);
+		new_npgs = old_npgs + umem->npgs;
+		if (new_npgs > lock_limit) {
+			free_uid(umem->user);
+			umem->user = NULL;
+			return -ENOBUFS;
+		}
+	} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
+				     new_npgs) != old_npgs);
+	return 0;
+}
+
+static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+{
+	bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
+	u64 addr = mr->addr, size = mr->len;
+	u32 chunks_rem, npgs_rem;
+	u64 chunks, npgs;
+	int err;
+
+	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
+		/* Strictly speaking we could support this, if:
+		 * - huge pages, or*
+		 * - using an IOMMU, or
+		 * - making sure the memory area is consecutive
+		 * but for now, we simply say "computer says no".
+		 */
+		return -EINVAL;
+	}
+
+	if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
+		return -EINVAL;
+
+	if (!unaligned_chunks && !is_power_of_2(chunk_size))
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(addr)) {
+		/* Memory area has to be page size aligned. For
+		 * simplicity, this might change.
+		 */
+		return -EINVAL;
+	}
+
+	if ((addr + size) < addr)
+		return -EINVAL;
+
+	npgs = div_u64_rem(size, PAGE_SIZE, &npgs_rem);
+	if (npgs_rem)
+		npgs++;
+	if (npgs > U32_MAX)
+		return -EINVAL;
+
+	chunks = div_u64_rem(size, chunk_size, &chunks_rem);
+	if (!chunks || chunks > U32_MAX)
+		return -EINVAL;
+
+	if (!unaligned_chunks && chunks_rem)
+		return -EINVAL;
+
+	if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
+		return -EINVAL;
+
+	umem->size = size;
+	umem->headroom = headroom;
+	umem->chunk_size = chunk_size;
+	umem->chunks = chunks;
+	umem->npgs = npgs;
+	umem->pgs = NULL;
+	umem->user = NULL;
+	umem->flags = mr->flags;
+
+	INIT_LIST_HEAD(&umem->xsk_dma_list);
+	refcount_set(&umem->users, 1);
+
+	err = xdp_umem_account_pages(umem);
+	if (err)
+		return err;
+
+	err = xdp_umem_pin_pages(umem, (unsigned long)addr);
+	if (err)
+		goto out_account;
+
+	err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs);
+	if (err)
+		goto out_unpin;
+
+	return 0;
+
+out_unpin:
+	xdp_umem_unpin_pages(umem);
+out_account:
+	xdp_umem_unaccount_pages(umem);
+	return err;
+}
+
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
+{
+	struct xdp_umem *umem;
+	int err;
+
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	err = ida_alloc(&umem_ida, GFP_KERNEL);
+	if (err < 0) {
+		kfree(umem);
+		return ERR_PTR(err);
+	}
+	umem->id = err;
+
+	err = xdp_umem_reg(umem, mr);
+	if (err) {
+		ida_free(&umem_ida, umem->id);
+		kfree(umem);
+		return ERR_PTR(err);
+	}
+
+	return umem;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 0000000000..aa9fe27804
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef XDP_UMEM_H_
+#define XDP_UMEM_H_
+
+#include <net/xdp_sock_drv.h>
+
+void xdp_get_umem(struct xdp_umem *umem);
+void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup);
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
+
+#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 0000000000..d849dc04a3
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,1722 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets
+ *
+ * AF_XDP sockets allows a channel between XDP programs and userspace
+ * applications.
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
+
+#include <linux/if_xdp.h>
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <net/xdp_sock_drv.h>
+#include <net/busy_poll.h>
+#include <net/netdev_rx_queue.h>
+#include <net/xdp.h>
+
+#include "xsk_queue.h"
+#include "xdp_umem.h"
+#include "xsk.h"
+
+#define TX_BATCH_SIZE 32
+
+static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
+
+void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
+{
+	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
+		return;
+
+	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
+	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
+
+void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
+{
+	struct xdp_sock *xs;
+
+	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
+	}
+	rcu_read_unlock();
+
+	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
+
+void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
+{
+	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
+		return;
+
+	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
+
+void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
+{
+	struct xdp_sock *xs;
+
+	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+	}
+	rcu_read_unlock();
+
+	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
+
+bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
+{
+	return pool->uses_need_wakeup;
+}
+EXPORT_SYMBOL(xsk_uses_need_wakeup);
+
+struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
+					    u16 queue_id)
+{
+	if (queue_id < dev->real_num_rx_queues)
+		return dev->_rx[queue_id].pool;
+	if (queue_id < dev->real_num_tx_queues)
+		return dev->_tx[queue_id].pool;
+
+	return NULL;
+}
+EXPORT_SYMBOL(xsk_get_pool_from_qid);
+
+void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
+{
+	if (queue_id < dev->num_rx_queues)
+		dev->_rx[queue_id].pool = NULL;
+	if (queue_id < dev->num_tx_queues)
+		dev->_tx[queue_id].pool = NULL;
+}
+
+/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
+ * not know if the device has more tx queues than rx, or the opposite.
+ * This might also change during run time.
+ */
+int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
+			u16 queue_id)
+{
+	if (queue_id >= max_t(unsigned int,
+			      dev->real_num_rx_queues,
+			      dev->real_num_tx_queues))
+		return -EINVAL;
+
+	if (queue_id < dev->real_num_rx_queues)
+		dev->_rx[queue_id].pool = pool;
+	if (queue_id < dev->real_num_tx_queues)
+		dev->_tx[queue_id].pool = pool;
+
+	return 0;
+}
+
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
+			u32 flags)
+{
+	u64 addr;
+	int err;
+
+	addr = xp_get_handle(xskb);
+	err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
+	if (err) {
+		xs->rx_queue_full++;
+		return err;
+	}
+
+	xp_release(xskb);
+	return 0;
+}
+
+static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+	u32 frags = xdp_buff_has_frags(xdp);
+	struct xdp_buff_xsk *pos, *tmp;
+	struct list_head *xskb_list;
+	u32 contd = 0;
+	int err;
+
+	if (frags)
+		contd = XDP_PKT_CONTD;
+
+	err = __xsk_rcv_zc(xs, xskb, len, contd);
+	if (err)
+		goto err;
+	if (likely(!frags))
+		return 0;
+
+	xskb_list = &xskb->pool->xskb_list;
+	list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
+		if (list_is_singular(xskb_list))
+			contd = 0;
+		len = pos->xdp.data_end - pos->xdp.data;
+		err = __xsk_rcv_zc(xs, pos, len, contd);
+		if (err)
+			goto err;
+		list_del(&pos->xskb_list_node);
+	}
+
+	return 0;
+err:
+	xsk_buff_free(xdp);
+	return err;
+}
+
+static void *xsk_copy_xdp_start(struct xdp_buff *from)
+{
+	if (unlikely(xdp_data_meta_unsupported(from)))
+		return from->data;
+	else
+		return from->data_meta;
+}
+
+static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
+			u32 *from_len, skb_frag_t **frag, u32 rem)
+{
+	u32 copied = 0;
+
+	while (1) {
+		u32 copy_len = min_t(u32, *from_len, to_len);
+
+		memcpy(to, *from, copy_len);
+		copied += copy_len;
+		if (rem == copied)
+			return copied;
+
+		if (*from_len == copy_len) {
+			*from = skb_frag_address(*frag);
+			*from_len = skb_frag_size((*frag)++);
+		} else {
+			*from += copy_len;
+			*from_len -= copy_len;
+		}
+		if (to_len == copy_len)
+			return copied;
+
+		to_len -= copy_len;
+		to += copy_len;
+	}
+}
+
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+	u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
+	void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
+	u32 from_len, meta_len, rem, num_desc;
+	struct xdp_buff_xsk *xskb;
+	struct xdp_buff *xsk_xdp;
+	skb_frag_t *frag;
+
+	from_len = xdp->data_end - copy_from;
+	meta_len = xdp->data - copy_from;
+	rem = len + meta_len;
+
+	if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
+		int err;
+
+		xsk_xdp = xsk_buff_alloc(xs->pool);
+		if (!xsk_xdp) {
+			xs->rx_dropped++;
+			return -ENOMEM;
+		}
+		memcpy(xsk_xdp->data - meta_len, copy_from, rem);
+		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
+		err = __xsk_rcv_zc(xs, xskb, len, 0);
+		if (err) {
+			xsk_buff_free(xsk_xdp);
+			return err;
+		}
+
+		return 0;
+	}
+
+	num_desc = (len - 1) / frame_size + 1;
+
+	if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
+		xs->rx_dropped++;
+		return -ENOMEM;
+	}
+	if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
+		xs->rx_queue_full++;
+		return -ENOBUFS;
+	}
+
+	if (xdp_buff_has_frags(xdp)) {
+		struct skb_shared_info *sinfo;
+
+		sinfo = xdp_get_shared_info_from_buff(xdp);
+		frag =  &sinfo->frags[0];
+	}
+
+	do {
+		u32 to_len = frame_size + meta_len;
+		u32 copied;
+
+		xsk_xdp = xsk_buff_alloc(xs->pool);
+		copy_to = xsk_xdp->data - meta_len;
+
+		copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
+		rem -= copied;
+
+		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
+		__xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
+		meta_len = 0;
+	} while (rem);
+
+	return 0;
+}
+
+static bool xsk_tx_writeable(struct xdp_sock *xs)
+{
+	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
+		return false;
+
+	return true;
+}
+
+static bool xsk_is_bound(struct xdp_sock *xs)
+{
+	if (READ_ONCE(xs->state) == XSK_BOUND) {
+		/* Matches smp_wmb() in bind(). */
+		smp_rmb();
+		return true;
+	}
+	return false;
+}
+
+static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+	if (!xsk_is_bound(xs))
+		return -ENXIO;
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+		return -EINVAL;
+
+	if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
+		xs->rx_dropped++;
+		return -ENOSPC;
+	}
+
+	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
+	return 0;
+}
+
+static void xsk_flush(struct xdp_sock *xs)
+{
+	xskq_prod_submit(xs->rx);
+	__xskq_cons_release(xs->pool->fq);
+	sock_def_readable(&xs->sk);
+}
+
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	u32 len = xdp_get_buff_len(xdp);
+	int err;
+
+	spin_lock_bh(&xs->rx_lock);
+	err = xsk_rcv_check(xs, xdp, len);
+	if (!err) {
+		err = __xsk_rcv(xs, xdp, len);
+		xsk_flush(xs);
+	}
+	spin_unlock_bh(&xs->rx_lock);
+	return err;
+}
+
+static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	u32 len = xdp_get_buff_len(xdp);
+	int err;
+
+	err = xsk_rcv_check(xs, xdp, len);
+	if (err)
+		return err;
+
+	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
+		len = xdp->data_end - xdp->data;
+		return xsk_rcv_zc(xs, xdp, len);
+	}
+
+	err = __xsk_rcv(xs, xdp, len);
+	if (!err)
+		xdp_return_buff(xdp);
+	return err;
+}
+
+int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
+	int err;
+
+	err = xsk_rcv(xs, xdp);
+	if (err)
+		return err;
+
+	if (!xs->flush_node.prev)
+		list_add(&xs->flush_node, flush_list);
+
+	return 0;
+}
+
+void __xsk_map_flush(void)
+{
+	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
+	struct xdp_sock *xs, *tmp;
+
+	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+		xsk_flush(xs);
+		__list_del_clearprev(&xs->flush_node);
+	}
+}
+
+void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
+{
+	xskq_prod_submit_n(pool->cq, nb_entries);
+}
+EXPORT_SYMBOL(xsk_tx_completed);
+
+void xsk_tx_release(struct xsk_buff_pool *pool)
+{
+	struct xdp_sock *xs;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+		__xskq_cons_release(xs->tx);
+		if (xsk_tx_writeable(xs))
+			xs->sk.sk_write_space(&xs->sk);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(xsk_tx_release);
+
+bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
+{
+	struct xdp_sock *xs;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
+			if (xskq_has_descs(xs->tx))
+				xskq_cons_release(xs->tx);
+			continue;
+		}
+
+		/* This is the backpressure mechanism for the Tx path.
+		 * Reserve space in the completion queue and only proceed
+		 * if there is space in it. This avoids having to implement
+		 * any buffering in the Tx path.
+		 */
+		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
+			goto out;
+
+		xskq_cons_release(xs->tx);
+		rcu_read_unlock();
+		return true;
+	}
+
+out:
+	rcu_read_unlock();
+	return false;
+}
+EXPORT_SYMBOL(xsk_tx_peek_desc);
+
+static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
+{
+	struct xdp_desc *descs = pool->tx_descs;
+	u32 nb_pkts = 0;
+
+	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
+		nb_pkts++;
+
+	xsk_tx_release(pool);
+	return nb_pkts;
+}
+
+u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
+{
+	struct xdp_sock *xs;
+
+	rcu_read_lock();
+	if (!list_is_singular(&pool->xsk_tx_list)) {
+		/* Fallback to the non-batched version */
+		rcu_read_unlock();
+		return xsk_tx_peek_release_fallback(pool, nb_pkts);
+	}
+
+	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
+	if (!xs) {
+		nb_pkts = 0;
+		goto out;
+	}
+
+	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
+
+	/* This is the backpressure mechanism for the Tx path. Try to
+	 * reserve space in the completion queue for all packets, but
+	 * if there are fewer slots available, just process that many
+	 * packets. This avoids having to implement any buffering in
+	 * the Tx path.
+	 */
+	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
+	if (!nb_pkts)
+		goto out;
+
+	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
+	if (!nb_pkts) {
+		xs->tx->queue_empty_descs++;
+		goto out;
+	}
+
+	__xskq_cons_release(xs->tx);
+	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
+	xs->sk.sk_write_space(&xs->sk);
+
+out:
+	rcu_read_unlock();
+	return nb_pkts;
+}
+EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
+
+static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
+{
+	struct net_device *dev = xs->dev;
+
+	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
+}
+
+static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&xs->pool->cq_lock, flags);
+	ret = xskq_prod_reserve_addr(xs->pool->cq, addr);
+	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+
+	return ret;
+}
+
+static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&xs->pool->cq_lock, flags);
+	xskq_prod_submit_n(xs->pool->cq, n);
+	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+}
+
+static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&xs->pool->cq_lock, flags);
+	xskq_prod_cancel_n(xs->pool->cq, n);
+	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+}
+
+static u32 xsk_get_num_desc(struct sk_buff *skb)
+{
+	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
+}
+
+static void xsk_destruct_skb(struct sk_buff *skb)
+{
+	xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
+	sock_wfree(skb);
+}
+
+static void xsk_set_destructor_arg(struct sk_buff *skb)
+{
+	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+
+	skb_shinfo(skb)->destructor_arg = (void *)num;
+}
+
+static void xsk_consume_skb(struct sk_buff *skb)
+{
+	struct xdp_sock *xs = xdp_sk(skb->sk);
+
+	skb->destructor = sock_wfree;
+	xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb));
+	/* Free skb without triggering the perf drop trace */
+	consume_skb(skb);
+	xs->skb = NULL;
+}
+
+static void xsk_drop_skb(struct sk_buff *skb)
+{
+	xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
+	xsk_consume_skb(skb);
+}
+
+static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
+					      struct xdp_desc *desc)
+{
+	struct xsk_buff_pool *pool = xs->pool;
+	u32 hr, len, ts, offset, copy, copied;
+	struct sk_buff *skb = xs->skb;
+	struct page *page;
+	void *buffer;
+	int err, i;
+	u64 addr;
+
+	if (!skb) {
+		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
+
+		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
+		if (unlikely(!skb))
+			return ERR_PTR(err);
+
+		skb_reserve(skb, hr);
+	}
+
+	addr = desc->addr;
+	len = desc->len;
+	ts = pool->unaligned ? len : pool->chunk_size;
+
+	buffer = xsk_buff_raw_get_data(pool, addr);
+	offset = offset_in_page(buffer);
+	addr = buffer - pool->addrs;
+
+	for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
+		if (unlikely(i >= MAX_SKB_FRAGS))
+			return ERR_PTR(-EOVERFLOW);
+
+		page = pool->umem->pgs[addr >> PAGE_SHIFT];
+		get_page(page);
+
+		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
+		skb_fill_page_desc(skb, i, page, offset, copy);
+
+		copied += copy;
+		addr += copy;
+		offset = 0;
+	}
+
+	skb->len += len;
+	skb->data_len += len;
+	skb->truesize += ts;
+
+	refcount_add(ts, &xs->sk.sk_wmem_alloc);
+
+	return skb;
+}
+
+static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
+				     struct xdp_desc *desc)
+{
+	struct net_device *dev = xs->dev;
+	struct sk_buff *skb = xs->skb;
+	int err;
+
+	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
+		skb = xsk_build_skb_zerocopy(xs, desc);
+		if (IS_ERR(skb)) {
+			err = PTR_ERR(skb);
+			goto free_err;
+		}
+	} else {
+		u32 hr, tr, len;
+		void *buffer;
+
+		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+		len = desc->len;
+
+		if (!skb) {
+			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+			tr = dev->needed_tailroom;
+			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+			if (unlikely(!skb))
+				goto free_err;
+
+			skb_reserve(skb, hr);
+			skb_put(skb, len);
+
+			err = skb_store_bits(skb, 0, buffer, len);
+			if (unlikely(err)) {
+				kfree_skb(skb);
+				goto free_err;
+			}
+		} else {
+			int nr_frags = skb_shinfo(skb)->nr_frags;
+			struct page *page;
+			u8 *vaddr;
+
+			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
+				err = -EOVERFLOW;
+				goto free_err;
+			}
+
+			page = alloc_page(xs->sk.sk_allocation);
+			if (unlikely(!page)) {
+				err = -EAGAIN;
+				goto free_err;
+			}
+
+			vaddr = kmap_local_page(page);
+			memcpy(vaddr, buffer, len);
+			kunmap_local(vaddr);
+
+			skb_add_rx_frag(skb, nr_frags, page, 0, len, 0);
+		}
+	}
+
+	skb->dev = dev;
+	skb->priority = xs->sk.sk_priority;
+	skb->mark = READ_ONCE(xs->sk.sk_mark);
+	skb->destructor = xsk_destruct_skb;
+	xsk_set_destructor_arg(skb);
+
+	return skb;
+
+free_err:
+	if (err == -EOVERFLOW) {
+		/* Drop the packet */
+		xsk_set_destructor_arg(xs->skb);
+		xsk_drop_skb(xs->skb);
+		xskq_cons_release(xs->tx);
+	} else {
+		/* Let application retry */
+		xsk_cq_cancel_locked(xs, 1);
+	}
+
+	return ERR_PTR(err);
+}
+
+static int __xsk_generic_xmit(struct sock *sk)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+	u32 max_batch = TX_BATCH_SIZE;
+	bool sent_frame = false;
+	struct xdp_desc desc;
+	struct sk_buff *skb;
+	int err = 0;
+
+	mutex_lock(&xs->mutex);
+
+	/* Since we dropped the RCU read lock, the socket state might have changed. */
+	if (unlikely(!xsk_is_bound(xs))) {
+		err = -ENXIO;
+		goto out;
+	}
+
+	if (xs->queue_id >= xs->dev->real_num_tx_queues)
+		goto out;
+
+	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
+		if (max_batch-- == 0) {
+			err = -EAGAIN;
+			goto out;
+		}
+
+		/* This is the backpressure mechanism for the Tx path.
+		 * Reserve space in the completion queue and only proceed
+		 * if there is space in it. This avoids having to implement
+		 * any buffering in the Tx path.
+		 */
+		if (xsk_cq_reserve_addr_locked(xs, desc.addr))
+			goto out;
+
+		skb = xsk_build_skb(xs, &desc);
+		if (IS_ERR(skb)) {
+			err = PTR_ERR(skb);
+			if (err != -EOVERFLOW)
+				goto out;
+			err = 0;
+			continue;
+		}
+
+		xskq_cons_release(xs->tx);
+
+		if (xp_mb_desc(&desc)) {
+			xs->skb = skb;
+			continue;
+		}
+
+		err = __dev_direct_xmit(skb, xs->queue_id);
+		if  (err == NETDEV_TX_BUSY) {
+			/* Tell user-space to retry the send */
+			xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
+			xsk_consume_skb(skb);
+			err = -EAGAIN;
+			goto out;
+		}
+
+		/* Ignore NET_XMIT_CN as packet might have been sent */
+		if (err == NET_XMIT_DROP) {
+			/* SKB completed but not sent */
+			err = -EBUSY;
+			xs->skb = NULL;
+			goto out;
+		}
+
+		sent_frame = true;
+		xs->skb = NULL;
+	}
+
+	if (xskq_has_descs(xs->tx)) {
+		if (xs->skb)
+			xsk_drop_skb(xs->skb);
+		xskq_cons_release(xs->tx);
+	}
+
+out:
+	if (sent_frame)
+		if (xsk_tx_writeable(xs))
+			sk->sk_write_space(sk);
+
+	mutex_unlock(&xs->mutex);
+	return err;
+}
+
+static int xsk_generic_xmit(struct sock *sk)
+{
+	int ret;
+
+	/* Drop the RCU lock since the SKB path might sleep. */
+	rcu_read_unlock();
+	ret = __xsk_generic_xmit(sk);
+	/* Reaquire RCU lock before going into common code. */
+	rcu_read_lock();
+
+	return ret;
+}
+
+static bool xsk_no_wakeup(struct sock *sk)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	/* Prefer busy-polling, skip the wakeup. */
+	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
+		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
+#else
+	return false;
+#endif
+}
+
+static int xsk_check_common(struct xdp_sock *xs)
+{
+	if (unlikely(!xsk_is_bound(xs)))
+		return -ENXIO;
+	if (unlikely(!(xs->dev->flags & IFF_UP)))
+		return -ENETDOWN;
+
+	return 0;
+}
+
+static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
+{
+	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct xsk_buff_pool *pool;
+	int err;
+
+	err = xsk_check_common(xs);
+	if (err)
+		return err;
+	if (unlikely(need_wait))
+		return -EOPNOTSUPP;
+	if (unlikely(!xs->tx))
+		return -ENOBUFS;
+
+	if (sk_can_busy_loop(sk)) {
+		if (xs->zc)
+			__sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
+		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
+	}
+
+	if (xs->zc && xsk_no_wakeup(sk))
+		return 0;
+
+	pool = xs->pool;
+	if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
+		if (xs->zc)
+			return xsk_wakeup(xs, XDP_WAKEUP_TX);
+		return xsk_generic_xmit(sk);
+	}
+	return 0;
+}
+
+static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = __xsk_sendmsg(sock, m, total_len);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
+{
+	bool need_wait = !(flags & MSG_DONTWAIT);
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int err;
+
+	err = xsk_check_common(xs);
+	if (err)
+		return err;
+	if (unlikely(!xs->rx))
+		return -ENOBUFS;
+	if (unlikely(need_wait))
+		return -EOPNOTSUPP;
+
+	if (sk_can_busy_loop(sk))
+		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
+
+	if (xsk_no_wakeup(sk))
+		return 0;
+
+	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
+		return xsk_wakeup(xs, XDP_WAKEUP_RX);
+	return 0;
+}
+
+static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = __xsk_recvmsg(sock, m, len, flags);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static __poll_t xsk_poll(struct file *file, struct socket *sock,
+			     struct poll_table_struct *wait)
+{
+	__poll_t mask = 0;
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct xsk_buff_pool *pool;
+
+	sock_poll_wait(file, sock, wait);
+
+	rcu_read_lock();
+	if (xsk_check_common(xs))
+		goto out;
+
+	pool = xs->pool;
+
+	if (pool->cached_need_wakeup) {
+		if (xs->zc)
+			xsk_wakeup(xs, pool->cached_need_wakeup);
+		else if (xs->tx)
+			/* Poll needs to drive Tx also in copy mode */
+			xsk_generic_xmit(sk);
+	}
+
+	if (xs->rx && !xskq_prod_is_empty(xs->rx))
+		mask |= EPOLLIN | EPOLLRDNORM;
+	if (xs->tx && xsk_tx_writeable(xs))
+		mask |= EPOLLOUT | EPOLLWRNORM;
+out:
+	rcu_read_unlock();
+	return mask;
+}
+
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
+			  bool umem_queue)
+{
+	struct xsk_queue *q;
+
+	if (entries == 0 || *queue || !is_power_of_2(entries))
+		return -EINVAL;
+
+	q = xskq_create(entries, umem_queue);
+	if (!q)
+		return -ENOMEM;
+
+	/* Make sure queue is ready before it can be seen by others */
+	smp_wmb();
+	WRITE_ONCE(*queue, q);
+	return 0;
+}
+
+static void xsk_unbind_dev(struct xdp_sock *xs)
+{
+	struct net_device *dev = xs->dev;
+
+	if (xs->state != XSK_BOUND)
+		return;
+	WRITE_ONCE(xs->state, XSK_UNBOUND);
+
+	/* Wait for driver to stop using the xdp socket. */
+	xp_del_xsk(xs->pool, xs);
+	synchronize_net();
+	dev_put(dev);
+}
+
+static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
+					      struct xdp_sock __rcu ***map_entry)
+{
+	struct xsk_map *map = NULL;
+	struct xsk_map_node *node;
+
+	*map_entry = NULL;
+
+	spin_lock_bh(&xs->map_list_lock);
+	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
+					node);
+	if (node) {
+		bpf_map_inc(&node->map->map);
+		map = node->map;
+		*map_entry = node->map_entry;
+	}
+	spin_unlock_bh(&xs->map_list_lock);
+	return map;
+}
+
+static void xsk_delete_from_maps(struct xdp_sock *xs)
+{
+	/* This function removes the current XDP socket from all the
+	 * maps it resides in. We need to take extra care here, due to
+	 * the two locks involved. Each map has a lock synchronizing
+	 * updates to the entries, and each socket has a lock that
+	 * synchronizes access to the list of maps (map_list). For
+	 * deadlock avoidance the locks need to be taken in the order
+	 * "map lock"->"socket map list lock". We start off by
+	 * accessing the socket map list, and take a reference to the
+	 * map to guarantee existence between the
+	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
+	 * calls. Then we ask the map to remove the socket, which
+	 * tries to remove the socket from the map. Note that there
+	 * might be updates to the map between
+	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
+	 */
+	struct xdp_sock __rcu **map_entry = NULL;
+	struct xsk_map *map;
+
+	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
+		xsk_map_try_sock_delete(map, xs, map_entry);
+		bpf_map_put(&map->map);
+	}
+}
+
+static int xsk_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct net *net;
+
+	if (!sk)
+		return 0;
+
+	net = sock_net(sk);
+
+	if (xs->skb)
+		xsk_drop_skb(xs->skb);
+
+	mutex_lock(&net->xdp.lock);
+	sk_del_node_init_rcu(sk);
+	mutex_unlock(&net->xdp.lock);
+
+	sock_prot_inuse_add(net, sk->sk_prot, -1);
+
+	xsk_delete_from_maps(xs);
+	mutex_lock(&xs->mutex);
+	xsk_unbind_dev(xs);
+	mutex_unlock(&xs->mutex);
+
+	xskq_destroy(xs->rx);
+	xskq_destroy(xs->tx);
+	xskq_destroy(xs->fq_tmp);
+	xskq_destroy(xs->cq_tmp);
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	sock_put(sk);
+
+	return 0;
+}
+
+static struct socket *xsk_lookup_xsk_from_fd(int fd)
+{
+	struct socket *sock;
+	int err;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return ERR_PTR(-ENOTSOCK);
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return ERR_PTR(-ENOPROTOOPT);
+	}
+
+	return sock;
+}
+
+static bool xsk_validate_queues(struct xdp_sock *xs)
+{
+	return xs->fq_tmp && xs->cq_tmp;
+}
+
+static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct net_device *dev;
+	int bound_dev_if;
+	u32 flags, qid;
+	int err = 0;
+
+	if (addr_len < sizeof(struct sockaddr_xdp))
+		return -EINVAL;
+	if (sxdp->sxdp_family != AF_XDP)
+		return -EINVAL;
+
+	flags = sxdp->sxdp_flags;
+	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
+		      XDP_USE_NEED_WAKEUP | XDP_USE_SG))
+		return -EINVAL;
+
+	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
+		return -EINVAL;
+
+	rtnl_lock();
+	mutex_lock(&xs->mutex);
+	if (xs->state != XSK_READY) {
+		err = -EBUSY;
+		goto out_release;
+	}
+
+	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
+	if (!dev) {
+		err = -ENODEV;
+		goto out_release;
+	}
+
+	if (!xs->rx && !xs->tx) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	qid = sxdp->sxdp_queue_id;
+
+	if (flags & XDP_SHARED_UMEM) {
+		struct xdp_sock *umem_xs;
+		struct socket *sock;
+
+		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
+		    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
+			/* Cannot specify flags for shared sockets. */
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		if (xs->umem) {
+			/* We have already our own. */
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
+		if (IS_ERR(sock)) {
+			err = PTR_ERR(sock);
+			goto out_unlock;
+		}
+
+		umem_xs = xdp_sk(sock->sk);
+		if (!xsk_is_bound(umem_xs)) {
+			err = -EBADF;
+			sockfd_put(sock);
+			goto out_unlock;
+		}
+
+		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
+			/* Share the umem with another socket on another qid
+			 * and/or device.
+			 */
+			xs->pool = xp_create_and_assign_umem(xs,
+							     umem_xs->umem);
+			if (!xs->pool) {
+				err = -ENOMEM;
+				sockfd_put(sock);
+				goto out_unlock;
+			}
+
+			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
+						   qid);
+			if (err) {
+				xp_destroy(xs->pool);
+				xs->pool = NULL;
+				sockfd_put(sock);
+				goto out_unlock;
+			}
+		} else {
+			/* Share the buffer pool with the other socket. */
+			if (xs->fq_tmp || xs->cq_tmp) {
+				/* Do not allow setting your own fq or cq. */
+				err = -EINVAL;
+				sockfd_put(sock);
+				goto out_unlock;
+			}
+
+			xp_get_pool(umem_xs->pool);
+			xs->pool = umem_xs->pool;
+
+			/* If underlying shared umem was created without Tx
+			 * ring, allocate Tx descs array that Tx batching API
+			 * utilizes
+			 */
+			if (xs->tx && !xs->pool->tx_descs) {
+				err = xp_alloc_tx_descs(xs->pool, xs);
+				if (err) {
+					xp_put_pool(xs->pool);
+					xs->pool = NULL;
+					sockfd_put(sock);
+					goto out_unlock;
+				}
+			}
+		}
+
+		xdp_get_umem(umem_xs->umem);
+		WRITE_ONCE(xs->umem, umem_xs->umem);
+		sockfd_put(sock);
+	} else if (!xs->umem || !xsk_validate_queues(xs)) {
+		err = -EINVAL;
+		goto out_unlock;
+	} else {
+		/* This xsk has its own umem. */
+		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
+		if (!xs->pool) {
+			err = -ENOMEM;
+			goto out_unlock;
+		}
+
+		err = xp_assign_dev(xs->pool, dev, qid, flags);
+		if (err) {
+			xp_destroy(xs->pool);
+			xs->pool = NULL;
+			goto out_unlock;
+		}
+	}
+
+	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
+	xs->fq_tmp = NULL;
+	xs->cq_tmp = NULL;
+
+	xs->dev = dev;
+	xs->zc = xs->umem->zc;
+	xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
+	xs->queue_id = qid;
+	xp_add_xsk(xs->pool, xs);
+
+out_unlock:
+	if (err) {
+		dev_put(dev);
+	} else {
+		/* Matches smp_rmb() in bind() for shared umem
+		 * sockets, and xsk_is_bound().
+		 */
+		smp_wmb();
+		WRITE_ONCE(xs->state, XSK_BOUND);
+	}
+out_release:
+	mutex_unlock(&xs->mutex);
+	rtnl_unlock();
+	return err;
+}
+
+struct xdp_umem_reg_v1 {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 chunk_size;
+	__u32 headroom;
+};
+
+static int xsk_setsockopt(struct socket *sock, int level, int optname,
+			  sockptr_t optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int err;
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	switch (optname) {
+	case XDP_RX_RING:
+	case XDP_TX_RING:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (optlen < sizeof(entries))
+			return -EINVAL;
+		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		if (xs->state != XSK_READY) {
+			mutex_unlock(&xs->mutex);
+			return -EBUSY;
+		}
+		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
+		err = xsk_init_queue(entries, q, false);
+		if (!err && optname == XDP_TX_RING)
+			/* Tx needs to be explicitly woken up the first time */
+			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
+	case XDP_UMEM_REG:
+	{
+		size_t mr_size = sizeof(struct xdp_umem_reg);
+		struct xdp_umem_reg mr = {};
+		struct xdp_umem *umem;
+
+		if (optlen < sizeof(struct xdp_umem_reg_v1))
+			return -EINVAL;
+		else if (optlen < sizeof(mr))
+			mr_size = sizeof(struct xdp_umem_reg_v1);
+
+		if (copy_from_sockptr(&mr, optval, mr_size))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		if (xs->state != XSK_READY || xs->umem) {
+			mutex_unlock(&xs->mutex);
+			return -EBUSY;
+		}
+
+		umem = xdp_umem_create(&mr);
+		if (IS_ERR(umem)) {
+			mutex_unlock(&xs->mutex);
+			return PTR_ERR(umem);
+		}
+
+		/* Make sure umem is ready before it can be seen by others */
+		smp_wmb();
+		WRITE_ONCE(xs->umem, umem);
+		mutex_unlock(&xs->mutex);
+		return 0;
+	}
+	case XDP_UMEM_FILL_RING:
+	case XDP_UMEM_COMPLETION_RING:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		if (xs->state != XSK_READY) {
+			mutex_unlock(&xs->mutex);
+			return -EBUSY;
+		}
+
+		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
+			&xs->cq_tmp;
+		err = xsk_init_queue(entries, q, true);
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
+	default:
+		break;
+	}
+
+	return -ENOPROTOOPT;
+}
+
+static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
+{
+	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
+}
+
+static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
+{
+	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+	ring->desc = offsetof(struct xdp_umem_ring, desc);
+}
+
+struct xdp_statistics_v1 {
+	__u64 rx_dropped;
+	__u64 rx_invalid_descs;
+	__u64 tx_invalid_descs;
+};
+
+static int xsk_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int len;
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case XDP_STATISTICS:
+	{
+		struct xdp_statistics stats = {};
+		bool extra_stats = true;
+		size_t stats_size;
+
+		if (len < sizeof(struct xdp_statistics_v1)) {
+			return -EINVAL;
+		} else if (len < sizeof(stats)) {
+			extra_stats = false;
+			stats_size = sizeof(struct xdp_statistics_v1);
+		} else {
+			stats_size = sizeof(stats);
+		}
+
+		mutex_lock(&xs->mutex);
+		stats.rx_dropped = xs->rx_dropped;
+		if (extra_stats) {
+			stats.rx_ring_full = xs->rx_queue_full;
+			stats.rx_fill_ring_empty_descs =
+				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
+			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
+		} else {
+			stats.rx_dropped += xs->rx_queue_full;
+		}
+		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
+		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
+		mutex_unlock(&xs->mutex);
+
+		if (copy_to_user(optval, &stats, stats_size))
+			return -EFAULT;
+		if (put_user(stats_size, optlen))
+			return -EFAULT;
+
+		return 0;
+	}
+	case XDP_MMAP_OFFSETS:
+	{
+		struct xdp_mmap_offsets off;
+		struct xdp_mmap_offsets_v1 off_v1;
+		bool flags_supported = true;
+		void *to_copy;
+
+		if (len < sizeof(off_v1))
+			return -EINVAL;
+		else if (len < sizeof(off))
+			flags_supported = false;
+
+		if (flags_supported) {
+			/* xdp_ring_offset is identical to xdp_ring_offset_v1
+			 * except for the flags field added to the end.
+			 */
+			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+					       &off.rx);
+			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+					       &off.tx);
+			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+					       &off.fr);
+			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+					       &off.cr);
+			off.rx.flags = offsetof(struct xdp_rxtx_ring,
+						ptrs.flags);
+			off.tx.flags = offsetof(struct xdp_rxtx_ring,
+						ptrs.flags);
+			off.fr.flags = offsetof(struct xdp_umem_ring,
+						ptrs.flags);
+			off.cr.flags = offsetof(struct xdp_umem_ring,
+						ptrs.flags);
+
+			len = sizeof(off);
+			to_copy = &off;
+		} else {
+			xsk_enter_rxtx_offsets(&off_v1.rx);
+			xsk_enter_rxtx_offsets(&off_v1.tx);
+			xsk_enter_umem_offsets(&off_v1.fr);
+			xsk_enter_umem_offsets(&off_v1.cr);
+
+			len = sizeof(off_v1);
+			to_copy = &off_v1;
+		}
+
+		if (copy_to_user(optval, to_copy, len))
+			return -EFAULT;
+		if (put_user(len, optlen))
+			return -EFAULT;
+
+		return 0;
+	}
+	case XDP_OPTIONS:
+	{
+		struct xdp_options opts = {};
+
+		if (len < sizeof(opts))
+			return -EINVAL;
+
+		mutex_lock(&xs->mutex);
+		if (xs->zc)
+			opts.flags |= XDP_OPTIONS_ZEROCOPY;
+		mutex_unlock(&xs->mutex);
+
+		len = sizeof(opts);
+		if (copy_to_user(optval, &opts, len))
+			return -EFAULT;
+		if (put_user(len, optlen))
+			return -EFAULT;
+
+		return 0;
+	}
+	default:
+		break;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int xsk_mmap(struct file *file, struct socket *sock,
+		    struct vm_area_struct *vma)
+{
+	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct xdp_sock *xs = xdp_sk(sock->sk);
+	int state = READ_ONCE(xs->state);
+	struct xsk_queue *q = NULL;
+
+	if (state != XSK_READY && state != XSK_BOUND)
+		return -EBUSY;
+
+	if (offset == XDP_PGOFF_RX_RING) {
+		q = READ_ONCE(xs->rx);
+	} else if (offset == XDP_PGOFF_TX_RING) {
+		q = READ_ONCE(xs->tx);
+	} else {
+		/* Matches the smp_wmb() in XDP_UMEM_REG */
+		smp_rmb();
+		if (offset == XDP_UMEM_PGOFF_FILL_RING)
+			q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
+						 READ_ONCE(xs->pool->fq);
+		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
+			q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
+						 READ_ONCE(xs->pool->cq);
+	}
+
+	if (!q)
+		return -EINVAL;
+
+	/* Matches the smp_wmb() in xsk_init_queue */
+	smp_rmb();
+	if (size > q->ring_vmalloc_size)
+		return -EINVAL;
+
+	return remap_vmalloc_range(vma, q->ring, 0);
+}
+
+static int xsk_notifier(struct notifier_block *this,
+			unsigned long msg, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct net *net = dev_net(dev);
+	struct sock *sk;
+
+	switch (msg) {
+	case NETDEV_UNREGISTER:
+		mutex_lock(&net->xdp.lock);
+		sk_for_each(sk, &net->xdp.list) {
+			struct xdp_sock *xs = xdp_sk(sk);
+
+			mutex_lock(&xs->mutex);
+			if (xs->dev == dev) {
+				sk->sk_err = ENETDOWN;
+				if (!sock_flag(sk, SOCK_DEAD))
+					sk_error_report(sk);
+
+				xsk_unbind_dev(xs);
+
+				/* Clear device references. */
+				xp_clear_dev(xs->pool);
+			}
+			mutex_unlock(&xs->mutex);
+		}
+		mutex_unlock(&net->xdp.lock);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct proto xsk_proto = {
+	.name =		"XDP",
+	.owner =	THIS_MODULE,
+	.obj_size =	sizeof(struct xdp_sock),
+};
+
+static const struct proto_ops xsk_proto_ops = {
+	.family		= PF_XDP,
+	.owner		= THIS_MODULE,
+	.release	= xsk_release,
+	.bind		= xsk_bind,
+	.connect	= sock_no_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.getname	= sock_no_getname,
+	.poll		= xsk_poll,
+	.ioctl		= sock_no_ioctl,
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= xsk_setsockopt,
+	.getsockopt	= xsk_getsockopt,
+	.sendmsg	= xsk_sendmsg,
+	.recvmsg	= xsk_recvmsg,
+	.mmap		= xsk_mmap,
+};
+
+static void xsk_destruct(struct sock *sk)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		return;
+
+	if (!xp_put_pool(xs->pool))
+		xdp_put_umem(xs->umem, !xs->pool);
+}
+
+static int xsk_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct xdp_sock *xs;
+	struct sock *sk;
+
+	if (!ns_capable(net->user_ns, CAP_NET_RAW))
+		return -EPERM;
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+	if (!sk)
+		return -ENOBUFS;
+
+	sock->ops = &xsk_proto_ops;
+
+	sock_init_data(sock, sk);
+
+	sk->sk_family = PF_XDP;
+
+	sk->sk_destruct = xsk_destruct;
+
+	sock_set_flag(sk, SOCK_RCU_FREE);
+
+	xs = xdp_sk(sk);
+	xs->state = XSK_READY;
+	mutex_init(&xs->mutex);
+	spin_lock_init(&xs->rx_lock);
+
+	INIT_LIST_HEAD(&xs->map_list);
+	spin_lock_init(&xs->map_list_lock);
+
+	mutex_lock(&net->xdp.lock);
+	sk_add_node_rcu(sk, &net->xdp.list);
+	mutex_unlock(&net->xdp.lock);
+
+	sock_prot_inuse_add(net, &xsk_proto, 1);
+
+	return 0;
+}
+
+static const struct net_proto_family xsk_family_ops = {
+	.family = PF_XDP,
+	.create = xsk_create,
+	.owner	= THIS_MODULE,
+};
+
+static struct notifier_block xsk_netdev_notifier = {
+	.notifier_call	= xsk_notifier,
+};
+
+static int __net_init xsk_net_init(struct net *net)
+{
+	mutex_init(&net->xdp.lock);
+	INIT_HLIST_HEAD(&net->xdp.list);
+	return 0;
+}
+
+static void __net_exit xsk_net_exit(struct net *net)
+{
+	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
+}
+
+static struct pernet_operations xsk_net_ops = {
+	.init = xsk_net_init,
+	.exit = xsk_net_exit,
+};
+
+static int __init xsk_init(void)
+{
+	int err, cpu;
+
+	err = proto_register(&xsk_proto, 0 /* no slab */);
+	if (err)
+		goto out;
+
+	err = sock_register(&xsk_family_ops);
+	if (err)
+		goto out_proto;
+
+	err = register_pernet_subsys(&xsk_net_ops);
+	if (err)
+		goto out_sk;
+
+	err = register_netdevice_notifier(&xsk_netdev_notifier);
+	if (err)
+		goto out_pernet;
+
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
+	return 0;
+
+out_pernet:
+	unregister_pernet_subsys(&xsk_net_ops);
+out_sk:
+	sock_unregister(PF_XDP);
+out_proto:
+	proto_unregister(&xsk_proto);
+out:
+	return err;
+}
+
+fs_initcall(xsk_init);
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
new file mode 100644
index 0000000000..a4bc4749fa
--- /dev/null
+++ b/net/xdp/xsk.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2019 Intel Corporation. */
+
+#ifndef XSK_H_
+#define XSK_H_
+
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+struct xdp_ring_offset_v1 {
+	__u64 producer;
+	__u64 consumer;
+	__u64 desc;
+};
+
+struct xdp_mmap_offsets_v1 {
+	struct xdp_ring_offset_v1 rx;
+	struct xdp_ring_offset_v1 tx;
+	struct xdp_ring_offset_v1 fr;
+	struct xdp_ring_offset_v1 cr;
+};
+
+/* Nodes are linked in the struct xdp_sock map_list field, and used to
+ * track which maps a certain socket reside in.
+ */
+
+struct xsk_map_node {
+	struct list_head node;
+	struct xsk_map *map;
+	struct xdp_sock __rcu **map_entry;
+};
+
+static inline struct xdp_sock *xdp_sk(struct sock *sk)
+{
+	return (struct xdp_sock *)sk;
+}
+
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+			     struct xdp_sock __rcu **map_entry);
+void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
+int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
+			u16 queue_id);
+
+#endif /* XSK_H_ */
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
new file mode 100644
index 0000000000..b0a6116778
--- /dev/null
+++ b/net/xdp/xsk_buff_pool.c
@@ -0,0 +1,696 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <net/xsk_buff_pool.h>
+#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
+
+#include "xsk_queue.h"
+#include "xdp_umem.h"
+#include "xsk.h"
+
+void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
+{
+	unsigned long flags;
+
+	if (!xs->tx)
+		return;
+
+	spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+	list_add_rcu(&xs->tx_list, &pool->xsk_tx_list);
+	spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
+}
+
+void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
+{
+	unsigned long flags;
+
+	if (!xs->tx)
+		return;
+
+	spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+	list_del_rcu(&xs->tx_list);
+	spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
+}
+
+void xp_destroy(struct xsk_buff_pool *pool)
+{
+	if (!pool)
+		return;
+
+	kvfree(pool->tx_descs);
+	kvfree(pool->heads);
+	kvfree(pool);
+}
+
+int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs)
+{
+	pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs),
+				  GFP_KERNEL);
+	if (!pool->tx_descs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
+						struct xdp_umem *umem)
+{
+	bool unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+	struct xsk_buff_pool *pool;
+	struct xdp_buff_xsk *xskb;
+	u32 i, entries;
+
+	entries = unaligned ? umem->chunks : 0;
+	pool = kvzalloc(struct_size(pool, free_heads, entries),	GFP_KERNEL);
+	if (!pool)
+		goto out;
+
+	pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL);
+	if (!pool->heads)
+		goto out;
+
+	if (xs->tx)
+		if (xp_alloc_tx_descs(pool, xs))
+			goto out;
+
+	pool->chunk_mask = ~((u64)umem->chunk_size - 1);
+	pool->addrs_cnt = umem->size;
+	pool->heads_cnt = umem->chunks;
+	pool->free_heads_cnt = umem->chunks;
+	pool->headroom = umem->headroom;
+	pool->chunk_size = umem->chunk_size;
+	pool->chunk_shift = ffs(umem->chunk_size) - 1;
+	pool->unaligned = unaligned;
+	pool->frame_len = umem->chunk_size - umem->headroom -
+		XDP_PACKET_HEADROOM;
+	pool->umem = umem;
+	pool->addrs = umem->addrs;
+	INIT_LIST_HEAD(&pool->free_list);
+	INIT_LIST_HEAD(&pool->xskb_list);
+	INIT_LIST_HEAD(&pool->xsk_tx_list);
+	spin_lock_init(&pool->xsk_tx_list_lock);
+	spin_lock_init(&pool->cq_lock);
+	refcount_set(&pool->users, 1);
+
+	pool->fq = xs->fq_tmp;
+	pool->cq = xs->cq_tmp;
+
+	for (i = 0; i < pool->free_heads_cnt; i++) {
+		xskb = &pool->heads[i];
+		xskb->pool = pool;
+		xskb->xdp.frame_sz = umem->chunk_size - umem->headroom;
+		INIT_LIST_HEAD(&xskb->free_list_node);
+		INIT_LIST_HEAD(&xskb->xskb_list_node);
+		if (pool->unaligned)
+			pool->free_heads[i] = xskb;
+		else
+			xp_init_xskb_addr(xskb, pool, i * pool->chunk_size);
+	}
+
+	return pool;
+
+out:
+	xp_destroy(pool);
+	return NULL;
+}
+
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
+{
+	u32 i;
+
+	for (i = 0; i < pool->heads_cnt; i++)
+		pool->heads[i].xdp.rxq = rxq;
+}
+EXPORT_SYMBOL(xp_set_rxq_info);
+
+static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
+{
+	struct netdev_bpf bpf;
+	int err;
+
+	ASSERT_RTNL();
+
+	if (pool->umem->zc) {
+		bpf.command = XDP_SETUP_XSK_POOL;
+		bpf.xsk.pool = NULL;
+		bpf.xsk.queue_id = pool->queue_id;
+
+		err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
+
+		if (err)
+			WARN(1, "Failed to disable zero-copy!\n");
+	}
+}
+
+#define NETDEV_XDP_ACT_ZC	(NETDEV_XDP_ACT_BASIC |		\
+				 NETDEV_XDP_ACT_REDIRECT |	\
+				 NETDEV_XDP_ACT_XSK_ZEROCOPY)
+
+int xp_assign_dev(struct xsk_buff_pool *pool,
+		  struct net_device *netdev, u16 queue_id, u16 flags)
+{
+	bool force_zc, force_copy;
+	struct netdev_bpf bpf;
+	int err = 0;
+
+	ASSERT_RTNL();
+
+	force_zc = flags & XDP_ZEROCOPY;
+	force_copy = flags & XDP_COPY;
+
+	if (force_zc && force_copy)
+		return -EINVAL;
+
+	if (xsk_get_pool_from_qid(netdev, queue_id))
+		return -EBUSY;
+
+	pool->netdev = netdev;
+	pool->queue_id = queue_id;
+	err = xsk_reg_pool_at_qid(netdev, pool, queue_id);
+	if (err)
+		return err;
+
+	if (flags & XDP_USE_SG)
+		pool->umem->flags |= XDP_UMEM_SG_FLAG;
+
+	if (flags & XDP_USE_NEED_WAKEUP)
+		pool->uses_need_wakeup = true;
+	/* Tx needs to be explicitly woken up the first time.  Also
+	 * for supporting drivers that do not implement this
+	 * feature. They will always have to call sendto() or poll().
+	 */
+	pool->cached_need_wakeup = XDP_WAKEUP_TX;
+
+	dev_hold(netdev);
+
+	if (force_copy)
+		/* For copy-mode, we are done. */
+		return 0;
+
+	if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) {
+		err = -EOPNOTSUPP;
+		goto err_unreg_pool;
+	}
+
+	if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) {
+		err = -EOPNOTSUPP;
+		goto err_unreg_pool;
+	}
+
+	bpf.command = XDP_SETUP_XSK_POOL;
+	bpf.xsk.pool = pool;
+	bpf.xsk.queue_id = queue_id;
+
+	err = netdev->netdev_ops->ndo_bpf(netdev, &bpf);
+	if (err)
+		goto err_unreg_pool;
+
+	if (!pool->dma_pages) {
+		WARN(1, "Driver did not DMA map zero-copy buffers");
+		err = -EINVAL;
+		goto err_unreg_xsk;
+	}
+	pool->umem->zc = true;
+	return 0;
+
+err_unreg_xsk:
+	xp_disable_drv_zc(pool);
+err_unreg_pool:
+	if (!force_zc)
+		err = 0; /* fallback to copy mode */
+	if (err) {
+		xsk_clear_pool_at_qid(netdev, queue_id);
+		dev_put(netdev);
+	}
+	return err;
+}
+
+int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
+			 struct net_device *dev, u16 queue_id)
+{
+	u16 flags;
+	struct xdp_umem *umem = umem_xs->umem;
+
+	/* One fill and completion ring required for each queue id. */
+	if (!pool->fq || !pool->cq)
+		return -EINVAL;
+
+	flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY;
+	if (umem_xs->pool->uses_need_wakeup)
+		flags |= XDP_USE_NEED_WAKEUP;
+
+	return xp_assign_dev(pool, dev, queue_id, flags);
+}
+
+void xp_clear_dev(struct xsk_buff_pool *pool)
+{
+	if (!pool->netdev)
+		return;
+
+	xp_disable_drv_zc(pool);
+	xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
+	dev_put(pool->netdev);
+	pool->netdev = NULL;
+}
+
+static void xp_release_deferred(struct work_struct *work)
+{
+	struct xsk_buff_pool *pool = container_of(work, struct xsk_buff_pool,
+						  work);
+
+	rtnl_lock();
+	xp_clear_dev(pool);
+	rtnl_unlock();
+
+	if (pool->fq) {
+		xskq_destroy(pool->fq);
+		pool->fq = NULL;
+	}
+
+	if (pool->cq) {
+		xskq_destroy(pool->cq);
+		pool->cq = NULL;
+	}
+
+	xdp_put_umem(pool->umem, false);
+	xp_destroy(pool);
+}
+
+void xp_get_pool(struct xsk_buff_pool *pool)
+{
+	refcount_inc(&pool->users);
+}
+
+bool xp_put_pool(struct xsk_buff_pool *pool)
+{
+	if (!pool)
+		return false;
+
+	if (refcount_dec_and_test(&pool->users)) {
+		INIT_WORK(&pool->work, xp_release_deferred);
+		schedule_work(&pool->work);
+		return true;
+	}
+
+	return false;
+}
+
+static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool)
+{
+	struct xsk_dma_map *dma_map;
+
+	list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) {
+		if (dma_map->netdev == pool->netdev)
+			return dma_map;
+	}
+
+	return NULL;
+}
+
+static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_device *netdev,
+					     u32 nr_pages, struct xdp_umem *umem)
+{
+	struct xsk_dma_map *dma_map;
+
+	dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
+	if (!dma_map)
+		return NULL;
+
+	dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL);
+	if (!dma_map->dma_pages) {
+		kfree(dma_map);
+		return NULL;
+	}
+
+	dma_map->netdev = netdev;
+	dma_map->dev = dev;
+	dma_map->dma_need_sync = false;
+	dma_map->dma_pages_cnt = nr_pages;
+	refcount_set(&dma_map->users, 1);
+	list_add(&dma_map->list, &umem->xsk_dma_list);
+	return dma_map;
+}
+
+static void xp_destroy_dma_map(struct xsk_dma_map *dma_map)
+{
+	list_del(&dma_map->list);
+	kvfree(dma_map->dma_pages);
+	kfree(dma_map);
+}
+
+static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs)
+{
+	dma_addr_t *dma;
+	u32 i;
+
+	for (i = 0; i < dma_map->dma_pages_cnt; i++) {
+		dma = &dma_map->dma_pages[i];
+		if (*dma) {
+			*dma &= ~XSK_NEXT_PG_CONTIG_MASK;
+			dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE,
+					     DMA_BIDIRECTIONAL, attrs);
+			*dma = 0;
+		}
+	}
+
+	xp_destroy_dma_map(dma_map);
+}
+
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+{
+	struct xsk_dma_map *dma_map;
+
+	if (!pool->dma_pages)
+		return;
+
+	dma_map = xp_find_dma_map(pool);
+	if (!dma_map) {
+		WARN(1, "Could not find dma_map for device");
+		return;
+	}
+
+	if (!refcount_dec_and_test(&dma_map->users))
+		return;
+
+	__xp_dma_unmap(dma_map, attrs);
+	kvfree(pool->dma_pages);
+	pool->dma_pages = NULL;
+	pool->dma_pages_cnt = 0;
+	pool->dev = NULL;
+}
+EXPORT_SYMBOL(xp_dma_unmap);
+
+static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
+{
+	u32 i;
+
+	for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
+		if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
+			dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
+		else
+			dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
+	}
+}
+
+static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
+{
+	if (!pool->unaligned) {
+		u32 i;
+
+		for (i = 0; i < pool->heads_cnt; i++) {
+			struct xdp_buff_xsk *xskb = &pool->heads[i];
+
+			xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr);
+		}
+	}
+
+	pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL);
+	if (!pool->dma_pages)
+		return -ENOMEM;
+
+	pool->dev = dma_map->dev;
+	pool->dma_pages_cnt = dma_map->dma_pages_cnt;
+	pool->dma_need_sync = dma_map->dma_need_sync;
+	memcpy(pool->dma_pages, dma_map->dma_pages,
+	       pool->dma_pages_cnt * sizeof(*pool->dma_pages));
+
+	return 0;
+}
+
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+	       unsigned long attrs, struct page **pages, u32 nr_pages)
+{
+	struct xsk_dma_map *dma_map;
+	dma_addr_t dma;
+	int err;
+	u32 i;
+
+	dma_map = xp_find_dma_map(pool);
+	if (dma_map) {
+		err = xp_init_dma_info(pool, dma_map);
+		if (err)
+			return err;
+
+		refcount_inc(&dma_map->users);
+		return 0;
+	}
+
+	dma_map = xp_create_dma_map(dev, pool->netdev, nr_pages, pool->umem);
+	if (!dma_map)
+		return -ENOMEM;
+
+	for (i = 0; i < dma_map->dma_pages_cnt; i++) {
+		dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+					 DMA_BIDIRECTIONAL, attrs);
+		if (dma_mapping_error(dev, dma)) {
+			__xp_dma_unmap(dma_map, attrs);
+			return -ENOMEM;
+		}
+		if (dma_need_sync(dev, dma))
+			dma_map->dma_need_sync = true;
+		dma_map->dma_pages[i] = dma;
+	}
+
+	if (pool->unaligned)
+		xp_check_dma_contiguity(dma_map);
+
+	err = xp_init_dma_info(pool, dma_map);
+	if (err) {
+		__xp_dma_unmap(dma_map, attrs);
+		return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xp_dma_map);
+
+static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+					  u64 addr)
+{
+	return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size);
+}
+
+static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+	*addr = xp_unaligned_extract_addr(*addr);
+	if (*addr >= pool->addrs_cnt ||
+	    *addr + pool->chunk_size > pool->addrs_cnt ||
+	    xp_addr_crosses_non_contig_pg(pool, *addr))
+		return false;
+	return true;
+}
+
+static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+	*addr = xp_aligned_extract_addr(pool, *addr);
+	return *addr < pool->addrs_cnt;
+}
+
+static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
+{
+	struct xdp_buff_xsk *xskb;
+	u64 addr;
+	bool ok;
+
+	if (pool->free_heads_cnt == 0)
+		return NULL;
+
+	for (;;) {
+		if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
+			pool->fq->queue_empty_descs++;
+			return NULL;
+		}
+
+		ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+		     xp_check_aligned(pool, &addr);
+		if (!ok) {
+			pool->fq->invalid_descs++;
+			xskq_cons_release(pool->fq);
+			continue;
+		}
+		break;
+	}
+
+	if (pool->unaligned) {
+		xskb = pool->free_heads[--pool->free_heads_cnt];
+		xp_init_xskb_addr(xskb, pool, addr);
+		if (pool->dma_pages)
+			xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
+	} else {
+		xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];
+	}
+
+	xskq_cons_release(pool->fq);
+	return xskb;
+}
+
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
+{
+	struct xdp_buff_xsk *xskb;
+
+	if (!pool->free_list_cnt) {
+		xskb = __xp_alloc(pool);
+		if (!xskb)
+			return NULL;
+	} else {
+		pool->free_list_cnt--;
+		xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk,
+					free_list_node);
+		list_del_init(&xskb->free_list_node);
+	}
+
+	xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
+	xskb->xdp.data_meta = xskb->xdp.data;
+	xskb->xdp.flags = 0;
+
+	if (pool->dma_need_sync) {
+		dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+						 pool->frame_len,
+						 DMA_BIDIRECTIONAL);
+	}
+	return &xskb->xdp;
+}
+EXPORT_SYMBOL(xp_alloc);
+
+static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+	u32 i, cached_cons, nb_entries;
+
+	if (max > pool->free_heads_cnt)
+		max = pool->free_heads_cnt;
+	max = xskq_cons_nb_entries(pool->fq, max);
+
+	cached_cons = pool->fq->cached_cons;
+	nb_entries = max;
+	i = max;
+	while (i--) {
+		struct xdp_buff_xsk *xskb;
+		u64 addr;
+		bool ok;
+
+		__xskq_cons_read_addr_unchecked(pool->fq, cached_cons++, &addr);
+
+		ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+			xp_check_aligned(pool, &addr);
+		if (unlikely(!ok)) {
+			pool->fq->invalid_descs++;
+			nb_entries--;
+			continue;
+		}
+
+		if (pool->unaligned) {
+			xskb = pool->free_heads[--pool->free_heads_cnt];
+			xp_init_xskb_addr(xskb, pool, addr);
+			if (pool->dma_pages)
+				xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
+		} else {
+			xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];
+		}
+
+		*xdp = &xskb->xdp;
+		xdp++;
+	}
+
+	xskq_cons_release_n(pool->fq, max);
+	return nb_entries;
+}
+
+static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 nb_entries)
+{
+	struct xdp_buff_xsk *xskb;
+	u32 i;
+
+	nb_entries = min_t(u32, nb_entries, pool->free_list_cnt);
+
+	i = nb_entries;
+	while (i--) {
+		xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node);
+		list_del_init(&xskb->free_list_node);
+
+		*xdp = &xskb->xdp;
+		xdp++;
+	}
+	pool->free_list_cnt -= nb_entries;
+
+	return nb_entries;
+}
+
+u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+	u32 nb_entries1 = 0, nb_entries2;
+
+	if (unlikely(pool->dma_need_sync)) {
+		struct xdp_buff *buff;
+
+		/* Slow path */
+		buff = xp_alloc(pool);
+		if (buff)
+			*xdp = buff;
+		return !!buff;
+	}
+
+	if (unlikely(pool->free_list_cnt)) {
+		nb_entries1 = xp_alloc_reused(pool, xdp, max);
+		if (nb_entries1 == max)
+			return nb_entries1;
+
+		max -= nb_entries1;
+		xdp += nb_entries1;
+	}
+
+	nb_entries2 = xp_alloc_new_from_fq(pool, xdp, max);
+	if (!nb_entries2)
+		pool->fq->queue_empty_descs++;
+
+	return nb_entries1 + nb_entries2;
+}
+EXPORT_SYMBOL(xp_alloc_batch);
+
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
+{
+	if (pool->free_list_cnt >= count)
+		return true;
+	return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt);
+}
+EXPORT_SYMBOL(xp_can_alloc);
+
+void xp_free(struct xdp_buff_xsk *xskb)
+{
+	if (!list_empty(&xskb->free_list_node))
+		return;
+
+	xskb->pool->free_list_cnt++;
+	list_add(&xskb->free_list_node, &xskb->pool->free_list);
+}
+EXPORT_SYMBOL(xp_free);
+
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
+{
+	addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+	return pool->addrs + addr;
+}
+EXPORT_SYMBOL(xp_raw_get_data);
+
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
+{
+	addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+	return (pool->dma_pages[addr >> PAGE_SHIFT] &
+		~XSK_NEXT_PG_CONTIG_MASK) +
+		(addr & ~PAGE_MASK);
+}
+EXPORT_SYMBOL(xp_raw_get_dma);
+
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
+{
+	dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
+				      xskb->pool->frame_len, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
+
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+				 size_t size)
+{
+	dma_sync_single_range_for_device(pool->dev, dma, 0,
+					 size, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_device_slow);
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
new file mode 100644
index 0000000000..22b36c8143
--- /dev/null
+++ b/net/xdp/xsk_diag.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets monitoring support
+ *
+ * Copyright(c) 2019 Intel Corporation.
+ *
+ * Author: Björn Töpel <bjorn.topel@intel.com>
+ */
+
+#include <linux/module.h>
+#include <net/xdp_sock.h>
+#include <linux/xdp_diag.h>
+#include <linux/sock_diag.h>
+
+#include "xsk_queue.h"
+#include "xsk.h"
+
+static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb)
+{
+	struct xdp_diag_info di = {};
+
+	di.ifindex = xs->dev ? xs->dev->ifindex : 0;
+	di.queue_id = xs->queue_id;
+	return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di);
+}
+
+static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type,
+			     struct sk_buff *nlskb)
+{
+	struct xdp_diag_ring dr = {};
+
+	dr.entries = queue->nentries;
+	return nla_put(nlskb, nl_type, sizeof(dr), &dr);
+}
+
+static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs,
+				  struct sk_buff *nlskb)
+{
+	int err = 0;
+
+	if (xs->rx)
+		err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb);
+	if (!err && xs->tx)
+		err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb);
+	return err;
+}
+
+static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
+{
+	struct xsk_buff_pool *pool = xs->pool;
+	struct xdp_umem *umem = xs->umem;
+	struct xdp_diag_umem du = {};
+	int err;
+
+	if (!umem)
+		return 0;
+
+	du.id = umem->id;
+	du.size = umem->size;
+	du.num_pages = umem->npgs;
+	du.chunk_size = umem->chunk_size;
+	du.headroom = umem->headroom;
+	du.ifindex = (pool && pool->netdev) ? pool->netdev->ifindex : 0;
+	du.queue_id = pool ? pool->queue_id : 0;
+	du.flags = 0;
+	if (umem->zc)
+		du.flags |= XDP_DU_F_ZEROCOPY;
+	du.refs = refcount_read(&umem->users);
+
+	err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du);
+	if (!err && pool && pool->fq)
+		err = xsk_diag_put_ring(pool->fq,
+					XDP_DIAG_UMEM_FILL_RING, nlskb);
+	if (!err && pool && pool->cq)
+		err = xsk_diag_put_ring(pool->cq,
+					XDP_DIAG_UMEM_COMPLETION_RING, nlskb);
+	return err;
+}
+
+static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb)
+{
+	struct xdp_diag_stats du = {};
+
+	du.n_rx_dropped = xs->rx_dropped;
+	du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx);
+	du.n_rx_full = xs->rx_queue_full;
+	du.n_fill_ring_empty = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
+	du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx);
+	du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx);
+	return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du);
+}
+
+static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
+			 struct xdp_diag_req *req,
+			 struct user_namespace *user_ns,
+			 u32 portid, u32 seq, u32 flags, int sk_ino)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct xdp_diag_msg *msg;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg),
+			flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	msg = nlmsg_data(nlh);
+	memset(msg, 0, sizeof(*msg));
+	msg->xdiag_family = AF_XDP;
+	msg->xdiag_type = sk->sk_type;
+	msg->xdiag_ino = sk_ino;
+	sock_diag_save_cookie(sk, msg->xdiag_cookie);
+
+	mutex_lock(&xs->mutex);
+	if (READ_ONCE(xs->state) == XSK_UNBOUND)
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_INFO) &&
+	    nla_put_u32(nlskb, XDP_DIAG_UID,
+			from_kuid_munged(user_ns, sock_i_uid(sk))))
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_RING_CFG) &&
+	    xsk_diag_put_rings_cfg(xs, nlskb))
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_UMEM) &&
+	    xsk_diag_put_umem(xs, nlskb))
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_MEMINFO) &&
+	    sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_STATS) &&
+	    xsk_diag_put_stats(xs, nlskb))
+		goto out_nlmsg_trim;
+
+	mutex_unlock(&xs->mutex);
+	nlmsg_end(nlskb, nlh);
+	return 0;
+
+out_nlmsg_trim:
+	mutex_unlock(&xs->mutex);
+	nlmsg_cancel(nlskb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb)
+{
+	struct xdp_diag_req *req = nlmsg_data(cb->nlh);
+	struct net *net = sock_net(nlskb->sk);
+	int num = 0, s_num = cb->args[0];
+	struct sock *sk;
+
+	mutex_lock(&net->xdp.lock);
+
+	sk_for_each(sk, &net->xdp.list) {
+		if (!net_eq(sock_net(sk), net))
+			continue;
+		if (num++ < s_num)
+			continue;
+
+		if (xsk_diag_fill(sk, nlskb, req,
+				  sk_user_ns(NETLINK_CB(cb->skb).sk),
+				  NETLINK_CB(cb->skb).portid,
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				  sock_i_ino(sk)) < 0) {
+			num--;
+			break;
+		}
+	}
+
+	mutex_unlock(&net->xdp.lock);
+	cb->args[0] = num;
+	return nlskb->len;
+}
+
+static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr)
+{
+	struct netlink_dump_control c = { .dump = xsk_diag_dump };
+	int hdrlen = sizeof(struct xdp_diag_req);
+	struct net *net = sock_net(nlskb->sk);
+
+	if (nlmsg_len(hdr) < hdrlen)
+		return -EINVAL;
+
+	if (!(hdr->nlmsg_flags & NLM_F_DUMP))
+		return -EOPNOTSUPP;
+
+	return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c);
+}
+
+static const struct sock_diag_handler xsk_diag_handler = {
+	.family = AF_XDP,
+	.dump = xsk_diag_handler_dump,
+};
+
+static int __init xsk_diag_init(void)
+{
+	return sock_diag_register(&xsk_diag_handler);
+}
+
+static void __exit xsk_diag_exit(void)
+{
+	sock_diag_unregister(&xsk_diag_handler);
+}
+
+module_init(xsk_diag_init);
+module_exit(xsk_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 0000000000..d2c2640300
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/overflow.h>
+#include <linux/vmalloc.h>
+#include <net/xdp_sock_drv.h>
+
+#include "xsk_queue.h"
+
+static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
+{
+	struct xdp_umem_ring *umem_ring;
+	struct xdp_rxtx_ring *rxtx_ring;
+
+	if (umem_queue)
+		return struct_size(umem_ring, desc, q->nentries);
+	return struct_size(rxtx_ring, desc, q->nentries);
+}
+
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
+{
+	struct xsk_queue *q;
+	size_t size;
+
+	q = kzalloc(sizeof(*q), GFP_KERNEL);
+	if (!q)
+		return NULL;
+
+	q->nentries = nentries;
+	q->ring_mask = nentries - 1;
+
+	size = xskq_get_ring_size(q, umem_queue);
+
+	/* size which is overflowing or close to SIZE_MAX will become 0 in
+	 * PAGE_ALIGN(), checking SIZE_MAX is enough due to the previous
+	 * is_power_of_2(), the rest will be handled by vmalloc_user()
+	 */
+	if (unlikely(size == SIZE_MAX)) {
+		kfree(q);
+		return NULL;
+	}
+
+	size = PAGE_ALIGN(size);
+
+	q->ring = vmalloc_user(size);
+	if (!q->ring) {
+		kfree(q);
+		return NULL;
+	}
+
+	q->ring_vmalloc_size = size;
+	return q;
+}
+
+void xskq_destroy(struct xsk_queue *q)
+{
+	if (!q)
+		return;
+
+	vfree(q->ring);
+	kfree(q);
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 0000000000..13354a1e42
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef _LINUX_XSK_QUEUE_H
+#define _LINUX_XSK_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/if_xdp.h>
+#include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
+
+#include "xsk.h"
+
+struct xdp_ring {
+	u32 producer ____cacheline_aligned_in_smp;
+	/* Hinder the adjacent cache prefetcher to prefetch the consumer
+	 * pointer if the producer pointer is touched and vice versa.
+	 */
+	u32 pad1 ____cacheline_aligned_in_smp;
+	u32 consumer ____cacheline_aligned_in_smp;
+	u32 pad2 ____cacheline_aligned_in_smp;
+	u32 flags;
+	u32 pad3 ____cacheline_aligned_in_smp;
+};
+
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+	struct xdp_ring ptrs;
+	struct xdp_desc desc[] ____cacheline_aligned_in_smp;
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+	struct xdp_ring ptrs;
+	u64 desc[] ____cacheline_aligned_in_smp;
+};
+
+struct xsk_queue {
+	u32 ring_mask;
+	u32 nentries;
+	u32 cached_prod;
+	u32 cached_cons;
+	struct xdp_ring *ring;
+	u64 invalid_descs;
+	u64 queue_empty_descs;
+	size_t ring_vmalloc_size;
+};
+
+struct parsed_desc {
+	u32 mb;
+	u32 valid;
+};
+
+/* The structure of the shared state of the rings are a simple
+ * circular buffer, as outlined in
+ * Documentation/core-api/circular-buffers.rst. For the Rx and
+ * completion ring, the kernel is the producer and user space is the
+ * consumer. For the Tx and fill rings, the kernel is the consumer and
+ * user space is the producer.
+ *
+ * producer                         consumer
+ *
+ * if (LOAD ->consumer) {  (A)      LOAD.acq ->producer  (C)
+ *    STORE $data                   LOAD $data
+ *    STORE.rel ->producer (B)      STORE.rel ->consumer (D)
+ * }
+ *
+ * (A) pairs with (D), and (B) pairs with (C).
+ *
+ * Starting with (B), it protects the data from being written after
+ * the producer pointer. If this barrier was missing, the consumer
+ * could observe the producer pointer being set and thus load the data
+ * before the producer has written the new data. The consumer would in
+ * this case load the old data.
+ *
+ * (C) protects the consumer from speculatively loading the data before
+ * the producer pointer actually has been read. If we do not have this
+ * barrier, some architectures could load old data as speculative loads
+ * are not discarded as the CPU does not know there is a dependency
+ * between ->producer and data.
+ *
+ * (A) is a control dependency that separates the load of ->consumer
+ * from the stores of $data. In case ->consumer indicates there is no
+ * room in the buffer to store $data we do not. The dependency will
+ * order both of the stores after the loads. So no barrier is needed.
+ *
+ * (D) protects the load of the data to be observed to happen after the
+ * store of the consumer pointer. If we did not have this memory
+ * barrier, the producer could observe the consumer pointer being set
+ * and overwrite the data with a new value before the consumer got the
+ * chance to read the old value. The consumer would thus miss reading
+ * the old entry and very likely read the new entry twice, once right
+ * now and again after circling through the ring.
+ */
+
+/* The operations on the rings are the following:
+ *
+ * producer                           consumer
+ *
+ * RESERVE entries                    PEEK in the ring for entries
+ * WRITE data into the ring           READ data from the ring
+ * SUBMIT entries                     RELEASE entries
+ *
+ * The producer reserves one or more entries in the ring. It can then
+ * fill in these entries and finally submit them so that they can be
+ * seen and read by the consumer.
+ *
+ * The consumer peeks into the ring to see if the producer has written
+ * any new entries. If so, the consumer can then read these entries
+ * and when it is done reading them release them back to the producer
+ * so that the producer can use these slots to fill in new entries.
+ *
+ * The function names below reflect these operations.
+ */
+
+/* Functions that read and validate content from consumer rings. */
+
+static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+	u32 idx = cached_cons & q->ring_mask;
+
+	*addr = ring->desc[idx];
+}
+
+static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
+{
+	if (q->cached_cons != q->cached_prod) {
+		__xskq_cons_read_addr_unchecked(q, q->cached_cons, addr);
+		return true;
+	}
+
+	return false;
+}
+
+static inline bool xp_unused_options_set(u32 options)
+{
+	return options & ~XDP_PKT_CONTD;
+}
+
+static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
+					    struct xdp_desc *desc)
+{
+	u64 offset = desc->addr & (pool->chunk_size - 1);
+
+	if (!desc->len)
+		return false;
+
+	if (offset + desc->len > pool->chunk_size)
+		return false;
+
+	if (desc->addr >= pool->addrs_cnt)
+		return false;
+
+	if (xp_unused_options_set(desc->options))
+		return false;
+	return true;
+}
+
+static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
+					      struct xdp_desc *desc)
+{
+	u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
+
+	if (!desc->len)
+		return false;
+
+	if (desc->len > pool->chunk_size)
+		return false;
+
+	if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
+	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+		return false;
+
+	if (xp_unused_options_set(desc->options))
+		return false;
+	return true;
+}
+
+static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
+				    struct xdp_desc *desc)
+{
+	return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) :
+		xp_aligned_validate_desc(pool, desc);
+}
+
+static inline bool xskq_has_descs(struct xsk_queue *q)
+{
+	return q->cached_cons != q->cached_prod;
+}
+
+static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
+					   struct xdp_desc *d,
+					   struct xsk_buff_pool *pool)
+{
+	if (!xp_validate_desc(pool, d)) {
+		q->invalid_descs++;
+		return false;
+	}
+	return true;
+}
+
+static inline bool xskq_cons_read_desc(struct xsk_queue *q,
+				       struct xdp_desc *desc,
+				       struct xsk_buff_pool *pool)
+{
+	if (q->cached_cons != q->cached_prod) {
+		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+		u32 idx = q->cached_cons & q->ring_mask;
+
+		*desc = ring->desc[idx];
+		return xskq_cons_is_valid_desc(q, desc, pool);
+	}
+
+	q->queue_empty_descs++;
+	return false;
+}
+
+static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
+{
+	q->cached_cons += cnt;
+}
+
+static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool,
+			      struct xdp_desc *desc, struct parsed_desc *parsed)
+{
+	parsed->valid = xskq_cons_is_valid_desc(q, desc, pool);
+	parsed->mb = xp_mb_desc(desc);
+}
+
+static inline
+u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+			      u32 max)
+{
+	u32 cached_cons = q->cached_cons, nb_entries = 0;
+	struct xdp_desc *descs = pool->tx_descs;
+	u32 total_descs = 0, nr_frags = 0;
+
+	/* track first entry, if stumble upon *any* invalid descriptor, rewind
+	 * current packet that consists of frags and stop the processing
+	 */
+	while (cached_cons != q->cached_prod && nb_entries < max) {
+		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+		u32 idx = cached_cons & q->ring_mask;
+		struct parsed_desc parsed;
+
+		descs[nb_entries] = ring->desc[idx];
+		cached_cons++;
+		parse_desc(q, pool, &descs[nb_entries], &parsed);
+		if (unlikely(!parsed.valid))
+			break;
+
+		if (likely(!parsed.mb)) {
+			total_descs += (nr_frags + 1);
+			nr_frags = 0;
+		} else {
+			nr_frags++;
+			if (nr_frags == pool->netdev->xdp_zc_max_segs) {
+				nr_frags = 0;
+				break;
+			}
+		}
+		nb_entries++;
+	}
+
+	cached_cons -= nr_frags;
+	/* Release valid plus any invalid entries */
+	xskq_cons_release_n(q, cached_cons - q->cached_cons);
+	return total_descs;
+}
+
+/* Functions for consumers */
+
+static inline void __xskq_cons_release(struct xsk_queue *q)
+{
+	smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */
+}
+
+static inline void __xskq_cons_peek(struct xsk_queue *q)
+{
+	/* Refresh the local pointer */
+	q->cached_prod = smp_load_acquire(&q->ring->producer);  /* C, matches B */
+}
+
+static inline void xskq_cons_get_entries(struct xsk_queue *q)
+{
+	__xskq_cons_release(q);
+	__xskq_cons_peek(q);
+}
+
+static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max)
+{
+	u32 entries = q->cached_prod - q->cached_cons;
+
+	if (entries >= max)
+		return max;
+
+	__xskq_cons_peek(q);
+	entries = q->cached_prod - q->cached_cons;
+
+	return entries >= max ? max : entries;
+}
+
+static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
+{
+	return xskq_cons_nb_entries(q, cnt) >= cnt;
+}
+
+static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
+{
+	if (q->cached_prod == q->cached_cons)
+		xskq_cons_get_entries(q);
+	return xskq_cons_read_addr_unchecked(q, addr);
+}
+
+static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
+				       struct xdp_desc *desc,
+				       struct xsk_buff_pool *pool)
+{
+	if (q->cached_prod == q->cached_cons)
+		xskq_cons_get_entries(q);
+	return xskq_cons_read_desc(q, desc, pool);
+}
+
+/* To improve performance in the xskq_cons_release functions, only update local state here.
+ * Reflect this to global state when we get new entries from the ring in
+ * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop.
+ */
+static inline void xskq_cons_release(struct xsk_queue *q)
+{
+	q->cached_cons++;
+}
+
+static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt)
+{
+	q->cached_cons -= cnt;
+}
+
+static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
+{
+	/* No barriers needed since data is not accessed */
+	return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer);
+}
+
+/* Functions for producers */
+
+static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
+{
+	u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
+
+	if (free_entries >= max)
+		return max;
+
+	/* Refresh the local tail pointer */
+	q->cached_cons = READ_ONCE(q->ring->consumer);
+	free_entries = q->nentries - (q->cached_prod - q->cached_cons);
+
+	return free_entries >= max ? max : free_entries;
+}
+
+static inline bool xskq_prod_is_full(struct xsk_queue *q)
+{
+	return xskq_prod_nb_free(q, 1) ? false : true;
+}
+
+static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt)
+{
+	q->cached_prod -= cnt;
+}
+
+static inline int xskq_prod_reserve(struct xsk_queue *q)
+{
+	if (xskq_prod_is_full(q))
+		return -ENOSPC;
+
+	/* A, matches D */
+	q->cached_prod++;
+	return 0;
+}
+
+static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	if (xskq_prod_is_full(q))
+		return -ENOSPC;
+
+	/* A, matches D */
+	ring->desc[q->cached_prod++ & q->ring_mask] = addr;
+	return 0;
+}
+
+static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
+					      u32 nb_entries)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+	u32 i, cached_prod;
+
+	/* A, matches D */
+	cached_prod = q->cached_prod;
+	for (i = 0; i < nb_entries; i++)
+		ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr;
+	q->cached_prod = cached_prod;
+}
+
+static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
+					 u64 addr, u32 len, u32 flags)
+{
+	struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+	u32 idx;
+
+	if (xskq_prod_is_full(q))
+		return -ENOBUFS;
+
+	/* A, matches D */
+	idx = q->cached_prod++ & q->ring_mask;
+	ring->desc[idx].addr = addr;
+	ring->desc[idx].len = len;
+	ring->desc[idx].options = flags;
+
+	return 0;
+}
+
+static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx)
+{
+	smp_store_release(&q->ring->producer, idx); /* B, matches C */
+}
+
+static inline void xskq_prod_submit(struct xsk_queue *q)
+{
+	__xskq_prod_submit(q, q->cached_prod);
+}
+
+static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries)
+{
+	__xskq_prod_submit(q, q->ring->producer + nb_entries);
+}
+
+static inline bool xskq_prod_is_empty(struct xsk_queue *q)
+{
+	/* No barriers needed since data is not accessed */
+	return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer);
+}
+
+/* For both producers and consumers */
+
+static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+{
+	return q ? q->invalid_descs : 0;
+}
+
+static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q)
+{
+	return q ? q->queue_empty_descs : 0;
+}
+
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
+void xskq_destroy(struct xsk_queue *q_ops);
+
+#endif /* _LINUX_XSK_QUEUE_H */
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
new file mode 100644
index 0000000000..e1c526f97c
--- /dev/null
+++ b/net/xdp/xskmap.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/btf_ids.h>
+
+#include "xsk.h"
+
+static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
+					       struct xdp_sock __rcu **map_entry)
+{
+	struct xsk_map_node *node;
+
+	node = bpf_map_kzalloc(&map->map, sizeof(*node),
+			       GFP_ATOMIC | __GFP_NOWARN);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_inc(&map->map);
+	atomic_inc(&map->count);
+
+	node->map = map;
+	node->map_entry = map_entry;
+	return node;
+}
+
+static void xsk_map_node_free(struct xsk_map_node *node)
+{
+	struct xsk_map *map = node->map;
+
+	bpf_map_put(&node->map->map);
+	kfree(node);
+	atomic_dec(&map->count);
+}
+
+static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
+{
+	spin_lock_bh(&xs->map_list_lock);
+	list_add_tail(&node->node, &xs->map_list);
+	spin_unlock_bh(&xs->map_list_lock);
+}
+
+static void xsk_map_sock_delete(struct xdp_sock *xs,
+				struct xdp_sock __rcu **map_entry)
+{
+	struct xsk_map_node *n, *tmp;
+
+	spin_lock_bh(&xs->map_list_lock);
+	list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
+		if (map_entry == n->map_entry) {
+			list_del(&n->node);
+			xsk_map_node_free(n);
+		}
+	}
+	spin_unlock_bh(&xs->map_list_lock);
+}
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+	struct xsk_map *m;
+	int numa_node;
+	u64 size;
+
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 ||
+	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+		return ERR_PTR(-EINVAL);
+
+	numa_node = bpf_map_attr_numa_node(attr);
+	size = struct_size(m, xsk_map, attr->max_entries);
+
+	m = bpf_map_area_alloc(size, numa_node);
+	if (!m)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&m->map, attr);
+	spin_lock_init(&m->lock);
+
+	return &m->map;
+}
+
+static u64 xsk_map_mem_usage(const struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+	return struct_size(m, xsk_map, map->max_entries) +
+		   (u64)atomic_read(&m->count) * sizeof(struct xsk_map_node);
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+	synchronize_net();
+	bpf_map_area_free(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= m->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == m->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+	const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
+	struct bpf_insn *insn = insn_buf;
+
+	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+	*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
+	*insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
+	*insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
+	*insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
+	*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+	*insn++ = BPF_MOV64_IMM(ret, 0);
+	return insn - insn_buf;
+}
+
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
+static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held());
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock __rcu **map_entry;
+	struct xdp_sock *xs, *old_xs;
+	u32 i = *(u32 *)key, fd = *(u32 *)value;
+	struct xsk_map_node *node;
+	struct socket *sock;
+	int err;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(i >= m->map.max_entries))
+		return -E2BIG;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return err;
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	xs = (struct xdp_sock *)sock->sk;
+
+	map_entry = &m->xsk_map[i];
+	node = xsk_map_node_alloc(m, map_entry);
+	if (IS_ERR(node)) {
+		sockfd_put(sock);
+		return PTR_ERR(node);
+	}
+
+	spin_lock_bh(&m->lock);
+	old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock));
+	if (old_xs == xs) {
+		err = 0;
+		goto out;
+	} else if (old_xs && map_flags == BPF_NOEXIST) {
+		err = -EEXIST;
+		goto out;
+	} else if (!old_xs && map_flags == BPF_EXIST) {
+		err = -ENOENT;
+		goto out;
+	}
+	xsk_map_sock_add(xs, node);
+	rcu_assign_pointer(*map_entry, xs);
+	if (old_xs)
+		xsk_map_sock_delete(old_xs, map_entry);
+	spin_unlock_bh(&m->lock);
+	sockfd_put(sock);
+	return 0;
+
+out:
+	spin_unlock_bh(&m->lock);
+	sockfd_put(sock);
+	xsk_map_node_free(node);
+	return err;
+}
+
+static long xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock __rcu **map_entry;
+	struct xdp_sock *old_xs;
+	int k = *(u32 *)key;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	spin_lock_bh(&m->lock);
+	map_entry = &m->xsk_map[k];
+	old_xs = unrcu_pointer(xchg(map_entry, NULL));
+	if (old_xs)
+		xsk_map_sock_delete(old_xs, map_entry);
+	spin_unlock_bh(&m->lock);
+
+	return 0;
+}
+
+static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+{
+	return __bpf_xdp_redirect_map(map, index, flags, 0,
+				      __xsk_map_lookup_elem);
+}
+
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+			     struct xdp_sock __rcu **map_entry)
+{
+	spin_lock_bh(&map->lock);
+	if (rcu_access_pointer(*map_entry) == xs) {
+		rcu_assign_pointer(*map_entry, NULL);
+		xsk_map_sock_delete(xs, map_entry);
+	}
+	spin_unlock_bh(&map->lock);
+}
+
+static bool xsk_map_meta_equal(const struct bpf_map *meta0,
+			       const struct bpf_map *meta1)
+{
+	return meta0->max_entries == meta1->max_entries &&
+		bpf_map_meta_equal(meta0, meta1);
+}
+
+BTF_ID_LIST_SINGLE(xsk_map_btf_ids, struct, xsk_map)
+const struct bpf_map_ops xsk_map_ops = {
+	.map_meta_equal = xsk_map_meta_equal,
+	.map_alloc = xsk_map_alloc,
+	.map_free = xsk_map_free,
+	.map_get_next_key = xsk_map_get_next_key,
+	.map_lookup_elem = xsk_map_lookup_elem,
+	.map_gen_lookup = xsk_map_gen_lookup,
+	.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
+	.map_update_elem = xsk_map_update_elem,
+	.map_delete_elem = xsk_map_delete_elem,
+	.map_check_btf = map_check_no_btf,
+	.map_mem_usage = xsk_map_mem_usage,
+	.map_btf_id = &xsk_map_btf_ids[0],
+	.map_redirect = xsk_map_redirect,
+};
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-11 08:27:49 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-11 08:27:49 +0000
commit	ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
tree	b2d64bc10158fdd5497876388cd68142ca374ed3 /net/xdp
parent	Initial commit. (diff)
download	linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip