diff options
Diffstat (limited to 'net/xdp')
-rw-r--r-- | net/xdp/Kconfig | 16 | ||||
-rw-r--r-- | net/xdp/Makefile | 4 | ||||
-rw-r--r-- | net/xdp/xdp_umem.c | 259 | ||||
-rw-r--r-- | net/xdp/xdp_umem.h | 15 | ||||
-rw-r--r-- | net/xdp/xsk.c | 1722 | ||||
-rw-r--r-- | net/xdp/xsk.h | 48 | ||||
-rw-r--r-- | net/xdp/xsk_buff_pool.c | 696 | ||||
-rw-r--r-- | net/xdp/xsk_diag.c | 214 | ||||
-rw-r--r-- | net/xdp/xsk_queue.c | 66 | ||||
-rw-r--r-- | net/xdp/xsk_queue.h | 462 | ||||
-rw-r--r-- | net/xdp/xskmap.c | 281 |
11 files changed, 3783 insertions, 0 deletions
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig new file mode 100644 index 0000000000..71af2febe7 --- /dev/null +++ b/net/xdp/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only +config XDP_SOCKETS + bool "XDP sockets" + depends on BPF_SYSCALL + default n + help + XDP sockets allows a channel between XDP programs and + userspace applications. + +config XDP_SOCKETS_DIAG + tristate "XDP sockets: monitoring interface" + depends on XDP_SOCKETS + default n + help + Support for PF_XDP sockets monitoring interface used by the ss tool. + If unsure, say Y. diff --git a/net/xdp/Makefile b/net/xdp/Makefile new file mode 100644 index 0000000000..30cdc4315f --- /dev/null +++ b/net/xdp/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o +obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o +obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c new file mode 100644 index 0000000000..06cead2b8e --- /dev/null +++ b/net/xdp/xdp_umem.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + */ + +#include <linux/init.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/mm.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/idr.h> +#include <linux/vmalloc.h> + +#include "xdp_umem.h" +#include "xsk_queue.h" + +static DEFINE_IDA(umem_ida); + +static void xdp_umem_unpin_pages(struct xdp_umem *umem) +{ + unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); + + kvfree(umem->pgs); + umem->pgs = NULL; +} + +static void xdp_umem_unaccount_pages(struct xdp_umem *umem) +{ + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } +} + +static void xdp_umem_addr_unmap(struct xdp_umem *umem) +{ + vunmap(umem->addrs); + umem->addrs = NULL; +} + +static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages, + u32 nr_pages) +{ + umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!umem->addrs) + return -ENOMEM; + return 0; +} + +static void xdp_umem_release(struct xdp_umem *umem) +{ + umem->zc = false; + ida_free(&umem_ida, umem->id); + + xdp_umem_addr_unmap(umem); + xdp_umem_unpin_pages(umem); + + xdp_umem_unaccount_pages(umem); + kfree(umem); +} + +static void xdp_umem_release_deferred(struct work_struct *work) +{ + struct xdp_umem *umem = container_of(work, struct xdp_umem, work); + + xdp_umem_release(umem); +} + +void xdp_get_umem(struct xdp_umem *umem) +{ + refcount_inc(&umem->users); +} + +void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup) +{ + if (!umem) + return; + + if (refcount_dec_and_test(&umem->users)) { + if (defer_cleanup) { + INIT_WORK(&umem->work, xdp_umem_release_deferred); + schedule_work(&umem->work); + } else { + xdp_umem_release(umem); + } + } +} + +static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) +{ + unsigned int gup_flags = FOLL_WRITE; + long npgs; + int err; + + umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); + if (!umem->pgs) + return -ENOMEM; + + mmap_read_lock(current->mm); + npgs = pin_user_pages(address, umem->npgs, + gup_flags | FOLL_LONGTERM, &umem->pgs[0]); + mmap_read_unlock(current->mm); + + if (npgs != umem->npgs) { + if (npgs >= 0) { + umem->npgs = npgs; + err = -ENOMEM; + goto out_pin; + } + err = npgs; + goto out_pgs; + } + return 0; + +out_pin: + xdp_umem_unpin_pages(umem); +out_pgs: + kvfree(umem->pgs); + umem->pgs = NULL; + return err; +} + +static int xdp_umem_account_pages(struct xdp_umem *umem) +{ + unsigned long lock_limit, new_npgs, old_npgs; + + if (capable(CAP_IPC_LOCK)) + return 0; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + umem->user = get_uid(current_user()); + + do { + old_npgs = atomic_long_read(&umem->user->locked_vm); + new_npgs = old_npgs + umem->npgs; + if (new_npgs > lock_limit) { + free_uid(umem->user); + umem->user = NULL; + return -ENOBUFS; + } + } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, + new_npgs) != old_npgs); + return 0; +} + +static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +{ + bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; + u32 chunk_size = mr->chunk_size, headroom = mr->headroom; + u64 addr = mr->addr, size = mr->len; + u32 chunks_rem, npgs_rem; + u64 chunks, npgs; + int err; + + if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { + /* Strictly speaking we could support this, if: + * - huge pages, or* + * - using an IOMMU, or + * - making sure the memory area is consecutive + * but for now, we simply say "computer says no". + */ + return -EINVAL; + } + + if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG) + return -EINVAL; + + if (!unaligned_chunks && !is_power_of_2(chunk_size)) + return -EINVAL; + + if (!PAGE_ALIGNED(addr)) { + /* Memory area has to be page size aligned. For + * simplicity, this might change. + */ + return -EINVAL; + } + + if ((addr + size) < addr) + return -EINVAL; + + npgs = div_u64_rem(size, PAGE_SIZE, &npgs_rem); + if (npgs_rem) + npgs++; + if (npgs > U32_MAX) + return -EINVAL; + + chunks = div_u64_rem(size, chunk_size, &chunks_rem); + if (!chunks || chunks > U32_MAX) + return -EINVAL; + + if (!unaligned_chunks && chunks_rem) + return -EINVAL; + + if (headroom >= chunk_size - XDP_PACKET_HEADROOM) + return -EINVAL; + + umem->size = size; + umem->headroom = headroom; + umem->chunk_size = chunk_size; + umem->chunks = chunks; + umem->npgs = npgs; + umem->pgs = NULL; + umem->user = NULL; + umem->flags = mr->flags; + + INIT_LIST_HEAD(&umem->xsk_dma_list); + refcount_set(&umem->users, 1); + + err = xdp_umem_account_pages(umem); + if (err) + return err; + + err = xdp_umem_pin_pages(umem, (unsigned long)addr); + if (err) + goto out_account; + + err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs); + if (err) + goto out_unpin; + + return 0; + +out_unpin: + xdp_umem_unpin_pages(umem); +out_account: + xdp_umem_unaccount_pages(umem); + return err; +} + +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) +{ + struct xdp_umem *umem; + int err; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + err = ida_alloc(&umem_ida, GFP_KERNEL); + if (err < 0) { + kfree(umem); + return ERR_PTR(err); + } + umem->id = err; + + err = xdp_umem_reg(umem, mr); + if (err) { + ida_free(&umem_ida, umem->id); + kfree(umem); + return ERR_PTR(err); + } + + return umem; +} diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h new file mode 100644 index 0000000000..aa9fe27804 --- /dev/null +++ b/net/xdp/xdp_umem.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef XDP_UMEM_H_ +#define XDP_UMEM_H_ + +#include <net/xdp_sock_drv.h> + +void xdp_get_umem(struct xdp_umem *umem); +void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup); +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); + +#endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c new file mode 100644 index 0000000000..d849dc04a3 --- /dev/null +++ b/net/xdp/xsk.c @@ -0,0 +1,1722 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets + * + * AF_XDP sockets allows a channel between XDP programs and userspace + * applications. + * Copyright(c) 2018 Intel Corporation. + * + * Author(s): Björn Töpel <bjorn.topel@intel.com> + * Magnus Karlsson <magnus.karlsson@intel.com> + */ + +#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ + +#include <linux/if_xdp.h> +#include <linux/init.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/socket.h> +#include <linux/file.h> +#include <linux/uaccess.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/vmalloc.h> +#include <net/xdp_sock_drv.h> +#include <net/busy_poll.h> +#include <net/netdev_rx_queue.h> +#include <net/xdp.h> + +#include "xsk_queue.h" +#include "xdp_umem.h" +#include "xsk.h" + +#define TX_BATCH_SIZE 32 + +static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); + +void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) +{ + if (pool->cached_need_wakeup & XDP_WAKEUP_RX) + return; + + pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; + pool->cached_need_wakeup |= XDP_WAKEUP_RX; +} +EXPORT_SYMBOL(xsk_set_rx_need_wakeup); + +void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) +{ + struct xdp_sock *xs; + + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { + xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; + } + rcu_read_unlock(); + + pool->cached_need_wakeup |= XDP_WAKEUP_TX; +} +EXPORT_SYMBOL(xsk_set_tx_need_wakeup); + +void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) +{ + if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) + return; + + pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; + pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; +} +EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); + +void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) +{ + struct xdp_sock *xs; + + if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { + xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; + } + rcu_read_unlock(); + + pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; +} +EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); + +bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) +{ + return pool->uses_need_wakeup; +} +EXPORT_SYMBOL(xsk_uses_need_wakeup); + +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, + u16 queue_id) +{ + if (queue_id < dev->real_num_rx_queues) + return dev->_rx[queue_id].pool; + if (queue_id < dev->real_num_tx_queues) + return dev->_tx[queue_id].pool; + + return NULL; +} +EXPORT_SYMBOL(xsk_get_pool_from_qid); + +void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) +{ + if (queue_id < dev->num_rx_queues) + dev->_rx[queue_id].pool = NULL; + if (queue_id < dev->num_tx_queues) + dev->_tx[queue_id].pool = NULL; +} + +/* The buffer pool is stored both in the _rx struct and the _tx struct as we do + * not know if the device has more tx queues than rx, or the opposite. + * This might also change during run time. + */ +int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, + u16 queue_id) +{ + if (queue_id >= max_t(unsigned int, + dev->real_num_rx_queues, + dev->real_num_tx_queues)) + return -EINVAL; + + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].pool = pool; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].pool = pool; + + return 0; +} + +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, + u32 flags) +{ + u64 addr; + int err; + + addr = xp_get_handle(xskb); + err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); + if (err) { + xs->rx_queue_full++; + return err; + } + + xp_release(xskb); + return 0; +} + +static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u32 frags = xdp_buff_has_frags(xdp); + struct xdp_buff_xsk *pos, *tmp; + struct list_head *xskb_list; + u32 contd = 0; + int err; + + if (frags) + contd = XDP_PKT_CONTD; + + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err) + goto err; + if (likely(!frags)) + return 0; + + xskb_list = &xskb->pool->xskb_list; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + if (list_is_singular(xskb_list)) + contd = 0; + len = pos->xdp.data_end - pos->xdp.data; + err = __xsk_rcv_zc(xs, pos, len, contd); + if (err) + goto err; + list_del(&pos->xskb_list_node); + } + + return 0; +err: + xsk_buff_free(xdp); + return err; +} + +static void *xsk_copy_xdp_start(struct xdp_buff *from) +{ + if (unlikely(xdp_data_meta_unsupported(from))) + return from->data; + else + return from->data_meta; +} + +static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, + u32 *from_len, skb_frag_t **frag, u32 rem) +{ + u32 copied = 0; + + while (1) { + u32 copy_len = min_t(u32, *from_len, to_len); + + memcpy(to, *from, copy_len); + copied += copy_len; + if (rem == copied) + return copied; + + if (*from_len == copy_len) { + *from = skb_frag_address(*frag); + *from_len = skb_frag_size((*frag)++); + } else { + *from += copy_len; + *from_len -= copy_len; + } + if (to_len == copy_len) + return copied; + + to_len -= copy_len; + to += copy_len; + } +} + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; + u32 from_len, meta_len, rem, num_desc; + struct xdp_buff_xsk *xskb; + struct xdp_buff *xsk_xdp; + skb_frag_t *frag; + + from_len = xdp->data_end - copy_from; + meta_len = xdp->data - copy_from; + rem = len + meta_len; + + if (len <= frame_size && !xdp_buff_has_frags(xdp)) { + int err; + + xsk_xdp = xsk_buff_alloc(xs->pool); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOMEM; + } + memcpy(xsk_xdp->data - meta_len, copy_from, rem); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc(xs, xskb, len, 0); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + + return 0; + } + + num_desc = (len - 1) / frame_size + 1; + + if (!xsk_buff_can_alloc(xs->pool, num_desc)) { + xs->rx_dropped++; + return -ENOMEM; + } + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + return -ENOBUFS; + } + + if (xdp_buff_has_frags(xdp)) { + struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(xdp); + frag = &sinfo->frags[0]; + } + + do { + u32 to_len = frame_size + meta_len; + u32 copied; + + xsk_xdp = xsk_buff_alloc(xs->pool); + copy_to = xsk_xdp->data - meta_len; + + copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); + rem -= copied; + + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + meta_len = 0; + } while (rem); + + return 0; +} + +static bool xsk_tx_writeable(struct xdp_sock *xs) +{ + if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) + return false; + + return true; +} + +static bool xsk_is_bound(struct xdp_sock *xs) +{ + if (READ_ONCE(xs->state) == XSK_BOUND) { + /* Matches smp_wmb() in bind(). */ + smp_rmb(); + return true; + } + return false; +} + +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + if (!xsk_is_bound(xs)) + return -ENXIO; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + xs->rx_dropped++; + return -ENOSPC; + } + + sk_mark_napi_id_once_xdp(&xs->sk, xdp); + return 0; +} + +static void xsk_flush(struct xdp_sock *xs) +{ + xskq_prod_submit(xs->rx); + __xskq_cons_release(xs->pool->fq); + sock_def_readable(&xs->sk); +} + +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 len = xdp_get_buff_len(xdp); + int err; + + spin_lock_bh(&xs->rx_lock); + err = xsk_rcv_check(xs, xdp, len); + if (!err) { + err = __xsk_rcv(xs, xdp, len); + xsk_flush(xs); + } + spin_unlock_bh(&xs->rx_lock); + return err; +} + +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 len = xdp_get_buff_len(xdp); + int err; + + err = xsk_rcv_check(xs, xdp, len); + if (err) + return err; + + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { + len = xdp->data_end - xdp->data; + return xsk_rcv_zc(xs, xdp, len); + } + + err = __xsk_rcv(xs, xdp, len); + if (!err) + xdp_return_buff(xdp); + return err; +} + +int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(void) +{ + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del_clearprev(&xs->flush_node); + } +} + +void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) +{ + xskq_prod_submit_n(pool->cq, nb_entries); +} +EXPORT_SYMBOL(xsk_tx_completed); + +void xsk_tx_release(struct xsk_buff_pool *pool) +{ + struct xdp_sock *xs; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { + __xskq_cons_release(xs->tx); + if (xsk_tx_writeable(xs)) + xs->sk.sk_write_space(&xs->sk); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(xsk_tx_release); + +bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) +{ + struct xdp_sock *xs; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { + if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { + if (xskq_has_descs(xs->tx)) + xskq_cons_release(xs->tx); + continue; + } + + /* This is the backpressure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ + if (xskq_prod_reserve_addr(pool->cq, desc->addr)) + goto out; + + xskq_cons_release(xs->tx); + rcu_read_unlock(); + return true; + } + +out: + rcu_read_unlock(); + return false; +} +EXPORT_SYMBOL(xsk_tx_peek_desc); + +static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) +{ + struct xdp_desc *descs = pool->tx_descs; + u32 nb_pkts = 0; + + while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) + nb_pkts++; + + xsk_tx_release(pool); + return nb_pkts; +} + +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) +{ + struct xdp_sock *xs; + + rcu_read_lock(); + if (!list_is_singular(&pool->xsk_tx_list)) { + /* Fallback to the non-batched version */ + rcu_read_unlock(); + return xsk_tx_peek_release_fallback(pool, nb_pkts); + } + + xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); + if (!xs) { + nb_pkts = 0; + goto out; + } + + nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); + + /* This is the backpressure mechanism for the Tx path. Try to + * reserve space in the completion queue for all packets, but + * if there are fewer slots available, just process that many + * packets. This avoids having to implement any buffering in + * the Tx path. + */ + nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); + if (!nb_pkts) + goto out; + + nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); + if (!nb_pkts) { + xs->tx->queue_empty_descs++; + goto out; + } + + __xskq_cons_release(xs->tx); + xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); + xs->sk.sk_write_space(&xs->sk); + +out: + rcu_read_unlock(); + return nb_pkts; +} +EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); + +static int xsk_wakeup(struct xdp_sock *xs, u8 flags) +{ + struct net_device *dev = xs->dev; + + return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); +} + +static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + ret = xskq_prod_reserve_addr(xs->pool->cq, addr); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + + return ret; +} + +static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) +{ + unsigned long flags; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_submit_n(xs->pool->cq, n); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) +{ + unsigned long flags; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel_n(xs->pool->cq, n); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static u32 xsk_get_num_desc(struct sk_buff *skb) +{ + return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; +} + +static void xsk_destruct_skb(struct sk_buff *skb) +{ + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); + sock_wfree(skb); +} + +static void xsk_set_destructor_arg(struct sk_buff *skb) +{ + long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; + + skb_shinfo(skb)->destructor_arg = (void *)num; +} + +static void xsk_consume_skb(struct sk_buff *skb) +{ + struct xdp_sock *xs = xdp_sk(skb->sk); + + skb->destructor = sock_wfree; + xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); + /* Free skb without triggering the perf drop trace */ + consume_skb(skb); + xs->skb = NULL; +} + +static void xsk_drop_skb(struct sk_buff *skb) +{ + xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); + xsk_consume_skb(skb); +} + +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct xsk_buff_pool *pool = xs->pool; + u32 hr, len, ts, offset, copy, copied; + struct sk_buff *skb = xs->skb; + struct page *page; + void *buffer; + int err, i; + u64 addr; + + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); + + skb_reserve(skb, hr); + } + + addr = desc->addr; + len = desc->len; + ts = pool->unaligned ? len : pool->chunk_size; + + buffer = xsk_buff_raw_get_data(pool, addr); + offset = offset_in_page(buffer); + addr = buffer - pool->addrs; + + for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { + if (unlikely(i >= MAX_SKB_FRAGS)) + return ERR_PTR(-EOVERFLOW); + + page = pool->umem->pgs[addr >> PAGE_SHIFT]; + get_page(page); + + copy = min_t(u32, PAGE_SIZE - offset, len - copied); + skb_fill_page_desc(skb, i, page, offset, copy); + + copied += copy; + addr += copy; + offset = 0; + } + + skb->len += len; + skb->data_len += len; + skb->truesize += ts; + + refcount_add(ts, &xs->sk.sk_wmem_alloc); + + return skb; +} + +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct net_device *dev = xs->dev; + struct sk_buff *skb = xs->skb; + int err; + + if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { + skb = xsk_build_skb_zerocopy(xs, desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + goto free_err; + } + } else { + u32 hr, tr, len; + void *buffer; + + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); + len = desc->len; + + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + goto free_err; + + skb_reserve(skb, hr); + skb_put(skb, len); + + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) { + kfree_skb(skb); + goto free_err; + } + } else { + int nr_frags = skb_shinfo(skb)->nr_frags; + struct page *page; + u8 *vaddr; + + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { + err = -EOVERFLOW; + goto free_err; + } + + page = alloc_page(xs->sk.sk_allocation); + if (unlikely(!page)) { + err = -EAGAIN; + goto free_err; + } + + vaddr = kmap_local_page(page); + memcpy(vaddr, buffer, len); + kunmap_local(vaddr); + + skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); + } + } + + skb->dev = dev; + skb->priority = xs->sk.sk_priority; + skb->mark = READ_ONCE(xs->sk.sk_mark); + skb->destructor = xsk_destruct_skb; + xsk_set_destructor_arg(skb); + + return skb; + +free_err: + if (err == -EOVERFLOW) { + /* Drop the packet */ + xsk_set_destructor_arg(xs->skb); + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } else { + /* Let application retry */ + xsk_cq_cancel_locked(xs, 1); + } + + return ERR_PTR(err); +} + +static int __xsk_generic_xmit(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + u32 max_batch = TX_BATCH_SIZE; + bool sent_frame = false; + struct xdp_desc desc; + struct sk_buff *skb; + int err = 0; + + mutex_lock(&xs->mutex); + + /* Since we dropped the RCU read lock, the socket state might have changed. */ + if (unlikely(!xsk_is_bound(xs))) { + err = -ENXIO; + goto out; + } + + if (xs->queue_id >= xs->dev->real_num_tx_queues) + goto out; + + while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { + if (max_batch-- == 0) { + err = -EAGAIN; + goto out; + } + + /* This is the backpressure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ + if (xsk_cq_reserve_addr_locked(xs, desc.addr)) + goto out; + + skb = xsk_build_skb(xs, &desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + if (err != -EOVERFLOW) + goto out; + err = 0; + continue; + } + + xskq_cons_release(xs->tx); + + if (xp_mb_desc(&desc)) { + xs->skb = skb; + continue; + } + + err = __dev_direct_xmit(skb, xs->queue_id); + if (err == NETDEV_TX_BUSY) { + /* Tell user-space to retry the send */ + xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); + xsk_consume_skb(skb); + err = -EAGAIN; + goto out; + } + + /* Ignore NET_XMIT_CN as packet might have been sent */ + if (err == NET_XMIT_DROP) { + /* SKB completed but not sent */ + err = -EBUSY; + xs->skb = NULL; + goto out; + } + + sent_frame = true; + xs->skb = NULL; + } + + if (xskq_has_descs(xs->tx)) { + if (xs->skb) + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } + +out: + if (sent_frame) + if (xsk_tx_writeable(xs)) + sk->sk_write_space(sk); + + mutex_unlock(&xs->mutex); + return err; +} + +static int xsk_generic_xmit(struct sock *sk) +{ + int ret; + + /* Drop the RCU lock since the SKB path might sleep. */ + rcu_read_unlock(); + ret = __xsk_generic_xmit(sk); + /* Reaquire RCU lock before going into common code. */ + rcu_read_lock(); + + return ret; +} + +static bool xsk_no_wakeup(struct sock *sk) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + /* Prefer busy-polling, skip the wakeup. */ + return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && + READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; +#else + return false; +#endif +} + +static int xsk_check_common(struct xdp_sock *xs) +{ + if (unlikely(!xsk_is_bound(xs))) + return -ENXIO; + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + + return 0; +} + +static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) +{ + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + struct xsk_buff_pool *pool; + int err; + + err = xsk_check_common(xs); + if (err) + return err; + if (unlikely(need_wait)) + return -EOPNOTSUPP; + if (unlikely(!xs->tx)) + return -ENOBUFS; + + if (sk_can_busy_loop(sk)) { + if (xs->zc) + __sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool)); + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + } + + if (xs->zc && xsk_no_wakeup(sk)) + return 0; + + pool = xs->pool; + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { + if (xs->zc) + return xsk_wakeup(xs, XDP_WAKEUP_TX); + return xsk_generic_xmit(sk); + } + return 0; +} + +static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) +{ + int ret; + + rcu_read_lock(); + ret = __xsk_sendmsg(sock, m, total_len); + rcu_read_unlock(); + + return ret; +} + +static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) +{ + bool need_wait = !(flags & MSG_DONTWAIT); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int err; + + err = xsk_check_common(xs); + if (err) + return err; + if (unlikely(!xs->rx)) + return -ENOBUFS; + if (unlikely(need_wait)) + return -EOPNOTSUPP; + + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + + if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) + return xsk_wakeup(xs, XDP_WAKEUP_RX); + return 0; +} + +static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) +{ + int ret; + + rcu_read_lock(); + ret = __xsk_recvmsg(sock, m, len, flags); + rcu_read_unlock(); + + return ret; +} + +static __poll_t xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + __poll_t mask = 0; + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + struct xsk_buff_pool *pool; + + sock_poll_wait(file, sock, wait); + + rcu_read_lock(); + if (xsk_check_common(xs)) + goto out; + + pool = xs->pool; + + if (pool->cached_need_wakeup) { + if (xs->zc) + xsk_wakeup(xs, pool->cached_need_wakeup); + else if (xs->tx) + /* Poll needs to drive Tx also in copy mode */ + xsk_generic_xmit(sk); + } + + if (xs->rx && !xskq_prod_is_empty(xs->rx)) + mask |= EPOLLIN | EPOLLRDNORM; + if (xs->tx && xsk_tx_writeable(xs)) + mask |= EPOLLOUT | EPOLLWRNORM; +out: + rcu_read_unlock(); + return mask; +} + +static int xsk_init_queue(u32 entries, struct xsk_queue **queue, + bool umem_queue) +{ + struct xsk_queue *q; + + if (entries == 0 || *queue || !is_power_of_2(entries)) + return -EINVAL; + + q = xskq_create(entries, umem_queue); + if (!q) + return -ENOMEM; + + /* Make sure queue is ready before it can be seen by others */ + smp_wmb(); + WRITE_ONCE(*queue, q); + return 0; +} + +static void xsk_unbind_dev(struct xdp_sock *xs) +{ + struct net_device *dev = xs->dev; + + if (xs->state != XSK_BOUND) + return; + WRITE_ONCE(xs->state, XSK_UNBOUND); + + /* Wait for driver to stop using the xdp socket. */ + xp_del_xsk(xs->pool, xs); + synchronize_net(); + dev_put(dev); +} + +static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, + struct xdp_sock __rcu ***map_entry) +{ + struct xsk_map *map = NULL; + struct xsk_map_node *node; + + *map_entry = NULL; + + spin_lock_bh(&xs->map_list_lock); + node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, + node); + if (node) { + bpf_map_inc(&node->map->map); + map = node->map; + *map_entry = node->map_entry; + } + spin_unlock_bh(&xs->map_list_lock); + return map; +} + +static void xsk_delete_from_maps(struct xdp_sock *xs) +{ + /* This function removes the current XDP socket from all the + * maps it resides in. We need to take extra care here, due to + * the two locks involved. Each map has a lock synchronizing + * updates to the entries, and each socket has a lock that + * synchronizes access to the list of maps (map_list). For + * deadlock avoidance the locks need to be taken in the order + * "map lock"->"socket map list lock". We start off by + * accessing the socket map list, and take a reference to the + * map to guarantee existence between the + * xsk_get_map_list_entry() and xsk_map_try_sock_delete() + * calls. Then we ask the map to remove the socket, which + * tries to remove the socket from the map. Note that there + * might be updates to the map between + * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). + */ + struct xdp_sock __rcu **map_entry = NULL; + struct xsk_map *map; + + while ((map = xsk_get_map_list_entry(xs, &map_entry))) { + xsk_map_try_sock_delete(map, xs, map_entry); + bpf_map_put(&map->map); + } +} + +static int xsk_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + struct net *net; + + if (!sk) + return 0; + + net = sock_net(sk); + + if (xs->skb) + xsk_drop_skb(xs->skb); + + mutex_lock(&net->xdp.lock); + sk_del_node_init_rcu(sk); + mutex_unlock(&net->xdp.lock); + + sock_prot_inuse_add(net, sk->sk_prot, -1); + + xsk_delete_from_maps(xs); + mutex_lock(&xs->mutex); + xsk_unbind_dev(xs); + mutex_unlock(&xs->mutex); + + xskq_destroy(xs->rx); + xskq_destroy(xs->tx); + xskq_destroy(xs->fq_tmp); + xskq_destroy(xs->cq_tmp); + + sock_orphan(sk); + sock->sk = NULL; + + sock_put(sk); + + return 0; +} + +static struct socket *xsk_lookup_xsk_from_fd(int fd) +{ + struct socket *sock; + int err; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return ERR_PTR(-ENOTSOCK); + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return ERR_PTR(-ENOPROTOOPT); + } + + return sock; +} + +static bool xsk_validate_queues(struct xdp_sock *xs) +{ + return xs->fq_tmp && xs->cq_tmp; +} + +static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + struct net_device *dev; + int bound_dev_if; + u32 flags, qid; + int err = 0; + + if (addr_len < sizeof(struct sockaddr_xdp)) + return -EINVAL; + if (sxdp->sxdp_family != AF_XDP) + return -EINVAL; + + flags = sxdp->sxdp_flags; + if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | + XDP_USE_NEED_WAKEUP | XDP_USE_SG)) + return -EINVAL; + + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) + return -EINVAL; + + rtnl_lock(); + mutex_lock(&xs->mutex); + if (xs->state != XSK_READY) { + err = -EBUSY; + goto out_release; + } + + dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); + if (!dev) { + err = -ENODEV; + goto out_release; + } + + if (!xs->rx && !xs->tx) { + err = -EINVAL; + goto out_unlock; + } + + qid = sxdp->sxdp_queue_id; + + if (flags & XDP_SHARED_UMEM) { + struct xdp_sock *umem_xs; + struct socket *sock; + + if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { + /* Cannot specify flags for shared sockets. */ + err = -EINVAL; + goto out_unlock; + } + + if (xs->umem) { + /* We have already our own. */ + err = -EINVAL; + goto out_unlock; + } + + sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); + if (IS_ERR(sock)) { + err = PTR_ERR(sock); + goto out_unlock; + } + + umem_xs = xdp_sk(sock->sk); + if (!xsk_is_bound(umem_xs)) { + err = -EBADF; + sockfd_put(sock); + goto out_unlock; + } + + if (umem_xs->queue_id != qid || umem_xs->dev != dev) { + /* Share the umem with another socket on another qid + * and/or device. + */ + xs->pool = xp_create_and_assign_umem(xs, + umem_xs->umem); + if (!xs->pool) { + err = -ENOMEM; + sockfd_put(sock); + goto out_unlock; + } + + err = xp_assign_dev_shared(xs->pool, umem_xs, dev, + qid); + if (err) { + xp_destroy(xs->pool); + xs->pool = NULL; + sockfd_put(sock); + goto out_unlock; + } + } else { + /* Share the buffer pool with the other socket. */ + if (xs->fq_tmp || xs->cq_tmp) { + /* Do not allow setting your own fq or cq. */ + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + + xp_get_pool(umem_xs->pool); + xs->pool = umem_xs->pool; + + /* If underlying shared umem was created without Tx + * ring, allocate Tx descs array that Tx batching API + * utilizes + */ + if (xs->tx && !xs->pool->tx_descs) { + err = xp_alloc_tx_descs(xs->pool, xs); + if (err) { + xp_put_pool(xs->pool); + xs->pool = NULL; + sockfd_put(sock); + goto out_unlock; + } + } + } + + xdp_get_umem(umem_xs->umem); + WRITE_ONCE(xs->umem, umem_xs->umem); + sockfd_put(sock); + } else if (!xs->umem || !xsk_validate_queues(xs)) { + err = -EINVAL; + goto out_unlock; + } else { + /* This xsk has its own umem. */ + xs->pool = xp_create_and_assign_umem(xs, xs->umem); + if (!xs->pool) { + err = -ENOMEM; + goto out_unlock; + } + + err = xp_assign_dev(xs->pool, dev, qid, flags); + if (err) { + xp_destroy(xs->pool); + xs->pool = NULL; + goto out_unlock; + } + } + + /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ + xs->fq_tmp = NULL; + xs->cq_tmp = NULL; + + xs->dev = dev; + xs->zc = xs->umem->zc; + xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); + xs->queue_id = qid; + xp_add_xsk(xs->pool, xs); + +out_unlock: + if (err) { + dev_put(dev); + } else { + /* Matches smp_rmb() in bind() for shared umem + * sockets, and xsk_is_bound(). + */ + smp_wmb(); + WRITE_ONCE(xs->state, XSK_BOUND); + } +out_release: + mutex_unlock(&xs->mutex); + rtnl_unlock(); + return err; +} + +struct xdp_umem_reg_v1 { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; +}; + +static int xsk_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int err; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + switch (optname) { + case XDP_RX_RING: + case XDP_TX_RING: + { + struct xsk_queue **q; + int entries; + + if (optlen < sizeof(entries)) + return -EINVAL; + if (copy_from_sockptr(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + if (xs->state != XSK_READY) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } + q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; + err = xsk_init_queue(entries, q, false); + if (!err && optname == XDP_TX_RING) + /* Tx needs to be explicitly woken up the first time */ + xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; + mutex_unlock(&xs->mutex); + return err; + } + case XDP_UMEM_REG: + { + size_t mr_size = sizeof(struct xdp_umem_reg); + struct xdp_umem_reg mr = {}; + struct xdp_umem *umem; + + if (optlen < sizeof(struct xdp_umem_reg_v1)) + return -EINVAL; + else if (optlen < sizeof(mr)) + mr_size = sizeof(struct xdp_umem_reg_v1); + + if (copy_from_sockptr(&mr, optval, mr_size)) + return -EFAULT; + + mutex_lock(&xs->mutex); + if (xs->state != XSK_READY || xs->umem) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } + + umem = xdp_umem_create(&mr); + if (IS_ERR(umem)) { + mutex_unlock(&xs->mutex); + return PTR_ERR(umem); + } + + /* Make sure umem is ready before it can be seen by others */ + smp_wmb(); + WRITE_ONCE(xs->umem, umem); + mutex_unlock(&xs->mutex); + return 0; + } + case XDP_UMEM_FILL_RING: + case XDP_UMEM_COMPLETION_RING: + { + struct xsk_queue **q; + int entries; + + if (copy_from_sockptr(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + if (xs->state != XSK_READY) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } + + q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : + &xs->cq_tmp; + err = xsk_init_queue(entries, q, true); + mutex_unlock(&xs->mutex); + return err; + } + default: + break; + } + + return -ENOPROTOOPT; +} + +static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) +{ + ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + ring->desc = offsetof(struct xdp_rxtx_ring, desc); +} + +static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) +{ + ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); + ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + ring->desc = offsetof(struct xdp_umem_ring, desc); +} + +struct xdp_statistics_v1 { + __u64 rx_dropped; + __u64 rx_invalid_descs; + __u64 tx_invalid_descs; +}; + +static int xsk_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int len; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case XDP_STATISTICS: + { + struct xdp_statistics stats = {}; + bool extra_stats = true; + size_t stats_size; + + if (len < sizeof(struct xdp_statistics_v1)) { + return -EINVAL; + } else if (len < sizeof(stats)) { + extra_stats = false; + stats_size = sizeof(struct xdp_statistics_v1); + } else { + stats_size = sizeof(stats); + } + + mutex_lock(&xs->mutex); + stats.rx_dropped = xs->rx_dropped; + if (extra_stats) { + stats.rx_ring_full = xs->rx_queue_full; + stats.rx_fill_ring_empty_descs = + xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; + stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); + } else { + stats.rx_dropped += xs->rx_queue_full; + } + stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); + stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); + mutex_unlock(&xs->mutex); + + if (copy_to_user(optval, &stats, stats_size)) + return -EFAULT; + if (put_user(stats_size, optlen)) + return -EFAULT; + + return 0; + } + case XDP_MMAP_OFFSETS: + { + struct xdp_mmap_offsets off; + struct xdp_mmap_offsets_v1 off_v1; + bool flags_supported = true; + void *to_copy; + + if (len < sizeof(off_v1)) + return -EINVAL; + else if (len < sizeof(off)) + flags_supported = false; + + if (flags_supported) { + /* xdp_ring_offset is identical to xdp_ring_offset_v1 + * except for the flags field added to the end. + */ + xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) + &off.rx); + xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) + &off.tx); + xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) + &off.fr); + xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) + &off.cr); + off.rx.flags = offsetof(struct xdp_rxtx_ring, + ptrs.flags); + off.tx.flags = offsetof(struct xdp_rxtx_ring, + ptrs.flags); + off.fr.flags = offsetof(struct xdp_umem_ring, + ptrs.flags); + off.cr.flags = offsetof(struct xdp_umem_ring, + ptrs.flags); + + len = sizeof(off); + to_copy = &off; + } else { + xsk_enter_rxtx_offsets(&off_v1.rx); + xsk_enter_rxtx_offsets(&off_v1.tx); + xsk_enter_umem_offsets(&off_v1.fr); + xsk_enter_umem_offsets(&off_v1.cr); + + len = sizeof(off_v1); + to_copy = &off_v1; + } + + if (copy_to_user(optval, to_copy, len)) + return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; + + return 0; + } + case XDP_OPTIONS: + { + struct xdp_options opts = {}; + + if (len < sizeof(opts)) + return -EINVAL; + + mutex_lock(&xs->mutex); + if (xs->zc) + opts.flags |= XDP_OPTIONS_ZEROCOPY; + mutex_unlock(&xs->mutex); + + len = sizeof(opts); + if (copy_to_user(optval, &opts, len)) + return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; + + return 0; + } + default: + break; + } + + return -EOPNOTSUPP; +} + +static int xsk_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct xdp_sock *xs = xdp_sk(sock->sk); + int state = READ_ONCE(xs->state); + struct xsk_queue *q = NULL; + + if (state != XSK_READY && state != XSK_BOUND) + return -EBUSY; + + if (offset == XDP_PGOFF_RX_RING) { + q = READ_ONCE(xs->rx); + } else if (offset == XDP_PGOFF_TX_RING) { + q = READ_ONCE(xs->tx); + } else { + /* Matches the smp_wmb() in XDP_UMEM_REG */ + smp_rmb(); + if (offset == XDP_UMEM_PGOFF_FILL_RING) + q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : + READ_ONCE(xs->pool->fq); + else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) + q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : + READ_ONCE(xs->pool->cq); + } + + if (!q) + return -EINVAL; + + /* Matches the smp_wmb() in xsk_init_queue */ + smp_rmb(); + if (size > q->ring_vmalloc_size) + return -EINVAL; + + return remap_vmalloc_range(vma, q->ring, 0); +} + +static int xsk_notifier(struct notifier_block *this, + unsigned long msg, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct sock *sk; + + switch (msg) { + case NETDEV_UNREGISTER: + mutex_lock(&net->xdp.lock); + sk_for_each(sk, &net->xdp.list) { + struct xdp_sock *xs = xdp_sk(sk); + + mutex_lock(&xs->mutex); + if (xs->dev == dev) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); + + xsk_unbind_dev(xs); + + /* Clear device references. */ + xp_clear_dev(xs->pool); + } + mutex_unlock(&xs->mutex); + } + mutex_unlock(&net->xdp.lock); + break; + } + return NOTIFY_DONE; +} + +static struct proto xsk_proto = { + .name = "XDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct xdp_sock), +}; + +static const struct proto_ops xsk_proto_ops = { + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = xsk_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = xsk_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = xsk_getsockopt, + .sendmsg = xsk_sendmsg, + .recvmsg = xsk_recvmsg, + .mmap = xsk_mmap, +}; + +static void xsk_destruct(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + return; + + if (!xp_put_pool(xs->pool)) + xdp_put_umem(xs->umem, !xs->pool); +} + +static int xsk_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct xdp_sock *xs; + struct sock *sk; + + if (!ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + if (protocol) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); + if (!sk) + return -ENOBUFS; + + sock->ops = &xsk_proto_ops; + + sock_init_data(sock, sk); + + sk->sk_family = PF_XDP; + + sk->sk_destruct = xsk_destruct; + + sock_set_flag(sk, SOCK_RCU_FREE); + + xs = xdp_sk(sk); + xs->state = XSK_READY; + mutex_init(&xs->mutex); + spin_lock_init(&xs->rx_lock); + + INIT_LIST_HEAD(&xs->map_list); + spin_lock_init(&xs->map_list_lock); + + mutex_lock(&net->xdp.lock); + sk_add_node_rcu(sk, &net->xdp.list); + mutex_unlock(&net->xdp.lock); + + sock_prot_inuse_add(net, &xsk_proto, 1); + + return 0; +} + +static const struct net_proto_family xsk_family_ops = { + .family = PF_XDP, + .create = xsk_create, + .owner = THIS_MODULE, +}; + +static struct notifier_block xsk_netdev_notifier = { + .notifier_call = xsk_notifier, +}; + +static int __net_init xsk_net_init(struct net *net) +{ + mutex_init(&net->xdp.lock); + INIT_HLIST_HEAD(&net->xdp.list); + return 0; +} + +static void __net_exit xsk_net_exit(struct net *net) +{ + WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); +} + +static struct pernet_operations xsk_net_ops = { + .init = xsk_net_init, + .exit = xsk_net_exit, +}; + +static int __init xsk_init(void) +{ + int err, cpu; + + err = proto_register(&xsk_proto, 0 /* no slab */); + if (err) + goto out; + + err = sock_register(&xsk_family_ops); + if (err) + goto out_proto; + + err = register_pernet_subsys(&xsk_net_ops); + if (err) + goto out_sk; + + err = register_netdevice_notifier(&xsk_netdev_notifier); + if (err) + goto out_pernet; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); + return 0; + +out_pernet: + unregister_pernet_subsys(&xsk_net_ops); +out_sk: + sock_unregister(PF_XDP); +out_proto: + proto_unregister(&xsk_proto); +out: + return err; +} + +fs_initcall(xsk_init); diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h new file mode 100644 index 0000000000..a4bc4749fa --- /dev/null +++ b/net/xdp/xsk.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2019 Intel Corporation. */ + +#ifndef XSK_H_ +#define XSK_H_ + +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +struct xdp_ring_offset_v1 { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +struct xdp_mmap_offsets_v1 { + struct xdp_ring_offset_v1 rx; + struct xdp_ring_offset_v1 tx; + struct xdp_ring_offset_v1 fr; + struct xdp_ring_offset_v1 cr; +}; + +/* Nodes are linked in the struct xdp_sock map_list field, and used to + * track which maps a certain socket reside in. + */ + +struct xsk_map_node { + struct list_head node; + struct xsk_map *map; + struct xdp_sock __rcu **map_entry; +}; + +static inline struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock __rcu **map_entry); +void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); +int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, + u16 queue_id); + +#endif /* XSK_H_ */ diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c new file mode 100644 index 0000000000..b0a6116778 --- /dev/null +++ b/net/xdp/xsk_buff_pool.c @@ -0,0 +1,696 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <net/xsk_buff_pool.h> +#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> + +#include "xsk_queue.h" +#include "xdp_umem.h" +#include "xsk.h" + +void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) +{ + unsigned long flags; + + if (!xs->tx) + return; + + spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + list_add_rcu(&xs->tx_list, &pool->xsk_tx_list); + spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); +} + +void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) +{ + unsigned long flags; + + if (!xs->tx) + return; + + spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + list_del_rcu(&xs->tx_list); + spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); +} + +void xp_destroy(struct xsk_buff_pool *pool) +{ + if (!pool) + return; + + kvfree(pool->tx_descs); + kvfree(pool->heads); + kvfree(pool); +} + +int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs) +{ + pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), + GFP_KERNEL); + if (!pool->tx_descs) + return -ENOMEM; + + return 0; +} + +struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, + struct xdp_umem *umem) +{ + bool unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; + struct xsk_buff_pool *pool; + struct xdp_buff_xsk *xskb; + u32 i, entries; + + entries = unaligned ? umem->chunks : 0; + pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL); + if (!pool) + goto out; + + pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL); + if (!pool->heads) + goto out; + + if (xs->tx) + if (xp_alloc_tx_descs(pool, xs)) + goto out; + + pool->chunk_mask = ~((u64)umem->chunk_size - 1); + pool->addrs_cnt = umem->size; + pool->heads_cnt = umem->chunks; + pool->free_heads_cnt = umem->chunks; + pool->headroom = umem->headroom; + pool->chunk_size = umem->chunk_size; + pool->chunk_shift = ffs(umem->chunk_size) - 1; + pool->unaligned = unaligned; + pool->frame_len = umem->chunk_size - umem->headroom - + XDP_PACKET_HEADROOM; + pool->umem = umem; + pool->addrs = umem->addrs; + INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->xskb_list); + INIT_LIST_HEAD(&pool->xsk_tx_list); + spin_lock_init(&pool->xsk_tx_list_lock); + spin_lock_init(&pool->cq_lock); + refcount_set(&pool->users, 1); + + pool->fq = xs->fq_tmp; + pool->cq = xs->cq_tmp; + + for (i = 0; i < pool->free_heads_cnt; i++) { + xskb = &pool->heads[i]; + xskb->pool = pool; + xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; + INIT_LIST_HEAD(&xskb->free_list_node); + INIT_LIST_HEAD(&xskb->xskb_list_node); + if (pool->unaligned) + pool->free_heads[i] = xskb; + else + xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); + } + + return pool; + +out: + xp_destroy(pool); + return NULL; +} + +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) +{ + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) + pool->heads[i].xdp.rxq = rxq; +} +EXPORT_SYMBOL(xp_set_rxq_info); + +static void xp_disable_drv_zc(struct xsk_buff_pool *pool) +{ + struct netdev_bpf bpf; + int err; + + ASSERT_RTNL(); + + if (pool->umem->zc) { + bpf.command = XDP_SETUP_XSK_POOL; + bpf.xsk.pool = NULL; + bpf.xsk.queue_id = pool->queue_id; + + err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf); + + if (err) + WARN(1, "Failed to disable zero-copy!\n"); + } +} + +#define NETDEV_XDP_ACT_ZC (NETDEV_XDP_ACT_BASIC | \ + NETDEV_XDP_ACT_REDIRECT | \ + NETDEV_XDP_ACT_XSK_ZEROCOPY) + +int xp_assign_dev(struct xsk_buff_pool *pool, + struct net_device *netdev, u16 queue_id, u16 flags) +{ + bool force_zc, force_copy; + struct netdev_bpf bpf; + int err = 0; + + ASSERT_RTNL(); + + force_zc = flags & XDP_ZEROCOPY; + force_copy = flags & XDP_COPY; + + if (force_zc && force_copy) + return -EINVAL; + + if (xsk_get_pool_from_qid(netdev, queue_id)) + return -EBUSY; + + pool->netdev = netdev; + pool->queue_id = queue_id; + err = xsk_reg_pool_at_qid(netdev, pool, queue_id); + if (err) + return err; + + if (flags & XDP_USE_SG) + pool->umem->flags |= XDP_UMEM_SG_FLAG; + + if (flags & XDP_USE_NEED_WAKEUP) + pool->uses_need_wakeup = true; + /* Tx needs to be explicitly woken up the first time. Also + * for supporting drivers that do not implement this + * feature. They will always have to call sendto() or poll(). + */ + pool->cached_need_wakeup = XDP_WAKEUP_TX; + + dev_hold(netdev); + + if (force_copy) + /* For copy-mode, we are done. */ + return 0; + + if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + + if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + + bpf.command = XDP_SETUP_XSK_POOL; + bpf.xsk.pool = pool; + bpf.xsk.queue_id = queue_id; + + err = netdev->netdev_ops->ndo_bpf(netdev, &bpf); + if (err) + goto err_unreg_pool; + + if (!pool->dma_pages) { + WARN(1, "Driver did not DMA map zero-copy buffers"); + err = -EINVAL; + goto err_unreg_xsk; + } + pool->umem->zc = true; + return 0; + +err_unreg_xsk: + xp_disable_drv_zc(pool); +err_unreg_pool: + if (!force_zc) + err = 0; /* fallback to copy mode */ + if (err) { + xsk_clear_pool_at_qid(netdev, queue_id); + dev_put(netdev); + } + return err; +} + +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, + struct net_device *dev, u16 queue_id) +{ + u16 flags; + struct xdp_umem *umem = umem_xs->umem; + + /* One fill and completion ring required for each queue id. */ + if (!pool->fq || !pool->cq) + return -EINVAL; + + flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; + if (umem_xs->pool->uses_need_wakeup) + flags |= XDP_USE_NEED_WAKEUP; + + return xp_assign_dev(pool, dev, queue_id, flags); +} + +void xp_clear_dev(struct xsk_buff_pool *pool) +{ + if (!pool->netdev) + return; + + xp_disable_drv_zc(pool); + xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); + dev_put(pool->netdev); + pool->netdev = NULL; +} + +static void xp_release_deferred(struct work_struct *work) +{ + struct xsk_buff_pool *pool = container_of(work, struct xsk_buff_pool, + work); + + rtnl_lock(); + xp_clear_dev(pool); + rtnl_unlock(); + + if (pool->fq) { + xskq_destroy(pool->fq); + pool->fq = NULL; + } + + if (pool->cq) { + xskq_destroy(pool->cq); + pool->cq = NULL; + } + + xdp_put_umem(pool->umem, false); + xp_destroy(pool); +} + +void xp_get_pool(struct xsk_buff_pool *pool) +{ + refcount_inc(&pool->users); +} + +bool xp_put_pool(struct xsk_buff_pool *pool) +{ + if (!pool) + return false; + + if (refcount_dec_and_test(&pool->users)) { + INIT_WORK(&pool->work, xp_release_deferred); + schedule_work(&pool->work); + return true; + } + + return false; +} + +static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool) +{ + struct xsk_dma_map *dma_map; + + list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) { + if (dma_map->netdev == pool->netdev) + return dma_map; + } + + return NULL; +} + +static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_device *netdev, + u32 nr_pages, struct xdp_umem *umem) +{ + struct xsk_dma_map *dma_map; + + dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL); + if (!dma_map) + return NULL; + + dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL); + if (!dma_map->dma_pages) { + kfree(dma_map); + return NULL; + } + + dma_map->netdev = netdev; + dma_map->dev = dev; + dma_map->dma_need_sync = false; + dma_map->dma_pages_cnt = nr_pages; + refcount_set(&dma_map->users, 1); + list_add(&dma_map->list, &umem->xsk_dma_list); + return dma_map; +} + +static void xp_destroy_dma_map(struct xsk_dma_map *dma_map) +{ + list_del(&dma_map->list); + kvfree(dma_map->dma_pages); + kfree(dma_map); +} + +static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs) +{ + dma_addr_t *dma; + u32 i; + + for (i = 0; i < dma_map->dma_pages_cnt; i++) { + dma = &dma_map->dma_pages[i]; + if (*dma) { + *dma &= ~XSK_NEXT_PG_CONTIG_MASK; + dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + *dma = 0; + } + } + + xp_destroy_dma_map(dma_map); +} + +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) +{ + struct xsk_dma_map *dma_map; + + if (!pool->dma_pages) + return; + + dma_map = xp_find_dma_map(pool); + if (!dma_map) { + WARN(1, "Could not find dma_map for device"); + return; + } + + if (!refcount_dec_and_test(&dma_map->users)) + return; + + __xp_dma_unmap(dma_map, attrs); + kvfree(pool->dma_pages); + pool->dma_pages = NULL; + pool->dma_pages_cnt = 0; + pool->dev = NULL; +} +EXPORT_SYMBOL(xp_dma_unmap); + +static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map) +{ + u32 i; + + for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) { + if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1]) + dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; + else + dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; + } +} + +static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map) +{ + if (!pool->unaligned) { + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) { + struct xdp_buff_xsk *xskb = &pool->heads[i]; + + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + } + } + + pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL); + if (!pool->dma_pages) + return -ENOMEM; + + pool->dev = dma_map->dev; + pool->dma_pages_cnt = dma_map->dma_pages_cnt; + pool->dma_need_sync = dma_map->dma_need_sync; + memcpy(pool->dma_pages, dma_map->dma_pages, + pool->dma_pages_cnt * sizeof(*pool->dma_pages)); + + return 0; +} + +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages) +{ + struct xsk_dma_map *dma_map; + dma_addr_t dma; + int err; + u32 i; + + dma_map = xp_find_dma_map(pool); + if (dma_map) { + err = xp_init_dma_info(pool, dma_map); + if (err) + return err; + + refcount_inc(&dma_map->users); + return 0; + } + + dma_map = xp_create_dma_map(dev, pool->netdev, nr_pages, pool->umem); + if (!dma_map) + return -ENOMEM; + + for (i = 0; i < dma_map->dma_pages_cnt; i++) { + dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + if (dma_mapping_error(dev, dma)) { + __xp_dma_unmap(dma_map, attrs); + return -ENOMEM; + } + if (dma_need_sync(dev, dma)) + dma_map->dma_need_sync = true; + dma_map->dma_pages[i] = dma; + } + + if (pool->unaligned) + xp_check_dma_contiguity(dma_map); + + err = xp_init_dma_info(pool, dma_map); + if (err) { + __xp_dma_unmap(dma_map, attrs); + return err; + } + + return 0; +} +EXPORT_SYMBOL(xp_dma_map); + +static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr) +{ + return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); +} + +static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_unaligned_extract_addr(*addr); + if (*addr >= pool->addrs_cnt || + *addr + pool->chunk_size > pool->addrs_cnt || + xp_addr_crosses_non_contig_pg(pool, *addr)) + return false; + return true; +} + +static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_aligned_extract_addr(pool, *addr); + return *addr < pool->addrs_cnt; +} + +static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + if (pool->free_heads_cnt == 0) + return NULL; + + for (;;) { + if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + pool->fq->queue_empty_descs++; + return NULL; + } + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (!ok) { + pool->fq->invalid_descs++; + xskq_cons_release(pool->fq); + continue; + } + break; + } + + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; + } + + xskq_cons_release(pool->fq); + return xskb; +} + +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + + if (!pool->free_list_cnt) { + xskb = __xp_alloc(pool); + if (!xskb) + return NULL; + } else { + pool->free_list_cnt--; + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, + free_list_node); + list_del_init(&xskb->free_list_node); + } + + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; + xskb->xdp.data_meta = xskb->xdp.data; + xskb->xdp.flags = 0; + + if (pool->dma_need_sync) { + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, + pool->frame_len, + DMA_BIDIRECTIONAL); + } + return &xskb->xdp; +} +EXPORT_SYMBOL(xp_alloc); + +static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + u32 i, cached_cons, nb_entries; + + if (max > pool->free_heads_cnt) + max = pool->free_heads_cnt; + max = xskq_cons_nb_entries(pool->fq, max); + + cached_cons = pool->fq->cached_cons; + nb_entries = max; + i = max; + while (i--) { + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + __xskq_cons_read_addr_unchecked(pool->fq, cached_cons++, &addr); + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (unlikely(!ok)) { + pool->fq->invalid_descs++; + nb_entries--; + continue; + } + + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; + } + + *xdp = &xskb->xdp; + xdp++; + } + + xskq_cons_release_n(pool->fq, max); + return nb_entries; +} + +static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 nb_entries) +{ + struct xdp_buff_xsk *xskb; + u32 i; + + nb_entries = min_t(u32, nb_entries, pool->free_list_cnt); + + i = nb_entries; + while (i--) { + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); + list_del_init(&xskb->free_list_node); + + *xdp = &xskb->xdp; + xdp++; + } + pool->free_list_cnt -= nb_entries; + + return nb_entries; +} + +u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + u32 nb_entries1 = 0, nb_entries2; + + if (unlikely(pool->dma_need_sync)) { + struct xdp_buff *buff; + + /* Slow path */ + buff = xp_alloc(pool); + if (buff) + *xdp = buff; + return !!buff; + } + + if (unlikely(pool->free_list_cnt)) { + nb_entries1 = xp_alloc_reused(pool, xdp, max); + if (nb_entries1 == max) + return nb_entries1; + + max -= nb_entries1; + xdp += nb_entries1; + } + + nb_entries2 = xp_alloc_new_from_fq(pool, xdp, max); + if (!nb_entries2) + pool->fq->queue_empty_descs++; + + return nb_entries1 + nb_entries2; +} +EXPORT_SYMBOL(xp_alloc_batch); + +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) +{ + if (pool->free_list_cnt >= count) + return true; + return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); +} +EXPORT_SYMBOL(xp_can_alloc); + +void xp_free(struct xdp_buff_xsk *xskb) +{ + if (!list_empty(&xskb->free_list_node)) + return; + + xskb->pool->free_list_cnt++; + list_add(&xskb->free_list_node, &xskb->pool->free_list); +} +EXPORT_SYMBOL(xp_free); + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return pool->addrs + addr; +} +EXPORT_SYMBOL(xp_raw_get_data); + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); +} +EXPORT_SYMBOL(xp_raw_get_dma); + +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb) +{ + dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, + xskb->pool->frame_len, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow); + +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size) +{ + dma_sync_single_range_for_device(pool->dev, dma, 0, + size, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_device_slow); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c new file mode 100644 index 0000000000..22b36c8143 --- /dev/null +++ b/net/xdp/xsk_diag.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets monitoring support + * + * Copyright(c) 2019 Intel Corporation. + * + * Author: Björn Töpel <bjorn.topel@intel.com> + */ + +#include <linux/module.h> +#include <net/xdp_sock.h> +#include <linux/xdp_diag.h> +#include <linux/sock_diag.h> + +#include "xsk_queue.h" +#include "xsk.h" + +static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_info di = {}; + + di.ifindex = xs->dev ? xs->dev->ifindex : 0; + di.queue_id = xs->queue_id; + return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di); +} + +static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type, + struct sk_buff *nlskb) +{ + struct xdp_diag_ring dr = {}; + + dr.entries = queue->nentries; + return nla_put(nlskb, nl_type, sizeof(dr), &dr); +} + +static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs, + struct sk_buff *nlskb) +{ + int err = 0; + + if (xs->rx) + err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb); + if (!err && xs->tx) + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb); + return err; +} + +static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xsk_buff_pool *pool = xs->pool; + struct xdp_umem *umem = xs->umem; + struct xdp_diag_umem du = {}; + int err; + + if (!umem) + return 0; + + du.id = umem->id; + du.size = umem->size; + du.num_pages = umem->npgs; + du.chunk_size = umem->chunk_size; + du.headroom = umem->headroom; + du.ifindex = (pool && pool->netdev) ? pool->netdev->ifindex : 0; + du.queue_id = pool ? pool->queue_id : 0; + du.flags = 0; + if (umem->zc) + du.flags |= XDP_DU_F_ZEROCOPY; + du.refs = refcount_read(&umem->users); + + err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); + if (!err && pool && pool->fq) + err = xsk_diag_put_ring(pool->fq, + XDP_DIAG_UMEM_FILL_RING, nlskb); + if (!err && pool && pool->cq) + err = xsk_diag_put_ring(pool->cq, + XDP_DIAG_UMEM_COMPLETION_RING, nlskb); + return err; +} + +static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_stats du = {}; + + du.n_rx_dropped = xs->rx_dropped; + du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx); + du.n_rx_full = xs->rx_queue_full; + du.n_fill_ring_empty = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; + du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx); + du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx); + return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du); +} + +static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, + struct xdp_diag_req *req, + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags, int sk_ino) +{ + struct xdp_sock *xs = xdp_sk(sk); + struct xdp_diag_msg *msg; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg), + flags); + if (!nlh) + return -EMSGSIZE; + + msg = nlmsg_data(nlh); + memset(msg, 0, sizeof(*msg)); + msg->xdiag_family = AF_XDP; + msg->xdiag_type = sk->sk_type; + msg->xdiag_ino = sk_ino; + sock_diag_save_cookie(sk, msg->xdiag_cookie); + + mutex_lock(&xs->mutex); + if (READ_ONCE(xs->state) == XSK_UNBOUND) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_INFO) && + nla_put_u32(nlskb, XDP_DIAG_UID, + from_kuid_munged(user_ns, sock_i_uid(sk)))) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_RING_CFG) && + xsk_diag_put_rings_cfg(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_UMEM) && + xsk_diag_put_umem(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_MEMINFO) && + sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_STATS) && + xsk_diag_put_stats(xs, nlskb)) + goto out_nlmsg_trim; + + mutex_unlock(&xs->mutex); + nlmsg_end(nlskb, nlh); + return 0; + +out_nlmsg_trim: + mutex_unlock(&xs->mutex); + nlmsg_cancel(nlskb, nlh); + return -EMSGSIZE; +} + +static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb) +{ + struct xdp_diag_req *req = nlmsg_data(cb->nlh); + struct net *net = sock_net(nlskb->sk); + int num = 0, s_num = cb->args[0]; + struct sock *sk; + + mutex_lock(&net->xdp.lock); + + sk_for_each(sk, &net->xdp.list) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num++ < s_num) + continue; + + if (xsk_diag_fill(sk, nlskb, req, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + num--; + break; + } + } + + mutex_unlock(&net->xdp.lock); + cb->args[0] = num; + return nlskb->len; +} + +static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr) +{ + struct netlink_dump_control c = { .dump = xsk_diag_dump }; + int hdrlen = sizeof(struct xdp_diag_req); + struct net *net = sock_net(nlskb->sk); + + if (nlmsg_len(hdr) < hdrlen) + return -EINVAL; + + if (!(hdr->nlmsg_flags & NLM_F_DUMP)) + return -EOPNOTSUPP; + + return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c); +} + +static const struct sock_diag_handler xsk_diag_handler = { + .family = AF_XDP, + .dump = xsk_diag_handler_dump, +}; + +static int __init xsk_diag_init(void) +{ + return sock_diag_register(&xsk_diag_handler); +} + +static void __exit xsk_diag_exit(void) +{ + sock_diag_unregister(&xsk_diag_handler); +} + +module_init(xsk_diag_init); +module_exit(xsk_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c new file mode 100644 index 0000000000..d2c2640300 --- /dev/null +++ b/net/xdp/xsk_queue.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + */ + +#include <linux/log2.h> +#include <linux/slab.h> +#include <linux/overflow.h> +#include <linux/vmalloc.h> +#include <net/xdp_sock_drv.h> + +#include "xsk_queue.h" + +static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) +{ + struct xdp_umem_ring *umem_ring; + struct xdp_rxtx_ring *rxtx_ring; + + if (umem_queue) + return struct_size(umem_ring, desc, q->nentries); + return struct_size(rxtx_ring, desc, q->nentries); +} + +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) +{ + struct xsk_queue *q; + size_t size; + + q = kzalloc(sizeof(*q), GFP_KERNEL); + if (!q) + return NULL; + + q->nentries = nentries; + q->ring_mask = nentries - 1; + + size = xskq_get_ring_size(q, umem_queue); + + /* size which is overflowing or close to SIZE_MAX will become 0 in + * PAGE_ALIGN(), checking SIZE_MAX is enough due to the previous + * is_power_of_2(), the rest will be handled by vmalloc_user() + */ + if (unlikely(size == SIZE_MAX)) { + kfree(q); + return NULL; + } + + size = PAGE_ALIGN(size); + + q->ring = vmalloc_user(size); + if (!q->ring) { + kfree(q); + return NULL; + } + + q->ring_vmalloc_size = size; + return q; +} + +void xskq_destroy(struct xsk_queue *q) +{ + if (!q) + return; + + vfree(q->ring); + kfree(q); +} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h new file mode 100644 index 0000000000..13354a1e42 --- /dev/null +++ b/net/xdp/xsk_queue.h @@ -0,0 +1,462 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef _LINUX_XSK_QUEUE_H +#define _LINUX_XSK_QUEUE_H + +#include <linux/types.h> +#include <linux/if_xdp.h> +#include <net/xdp_sock.h> +#include <net/xsk_buff_pool.h> + +#include "xsk.h" + +struct xdp_ring { + u32 producer ____cacheline_aligned_in_smp; + /* Hinder the adjacent cache prefetcher to prefetch the consumer + * pointer if the producer pointer is touched and vice versa. + */ + u32 pad1 ____cacheline_aligned_in_smp; + u32 consumer ____cacheline_aligned_in_smp; + u32 pad2 ____cacheline_aligned_in_smp; + u32 flags; + u32 pad3 ____cacheline_aligned_in_smp; +}; + +/* Used for the RX and TX queues for packets */ +struct xdp_rxtx_ring { + struct xdp_ring ptrs; + struct xdp_desc desc[] ____cacheline_aligned_in_smp; +}; + +/* Used for the fill and completion queues for buffers */ +struct xdp_umem_ring { + struct xdp_ring ptrs; + u64 desc[] ____cacheline_aligned_in_smp; +}; + +struct xsk_queue { + u32 ring_mask; + u32 nentries; + u32 cached_prod; + u32 cached_cons; + struct xdp_ring *ring; + u64 invalid_descs; + u64 queue_empty_descs; + size_t ring_vmalloc_size; +}; + +struct parsed_desc { + u32 mb; + u32 valid; +}; + +/* The structure of the shared state of the rings are a simple + * circular buffer, as outlined in + * Documentation/core-api/circular-buffers.rst. For the Rx and + * completion ring, the kernel is the producer and user space is the + * consumer. For the Tx and fill rings, the kernel is the consumer and + * user space is the producer. + * + * producer consumer + * + * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C) + * STORE $data LOAD $data + * STORE.rel ->producer (B) STORE.rel ->consumer (D) + * } + * + * (A) pairs with (D), and (B) pairs with (C). + * + * Starting with (B), it protects the data from being written after + * the producer pointer. If this barrier was missing, the consumer + * could observe the producer pointer being set and thus load the data + * before the producer has written the new data. The consumer would in + * this case load the old data. + * + * (C) protects the consumer from speculatively loading the data before + * the producer pointer actually has been read. If we do not have this + * barrier, some architectures could load old data as speculative loads + * are not discarded as the CPU does not know there is a dependency + * between ->producer and data. + * + * (A) is a control dependency that separates the load of ->consumer + * from the stores of $data. In case ->consumer indicates there is no + * room in the buffer to store $data we do not. The dependency will + * order both of the stores after the loads. So no barrier is needed. + * + * (D) protects the load of the data to be observed to happen after the + * store of the consumer pointer. If we did not have this memory + * barrier, the producer could observe the consumer pointer being set + * and overwrite the data with a new value before the consumer got the + * chance to read the old value. The consumer would thus miss reading + * the old entry and very likely read the new entry twice, once right + * now and again after circling through the ring. + */ + +/* The operations on the rings are the following: + * + * producer consumer + * + * RESERVE entries PEEK in the ring for entries + * WRITE data into the ring READ data from the ring + * SUBMIT entries RELEASE entries + * + * The producer reserves one or more entries in the ring. It can then + * fill in these entries and finally submit them so that they can be + * seen and read by the consumer. + * + * The consumer peeks into the ring to see if the producer has written + * any new entries. If so, the consumer can then read these entries + * and when it is done reading them release them back to the producer + * so that the producer can use these slots to fill in new entries. + * + * The function names below reflect these operations. + */ + +/* Functions that read and validate content from consumer rings. */ + +static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; + + *addr = ring->desc[idx]; +} + +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_cons != q->cached_prod) { + __xskq_cons_read_addr_unchecked(q, q->cached_cons, addr); + return true; + } + + return false; +} + +static inline bool xp_unused_options_set(u32 options) +{ + return options & ~XDP_PKT_CONTD; +} + +static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 offset = desc->addr & (pool->chunk_size - 1); + + if (!desc->len) + return false; + + if (offset + desc->len > pool->chunk_size) + return false; + + if (desc->addr >= pool->addrs_cnt) + return false; + + if (xp_unused_options_set(desc->options)) + return false; + return true; +} + +static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 addr = xp_unaligned_add_offset_to_addr(desc->addr); + + if (!desc->len) + return false; + + if (desc->len > pool->chunk_size) + return false; + + if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + return false; + + if (xp_unused_options_set(desc->options)) + return false; + return true; +} + +static inline bool xp_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : + xp_aligned_validate_desc(pool, desc); +} + +static inline bool xskq_has_descs(struct xsk_queue *q) +{ + return q->cached_cons != q->cached_prod; +} + +static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, + struct xdp_desc *d, + struct xsk_buff_pool *pool) +{ + if (!xp_validate_desc(pool, d)) { + q->invalid_descs++; + return false; + } + return true; +} + +static inline bool xskq_cons_read_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xsk_buff_pool *pool) +{ + if (q->cached_cons != q->cached_prod) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = q->cached_cons & q->ring_mask; + + *desc = ring->desc[idx]; + return xskq_cons_is_valid_desc(q, desc, pool); + } + + q->queue_empty_descs++; + return false; +} + +static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons += cnt; +} + +static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, + struct xdp_desc *desc, struct parsed_desc *parsed) +{ + parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); + parsed->mb = xp_mb_desc(desc); +} + +static inline +u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, + u32 max) +{ + u32 cached_cons = q->cached_cons, nb_entries = 0; + struct xdp_desc *descs = pool->tx_descs; + u32 total_descs = 0, nr_frags = 0; + + /* track first entry, if stumble upon *any* invalid descriptor, rewind + * current packet that consists of frags and stop the processing + */ + while (cached_cons != q->cached_prod && nb_entries < max) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; + struct parsed_desc parsed; + + descs[nb_entries] = ring->desc[idx]; + cached_cons++; + parse_desc(q, pool, &descs[nb_entries], &parsed); + if (unlikely(!parsed.valid)) + break; + + if (likely(!parsed.mb)) { + total_descs += (nr_frags + 1); + nr_frags = 0; + } else { + nr_frags++; + if (nr_frags == pool->netdev->xdp_zc_max_segs) { + nr_frags = 0; + break; + } + } + nb_entries++; + } + + cached_cons -= nr_frags; + /* Release valid plus any invalid entries */ + xskq_cons_release_n(q, cached_cons - q->cached_cons); + return total_descs; +} + +/* Functions for consumers */ + +static inline void __xskq_cons_release(struct xsk_queue *q) +{ + smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */ +} + +static inline void __xskq_cons_peek(struct xsk_queue *q) +{ + /* Refresh the local pointer */ + q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */ +} + +static inline void xskq_cons_get_entries(struct xsk_queue *q) +{ + __xskq_cons_release(q); + __xskq_cons_peek(q); +} + +static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) +{ + u32 entries = q->cached_prod - q->cached_cons; + + if (entries >= max) + return max; + + __xskq_cons_peek(q); + entries = q->cached_prod - q->cached_cons; + + return entries >= max ? max : entries; +} + +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +{ + return xskq_cons_nb_entries(q, cnt) >= cnt; +} + +static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr_unchecked(q, addr); +} + +static inline bool xskq_cons_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xsk_buff_pool *pool) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_desc(q, desc, pool); +} + +/* To improve performance in the xskq_cons_release functions, only update local state here. + * Reflect this to global state when we get new entries from the ring in + * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. + */ +static inline void xskq_cons_release(struct xsk_queue *q) +{ + q->cached_cons++; +} + +static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons -= cnt; +} + +static inline u32 xskq_cons_present_entries(struct xsk_queue *q) +{ + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer); +} + +/* Functions for producers */ + +static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) +{ + u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + if (free_entries >= max) + return max; + + /* Refresh the local tail pointer */ + q->cached_cons = READ_ONCE(q->ring->consumer); + free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + return free_entries >= max ? max : free_entries; +} + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + return xskq_prod_nb_free(q, 1) ? false : true; +} + +static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_prod -= cnt; +} + +static inline int xskq_prod_reserve(struct xsk_queue *q) +{ + if (xskq_prod_is_full(q)) + return -ENOSPC; + + /* A, matches D */ + q->cached_prod++; + return 0; +} + +static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + if (xskq_prod_is_full(q)) + return -ENOSPC; + + /* A, matches D */ + ring->desc[q->cached_prod++ & q->ring_mask] = addr; + return 0; +} + +static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, + u32 nb_entries) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 i, cached_prod; + + /* A, matches D */ + cached_prod = q->cached_prod; + for (i = 0; i < nb_entries; i++) + ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; + q->cached_prod = cached_prod; +} + +static inline int xskq_prod_reserve_desc(struct xsk_queue *q, + u64 addr, u32 len, u32 flags) +{ + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx; + + if (xskq_prod_is_full(q)) + return -ENOBUFS; + + /* A, matches D */ + idx = q->cached_prod++ & q->ring_mask; + ring->desc[idx].addr = addr; + ring->desc[idx].len = len; + ring->desc[idx].options = flags; + + return 0; +} + +static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) +{ + smp_store_release(&q->ring->producer, idx); /* B, matches C */ +} + +static inline void xskq_prod_submit(struct xsk_queue *q) +{ + __xskq_prod_submit(q, q->cached_prod); +} + +static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) +{ + __xskq_prod_submit(q, q->ring->producer + nb_entries); +} + +static inline bool xskq_prod_is_empty(struct xsk_queue *q) +{ + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); +} + +/* For both producers and consumers */ + +static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +{ + return q ? q->invalid_descs : 0; +} + +static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) +{ + return q ? q->queue_empty_descs : 0; +} + +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); +void xskq_destroy(struct xsk_queue *q_ops); + +#endif /* _LINUX_XSK_QUEUE_H */ diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c new file mode 100644 index 0000000000..e1c526f97c --- /dev/null +++ b/net/xdp/xskmap.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + */ + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <net/xdp_sock.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/btf_ids.h> + +#include "xsk.h" + +static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, + struct xdp_sock __rcu **map_entry) +{ + struct xsk_map_node *node; + + node = bpf_map_kzalloc(&map->map, sizeof(*node), + GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return ERR_PTR(-ENOMEM); + + bpf_map_inc(&map->map); + atomic_inc(&map->count); + + node->map = map; + node->map_entry = map_entry; + return node; +} + +static void xsk_map_node_free(struct xsk_map_node *node) +{ + struct xsk_map *map = node->map; + + bpf_map_put(&node->map->map); + kfree(node); + atomic_dec(&map->count); +} + +static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) +{ + spin_lock_bh(&xs->map_list_lock); + list_add_tail(&node->node, &xs->map_list); + spin_unlock_bh(&xs->map_list_lock); +} + +static void xsk_map_sock_delete(struct xdp_sock *xs, + struct xdp_sock __rcu **map_entry) +{ + struct xsk_map_node *n, *tmp; + + spin_lock_bh(&xs->map_list_lock); + list_for_each_entry_safe(n, tmp, &xs->map_list, node) { + if (map_entry == n->map_entry) { + list_del(&n->node); + xsk_map_node_free(n); + } + } + spin_unlock_bh(&xs->map_list_lock); +} + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + struct xsk_map *m; + int numa_node; + u64 size; + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + numa_node = bpf_map_attr_numa_node(attr); + size = struct_size(m, xsk_map, attr->max_entries); + + m = bpf_map_area_alloc(size, numa_node); + if (!m) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&m->map, attr); + spin_lock_init(&m->lock); + + return &m->map; +} + +static u64 xsk_map_mem_usage(const struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + return struct_size(m, xsk_map, map->max_entries) + + (u64)atomic_read(&m->count) * sizeof(struct xsk_map_node); +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + synchronize_net(); + bpf_map_area_free(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; + struct bpf_insn *insn = insn_buf; + + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); + *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + return insn - insn_buf; +} + +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ +static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + if (key >= map->max_entries) + return NULL; + + return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held()); +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock __rcu **map_entry; + struct xdp_sock *xs, *old_xs; + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xsk_map_node *node; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + map_entry = &m->xsk_map[i]; + node = xsk_map_node_alloc(m, map_entry); + if (IS_ERR(node)) { + sockfd_put(sock); + return PTR_ERR(node); + } + + spin_lock_bh(&m->lock); + old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock)); + if (old_xs == xs) { + err = 0; + goto out; + } else if (old_xs && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto out; + } else if (!old_xs && map_flags == BPF_EXIST) { + err = -ENOENT; + goto out; + } + xsk_map_sock_add(xs, node); + rcu_assign_pointer(*map_entry, xs); + if (old_xs) + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); + sockfd_put(sock); + return 0; + +out: + spin_unlock_bh(&m->lock); + sockfd_put(sock); + xsk_map_node_free(node); + return err; +} + +static long xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock __rcu **map_entry; + struct xdp_sock *old_xs; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + spin_lock_bh(&m->lock); + map_entry = &m->xsk_map[k]; + old_xs = unrcu_pointer(xchg(map_entry, NULL)); + if (old_xs) + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); + + return 0; +} + +static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags) +{ + return __bpf_xdp_redirect_map(map, index, flags, 0, + __xsk_map_lookup_elem); +} + +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock __rcu **map_entry) +{ + spin_lock_bh(&map->lock); + if (rcu_access_pointer(*map_entry) == xs) { + rcu_assign_pointer(*map_entry, NULL); + xsk_map_sock_delete(xs, map_entry); + } + spin_unlock_bh(&map->lock); +} + +static bool xsk_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + return meta0->max_entries == meta1->max_entries && + bpf_map_meta_equal(meta0, meta1); +} + +BTF_ID_LIST_SINGLE(xsk_map_btf_ids, struct, xsk_map) +const struct bpf_map_ops xsk_map_ops = { + .map_meta_equal = xsk_map_meta_equal, + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_gen_lookup = xsk_map_gen_lookup, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, + .map_mem_usage = xsk_map_mem_usage, + .map_btf_id = &xsk_map_btf_ids[0], + .map_redirect = xsk_map_redirect, +}; |