summaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/mellanox/mlx5/core/en/xsk
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/en/xsk')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c230
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h27
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c311
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h23
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c191
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h21
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c123
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h15
8 files changed, 941 insertions, 0 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
new file mode 100644
index 000000000..ebada0c5a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019-2020, Mellanox Technologies inc. All rights reserved. */
+
+#include <net/xdp_sock_drv.h>
+#include "pool.h"
+#include "setup.h"
+#include "en/params.h"
+
+static int mlx5e_xsk_map_pool(struct mlx5e_priv *priv,
+ struct xsk_buff_pool *pool)
+{
+ struct device *dev = mlx5_core_dma_dev(priv->mdev);
+
+ return xsk_pool_dma_map(pool, dev, DMA_ATTR_SKIP_CPU_SYNC);
+}
+
+static void mlx5e_xsk_unmap_pool(struct mlx5e_priv *priv,
+ struct xsk_buff_pool *pool)
+{
+ return xsk_pool_dma_unmap(pool, DMA_ATTR_SKIP_CPU_SYNC);
+}
+
+static int mlx5e_xsk_get_pools(struct mlx5e_xsk *xsk)
+{
+ if (!xsk->pools) {
+ xsk->pools = kcalloc(MLX5E_MAX_NUM_CHANNELS,
+ sizeof(*xsk->pools), GFP_KERNEL);
+ if (unlikely(!xsk->pools))
+ return -ENOMEM;
+ }
+
+ xsk->refcnt++;
+ xsk->ever_used = true;
+
+ return 0;
+}
+
+static void mlx5e_xsk_put_pools(struct mlx5e_xsk *xsk)
+{
+ if (!--xsk->refcnt) {
+ kfree(xsk->pools);
+ xsk->pools = NULL;
+ }
+}
+
+static int mlx5e_xsk_add_pool(struct mlx5e_xsk *xsk, struct xsk_buff_pool *pool, u16 ix)
+{
+ int err;
+
+ err = mlx5e_xsk_get_pools(xsk);
+ if (unlikely(err))
+ return err;
+
+ xsk->pools[ix] = pool;
+ return 0;
+}
+
+static void mlx5e_xsk_remove_pool(struct mlx5e_xsk *xsk, u16 ix)
+{
+ xsk->pools[ix] = NULL;
+
+ mlx5e_xsk_put_pools(xsk);
+}
+
+static bool mlx5e_xsk_is_pool_sane(struct xsk_buff_pool *pool)
+{
+ return xsk_pool_get_headroom(pool) <= 0xffff &&
+ xsk_pool_get_chunk_size(pool) <= 0xffff;
+}
+
+void mlx5e_build_xsk_param(struct xsk_buff_pool *pool, struct mlx5e_xsk_param *xsk)
+{
+ xsk->headroom = xsk_pool_get_headroom(pool);
+ xsk->chunk_size = xsk_pool_get_chunk_size(pool);
+ xsk->unaligned = pool->unaligned;
+}
+
+static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
+ struct xsk_buff_pool *pool, u16 ix)
+{
+ struct mlx5e_params *params = &priv->channels.params;
+ struct mlx5e_xsk_param xsk;
+ struct mlx5e_channel *c;
+ int err;
+
+ if (unlikely(mlx5e_xsk_get_pool(&priv->channels.params, &priv->xsk, ix)))
+ return -EBUSY;
+
+ if (unlikely(!mlx5e_xsk_is_pool_sane(pool)))
+ return -EINVAL;
+
+ err = mlx5e_xsk_map_pool(priv, pool);
+ if (unlikely(err))
+ return err;
+
+ err = mlx5e_xsk_add_pool(&priv->xsk, pool, ix);
+ if (unlikely(err))
+ goto err_unmap_pool;
+
+ mlx5e_build_xsk_param(pool, &xsk);
+
+ if (priv->channels.params.rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ &&
+ mlx5e_mpwrq_umr_mode(priv->mdev, &xsk) == MLX5E_MPWRQ_UMR_MODE_OVERSIZED) {
+ const char *recommendation = is_power_of_2(xsk.chunk_size) ?
+ "Upgrade firmware" : "Disable striding RQ";
+
+ mlx5_core_warn(priv->mdev, "Expected slowdown with XSK frame size %u. %s for better performance.\n",
+ xsk.chunk_size, recommendation);
+ }
+
+ if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+ /* XSK objects will be created on open. */
+ goto validate_closed;
+ }
+
+ if (!params->xdp_prog) {
+ /* XSK objects will be created when an XDP program is set,
+ * and the channels are reopened.
+ */
+ goto validate_closed;
+ }
+
+ c = priv->channels.c[ix];
+
+ err = mlx5e_open_xsk(priv, params, &xsk, pool, c);
+ if (unlikely(err))
+ goto err_remove_pool;
+
+ mlx5e_activate_xsk(c);
+ mlx5e_trigger_napi_icosq(c);
+
+ /* Don't wait for WQEs, because the newer xdpsock sample doesn't provide
+ * any Fill Ring entries at the setup stage.
+ */
+
+ mlx5e_rx_res_xsk_update(priv->rx_res, &priv->channels, ix, true);
+
+ mlx5e_deactivate_rq(&c->rq);
+ mlx5e_flush_rq(&c->rq, MLX5_RQC_STATE_RDY);
+
+ return 0;
+
+err_remove_pool:
+ mlx5e_xsk_remove_pool(&priv->xsk, ix);
+
+err_unmap_pool:
+ mlx5e_xsk_unmap_pool(priv, pool);
+
+ return err;
+
+validate_closed:
+ /* Check the configuration in advance, rather than fail at a later stage
+ * (in mlx5e_xdp_set or on open) and end up with no channels.
+ */
+ if (!mlx5e_validate_xsk_param(params, &xsk, priv->mdev)) {
+ err = -EINVAL;
+ goto err_remove_pool;
+ }
+
+ return 0;
+}
+
+static int mlx5e_xsk_disable_locked(struct mlx5e_priv *priv, u16 ix)
+{
+ struct xsk_buff_pool *pool = mlx5e_xsk_get_pool(&priv->channels.params,
+ &priv->xsk, ix);
+ struct mlx5e_channel *c;
+
+ if (unlikely(!pool))
+ return -EINVAL;
+
+ if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+ goto remove_pool;
+
+ /* XSK RQ and SQ are only created if XDP program is set. */
+ if (!priv->channels.params.xdp_prog)
+ goto remove_pool;
+
+ c = priv->channels.c[ix];
+
+ mlx5e_activate_rq(&c->rq);
+ mlx5e_trigger_napi_icosq(c);
+ mlx5e_wait_for_min_rx_wqes(&c->rq, MLX5E_RQ_WQES_TIMEOUT);
+
+ mlx5e_rx_res_xsk_update(priv->rx_res, &priv->channels, ix, false);
+
+ mlx5e_deactivate_xsk(c);
+ mlx5e_close_xsk(c);
+
+remove_pool:
+ mlx5e_xsk_remove_pool(&priv->xsk, ix);
+ mlx5e_xsk_unmap_pool(priv, pool);
+
+ return 0;
+}
+
+static int mlx5e_xsk_enable_pool(struct mlx5e_priv *priv, struct xsk_buff_pool *pool,
+ u16 ix)
+{
+ int err;
+
+ mutex_lock(&priv->state_lock);
+ err = mlx5e_xsk_enable_locked(priv, pool, ix);
+ mutex_unlock(&priv->state_lock);
+
+ return err;
+}
+
+static int mlx5e_xsk_disable_pool(struct mlx5e_priv *priv, u16 ix)
+{
+ int err;
+
+ mutex_lock(&priv->state_lock);
+ err = mlx5e_xsk_disable_locked(priv, ix);
+ mutex_unlock(&priv->state_lock);
+
+ return err;
+}
+
+int mlx5e_xsk_setup_pool(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5e_params *params = &priv->channels.params;
+
+ if (unlikely(qid >= params->num_channels))
+ return -EINVAL;
+
+ return pool ? mlx5e_xsk_enable_pool(priv, pool, qid) :
+ mlx5e_xsk_disable_pool(priv, qid);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h
new file mode 100644
index 000000000..dca0010a0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019-2020, Mellanox Technologies inc. All rights reserved. */
+
+#ifndef __MLX5_EN_XSK_POOL_H__
+#define __MLX5_EN_XSK_POOL_H__
+
+#include "en.h"
+
+static inline struct xsk_buff_pool *mlx5e_xsk_get_pool(struct mlx5e_params *params,
+ struct mlx5e_xsk *xsk, u16 ix)
+{
+ if (!xsk || !xsk->pools)
+ return NULL;
+
+ if (unlikely(ix >= params->num_channels))
+ return NULL;
+
+ return xsk->pools[ix];
+}
+
+struct mlx5e_xsk_param;
+void mlx5e_build_xsk_param(struct xsk_buff_pool *pool, struct mlx5e_xsk_param *xsk);
+
+/* .ndo_bpf callback. */
+int mlx5e_xsk_setup_pool(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid);
+
+#endif /* __MLX5_EN_XSK_POOL_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
new file mode 100644
index 000000000..c91b54d9f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include "rx.h"
+#include "en/xdp.h"
+#include <net/xdp_sock_drv.h>
+#include <linux/filter.h>
+
+/* RX data path */
+
+int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
+{
+ struct mlx5e_mpw_info *wi = mlx5e_get_mpw_info(rq, ix);
+ struct mlx5e_icosq *icosq = rq->icosq;
+ struct mlx5_wq_cyc *wq = &icosq->wq;
+ struct mlx5e_umr_wqe *umr_wqe;
+ int batch, i;
+ u32 offset; /* 17-bit value with MTT. */
+ u16 pi;
+
+ if (unlikely(!xsk_buff_can_alloc(rq->xsk_pool, rq->mpwqe.pages_per_wqe)))
+ goto err;
+
+ BUILD_BUG_ON(sizeof(wi->alloc_units[0]) != sizeof(wi->alloc_units[0].xsk));
+ batch = xsk_buff_alloc_batch(rq->xsk_pool, (struct xdp_buff **)wi->alloc_units,
+ rq->mpwqe.pages_per_wqe);
+
+ /* If batch < pages_per_wqe, either:
+ * 1. Some (or all) descriptors were invalid.
+ * 2. dma_need_sync is true, and it fell back to allocating one frame.
+ * In either case, try to continue allocating frames one by one, until
+ * the first error, which will mean there are no more valid descriptors.
+ */
+ for (; batch < rq->mpwqe.pages_per_wqe; batch++) {
+ wi->alloc_units[batch].xsk = xsk_buff_alloc(rq->xsk_pool);
+ if (unlikely(!wi->alloc_units[batch].xsk))
+ goto err_reuse_batch;
+ }
+
+ pi = mlx5e_icosq_get_next_pi(icosq, rq->mpwqe.umr_wqebbs);
+ umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
+ memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
+
+ if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED)) {
+ for (i = 0; i < batch; i++) {
+ dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+ umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
+ .ptag = cpu_to_be64(addr | MLX5_EN_WR),
+ };
+ }
+ } else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) {
+ for (i = 0; i < batch; i++) {
+ dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+ umr_wqe->inline_ksms[i] = (struct mlx5_ksm) {
+ .key = rq->mkey_be,
+ .va = cpu_to_be64(addr),
+ };
+ }
+ } else if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE)) {
+ u32 mapping_size = 1 << (rq->mpwqe.page_shift - 2);
+
+ for (i = 0; i < batch; i++) {
+ dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+ umr_wqe->inline_ksms[i << 2] = (struct mlx5_ksm) {
+ .key = rq->mkey_be,
+ .va = cpu_to_be64(addr),
+ };
+ umr_wqe->inline_ksms[(i << 2) + 1] = (struct mlx5_ksm) {
+ .key = rq->mkey_be,
+ .va = cpu_to_be64(addr + mapping_size),
+ };
+ umr_wqe->inline_ksms[(i << 2) + 2] = (struct mlx5_ksm) {
+ .key = rq->mkey_be,
+ .va = cpu_to_be64(addr + mapping_size * 2),
+ };
+ umr_wqe->inline_ksms[(i << 2) + 3] = (struct mlx5_ksm) {
+ .key = rq->mkey_be,
+ .va = cpu_to_be64(rq->wqe_overflow.addr),
+ };
+ }
+ } else {
+ __be32 pad_size = cpu_to_be32((1 << rq->mpwqe.page_shift) -
+ rq->xsk_pool->chunk_size);
+ __be32 frame_size = cpu_to_be32(rq->xsk_pool->chunk_size);
+
+ for (i = 0; i < batch; i++) {
+ dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+ umr_wqe->inline_klms[i << 1] = (struct mlx5_klm) {
+ .key = rq->mkey_be,
+ .va = cpu_to_be64(addr),
+ .bcount = frame_size,
+ };
+ umr_wqe->inline_klms[(i << 1) + 1] = (struct mlx5_klm) {
+ .key = rq->mkey_be,
+ .va = cpu_to_be64(rq->wqe_overflow.addr),
+ .bcount = pad_size,
+ };
+ }
+ }
+
+ bitmap_zero(wi->xdp_xmit_bitmap, rq->mpwqe.pages_per_wqe);
+ wi->consumed_strides = 0;
+
+ umr_wqe->ctrl.opmod_idx_opcode =
+ cpu_to_be32((icosq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR);
+
+ /* Optimized for speed: keep in sync with mlx5e_mpwrq_umr_entry_size. */
+ offset = ix * rq->mpwqe.mtts_per_wqe;
+ if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED))
+ offset = offset * sizeof(struct mlx5_mtt) / MLX5_OCTWORD;
+ else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_OVERSIZED))
+ offset = offset * sizeof(struct mlx5_klm) * 2 / MLX5_OCTWORD;
+ else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE))
+ offset = offset * sizeof(struct mlx5_ksm) * 4 / MLX5_OCTWORD;
+ umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
+
+ icosq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
+ .wqe_type = MLX5E_ICOSQ_WQE_UMR_RX,
+ .num_wqebbs = rq->mpwqe.umr_wqebbs,
+ .umr.rq = rq,
+ };
+
+ icosq->pc += rq->mpwqe.umr_wqebbs;
+
+ icosq->doorbell_cseg = &umr_wqe->ctrl;
+
+ return 0;
+
+err_reuse_batch:
+ while (--batch >= 0)
+ xsk_buff_free(wi->alloc_units[batch].xsk);
+
+err:
+ rq->stats->buff_alloc_err++;
+ return -ENOMEM;
+}
+
+int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
+{
+ struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+ struct xdp_buff **buffs;
+ u32 contig, alloc;
+ int i;
+
+ /* mlx5e_init_frags_partition creates a 1:1 mapping between
+ * rq->wqe.frags and rq->wqe.alloc_units, which allows us to
+ * allocate XDP buffers straight into alloc_units.
+ */
+ BUILD_BUG_ON(sizeof(rq->wqe.alloc_units[0]) !=
+ sizeof(rq->wqe.alloc_units[0].xsk));
+ buffs = (struct xdp_buff **)rq->wqe.alloc_units;
+ contig = mlx5_wq_cyc_get_size(wq) - ix;
+ if (wqe_bulk <= contig) {
+ alloc = xsk_buff_alloc_batch(rq->xsk_pool, buffs + ix, wqe_bulk);
+ } else {
+ alloc = xsk_buff_alloc_batch(rq->xsk_pool, buffs + ix, contig);
+ if (likely(alloc == contig))
+ alloc += xsk_buff_alloc_batch(rq->xsk_pool, buffs, wqe_bulk - contig);
+ }
+
+ for (i = 0; i < alloc; i++) {
+ int j = mlx5_wq_cyc_ctr2ix(wq, ix + i);
+ struct mlx5e_wqe_frag_info *frag;
+ struct mlx5e_rx_wqe_cyc *wqe;
+ dma_addr_t addr;
+
+ wqe = mlx5_wq_cyc_get_wqe(wq, j);
+ /* Assumes log_num_frags == 0. */
+ frag = &rq->wqe.frags[j];
+
+ addr = xsk_buff_xdp_get_frame_dma(frag->au->xsk);
+ wqe->data[0].addr = cpu_to_be64(addr + rq->buff.headroom);
+ }
+
+ return alloc;
+}
+
+int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
+{
+ struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+ int i;
+
+ for (i = 0; i < wqe_bulk; i++) {
+ int j = mlx5_wq_cyc_ctr2ix(wq, ix + i);
+ struct mlx5e_wqe_frag_info *frag;
+ struct mlx5e_rx_wqe_cyc *wqe;
+ dma_addr_t addr;
+
+ wqe = mlx5_wq_cyc_get_wqe(wq, j);
+ /* Assumes log_num_frags == 0. */
+ frag = &rq->wqe.frags[j];
+
+ frag->au->xsk = xsk_buff_alloc(rq->xsk_pool);
+ if (unlikely(!frag->au->xsk))
+ return i;
+
+ addr = xsk_buff_xdp_get_frame_dma(frag->au->xsk);
+ wqe->data[0].addr = cpu_to_be64(addr + rq->buff.headroom);
+ }
+
+ return wqe_bulk;
+}
+
+static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, struct xdp_buff *xdp)
+{
+ u32 totallen = xdp->data_end - xdp->data_meta;
+ u32 metalen = xdp->data - xdp->data_meta;
+ struct sk_buff *skb;
+
+ skb = napi_alloc_skb(rq->cq.napi, totallen);
+ if (unlikely(!skb)) {
+ rq->stats->buff_alloc_err++;
+ return NULL;
+ }
+
+ skb_put_data(skb, xdp->data_meta, totallen);
+
+ if (metalen) {
+ skb_metadata_set(skb, metalen);
+ __skb_pull(skb, metalen);
+ }
+
+ return skb;
+}
+
+struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
+ struct mlx5e_mpw_info *wi,
+ u16 cqe_bcnt,
+ u32 head_offset,
+ u32 page_idx)
+{
+ struct xdp_buff *xdp = wi->alloc_units[page_idx].xsk;
+ struct bpf_prog *prog;
+
+ /* Check packet size. Note LRO doesn't use linear SKB */
+ if (unlikely(cqe_bcnt > rq->hw_mtu)) {
+ rq->stats->oversize_pkts_sw_drop++;
+ return NULL;
+ }
+
+ /* head_offset is not used in this function, because xdp->data and the
+ * DMA address point directly to the necessary place. Furthermore, in
+ * the current implementation, UMR pages are mapped to XSK frames, so
+ * head_offset should always be 0.
+ */
+ WARN_ON_ONCE(head_offset);
+
+ xsk_buff_set_size(xdp, cqe_bcnt);
+ xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
+ net_prefetch(xdp->data);
+
+ /* Possible flows:
+ * - XDP_REDIRECT to XSKMAP:
+ * The page is owned by the userspace from now.
+ * - XDP_TX and other XDP_REDIRECTs:
+ * The page was returned by ZCA and recycled.
+ * - XDP_DROP:
+ * Recycle the page.
+ * - XDP_PASS:
+ * Allocate an SKB, copy the data and recycle the page.
+ *
+ * Pages to be recycled go to the Reuse Ring on MPWQE deallocation. Its
+ * size is the same as the Driver RX Ring's size, and pages for WQEs are
+ * allocated first from the Reuse Ring, so it has enough space.
+ */
+
+ prog = rcu_dereference(rq->xdp_prog);
+ if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, xdp))) {
+ if (likely(__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)))
+ __set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
+ return NULL; /* page/packet was consumed by XDP */
+ }
+
+ /* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
+ * frame. On SKB allocation failure, NULL is returned.
+ */
+ return mlx5e_xsk_construct_skb(rq, xdp);
+}
+
+struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
+ struct mlx5e_wqe_frag_info *wi,
+ u32 cqe_bcnt)
+{
+ struct xdp_buff *xdp = wi->au->xsk;
+ struct bpf_prog *prog;
+
+ /* wi->offset is not used in this function, because xdp->data and the
+ * DMA address point directly to the necessary place. Furthermore, the
+ * XSK allocator allocates frames per packet, instead of pages, so
+ * wi->offset should always be 0.
+ */
+ WARN_ON_ONCE(wi->offset);
+
+ xsk_buff_set_size(xdp, cqe_bcnt);
+ xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
+ net_prefetch(xdp->data);
+
+ prog = rcu_dereference(rq->xdp_prog);
+ if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, xdp)))
+ return NULL; /* page/packet was consumed by XDP */
+
+ /* XDP_PASS: copy the data from the UMEM to a new SKB. The frame reuse
+ * will be handled by mlx5e_free_rx_wqe.
+ * On SKB allocation failure, NULL is returned.
+ */
+ return mlx5e_xsk_construct_skb(rq, xdp);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
new file mode 100644
index 000000000..087c943bd
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_XSK_RX_H__
+#define __MLX5_EN_XSK_RX_H__
+
+#include "en.h"
+
+/* RX data path */
+
+int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
+int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
+int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
+struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
+ struct mlx5e_mpw_info *wi,
+ u16 cqe_bcnt,
+ u32 head_offset,
+ u32 page_idx);
+struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
+ struct mlx5e_wqe_frag_info *wi,
+ u32 cqe_bcnt);
+
+#endif /* __MLX5_EN_XSK_RX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
new file mode 100644
index 000000000..ff03c4383
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include "setup.h"
+#include "en/params.h"
+#include "en/txrx.h"
+#include "en/health.h"
+#include <net/xdp_sock_drv.h>
+
+/* The limitation of 2048 can be altered, but shouldn't go beyond the minimal
+ * stride size of striding RQ.
+ */
+#define MLX5E_MIN_XSK_CHUNK_SIZE max(2048, XDP_UMEM_MIN_CHUNK_SIZE)
+
+bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5_core_dev *mdev)
+{
+ /* AF_XDP doesn't support frames larger than PAGE_SIZE. */
+ if (xsk->chunk_size > PAGE_SIZE || xsk->chunk_size < MLX5E_MIN_XSK_CHUNK_SIZE)
+ return false;
+
+ /* frag_sz is different for regular and XSK RQs, so ensure that linear
+ * SKB mode is possible.
+ */
+ switch (params->rq_wq_type) {
+ case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+ return !mlx5e_mpwrq_validate_xsk(mdev, params, xsk);
+ default: /* MLX5_WQ_TYPE_CYCLIC */
+ return mlx5e_rx_is_linear_skb(mdev, params, xsk);
+ }
+}
+
+static void mlx5e_build_xsk_cparam(struct mlx5_core_dev *mdev,
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ u16 q_counter,
+ struct mlx5e_channel_param *cparam)
+{
+ mlx5e_build_rq_param(mdev, params, xsk, q_counter, &cparam->rq);
+ mlx5e_build_xdpsq_param(mdev, params, xsk, &cparam->xdp_sq);
+}
+
+static int mlx5e_init_xsk_rq(struct mlx5e_channel *c,
+ struct mlx5e_params *params,
+ struct xsk_buff_pool *pool,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5e_rq *rq)
+{
+ struct mlx5_core_dev *mdev = c->mdev;
+ int rq_xdp_ix;
+ int err;
+
+ rq->wq_type = params->rq_wq_type;
+ rq->pdev = c->pdev;
+ rq->netdev = c->netdev;
+ rq->priv = c->priv;
+ rq->tstamp = c->tstamp;
+ rq->clock = &mdev->clock;
+ rq->icosq = &c->icosq;
+ rq->ix = c->ix;
+ rq->channel = c;
+ rq->mdev = mdev;
+ rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+ rq->xdpsq = &c->rq_xdpsq;
+ rq->xsk_pool = pool;
+ rq->stats = &c->priv->channel_stats[c->ix]->xskrq;
+ rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev);
+ rq_xdp_ix = c->ix;
+ err = mlx5e_rq_set_handlers(rq, params, xsk);
+ if (err)
+ return err;
+
+ return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0);
+}
+
+static int mlx5e_open_xsk_rq(struct mlx5e_channel *c, struct mlx5e_params *params,
+ struct mlx5e_rq_param *rq_params, struct xsk_buff_pool *pool,
+ struct mlx5e_xsk_param *xsk)
+{
+ int err;
+
+ err = mlx5e_init_xsk_rq(c, params, pool, xsk, &c->xskrq);
+ if (err)
+ return err;
+
+ return mlx5e_open_rq(params, rq_params, xsk, cpu_to_node(c->cpu), &c->xskrq);
+}
+
+int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk, struct xsk_buff_pool *pool,
+ struct mlx5e_channel *c)
+{
+ struct mlx5e_channel_param *cparam;
+ struct mlx5e_create_cq_param ccp;
+ int err;
+
+ mlx5e_build_create_cq_param(&ccp, c);
+
+ if (!mlx5e_validate_xsk_param(params, xsk, priv->mdev))
+ return -EINVAL;
+
+ cparam = kvzalloc(sizeof(*cparam), GFP_KERNEL);
+ if (!cparam)
+ return -ENOMEM;
+
+ mlx5e_build_xsk_cparam(priv->mdev, params, xsk, priv->q_counter, cparam);
+
+ err = mlx5e_open_cq(c->priv, params->rx_cq_moderation, &cparam->rq.cqp, &ccp,
+ &c->xskrq.cq);
+ if (unlikely(err))
+ goto err_free_cparam;
+
+ err = mlx5e_open_xsk_rq(c, params, &cparam->rq, pool, xsk);
+ if (unlikely(err))
+ goto err_close_rx_cq;
+
+ err = mlx5e_open_cq(c->priv, params->tx_cq_moderation, &cparam->xdp_sq.cqp, &ccp,
+ &c->xsksq.cq);
+ if (unlikely(err))
+ goto err_close_rq;
+
+ /* Create a separate SQ, so that when the buff pool is disabled, we could
+ * close this SQ safely and stop receiving CQEs. In other case, e.g., if
+ * the XDPSQ was used instead, we might run into trouble when the buff pool
+ * is disabled and then re-enabled, but the SQ continues receiving CQEs
+ * from the old buff pool.
+ */
+ err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, pool, &c->xsksq, true);
+ if (unlikely(err))
+ goto err_close_tx_cq;
+
+ kvfree(cparam);
+
+ set_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
+
+ return 0;
+
+err_close_tx_cq:
+ mlx5e_close_cq(&c->xsksq.cq);
+
+err_close_rq:
+ mlx5e_close_rq(&c->xskrq);
+
+err_close_rx_cq:
+ mlx5e_close_cq(&c->xskrq.cq);
+
+err_free_cparam:
+ kvfree(cparam);
+
+ return err;
+}
+
+void mlx5e_close_xsk(struct mlx5e_channel *c)
+{
+ clear_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
+ synchronize_net(); /* Sync with NAPI. */
+
+ mlx5e_close_rq(&c->xskrq);
+ mlx5e_close_cq(&c->xskrq.cq);
+ mlx5e_close_xdpsq(&c->xsksq);
+ mlx5e_close_cq(&c->xsksq.cq);
+
+ memset(&c->xskrq, 0, sizeof(c->xskrq));
+ memset(&c->xsksq, 0, sizeof(c->xsksq));
+}
+
+void mlx5e_activate_xsk(struct mlx5e_channel *c)
+{
+ /* ICOSQ recovery deactivates RQs. Suspend the recovery to avoid
+ * activating XSKRQ in the middle of recovery.
+ */
+ mlx5e_reporter_icosq_suspend_recovery(c);
+ set_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state);
+ mlx5e_reporter_icosq_resume_recovery(c);
+
+ /* TX queue is created active. */
+}
+
+void mlx5e_deactivate_xsk(struct mlx5e_channel *c)
+{
+ /* ICOSQ recovery may reactivate XSKRQ if clear_bit is called in the
+ * middle of recovery. Suspend the recovery to avoid it.
+ */
+ mlx5e_reporter_icosq_suspend_recovery(c);
+ clear_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state);
+ mlx5e_reporter_icosq_resume_recovery(c);
+ synchronize_net(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */
+
+ /* TX queue is disabled on close. */
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h
new file mode 100644
index 000000000..50e111b85
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_XSK_SETUP_H__
+#define __MLX5_EN_XSK_SETUP_H__
+
+#include "en.h"
+
+struct mlx5e_xsk_param;
+
+bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5_core_dev *mdev);
+int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk, struct xsk_buff_pool *pool,
+ struct mlx5e_channel *c);
+void mlx5e_close_xsk(struct mlx5e_channel *c);
+void mlx5e_activate_xsk(struct mlx5e_channel *c);
+void mlx5e_deactivate_xsk(struct mlx5e_channel *c);
+
+#endif /* __MLX5_EN_XSK_SETUP_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
new file mode 100644
index 000000000..367a9505c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include "tx.h"
+#include "pool.h"
+#include "en/xdp.h"
+#include "en/params.h"
+#include <net/xdp_sock_drv.h>
+
+int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5e_params *params = &priv->channels.params;
+ struct mlx5e_channel *c;
+
+ if (unlikely(!mlx5e_xdp_is_active(priv)))
+ return -ENETDOWN;
+
+ if (unlikely(qid >= params->num_channels))
+ return -EINVAL;
+
+ c = priv->channels.c[qid];
+
+ if (!napi_if_scheduled_mark_missed(&c->napi)) {
+ /* To avoid WQE overrun, don't post a NOP if async_icosq is not
+ * active and not polled by NAPI. Return 0, because the upcoming
+ * activate will trigger the IRQ for us.
+ */
+ if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &c->async_icosq.state)))
+ return 0;
+
+ if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state))
+ return 0;
+
+ mlx5e_trigger_napi_icosq(c);
+ }
+
+ return 0;
+}
+
+/* When TX fails (because of the size of the packet), we need to get completions
+ * in order, so post a NOP to get a CQE. Since AF_XDP doesn't distinguish
+ * between successful TX and errors, handling in mlx5e_poll_xdpsq_cq is the
+ * same.
+ */
+static void mlx5e_xsk_tx_post_err(struct mlx5e_xdpsq *sq,
+ struct mlx5e_xdp_info *xdpi)
+{
+ u16 pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc);
+ struct mlx5e_xdp_wqe_info *wi = &sq->db.wqe_info[pi];
+ struct mlx5e_tx_wqe *nopwqe;
+
+ wi->num_wqebbs = 1;
+ wi->num_pkts = 1;
+
+ nopwqe = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc);
+ mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi);
+ sq->doorbell_cseg = &nopwqe->ctrl;
+}
+
+bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
+{
+ struct xsk_buff_pool *pool = sq->xsk_pool;
+ struct mlx5e_xmit_data xdptxd;
+ struct mlx5e_xdp_info xdpi;
+ bool work_done = true;
+ bool flush = false;
+
+ xdpi.mode = MLX5E_XDP_XMIT_MODE_XSK;
+
+ for (; budget; budget--) {
+ int check_result = INDIRECT_CALL_2(sq->xmit_xdp_frame_check,
+ mlx5e_xmit_xdp_frame_check_mpwqe,
+ mlx5e_xmit_xdp_frame_check,
+ sq);
+ struct xdp_desc desc;
+ bool ret;
+
+ if (unlikely(check_result < 0)) {
+ work_done = false;
+ break;
+ }
+
+ if (!xsk_tx_peek_desc(pool, &desc)) {
+ /* TX will get stuck until something wakes it up by
+ * triggering NAPI. Currently it's expected that the
+ * application calls sendto() if there are consumed, but
+ * not completed frames.
+ */
+ break;
+ }
+
+ xdptxd.dma_addr = xsk_buff_raw_get_dma(pool, desc.addr);
+ xdptxd.data = xsk_buff_raw_get_data(pool, desc.addr);
+ xdptxd.len = desc.len;
+
+ xsk_buff_raw_dma_sync_for_device(pool, xdptxd.dma_addr, xdptxd.len);
+
+ ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
+ mlx5e_xmit_xdp_frame, sq, &xdptxd, NULL,
+ check_result);
+ if (unlikely(!ret)) {
+ if (sq->mpwqe.wqe)
+ mlx5e_xdp_mpwqe_complete(sq);
+
+ mlx5e_xsk_tx_post_err(sq, &xdpi);
+ } else {
+ mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, &xdpi);
+ }
+
+ flush = true;
+ }
+
+ if (flush) {
+ if (sq->mpwqe.wqe)
+ mlx5e_xdp_mpwqe_complete(sq);
+ mlx5e_xmit_xdp_doorbell(sq);
+
+ xsk_tx_release(pool);
+ }
+
+ return !(budget && work_done);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
new file mode 100644
index 000000000..9c505158b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_XSK_TX_H__
+#define __MLX5_EN_XSK_TX_H__
+
+#include "en.h"
+
+/* TX data path */
+
+int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
+
+bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget);
+
+#endif /* __MLX5_EN_XSK_TX_H__ */