summaryrefslogtreecommitdiffstats
path: root/src/spdk/dpdk/drivers/net/mlx5
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/dpdk/drivers/net/mlx5')
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/Makefile77
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/meson.build54
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5.c3814
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5.h848
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h188
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c2071
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c6204
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h1034
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c9666
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c1292
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c1987
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c255
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c211
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c551
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h39
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c229
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c174
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c2976
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c5691
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h683
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c170
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h125
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h1114
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h780
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h731
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c230
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c589
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c579
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c1470
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c484
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h423
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c327
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h35
-rw-r--r--src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map10
34 files changed, 45111 insertions, 0 deletions
diff --git a/src/spdk/dpdk/drivers/net/mlx5/Makefile b/src/spdk/dpdk/drivers/net/mlx5/Makefile
new file mode 100644
index 000000000..2577ee5e5
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/Makefile
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2015 6WIND S.A.
+# Copyright 2015 Mellanox Technologies, Ltd
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# Library name.
+LIB = librte_pmd_mlx5.a
+
+# Sources.
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_txq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx.c
+ifneq ($(filter y,$(CONFIG_RTE_ARCH_X86_64) \
+ $(CONFIG_RTE_ARCH_PPC_64) \
+ $(CONFIG_RTE_ARCH_ARM64)),)
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx_vec.c
+endif
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_trigger.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_ethdev.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mac.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxmode.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_meter.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_dv.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_verbs.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mp.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_utils.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
+
+# Basic CFLAGS.
+CFLAGS += -O3
+CFLAGS += -std=c11 -Wall -Wextra
+CFLAGS += -g
+CFLAGS += -I$(RTE_SDK)/drivers/common/mlx5
+CFLAGS += -I$(RTE_SDK)/drivers/net/mlx5
+CFLAGS += -I$(BUILDDIR)/drivers/common/mlx5
+CFLAGS += -D_BSD_SOURCE
+CFLAGS += -D_DEFAULT_SOURCE
+CFLAGS += -D_XOPEN_SOURCE=600
+CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -Wno-strict-prototypes
+LDLIBS += -lrte_common_mlx5
+LDLIBS += -lm
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_pci
+
+# A few warnings cannot be avoided in external headers.
+CFLAGS += -Wno-error=cast-qual
+
+EXPORT_MAP := rte_pmd_mlx5_version.map
+
+# DEBUG which is usually provided on the command-line may enable
+# CONFIG_RTE_LIBRTE_MLX5_DEBUG.
+ifeq ($(DEBUG),1)
+CONFIG_RTE_LIBRTE_MLX5_DEBUG := y
+endif
+
+# User-defined CFLAGS.
+ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DEBUG),y)
+CFLAGS += -pedantic
+ifneq ($(CONFIG_RTE_TOOLCHAIN_ICC),y)
+CFLAGS += -DPEDANTIC
+endif
+AUTO_CONFIG_CFLAGS += -Wno-pedantic
+else
+CFLAGS += -UPEDANTIC
+endif
+
+include $(RTE_SDK)/mk/rte.lib.mk
+
diff --git a/src/spdk/dpdk/drivers/net/mlx5/meson.build b/src/spdk/dpdk/drivers/net/mlx5/meson.build
new file mode 100644
index 000000000..928663af7
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/meson.build
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2018 6WIND S.A.
+# Copyright 2018 Mellanox Technologies, Ltd
+
+if not is_linux
+ build = false
+ reason = 'only supported on Linux'
+ subdir_done()
+endif
+
+deps += ['hash', 'common_mlx5']
+sources = files(
+ 'mlx5.c',
+ 'mlx5_ethdev.c',
+ 'mlx5_flow.c',
+ 'mlx5_flow_meter.c',
+ 'mlx5_flow_dv.c',
+ 'mlx5_flow_verbs.c',
+ 'mlx5_mac.c',
+ 'mlx5_mr.c',
+ 'mlx5_rss.c',
+ 'mlx5_rxmode.c',
+ 'mlx5_rxq.c',
+ 'mlx5_rxtx.c',
+ 'mlx5_mp.c',
+ 'mlx5_stats.c',
+ 'mlx5_trigger.c',
+ 'mlx5_txq.c',
+ 'mlx5_vlan.c',
+ 'mlx5_utils.c',
+ 'mlx5_socket.c',
+)
+if (dpdk_conf.has('RTE_ARCH_X86_64')
+ or dpdk_conf.has('RTE_ARCH_ARM64')
+ or dpdk_conf.has('RTE_ARCH_PPC_64'))
+ sources += files('mlx5_rxtx_vec.c')
+endif
+cflags_options = [
+ '-std=c11',
+ '-Wno-strict-prototypes',
+ '-D_BSD_SOURCE',
+ '-D_DEFAULT_SOURCE',
+ '-D_XOPEN_SOURCE=600'
+]
+foreach option:cflags_options
+ if cc.has_argument(option)
+ cflags += option
+ endif
+endforeach
+if get_option('buildtype').contains('debug')
+ cflags += [ '-pedantic', '-DPEDANTIC' ]
+else
+ cflags += [ '-UPEDANTIC' ]
+endif
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5.c
new file mode 100644
index 000000000..5589772eb
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5.c
@@ -0,0 +1,3814 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <net/if.h>
+#include <sys/mman.h>
+#include <linux/rtnetlink.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_malloc.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ethdev_pci.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_common.h>
+#include <rte_kvargs.h>
+#include <rte_rwlock.h>
+#include <rte_spinlock.h>
+#include <rte_string_fns.h>
+#include <rte_alarm.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mp.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_mr.h"
+#include "mlx5_flow.h"
+#include "rte_pmd_mlx5.h"
+
+/* Device parameter to enable RX completion queue compression. */
+#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
+
+/* Device parameter to enable RX completion entry padding to 128B. */
+#define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
+
+/* Device parameter to enable padding Rx packet to cacheline size. */
+#define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
+
+/* Device parameter to enable Multi-Packet Rx queue. */
+#define MLX5_RX_MPRQ_EN "mprq_en"
+
+/* Device parameter to configure log 2 of the number of strides for MPRQ. */
+#define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
+
+/* Device parameter to configure log 2 of the stride size for MPRQ. */
+#define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size"
+
+/* Device parameter to limit the size of memcpy'd packet for MPRQ. */
+#define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
+
+/* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
+#define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
+
+/* Device parameter to configure inline send. Deprecated, ignored.*/
+#define MLX5_TXQ_INLINE "txq_inline"
+
+/* Device parameter to limit packet size to inline with ordinary SEND. */
+#define MLX5_TXQ_INLINE_MAX "txq_inline_max"
+
+/* Device parameter to configure minimal data size to inline. */
+#define MLX5_TXQ_INLINE_MIN "txq_inline_min"
+
+/* Device parameter to limit packet size to inline with Enhanced MPW. */
+#define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
+
+/*
+ * Device parameter to configure the number of TX queues threshold for
+ * enabling inline send.
+ */
+#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
+
+/*
+ * Device parameter to configure the number of TX queues threshold for
+ * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
+ */
+#define MLX5_TXQS_MAX_VEC "txqs_max_vec"
+
+/* Device parameter to enable multi-packet send WQEs. */
+#define MLX5_TXQ_MPW_EN "txq_mpw_en"
+
+/*
+ * Device parameter to force doorbell register mapping
+ * to non-cahed region eliminating the extra write memory barrier.
+ */
+#define MLX5_TX_DB_NC "tx_db_nc"
+
+/*
+ * Device parameter to include 2 dsegs in the title WQEBB.
+ * Deprecated, ignored.
+ */
+#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
+
+/*
+ * Device parameter to limit the size of inlining packet.
+ * Deprecated, ignored.
+ */
+#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
+
+/*
+ * Device parameter to enable hardware Tx vector.
+ * Deprecated, ignored (no vectorized Tx routines anymore).
+ */
+#define MLX5_TX_VEC_EN "tx_vec_en"
+
+/* Device parameter to enable hardware Rx vector. */
+#define MLX5_RX_VEC_EN "rx_vec_en"
+
+/* Allow L3 VXLAN flow creation. */
+#define MLX5_L3_VXLAN_EN "l3_vxlan_en"
+
+/* Activate DV E-Switch flow steering. */
+#define MLX5_DV_ESW_EN "dv_esw_en"
+
+/* Activate DV flow steering. */
+#define MLX5_DV_FLOW_EN "dv_flow_en"
+
+/* Enable extensive flow metadata support. */
+#define MLX5_DV_XMETA_EN "dv_xmeta_en"
+
+/* Activate Netlink support in VF mode. */
+#define MLX5_VF_NL_EN "vf_nl_en"
+
+/* Enable extending memsegs when creating a MR. */
+#define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
+
+/* Select port representors to instantiate. */
+#define MLX5_REPRESENTOR "representor"
+
+/* Device parameter to configure the maximum number of dump files per queue. */
+#define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
+
+/* Configure timeout of LRO session (in microseconds). */
+#define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
+
+/*
+ * Device parameter to configure the total data buffer size for a single
+ * hairpin queue (logarithm value).
+ */
+#define MLX5_HP_BUF_SIZE "hp_buf_log_sz"
+
+#ifndef HAVE_IBV_MLX5_MOD_MPW
+#define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
+#define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
+#endif
+
+#ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
+#define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
+#endif
+
+static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
+
+/* Shared memory between primary and secondary processes. */
+struct mlx5_shared_data *mlx5_shared_data;
+
+/* Spinlock for mlx5_shared_data allocation. */
+static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Process local data for secondary processes. */
+static struct mlx5_local_data mlx5_local_data;
+
+/** Driver-specific log messages type. */
+int mlx5_logtype;
+
+/** Data associated with devices to spawn. */
+struct mlx5_dev_spawn_data {
+ uint32_t ifindex; /**< Network interface index. */
+ uint32_t max_port; /**< IB device maximal port index. */
+ uint32_t ibv_port; /**< IB device physical port index. */
+ int pf_bond; /**< bonding device PF index. < 0 - no bonding */
+ struct mlx5_switch_info info; /**< Switch information. */
+ struct ibv_device *ibv_dev; /**< Associated IB device. */
+ struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+ struct rte_pci_device *pci_dev; /**< Backend PCI device. */
+};
+
+static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
+static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ {
+ .size = sizeof(struct mlx5_flow_dv_encap_decap_resource),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 0,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "mlx5_encap_decap_ipool",
+ },
+ {
+ .size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 0,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "mlx5_push_vlan_ipool",
+ },
+ {
+ .size = sizeof(struct mlx5_flow_dv_tag_resource),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 0,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "mlx5_tag_ipool",
+ },
+ {
+ .size = sizeof(struct mlx5_flow_dv_port_id_action_resource),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 0,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "mlx5_port_id_ipool",
+ },
+ {
+ .size = sizeof(struct mlx5_flow_tbl_data_entry),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 0,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "mlx5_jump_ipool",
+ },
+#endif
+ {
+ .size = sizeof(struct mlx5_flow_meter),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 0,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "mlx5_meter_ipool",
+ },
+ {
+ .size = sizeof(struct mlx5_flow_mreg_copy_resource),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 0,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "mlx5_mcp_ipool",
+ },
+ {
+ .size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 0,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "mlx5_hrxq_ipool",
+ },
+ {
+ .size = sizeof(struct mlx5_flow_handle),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 0,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "mlx5_flow_handle_ipool",
+ },
+ {
+ .size = sizeof(struct rte_flow),
+ .trunk_size = 4096,
+ .need_lock = 1,
+ .release_mem_en = 1,
+ .malloc = rte_malloc_socket,
+ .free = rte_free,
+ .type = "rte_flow_ipool",
+ },
+};
+
+
+#define MLX5_FLOW_MIN_ID_POOL_SIZE 512
+#define MLX5_ID_GENERATION_ARRAY_FACTOR 16
+
+#define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
+#define MLX5_TAGS_HLIST_ARRAY_SIZE 8192
+
+/**
+ * Allocate ID pool structure.
+ *
+ * @param[in] max_id
+ * The maximum id can be allocated from the pool.
+ *
+ * @return
+ * Pointer to pool object, NULL value otherwise.
+ */
+struct mlx5_flow_id_pool *
+mlx5_flow_id_pool_alloc(uint32_t max_id)
+{
+ struct mlx5_flow_id_pool *pool;
+ void *mem;
+
+ pool = rte_zmalloc("id pool allocation", sizeof(*pool),
+ RTE_CACHE_LINE_SIZE);
+ if (!pool) {
+ DRV_LOG(ERR, "can't allocate id pool");
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
+ RTE_CACHE_LINE_SIZE);
+ if (!mem) {
+ DRV_LOG(ERR, "can't allocate mem for id pool");
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ pool->free_arr = mem;
+ pool->curr = pool->free_arr;
+ pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
+ pool->base_index = 0;
+ pool->max_id = max_id;
+ return pool;
+error:
+ rte_free(pool);
+ return NULL;
+}
+
+/**
+ * Release ID pool structure.
+ *
+ * @param[in] pool
+ * Pointer to flow id pool object to free.
+ */
+void
+mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
+{
+ rte_free(pool->free_arr);
+ rte_free(pool);
+}
+
+/**
+ * Generate ID.
+ *
+ * @param[in] pool
+ * Pointer to flow id pool.
+ * @param[out] id
+ * The generated ID.
+ *
+ * @return
+ * 0 on success, error value otherwise.
+ */
+uint32_t
+mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
+{
+ if (pool->curr == pool->free_arr) {
+ if (pool->base_index == pool->max_id) {
+ rte_errno = ENOMEM;
+ DRV_LOG(ERR, "no free id");
+ return -rte_errno;
+ }
+ *id = ++pool->base_index;
+ return 0;
+ }
+ *id = *(--pool->curr);
+ return 0;
+}
+
+/**
+ * Release ID.
+ *
+ * @param[in] pool
+ * Pointer to flow id pool.
+ * @param[out] id
+ * The generated ID.
+ *
+ * @return
+ * 0 on success, error value otherwise.
+ */
+uint32_t
+mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
+{
+ uint32_t size;
+ uint32_t size2;
+ void *mem;
+
+ if (pool->curr == pool->last) {
+ size = pool->curr - pool->free_arr;
+ size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
+ MLX5_ASSERT(size2 > size);
+ mem = rte_malloc("", size2 * sizeof(uint32_t), 0);
+ if (!mem) {
+ DRV_LOG(ERR, "can't allocate mem for id pool");
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
+ rte_free(pool->free_arr);
+ pool->free_arr = mem;
+ pool->curr = pool->free_arr + size;
+ pool->last = pool->free_arr + size2;
+ }
+ *pool->curr = id;
+ pool->curr++;
+ return 0;
+}
+
+/**
+ * Initialize the shared aging list information per port.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_ibv_shared object.
+ */
+static void
+mlx5_flow_aging_init(struct mlx5_ibv_shared *sh)
+{
+ uint32_t i;
+ struct mlx5_age_info *age_info;
+
+ for (i = 0; i < sh->max_port; i++) {
+ age_info = &sh->port[i].age_info;
+ age_info->flags = 0;
+ TAILQ_INIT(&age_info->aged_counters);
+ rte_spinlock_init(&age_info->aged_sl);
+ MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER);
+ }
+}
+
+/**
+ * Initialize the counters management structure.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_ibv_shared object to free
+ */
+static void
+mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh)
+{
+ int i;
+
+ memset(&sh->cmng, 0, sizeof(sh->cmng));
+ TAILQ_INIT(&sh->cmng.flow_counters);
+ for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
+ TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
+ rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
+ }
+}
+
+/**
+ * Destroy all the resources allocated for a counter memory management.
+ *
+ * @param[in] mng
+ * Pointer to the memory management structure.
+ */
+static void
+mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
+{
+ uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
+
+ LIST_REMOVE(mng, next);
+ claim_zero(mlx5_devx_cmd_destroy(mng->dm));
+ claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
+ rte_free(mem);
+}
+
+/**
+ * Close and release all the resources of the counters management.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_ibv_shared object to free.
+ */
+static void
+mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh)
+{
+ struct mlx5_counter_stats_mem_mng *mng;
+ int i;
+ int j;
+ int retries = 1024;
+
+ rte_errno = 0;
+ while (--retries) {
+ rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
+ if (rte_errno != EINPROGRESS)
+ break;
+ rte_pause();
+ }
+ for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
+ struct mlx5_flow_counter_pool *pool;
+ uint32_t batch = !!(i > 1);
+
+ if (!sh->cmng.ccont[i].pools)
+ continue;
+ pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+ while (pool) {
+ if (batch && pool->min_dcs)
+ claim_zero(mlx5_devx_cmd_destroy
+ (pool->min_dcs));
+ for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
+ if (MLX5_POOL_GET_CNT(pool, j)->action)
+ claim_zero
+ (mlx5_glue->destroy_flow_action
+ (MLX5_POOL_GET_CNT
+ (pool, j)->action));
+ if (!batch && MLX5_GET_POOL_CNT_EXT
+ (pool, j)->dcs)
+ claim_zero(mlx5_devx_cmd_destroy
+ (MLX5_GET_POOL_CNT_EXT
+ (pool, j)->dcs));
+ }
+ TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next);
+ rte_free(pool);
+ pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+ }
+ rte_free(sh->cmng.ccont[i].pools);
+ }
+ mng = LIST_FIRST(&sh->cmng.mem_mngs);
+ while (mng) {
+ mlx5_flow_destroy_counter_stat_mem_mng(mng);
+ mng = LIST_FIRST(&sh->cmng.mem_mngs);
+ }
+ memset(&sh->cmng, 0, sizeof(sh->cmng));
+}
+
+/**
+ * Initialize the flow resources' indexed mempool.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_ibv_shared object.
+ * @param[in] sh
+ * Pointer to user dev config.
+ */
+static void
+mlx5_flow_ipool_create(struct mlx5_ibv_shared *sh,
+ const struct mlx5_dev_config *config __rte_unused)
+{
+ uint8_t i;
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ /*
+ * While DV is supported, user chooses the verbs mode,
+ * the mlx5 flow handle size is different with the
+ * MLX5_FLOW_HANDLE_VERBS_SIZE.
+ */
+ if (!config->dv_flow_en)
+ mlx5_ipool_cfg[MLX5_IPOOL_MLX5_FLOW].size =
+ MLX5_FLOW_HANDLE_VERBS_SIZE;
+#endif
+ for (i = 0; i < MLX5_IPOOL_MAX; ++i)
+ sh->ipool[i] = mlx5_ipool_create(&mlx5_ipool_cfg[i]);
+}
+
+/**
+ * Release the flow resources' indexed mempool.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_ibv_shared object.
+ */
+static void
+mlx5_flow_ipool_destroy(struct mlx5_ibv_shared *sh)
+{
+ uint8_t i;
+
+ for (i = 0; i < MLX5_IPOOL_MAX; ++i)
+ mlx5_ipool_destroy(sh->ipool[i]);
+}
+
+/**
+ * Extract pdn of PD object using DV API.
+ *
+ * @param[in] pd
+ * Pointer to the verbs PD object.
+ * @param[out] pdn
+ * Pointer to the PD object number variable.
+ *
+ * @return
+ * 0 on success, error value otherwise.
+ */
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+static int
+mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused)
+{
+ struct mlx5dv_obj obj;
+ struct mlx5dv_pd pd_info;
+ int ret = 0;
+
+ obj.pd.in = pd;
+ obj.pd.out = &pd_info;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
+ if (ret) {
+ DRV_LOG(DEBUG, "Fail to get PD object info");
+ return ret;
+ }
+ *pdn = pd_info.pdn;
+ return 0;
+}
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
+
+static int
+mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
+{
+ char *env;
+ int value;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ /* Get environment variable to store. */
+ env = getenv(MLX5_SHUT_UP_BF);
+ value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
+ if (config->dbnc == MLX5_ARG_UNSET)
+ setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
+ else
+ setenv(MLX5_SHUT_UP_BF,
+ config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
+ return value;
+}
+
+static void
+mlx5_restore_doorbell_mapping_env(int value)
+{
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ /* Restore the original environment variable state. */
+ if (value == MLX5_ARG_UNSET)
+ unsetenv(MLX5_SHUT_UP_BF);
+ else
+ setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
+}
+
+/**
+ * Allocate shared IB device context. If there is multiport device the
+ * master and representors will share this context, if there is single
+ * port dedicated IB device, the context will be used by only given
+ * port due to unification.
+ *
+ * Routine first searches the context for the specified IB device name,
+ * if found the shared context assumed and reference counter is incremented.
+ * If no context found the new one is created and initialized with specified
+ * IB device context and parameters.
+ *
+ * @param[in] spawn
+ * Pointer to the IB device attributes (name, port, etc).
+ * @param[in] config
+ * Pointer to device configuration structure.
+ *
+ * @return
+ * Pointer to mlx5_ibv_shared object on success,
+ * otherwise NULL and rte_errno is set.
+ */
+static struct mlx5_ibv_shared *
+mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn,
+ const struct mlx5_dev_config *config)
+{
+ struct mlx5_ibv_shared *sh;
+ int dbmap_env;
+ int err = 0;
+ uint32_t i;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ struct mlx5_devx_tis_attr tis_attr = { 0 };
+#endif
+
+ MLX5_ASSERT(spawn);
+ /* Secondary process should not create the shared context. */
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ pthread_mutex_lock(&mlx5_ibv_list_mutex);
+ /* Search for IB context by device name. */
+ LIST_FOREACH(sh, &mlx5_ibv_list, next) {
+ if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) {
+ sh->refcnt++;
+ goto exit;
+ }
+ }
+ /* No device found, we have to create new shared context. */
+ MLX5_ASSERT(spawn->max_port);
+ sh = rte_zmalloc("ethdev shared ib context",
+ sizeof(struct mlx5_ibv_shared) +
+ spawn->max_port *
+ sizeof(struct mlx5_ibv_shared_port),
+ RTE_CACHE_LINE_SIZE);
+ if (!sh) {
+ DRV_LOG(ERR, "shared context allocation failure");
+ rte_errno = ENOMEM;
+ goto exit;
+ }
+ /*
+ * Configure environment variable "MLX5_BF_SHUT_UP"
+ * before the device creation. The rdma_core library
+ * checks the variable at device creation and
+ * stores the result internally.
+ */
+ dbmap_env = mlx5_config_doorbell_mapping_env(config);
+ /* Try to open IB device with DV first, then usual Verbs. */
+ errno = 0;
+ sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
+ if (sh->ctx) {
+ sh->devx = 1;
+ DRV_LOG(DEBUG, "DevX is supported");
+ /* The device is created, no need for environment. */
+ mlx5_restore_doorbell_mapping_env(dbmap_env);
+ } else {
+ /* The environment variable is still configured. */
+ sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
+ err = errno ? errno : ENODEV;
+ /*
+ * The environment variable is not needed anymore,
+ * all device creation attempts are completed.
+ */
+ mlx5_restore_doorbell_mapping_env(dbmap_env);
+ if (!sh->ctx)
+ goto error;
+ DRV_LOG(DEBUG, "DevX is NOT supported");
+ }
+ err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr);
+ if (err) {
+ DRV_LOG(DEBUG, "ibv_query_device_ex() failed");
+ goto error;
+ }
+ sh->refcnt = 1;
+ sh->max_port = spawn->max_port;
+ strncpy(sh->ibdev_name, sh->ctx->device->name,
+ sizeof(sh->ibdev_name));
+ strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
+ sizeof(sh->ibdev_path));
+ pthread_mutex_init(&sh->intr_mutex, NULL);
+ /*
+ * Setting port_id to max unallowed value means
+ * there is no interrupt subhandler installed for
+ * the given port index i.
+ */
+ for (i = 0; i < sh->max_port; i++) {
+ sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
+ sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
+ }
+ sh->pd = mlx5_glue->alloc_pd(sh->ctx);
+ if (sh->pd == NULL) {
+ DRV_LOG(ERR, "PD allocation failure");
+ err = ENOMEM;
+ goto error;
+ }
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ if (sh->devx) {
+ err = mlx5_get_pdn(sh->pd, &sh->pdn);
+ if (err) {
+ DRV_LOG(ERR, "Fail to extract pdn from PD");
+ goto error;
+ }
+ sh->td = mlx5_devx_cmd_create_td(sh->ctx);
+ if (!sh->td) {
+ DRV_LOG(ERR, "TD allocation failure");
+ err = ENOMEM;
+ goto error;
+ }
+ tis_attr.transport_domain = sh->td->id;
+ sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
+ if (!sh->tis) {
+ DRV_LOG(ERR, "TIS allocation failure");
+ err = ENOMEM;
+ goto error;
+ }
+ }
+ sh->flow_id_pool = mlx5_flow_id_pool_alloc
+ ((1 << HAIRPIN_FLOW_ID_BITS) - 1);
+ if (!sh->flow_id_pool) {
+ DRV_LOG(ERR, "can't create flow id pool");
+ err = ENOMEM;
+ goto error;
+ }
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
+ /*
+ * Once the device is added to the list of memory event
+ * callback, its global MR cache table cannot be expanded
+ * on the fly because of deadlock. If it overflows, lookup
+ * should be done by searching MR list linearly, which is slow.
+ *
+ * At this point the device is not added to the memory
+ * event list yet, context is just being created.
+ */
+ err = mlx5_mr_btree_init(&sh->share_cache.cache,
+ MLX5_MR_BTREE_CACHE_N * 2,
+ spawn->pci_dev->device.numa_node);
+ if (err) {
+ err = rte_errno;
+ goto error;
+ }
+ mlx5_flow_aging_init(sh);
+ mlx5_flow_counters_mng_init(sh);
+ mlx5_flow_ipool_create(sh, config);
+ /* Add device to memory callback list. */
+ rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+ LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
+ sh, mem_event_cb);
+ rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+ /* Add context to the global device list. */
+ LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
+exit:
+ pthread_mutex_unlock(&mlx5_ibv_list_mutex);
+ return sh;
+error:
+ pthread_mutex_unlock(&mlx5_ibv_list_mutex);
+ MLX5_ASSERT(sh);
+ if (sh->tis)
+ claim_zero(mlx5_devx_cmd_destroy(sh->tis));
+ if (sh->td)
+ claim_zero(mlx5_devx_cmd_destroy(sh->td));
+ if (sh->pd)
+ claim_zero(mlx5_glue->dealloc_pd(sh->pd));
+ if (sh->ctx)
+ claim_zero(mlx5_glue->close_device(sh->ctx));
+ if (sh->flow_id_pool)
+ mlx5_flow_id_pool_release(sh->flow_id_pool);
+ rte_free(sh);
+ MLX5_ASSERT(err > 0);
+ rte_errno = err;
+ return NULL;
+}
+
+/**
+ * Free shared IB device context. Decrement counter and if zero free
+ * all allocated resources and close handles.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_ibv_shared object to free
+ */
+static void
+mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh)
+{
+ pthread_mutex_lock(&mlx5_ibv_list_mutex);
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+ /* Check the object presence in the list. */
+ struct mlx5_ibv_shared *lctx;
+
+ LIST_FOREACH(lctx, &mlx5_ibv_list, next)
+ if (lctx == sh)
+ break;
+ MLX5_ASSERT(lctx);
+ if (lctx != sh) {
+ DRV_LOG(ERR, "Freeing non-existing shared IB context");
+ goto exit;
+ }
+#endif
+ MLX5_ASSERT(sh);
+ MLX5_ASSERT(sh->refcnt);
+ /* Secondary process should not free the shared context. */
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ if (--sh->refcnt)
+ goto exit;
+ /* Remove from memory callback device list. */
+ rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+ LIST_REMOVE(sh, mem_event_cb);
+ rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+ /* Release created Memory Regions. */
+ mlx5_mr_release_cache(&sh->share_cache);
+ /* Remove context from the global device list. */
+ LIST_REMOVE(sh, next);
+ /*
+ * Ensure there is no async event handler installed.
+ * Only primary process handles async device events.
+ **/
+ mlx5_flow_counters_mng_close(sh);
+ mlx5_flow_ipool_destroy(sh);
+ MLX5_ASSERT(!sh->intr_cnt);
+ if (sh->intr_cnt)
+ mlx5_intr_callback_unregister
+ (&sh->intr_handle, mlx5_dev_interrupt_handler, sh);
+#ifdef HAVE_MLX5_DEVX_ASYNC_SUPPORT
+ if (sh->devx_intr_cnt) {
+ if (sh->intr_handle_devx.fd)
+ rte_intr_callback_unregister(&sh->intr_handle_devx,
+ mlx5_dev_interrupt_handler_devx, sh);
+ if (sh->devx_comp)
+ mlx5dv_devx_destroy_cmd_comp(sh->devx_comp);
+ }
+#endif
+ pthread_mutex_destroy(&sh->intr_mutex);
+ if (sh->pd)
+ claim_zero(mlx5_glue->dealloc_pd(sh->pd));
+ if (sh->tis)
+ claim_zero(mlx5_devx_cmd_destroy(sh->tis));
+ if (sh->td)
+ claim_zero(mlx5_devx_cmd_destroy(sh->td));
+ if (sh->ctx)
+ claim_zero(mlx5_glue->close_device(sh->ctx));
+ if (sh->flow_id_pool)
+ mlx5_flow_id_pool_release(sh->flow_id_pool);
+ rte_free(sh);
+exit:
+ pthread_mutex_unlock(&mlx5_ibv_list_mutex);
+}
+
+/**
+ * Destroy table hash list and all the root entries per domain.
+ *
+ * @param[in] priv
+ * Pointer to the private device data structure.
+ */
+static void
+mlx5_free_table_hash_list(struct mlx5_priv *priv)
+{
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_tbl_data_entry *tbl_data;
+ union mlx5_flow_tbl_key table_key = {
+ {
+ .table_id = 0,
+ .reserved = 0,
+ .domain = 0,
+ .direction = 0,
+ }
+ };
+ struct mlx5_hlist_entry *pos;
+
+ if (!sh->flow_tbls)
+ return;
+ pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
+ if (pos) {
+ tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
+ entry);
+ MLX5_ASSERT(tbl_data);
+ mlx5_hlist_remove(sh->flow_tbls, pos);
+ rte_free(tbl_data);
+ }
+ table_key.direction = 1;
+ pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
+ if (pos) {
+ tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
+ entry);
+ MLX5_ASSERT(tbl_data);
+ mlx5_hlist_remove(sh->flow_tbls, pos);
+ rte_free(tbl_data);
+ }
+ table_key.direction = 0;
+ table_key.domain = 1;
+ pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
+ if (pos) {
+ tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
+ entry);
+ MLX5_ASSERT(tbl_data);
+ mlx5_hlist_remove(sh->flow_tbls, pos);
+ rte_free(tbl_data);
+ }
+ mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
+}
+
+/**
+ * Initialize flow table hash list and create the root tables entry
+ * for each domain.
+ *
+ * @param[in] priv
+ * Pointer to the private device data structure.
+ *
+ * @return
+ * Zero on success, positive error code otherwise.
+ */
+static int
+mlx5_alloc_table_hash_list(struct mlx5_priv *priv)
+{
+ struct mlx5_ibv_shared *sh = priv->sh;
+ char s[MLX5_HLIST_NAMESIZE];
+ int err = 0;
+
+ MLX5_ASSERT(sh);
+ snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
+ sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
+ if (!sh->flow_tbls) {
+ DRV_LOG(ERR, "flow tables with hash creation failed.\n");
+ err = ENOMEM;
+ return err;
+ }
+#ifndef HAVE_MLX5DV_DR
+ /*
+ * In case we have not DR support, the zero tables should be created
+ * because DV expect to see them even if they cannot be created by
+ * RDMA-CORE.
+ */
+ union mlx5_flow_tbl_key table_key = {
+ {
+ .table_id = 0,
+ .reserved = 0,
+ .domain = 0,
+ .direction = 0,
+ }
+ };
+ struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL,
+ sizeof(*tbl_data), 0);
+
+ if (!tbl_data) {
+ err = ENOMEM;
+ goto error;
+ }
+ tbl_data->entry.key = table_key.v64;
+ err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
+ if (err)
+ goto error;
+ rte_atomic32_init(&tbl_data->tbl.refcnt);
+ rte_atomic32_inc(&tbl_data->tbl.refcnt);
+ table_key.direction = 1;
+ tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
+ if (!tbl_data) {
+ err = ENOMEM;
+ goto error;
+ }
+ tbl_data->entry.key = table_key.v64;
+ err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
+ if (err)
+ goto error;
+ rte_atomic32_init(&tbl_data->tbl.refcnt);
+ rte_atomic32_inc(&tbl_data->tbl.refcnt);
+ table_key.direction = 0;
+ table_key.domain = 1;
+ tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
+ if (!tbl_data) {
+ err = ENOMEM;
+ goto error;
+ }
+ tbl_data->entry.key = table_key.v64;
+ err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
+ if (err)
+ goto error;
+ rte_atomic32_init(&tbl_data->tbl.refcnt);
+ rte_atomic32_inc(&tbl_data->tbl.refcnt);
+ return err;
+error:
+ mlx5_free_table_hash_list(priv);
+#endif /* HAVE_MLX5DV_DR */
+ return err;
+}
+
+/**
+ * Initialize DR related data within private structure.
+ * Routine checks the reference counter and does actual
+ * resources creation/initialization only if counter is zero.
+ *
+ * @param[in] priv
+ * Pointer to the private device data structure.
+ *
+ * @return
+ * Zero on success, positive error code otherwise.
+ */
+static int
+mlx5_alloc_shared_dr(struct mlx5_priv *priv)
+{
+ struct mlx5_ibv_shared *sh = priv->sh;
+ char s[MLX5_HLIST_NAMESIZE];
+ int err = 0;
+
+ if (!sh->flow_tbls)
+ err = mlx5_alloc_table_hash_list(priv);
+ else
+ DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n",
+ (void *)sh->flow_tbls);
+ if (err)
+ return err;
+ /* Create tags hash list table. */
+ snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name);
+ sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE);
+ if (!sh->tag_table) {
+ DRV_LOG(ERR, "tags with hash creation failed.\n");
+ err = ENOMEM;
+ goto error;
+ }
+#ifdef HAVE_MLX5DV_DR
+ void *domain;
+
+ if (sh->dv_refcnt) {
+ /* Shared DV/DR structures is already initialized. */
+ sh->dv_refcnt++;
+ priv->dr_shared = 1;
+ return 0;
+ }
+ /* Reference counter is zero, we should initialize structures. */
+ domain = mlx5_glue->dr_create_domain(sh->ctx,
+ MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
+ if (!domain) {
+ DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
+ err = errno;
+ goto error;
+ }
+ sh->rx_domain = domain;
+ domain = mlx5_glue->dr_create_domain(sh->ctx,
+ MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
+ if (!domain) {
+ DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
+ err = errno;
+ goto error;
+ }
+ pthread_mutex_init(&sh->dv_mutex, NULL);
+ sh->tx_domain = domain;
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+ if (priv->config.dv_esw_en) {
+ domain = mlx5_glue->dr_create_domain
+ (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
+ if (!domain) {
+ DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
+ err = errno;
+ goto error;
+ }
+ sh->fdb_domain = domain;
+ sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
+ }
+#endif
+ sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
+#endif /* HAVE_MLX5DV_DR */
+ sh->dv_refcnt++;
+ priv->dr_shared = 1;
+ return 0;
+error:
+ /* Rollback the created objects. */
+ if (sh->rx_domain) {
+ mlx5_glue->dr_destroy_domain(sh->rx_domain);
+ sh->rx_domain = NULL;
+ }
+ if (sh->tx_domain) {
+ mlx5_glue->dr_destroy_domain(sh->tx_domain);
+ sh->tx_domain = NULL;
+ }
+ if (sh->fdb_domain) {
+ mlx5_glue->dr_destroy_domain(sh->fdb_domain);
+ sh->fdb_domain = NULL;
+ }
+ if (sh->esw_drop_action) {
+ mlx5_glue->destroy_flow_action(sh->esw_drop_action);
+ sh->esw_drop_action = NULL;
+ }
+ if (sh->pop_vlan_action) {
+ mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
+ sh->pop_vlan_action = NULL;
+ }
+ if (sh->tag_table) {
+ /* tags should be destroyed with flow before. */
+ mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
+ sh->tag_table = NULL;
+ }
+ mlx5_free_table_hash_list(priv);
+ return err;
+}
+
+/**
+ * Destroy DR related data within private structure.
+ *
+ * @param[in] priv
+ * Pointer to the private device data structure.
+ */
+static void
+mlx5_free_shared_dr(struct mlx5_priv *priv)
+{
+ struct mlx5_ibv_shared *sh;
+
+ if (!priv->dr_shared)
+ return;
+ priv->dr_shared = 0;
+ sh = priv->sh;
+ MLX5_ASSERT(sh);
+#ifdef HAVE_MLX5DV_DR
+ MLX5_ASSERT(sh->dv_refcnt);
+ if (sh->dv_refcnt && --sh->dv_refcnt)
+ return;
+ if (sh->rx_domain) {
+ mlx5_glue->dr_destroy_domain(sh->rx_domain);
+ sh->rx_domain = NULL;
+ }
+ if (sh->tx_domain) {
+ mlx5_glue->dr_destroy_domain(sh->tx_domain);
+ sh->tx_domain = NULL;
+ }
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+ if (sh->fdb_domain) {
+ mlx5_glue->dr_destroy_domain(sh->fdb_domain);
+ sh->fdb_domain = NULL;
+ }
+ if (sh->esw_drop_action) {
+ mlx5_glue->destroy_flow_action(sh->esw_drop_action);
+ sh->esw_drop_action = NULL;
+ }
+#endif
+ if (sh->pop_vlan_action) {
+ mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
+ sh->pop_vlan_action = NULL;
+ }
+ pthread_mutex_destroy(&sh->dv_mutex);
+#endif /* HAVE_MLX5DV_DR */
+ if (sh->tag_table) {
+ /* tags should be destroyed with flow before. */
+ mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
+ sh->tag_table = NULL;
+ }
+ mlx5_free_table_hash_list(priv);
+}
+
+/**
+ * Initialize shared data between primary and secondary process.
+ *
+ * A memzone is reserved by primary process and secondary processes attach to
+ * the memzone.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_init_shared_data(void)
+{
+ const struct rte_memzone *mz;
+ int ret = 0;
+
+ rte_spinlock_lock(&mlx5_shared_data_lock);
+ if (mlx5_shared_data == NULL) {
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* Allocate shared memory. */
+ mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
+ sizeof(*mlx5_shared_data),
+ SOCKET_ID_ANY, 0);
+ if (mz == NULL) {
+ DRV_LOG(ERR,
+ "Cannot allocate mlx5 shared data");
+ ret = -rte_errno;
+ goto error;
+ }
+ mlx5_shared_data = mz->addr;
+ memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
+ rte_spinlock_init(&mlx5_shared_data->lock);
+ } else {
+ /* Lookup allocated shared memory. */
+ mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
+ if (mz == NULL) {
+ DRV_LOG(ERR,
+ "Cannot attach mlx5 shared data");
+ ret = -rte_errno;
+ goto error;
+ }
+ mlx5_shared_data = mz->addr;
+ memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
+ }
+ }
+error:
+ rte_spinlock_unlock(&mlx5_shared_data_lock);
+ return ret;
+}
+
+/**
+ * Retrieve integer value from environment variable.
+ *
+ * @param[in] name
+ * Environment variable name.
+ *
+ * @return
+ * Integer value, 0 if the variable is not set.
+ */
+int
+mlx5_getenv_int(const char *name)
+{
+ const char *val = getenv(name);
+
+ if (val == NULL)
+ return 0;
+ return atoi(val);
+}
+
+/**
+ * Verbs callback to allocate a memory. This function should allocate the space
+ * according to the size provided residing inside a huge page.
+ * Please note that all allocation must respect the alignment from libmlx5
+ * (i.e. currently sysconf(_SC_PAGESIZE)).
+ *
+ * @param[in] size
+ * The size in bytes of the memory to allocate.
+ * @param[in] data
+ * A pointer to the callback data.
+ *
+ * @return
+ * Allocated buffer, NULL otherwise and rte_errno is set.
+ */
+static void *
+mlx5_alloc_verbs_buf(size_t size, void *data)
+{
+ struct mlx5_priv *priv = data;
+ void *ret;
+ size_t alignment = sysconf(_SC_PAGESIZE);
+ unsigned int socket = SOCKET_ID_ANY;
+
+ if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
+ const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
+
+ socket = ctrl->socket;
+ } else if (priv->verbs_alloc_ctx.type ==
+ MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
+ const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
+
+ socket = ctrl->socket;
+ }
+ MLX5_ASSERT(data != NULL);
+ ret = rte_malloc_socket(__func__, size, alignment, socket);
+ if (!ret && size)
+ rte_errno = ENOMEM;
+ return ret;
+}
+
+/**
+ * Verbs callback to free a memory.
+ *
+ * @param[in] ptr
+ * A pointer to the memory to free.
+ * @param[in] data
+ * A pointer to the callback data.
+ */
+static void
+mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
+{
+ MLX5_ASSERT(data != NULL);
+ rte_free(ptr);
+}
+
+/**
+ * DPDK callback to add udp tunnel port
+ *
+ * @param[in] dev
+ * A pointer to eth_dev
+ * @param[in] udp_tunnel
+ * A pointer to udp tunnel
+ *
+ * @return
+ * 0 on valid udp ports and tunnels, -ENOTSUP otherwise.
+ */
+int
+mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
+ struct rte_eth_udp_tunnel *udp_tunnel)
+{
+ MLX5_ASSERT(udp_tunnel != NULL);
+ if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
+ udp_tunnel->udp_port == 4789)
+ return 0;
+ if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
+ udp_tunnel->udp_port == 4790)
+ return 0;
+ return -ENOTSUP;
+}
+
+/**
+ * Initialize process private data structure.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_proc_priv_init(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_proc_priv *ppriv;
+ size_t ppriv_size;
+
+ /*
+ * UAR register table follows the process private structure. BlueFlame
+ * registers for Tx queues are stored in the table.
+ */
+ ppriv_size =
+ sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
+ ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+ if (!ppriv) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ ppriv->uar_table_sz = ppriv_size;
+ dev->process_private = ppriv;
+ return 0;
+}
+
+/**
+ * Un-initialize process private data structure.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+static void
+mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
+{
+ if (!dev->process_private)
+ return;
+ rte_free(dev->process_private);
+ dev->process_private = NULL;
+}
+
+/**
+ * DPDK callback to close the device.
+ *
+ * Destroy all queues and objects, free memory.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+static void
+mlx5_dev_close(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+ int ret;
+
+ DRV_LOG(DEBUG, "port %u closing device \"%s\"",
+ dev->data->port_id,
+ ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : ""));
+ /* In case mlx5_dev_stop() has not been called. */
+ mlx5_dev_interrupt_handler_uninstall(dev);
+ mlx5_dev_interrupt_handler_devx_uninstall(dev);
+ /*
+ * If default mreg copy action is removed at the stop stage,
+ * the search will return none and nothing will be done anymore.
+ */
+ mlx5_flow_stop_default(dev);
+ mlx5_traffic_disable(dev);
+ /*
+ * If all the flows are already flushed in the device stop stage,
+ * then this will return directly without any action.
+ */
+ mlx5_flow_list_flush(dev, &priv->flows, true);
+ mlx5_flow_meter_flush(dev, NULL);
+ /* Free the intermediate buffers for flow creation. */
+ mlx5_flow_free_intermediate(dev);
+ /* Prevent crashes when queues are still in use. */
+ dev->rx_pkt_burst = removed_rx_burst;
+ dev->tx_pkt_burst = removed_tx_burst;
+ rte_wmb();
+ /* Disable datapath on secondary process. */
+ mlx5_mp_req_stop_rxtx(dev);
+ if (priv->rxqs != NULL) {
+ /* XXX race condition if mlx5_rx_burst() is still running. */
+ usleep(1000);
+ for (i = 0; (i != priv->rxqs_n); ++i)
+ mlx5_rxq_release(dev, i);
+ priv->rxqs_n = 0;
+ priv->rxqs = NULL;
+ }
+ if (priv->txqs != NULL) {
+ /* XXX race condition if mlx5_tx_burst() is still running. */
+ usleep(1000);
+ for (i = 0; (i != priv->txqs_n); ++i)
+ mlx5_txq_release(dev, i);
+ priv->txqs_n = 0;
+ priv->txqs = NULL;
+ }
+ mlx5_proc_priv_uninit(dev);
+ if (priv->mreg_cp_tbl)
+ mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
+ mlx5_mprq_free_mp(dev);
+ mlx5_free_shared_dr(priv);
+ if (priv->rss_conf.rss_key != NULL)
+ rte_free(priv->rss_conf.rss_key);
+ if (priv->reta_idx != NULL)
+ rte_free(priv->reta_idx);
+ if (priv->config.vf)
+ mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
+ dev->data->mac_addrs,
+ MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
+ if (priv->nl_socket_route >= 0)
+ close(priv->nl_socket_route);
+ if (priv->nl_socket_rdma >= 0)
+ close(priv->nl_socket_rdma);
+ if (priv->vmwa_context)
+ mlx5_vlan_vmwa_exit(priv->vmwa_context);
+ ret = mlx5_hrxq_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
+ dev->data->port_id);
+ ret = mlx5_ind_table_obj_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some indirection table still remain",
+ dev->data->port_id);
+ ret = mlx5_rxq_obj_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
+ dev->data->port_id);
+ ret = mlx5_rxq_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some Rx queues still remain",
+ dev->data->port_id);
+ ret = mlx5_txq_obj_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
+ dev->data->port_id);
+ ret = mlx5_txq_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some Tx queues still remain",
+ dev->data->port_id);
+ ret = mlx5_flow_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some flows still remain",
+ dev->data->port_id);
+ if (priv->sh) {
+ /*
+ * Free the shared context in last turn, because the cleanup
+ * routines above may use some shared fields, like
+ * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
+ * ifindex if Netlink fails.
+ */
+ mlx5_free_shared_ibctx(priv->sh);
+ priv->sh = NULL;
+ }
+ if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+ unsigned int c = 0;
+ uint16_t port_id;
+
+ MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+ struct mlx5_priv *opriv =
+ rte_eth_devices[port_id].data->dev_private;
+
+ if (!opriv ||
+ opriv->domain_id != priv->domain_id ||
+ &rte_eth_devices[port_id] == dev)
+ continue;
+ ++c;
+ break;
+ }
+ if (!c)
+ claim_zero(rte_eth_switch_domain_free(priv->domain_id));
+ }
+ memset(priv, 0, sizeof(*priv));
+ priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+ /*
+ * Reset mac_addrs to NULL such that it is not freed as part of
+ * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
+ * it is freed when dev_private is freed.
+ */
+ dev->data->mac_addrs = NULL;
+}
+
+const struct eth_dev_ops mlx5_dev_ops = {
+ .dev_configure = mlx5_dev_configure,
+ .dev_start = mlx5_dev_start,
+ .dev_stop = mlx5_dev_stop,
+ .dev_set_link_down = mlx5_set_link_down,
+ .dev_set_link_up = mlx5_set_link_up,
+ .dev_close = mlx5_dev_close,
+ .promiscuous_enable = mlx5_promiscuous_enable,
+ .promiscuous_disable = mlx5_promiscuous_disable,
+ .allmulticast_enable = mlx5_allmulticast_enable,
+ .allmulticast_disable = mlx5_allmulticast_disable,
+ .link_update = mlx5_link_update,
+ .stats_get = mlx5_stats_get,
+ .stats_reset = mlx5_stats_reset,
+ .xstats_get = mlx5_xstats_get,
+ .xstats_reset = mlx5_xstats_reset,
+ .xstats_get_names = mlx5_xstats_get_names,
+ .fw_version_get = mlx5_fw_version_get,
+ .dev_infos_get = mlx5_dev_infos_get,
+ .read_clock = mlx5_read_clock,
+ .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
+ .vlan_filter_set = mlx5_vlan_filter_set,
+ .rx_queue_setup = mlx5_rx_queue_setup,
+ .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
+ .tx_queue_setup = mlx5_tx_queue_setup,
+ .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
+ .rx_queue_release = mlx5_rx_queue_release,
+ .tx_queue_release = mlx5_tx_queue_release,
+ .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
+ .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
+ .mac_addr_remove = mlx5_mac_addr_remove,
+ .mac_addr_add = mlx5_mac_addr_add,
+ .mac_addr_set = mlx5_mac_addr_set,
+ .set_mc_addr_list = mlx5_set_mc_addr_list,
+ .mtu_set = mlx5_dev_set_mtu,
+ .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
+ .vlan_offload_set = mlx5_vlan_offload_set,
+ .reta_update = mlx5_dev_rss_reta_update,
+ .reta_query = mlx5_dev_rss_reta_query,
+ .rss_hash_update = mlx5_rss_hash_update,
+ .rss_hash_conf_get = mlx5_rss_hash_conf_get,
+ .filter_ctrl = mlx5_dev_filter_ctrl,
+ .rx_descriptor_status = mlx5_rx_descriptor_status,
+ .tx_descriptor_status = mlx5_tx_descriptor_status,
+ .rxq_info_get = mlx5_rxq_info_get,
+ .txq_info_get = mlx5_txq_info_get,
+ .rx_burst_mode_get = mlx5_rx_burst_mode_get,
+ .tx_burst_mode_get = mlx5_tx_burst_mode_get,
+ .rx_queue_count = mlx5_rx_queue_count,
+ .rx_queue_intr_enable = mlx5_rx_intr_enable,
+ .rx_queue_intr_disable = mlx5_rx_intr_disable,
+ .is_removed = mlx5_is_removed,
+ .udp_tunnel_port_add = mlx5_udp_tunnel_port_add,
+ .get_module_info = mlx5_get_module_info,
+ .get_module_eeprom = mlx5_get_module_eeprom,
+ .hairpin_cap_get = mlx5_hairpin_cap_get,
+ .mtr_ops_get = mlx5_flow_meter_ops_get,
+};
+
+/* Available operations from secondary process. */
+static const struct eth_dev_ops mlx5_dev_sec_ops = {
+ .stats_get = mlx5_stats_get,
+ .stats_reset = mlx5_stats_reset,
+ .xstats_get = mlx5_xstats_get,
+ .xstats_reset = mlx5_xstats_reset,
+ .xstats_get_names = mlx5_xstats_get_names,
+ .fw_version_get = mlx5_fw_version_get,
+ .dev_infos_get = mlx5_dev_infos_get,
+ .rx_descriptor_status = mlx5_rx_descriptor_status,
+ .tx_descriptor_status = mlx5_tx_descriptor_status,
+ .rxq_info_get = mlx5_rxq_info_get,
+ .txq_info_get = mlx5_txq_info_get,
+ .rx_burst_mode_get = mlx5_rx_burst_mode_get,
+ .tx_burst_mode_get = mlx5_tx_burst_mode_get,
+ .get_module_info = mlx5_get_module_info,
+ .get_module_eeprom = mlx5_get_module_eeprom,
+};
+
+/* Available operations in flow isolated mode. */
+const struct eth_dev_ops mlx5_dev_ops_isolate = {
+ .dev_configure = mlx5_dev_configure,
+ .dev_start = mlx5_dev_start,
+ .dev_stop = mlx5_dev_stop,
+ .dev_set_link_down = mlx5_set_link_down,
+ .dev_set_link_up = mlx5_set_link_up,
+ .dev_close = mlx5_dev_close,
+ .promiscuous_enable = mlx5_promiscuous_enable,
+ .promiscuous_disable = mlx5_promiscuous_disable,
+ .allmulticast_enable = mlx5_allmulticast_enable,
+ .allmulticast_disable = mlx5_allmulticast_disable,
+ .link_update = mlx5_link_update,
+ .stats_get = mlx5_stats_get,
+ .stats_reset = mlx5_stats_reset,
+ .xstats_get = mlx5_xstats_get,
+ .xstats_reset = mlx5_xstats_reset,
+ .xstats_get_names = mlx5_xstats_get_names,
+ .fw_version_get = mlx5_fw_version_get,
+ .dev_infos_get = mlx5_dev_infos_get,
+ .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
+ .vlan_filter_set = mlx5_vlan_filter_set,
+ .rx_queue_setup = mlx5_rx_queue_setup,
+ .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
+ .tx_queue_setup = mlx5_tx_queue_setup,
+ .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
+ .rx_queue_release = mlx5_rx_queue_release,
+ .tx_queue_release = mlx5_tx_queue_release,
+ .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
+ .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
+ .mac_addr_remove = mlx5_mac_addr_remove,
+ .mac_addr_add = mlx5_mac_addr_add,
+ .mac_addr_set = mlx5_mac_addr_set,
+ .set_mc_addr_list = mlx5_set_mc_addr_list,
+ .mtu_set = mlx5_dev_set_mtu,
+ .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
+ .vlan_offload_set = mlx5_vlan_offload_set,
+ .filter_ctrl = mlx5_dev_filter_ctrl,
+ .rx_descriptor_status = mlx5_rx_descriptor_status,
+ .tx_descriptor_status = mlx5_tx_descriptor_status,
+ .rxq_info_get = mlx5_rxq_info_get,
+ .txq_info_get = mlx5_txq_info_get,
+ .rx_burst_mode_get = mlx5_rx_burst_mode_get,
+ .tx_burst_mode_get = mlx5_tx_burst_mode_get,
+ .rx_queue_intr_enable = mlx5_rx_intr_enable,
+ .rx_queue_intr_disable = mlx5_rx_intr_disable,
+ .is_removed = mlx5_is_removed,
+ .get_module_info = mlx5_get_module_info,
+ .get_module_eeprom = mlx5_get_module_eeprom,
+ .hairpin_cap_get = mlx5_hairpin_cap_get,
+ .mtr_ops_get = mlx5_flow_meter_ops_get,
+};
+
+/**
+ * Verify and store value for device argument.
+ *
+ * @param[in] key
+ * Key argument to verify.
+ * @param[in] val
+ * Value associated with key.
+ * @param opaque
+ * User data.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_args_check(const char *key, const char *val, void *opaque)
+{
+ struct mlx5_dev_config *config = opaque;
+ unsigned long tmp;
+
+ /* No-op, port representors are processed in mlx5_dev_spawn(). */
+ if (!strcmp(MLX5_REPRESENTOR, key))
+ return 0;
+ errno = 0;
+ tmp = strtoul(val, NULL, 0);
+ if (errno) {
+ rte_errno = errno;
+ DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
+ return -rte_errno;
+ }
+ if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
+ config->cqe_comp = !!tmp;
+ } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
+ config->cqe_pad = !!tmp;
+ } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
+ config->hw_padding = !!tmp;
+ } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
+ config->mprq.enabled = !!tmp;
+ } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
+ config->mprq.stride_num_n = tmp;
+ } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) {
+ config->mprq.stride_size_n = tmp;
+ } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
+ config->mprq.max_memcpy_len = tmp;
+ } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
+ config->mprq.min_rxqs_num = tmp;
+ } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
+ DRV_LOG(WARNING, "%s: deprecated parameter,"
+ " converted to txq_inline_max", key);
+ config->txq_inline_max = tmp;
+ } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
+ config->txq_inline_max = tmp;
+ } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
+ config->txq_inline_min = tmp;
+ } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
+ config->txq_inline_mpw = tmp;
+ } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
+ config->txqs_inline = tmp;
+ } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
+ DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
+ } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
+ config->mps = !!tmp;
+ } else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
+ if (tmp != MLX5_TXDB_CACHED &&
+ tmp != MLX5_TXDB_NCACHED &&
+ tmp != MLX5_TXDB_HEURISTIC) {
+ DRV_LOG(ERR, "invalid Tx doorbell "
+ "mapping parameter");
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ config->dbnc = tmp;
+ } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
+ DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
+ } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
+ DRV_LOG(WARNING, "%s: deprecated parameter,"
+ " converted to txq_inline_mpw", key);
+ config->txq_inline_mpw = tmp;
+ } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
+ DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
+ } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
+ config->rx_vec_en = !!tmp;
+ } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
+ config->l3_vxlan_en = !!tmp;
+ } else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
+ config->vf_nl_en = !!tmp;
+ } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
+ config->dv_esw_en = !!tmp;
+ } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
+ config->dv_flow_en = !!tmp;
+ } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
+ if (tmp != MLX5_XMETA_MODE_LEGACY &&
+ tmp != MLX5_XMETA_MODE_META16 &&
+ tmp != MLX5_XMETA_MODE_META32) {
+ DRV_LOG(ERR, "invalid extensive "
+ "metadata parameter");
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ config->dv_xmeta_en = tmp;
+ } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
+ config->mr_ext_memseg_en = !!tmp;
+ } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
+ config->max_dump_files_num = tmp;
+ } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
+ config->lro.timeout = tmp;
+ } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) {
+ DRV_LOG(DEBUG, "class argument is %s.", val);
+ } else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) {
+ config->log_hp_size = tmp;
+ } else {
+ DRV_LOG(WARNING, "%s: unknown parameter", key);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ return 0;
+}
+
+/**
+ * Parse device parameters.
+ *
+ * @param config
+ * Pointer to device configuration structure.
+ * @param devargs
+ * Device arguments structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
+{
+ const char **params = (const char *[]){
+ MLX5_RXQ_CQE_COMP_EN,
+ MLX5_RXQ_CQE_PAD_EN,
+ MLX5_RXQ_PKT_PAD_EN,
+ MLX5_RX_MPRQ_EN,
+ MLX5_RX_MPRQ_LOG_STRIDE_NUM,
+ MLX5_RX_MPRQ_LOG_STRIDE_SIZE,
+ MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
+ MLX5_RXQS_MIN_MPRQ,
+ MLX5_TXQ_INLINE,
+ MLX5_TXQ_INLINE_MIN,
+ MLX5_TXQ_INLINE_MAX,
+ MLX5_TXQ_INLINE_MPW,
+ MLX5_TXQS_MIN_INLINE,
+ MLX5_TXQS_MAX_VEC,
+ MLX5_TXQ_MPW_EN,
+ MLX5_TXQ_MPW_HDR_DSEG_EN,
+ MLX5_TXQ_MAX_INLINE_LEN,
+ MLX5_TX_DB_NC,
+ MLX5_TX_VEC_EN,
+ MLX5_RX_VEC_EN,
+ MLX5_L3_VXLAN_EN,
+ MLX5_VF_NL_EN,
+ MLX5_DV_ESW_EN,
+ MLX5_DV_FLOW_EN,
+ MLX5_DV_XMETA_EN,
+ MLX5_MR_EXT_MEMSEG_EN,
+ MLX5_REPRESENTOR,
+ MLX5_MAX_DUMP_FILES_NUM,
+ MLX5_LRO_TIMEOUT_USEC,
+ MLX5_CLASS_ARG_NAME,
+ MLX5_HP_BUF_SIZE,
+ NULL,
+ };
+ struct rte_kvargs *kvlist;
+ int ret = 0;
+ int i;
+
+ if (devargs == NULL)
+ return 0;
+ /* Following UGLY cast is done to pass checkpatch. */
+ kvlist = rte_kvargs_parse(devargs->args, params);
+ if (kvlist == NULL) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ /* Process parameters. */
+ for (i = 0; (params[i] != NULL); ++i) {
+ if (rte_kvargs_count(kvlist, params[i])) {
+ ret = rte_kvargs_process(kvlist, params[i],
+ mlx5_args_check, config);
+ if (ret) {
+ rte_errno = EINVAL;
+ rte_kvargs_free(kvlist);
+ return -rte_errno;
+ }
+ }
+ }
+ rte_kvargs_free(kvlist);
+ return 0;
+}
+
+static struct rte_pci_driver mlx5_driver;
+
+/**
+ * PMD global initialization.
+ *
+ * Independent from individual device, this function initializes global
+ * per-PMD data structures distinguishing primary and secondary processes.
+ * Hence, each initialization is called once per a process.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_init_once(void)
+{
+ struct mlx5_shared_data *sd;
+ struct mlx5_local_data *ld = &mlx5_local_data;
+ int ret = 0;
+
+ if (mlx5_init_shared_data())
+ return -rte_errno;
+ sd = mlx5_shared_data;
+ MLX5_ASSERT(sd);
+ rte_spinlock_lock(&sd->lock);
+ switch (rte_eal_process_type()) {
+ case RTE_PROC_PRIMARY:
+ if (sd->init_done)
+ break;
+ LIST_INIT(&sd->mem_event_cb_list);
+ rte_rwlock_init(&sd->mem_event_rwlock);
+ rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
+ mlx5_mr_mem_event_cb, NULL);
+ ret = mlx5_mp_init_primary(MLX5_MP_NAME,
+ mlx5_mp_primary_handle);
+ if (ret)
+ goto out;
+ sd->init_done = true;
+ break;
+ case RTE_PROC_SECONDARY:
+ if (ld->init_done)
+ break;
+ ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
+ mlx5_mp_secondary_handle);
+ if (ret)
+ goto out;
+ ++sd->secondary_cnt;
+ ld->init_done = true;
+ break;
+ default:
+ break;
+ }
+out:
+ rte_spinlock_unlock(&sd->lock);
+ return ret;
+}
+
+/**
+ * Configures the minimal amount of data to inline into WQE
+ * while sending packets.
+ *
+ * - the txq_inline_min has the maximal priority, if this
+ * key is specified in devargs
+ * - if DevX is enabled the inline mode is queried from the
+ * device (HCA attributes and NIC vport context if needed).
+ * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx
+ * and none (0 bytes) for other NICs
+ *
+ * @param spawn
+ * Verbs device parameters (name, port, switch_info) to spawn.
+ * @param config
+ * Device configuration parameters.
+ */
+static void
+mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
+ struct mlx5_dev_config *config)
+{
+ if (config->txq_inline_min != MLX5_ARG_UNSET) {
+ /* Application defines size of inlined data explicitly. */
+ switch (spawn->pci_dev->id.device_id) {
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+ if (config->txq_inline_min <
+ (int)MLX5_INLINE_HSIZE_L2) {
+ DRV_LOG(DEBUG,
+ "txq_inline_mix aligned to minimal"
+ " ConnectX-4 required value %d",
+ (int)MLX5_INLINE_HSIZE_L2);
+ config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+ }
+ break;
+ }
+ goto exit;
+ }
+ if (config->hca_attr.eth_net_offloads) {
+ /* We have DevX enabled, inline mode queried successfully. */
+ switch (config->hca_attr.wqe_inline_mode) {
+ case MLX5_CAP_INLINE_MODE_L2:
+ /* outer L2 header must be inlined. */
+ config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+ goto exit;
+ case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+ /* No inline data are required by NIC. */
+ config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+ config->hw_vlan_insert =
+ config->hca_attr.wqe_vlan_insert;
+ DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
+ goto exit;
+ case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+ /* inline mode is defined by NIC vport context. */
+ if (!config->hca_attr.eth_virt)
+ break;
+ switch (config->hca_attr.vport_inline_mode) {
+ case MLX5_INLINE_MODE_NONE:
+ config->txq_inline_min =
+ MLX5_INLINE_HSIZE_NONE;
+ goto exit;
+ case MLX5_INLINE_MODE_L2:
+ config->txq_inline_min =
+ MLX5_INLINE_HSIZE_L2;
+ goto exit;
+ case MLX5_INLINE_MODE_IP:
+ config->txq_inline_min =
+ MLX5_INLINE_HSIZE_L3;
+ goto exit;
+ case MLX5_INLINE_MODE_TCP_UDP:
+ config->txq_inline_min =
+ MLX5_INLINE_HSIZE_L4;
+ goto exit;
+ case MLX5_INLINE_MODE_INNER_L2:
+ config->txq_inline_min =
+ MLX5_INLINE_HSIZE_INNER_L2;
+ goto exit;
+ case MLX5_INLINE_MODE_INNER_IP:
+ config->txq_inline_min =
+ MLX5_INLINE_HSIZE_INNER_L3;
+ goto exit;
+ case MLX5_INLINE_MODE_INNER_TCP_UDP:
+ config->txq_inline_min =
+ MLX5_INLINE_HSIZE_INNER_L4;
+ goto exit;
+ }
+ }
+ }
+ /*
+ * We get here if we are unable to deduce
+ * inline data size with DevX. Try PCI ID
+ * to determine old NICs.
+ */
+ switch (spawn->pci_dev->id.device_id) {
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+ config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+ config->hw_vlan_insert = 0;
+ break;
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+ /*
+ * These NICs support VLAN insertion from WQE and
+ * report the wqe_vlan_insert flag. But there is the bug
+ * and PFC control may be broken, so disable feature.
+ */
+ config->hw_vlan_insert = 0;
+ config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+ break;
+ default:
+ config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+ break;
+ }
+exit:
+ DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
+}
+
+/**
+ * Configures the metadata mask fields in the shared context.
+ *
+ * @param [in] dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx5_set_metadata_mask(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ uint32_t meta, mark, reg_c0;
+
+ reg_c0 = ~priv->vport_meta_mask;
+ switch (priv->config.dv_xmeta_en) {
+ case MLX5_XMETA_MODE_LEGACY:
+ meta = UINT32_MAX;
+ mark = MLX5_FLOW_MARK_MASK;
+ break;
+ case MLX5_XMETA_MODE_META16:
+ meta = reg_c0 >> rte_bsf32(reg_c0);
+ mark = MLX5_FLOW_MARK_MASK;
+ break;
+ case MLX5_XMETA_MODE_META32:
+ meta = UINT32_MAX;
+ mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
+ break;
+ default:
+ meta = 0;
+ mark = 0;
+ MLX5_ASSERT(false);
+ break;
+ }
+ if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
+ DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
+ sh->dv_mark_mask, mark);
+ else
+ sh->dv_mark_mask = mark;
+ if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
+ DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
+ sh->dv_meta_mask, meta);
+ else
+ sh->dv_meta_mask = meta;
+ if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
+ DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
+ sh->dv_meta_mask, reg_c0);
+ else
+ sh->dv_regc0_mask = reg_c0;
+ DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
+ DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
+ DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
+ DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
+}
+
+/**
+ * Allocate page of door-bells and register it using DevX API.
+ *
+ * @param [in] dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * Pointer to new page on success, NULL otherwise.
+ */
+static struct mlx5_devx_dbr_page *
+mlx5_alloc_dbr_page(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_devx_dbr_page *page;
+
+ /* Allocate space for door-bell page and management data. */
+ page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page),
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+ if (!page) {
+ DRV_LOG(ERR, "port %u cannot allocate dbr page",
+ dev->data->port_id);
+ return NULL;
+ }
+ /* Register allocated memory. */
+ page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs,
+ MLX5_DBR_PAGE_SIZE, 0);
+ if (!page->umem) {
+ DRV_LOG(ERR, "port %u cannot umem reg dbr page",
+ dev->data->port_id);
+ rte_free(page);
+ return NULL;
+ }
+ return page;
+}
+
+/**
+ * Find the next available door-bell, allocate new page if needed.
+ *
+ * @param [in] dev
+ * Pointer to Ethernet device.
+ * @param [out] dbr_page
+ * Door-bell page containing the page data.
+ *
+ * @return
+ * Door-bell address offset on success, a negative error value otherwise.
+ */
+int64_t
+mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_devx_dbr_page *page = NULL;
+ uint32_t i, j;
+
+ LIST_FOREACH(page, &priv->dbrpgs, next)
+ if (page->dbr_count < MLX5_DBR_PER_PAGE)
+ break;
+ if (!page) { /* No page with free door-bell exists. */
+ page = mlx5_alloc_dbr_page(dev);
+ if (!page) /* Failed to allocate new page. */
+ return (-1);
+ LIST_INSERT_HEAD(&priv->dbrpgs, page, next);
+ }
+ /* Loop to find bitmap part with clear bit. */
+ for (i = 0;
+ i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX;
+ i++)
+ ; /* Empty. */
+ /* Find the first clear bit. */
+ MLX5_ASSERT(i < MLX5_DBR_BITMAP_SIZE);
+ j = rte_bsf64(~page->dbr_bitmap[i]);
+ page->dbr_bitmap[i] |= (UINT64_C(1) << j);
+ page->dbr_count++;
+ *dbr_page = page;
+ return (((i * 64) + j) * sizeof(uint64_t));
+}
+
+/**
+ * Release a door-bell record.
+ *
+ * @param [in] dev
+ * Pointer to Ethernet device.
+ * @param [in] umem_id
+ * UMEM ID of page containing the door-bell record to release.
+ * @param [in] offset
+ * Offset of door-bell record in page.
+ *
+ * @return
+ * 0 on success, a negative error value otherwise.
+ */
+int32_t
+mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_devx_dbr_page *page = NULL;
+ int ret = 0;
+
+ LIST_FOREACH(page, &priv->dbrpgs, next)
+ /* Find the page this address belongs to. */
+ if (page->umem->umem_id == umem_id)
+ break;
+ if (!page)
+ return -EINVAL;
+ page->dbr_count--;
+ if (!page->dbr_count) {
+ /* Page not used, free it and remove from list. */
+ LIST_REMOVE(page, next);
+ if (page->umem)
+ ret = -mlx5_glue->devx_umem_dereg(page->umem);
+ rte_free(page);
+ } else {
+ /* Mark in bitmap that this door-bell is not in use. */
+ offset /= MLX5_DBR_SIZE;
+ int i = offset / 64;
+ int j = offset % 64;
+
+ page->dbr_bitmap[i] &= ~(UINT64_C(1) << j);
+ }
+ return ret;
+}
+
+int
+rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n)
+{
+ static const char *const dynf_names[] = {
+ RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
+ RTE_MBUF_DYNFLAG_METADATA_NAME
+ };
+ unsigned int i;
+
+ if (n < RTE_DIM(dynf_names))
+ return -ENOMEM;
+ for (i = 0; i < RTE_DIM(dynf_names); i++) {
+ if (names[i] == NULL)
+ return -EINVAL;
+ strcpy(names[i], dynf_names[i]);
+ }
+ return RTE_DIM(dynf_names);
+}
+
+/**
+ * Check sibling device configurations.
+ *
+ * Sibling devices sharing the Infiniband device context
+ * should have compatible configurations. This regards
+ * representors and bonding slaves.
+ *
+ * @param priv
+ * Private device descriptor.
+ * @param config
+ * Configuration of the device is going to be created.
+ *
+ * @return
+ * 0 on success, EINVAL otherwise
+ */
+static int
+mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
+ struct mlx5_dev_config *config)
+{
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_dev_config *sh_conf = NULL;
+ uint16_t port_id;
+
+ MLX5_ASSERT(sh);
+ /* Nothing to compare for the single/first device. */
+ if (sh->refcnt == 1)
+ return 0;
+ /* Find the device with shared context. */
+ MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+ struct mlx5_priv *opriv =
+ rte_eth_devices[port_id].data->dev_private;
+
+ if (opriv && opriv != priv && opriv->sh == sh) {
+ sh_conf = &opriv->config;
+ break;
+ }
+ }
+ if (!sh_conf)
+ return 0;
+ if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
+ DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
+ " for shared %s context", sh->ibdev_name);
+ rte_errno = EINVAL;
+ return rte_errno;
+ }
+ if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
+ DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
+ " for shared %s context", sh->ibdev_name);
+ rte_errno = EINVAL;
+ return rte_errno;
+ }
+ return 0;
+}
+/**
+ * Spawn an Ethernet device from Verbs information.
+ *
+ * @param dpdk_dev
+ * Backing DPDK device.
+ * @param spawn
+ * Verbs device parameters (name, port, switch_info) to spawn.
+ * @param config
+ * Device configuration parameters.
+ *
+ * @return
+ * A valid Ethernet device object on success, NULL otherwise and rte_errno
+ * is set. The following errors are defined:
+ *
+ * EBUSY: device is not supposed to be spawned.
+ * EEXIST: device is already spawned
+ */
+static struct rte_eth_dev *
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+ struct mlx5_dev_spawn_data *spawn,
+ struct mlx5_dev_config config)
+{
+ const struct mlx5_switch_info *switch_info = &spawn->info;
+ struct mlx5_ibv_shared *sh = NULL;
+ struct ibv_port_attr port_attr;
+ struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+ struct rte_eth_dev *eth_dev = NULL;
+ struct mlx5_priv *priv = NULL;
+ int err = 0;
+ unsigned int hw_padding = 0;
+ unsigned int mps;
+ unsigned int cqe_comp;
+ unsigned int cqe_pad = 0;
+ unsigned int tunnel_en = 0;
+ unsigned int mpls_en = 0;
+ unsigned int swp = 0;
+ unsigned int mprq = 0;
+ unsigned int mprq_min_stride_size_n = 0;
+ unsigned int mprq_max_stride_size_n = 0;
+ unsigned int mprq_min_stride_num_n = 0;
+ unsigned int mprq_max_stride_num_n = 0;
+ struct rte_ether_addr mac;
+ char name[RTE_ETH_NAME_MAX_LEN];
+ int own_domain_id = 0;
+ uint16_t port_id;
+ unsigned int i;
+#ifdef HAVE_MLX5DV_DR_DEVX_PORT
+ struct mlx5dv_devx_port devx_port = { .comp_mask = 0 };
+#endif
+
+ /* Determine if this port representor is supposed to be spawned. */
+ if (switch_info->representor && dpdk_dev->devargs) {
+ struct rte_eth_devargs eth_da;
+
+ err = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
+ if (err) {
+ rte_errno = -err;
+ DRV_LOG(ERR, "failed to process device arguments: %s",
+ strerror(rte_errno));
+ return NULL;
+ }
+ for (i = 0; i < eth_da.nb_representor_ports; ++i)
+ if (eth_da.representor_ports[i] ==
+ (uint16_t)switch_info->port_name)
+ break;
+ if (i == eth_da.nb_representor_ports) {
+ rte_errno = EBUSY;
+ return NULL;
+ }
+ }
+ /* Build device name. */
+ if (spawn->pf_bond < 0) {
+ /* Single device. */
+ if (!switch_info->representor)
+ strlcpy(name, dpdk_dev->name, sizeof(name));
+ else
+ snprintf(name, sizeof(name), "%s_representor_%u",
+ dpdk_dev->name, switch_info->port_name);
+ } else {
+ /* Bonding device. */
+ if (!switch_info->representor)
+ snprintf(name, sizeof(name), "%s_%s",
+ dpdk_dev->name, spawn->ibv_dev->name);
+ else
+ snprintf(name, sizeof(name), "%s_%s_representor_%u",
+ dpdk_dev->name, spawn->ibv_dev->name,
+ switch_info->port_name);
+ }
+ /* check if the device is already spawned */
+ if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
+ rte_errno = EEXIST;
+ return NULL;
+ }
+ DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ struct mlx5_mp_id mp_id;
+
+ eth_dev = rte_eth_dev_attach_secondary(name);
+ if (eth_dev == NULL) {
+ DRV_LOG(ERR, "can not attach rte ethdev");
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ eth_dev->device = dpdk_dev;
+ eth_dev->dev_ops = &mlx5_dev_sec_ops;
+ err = mlx5_proc_priv_init(eth_dev);
+ if (err)
+ return NULL;
+ mp_id.port_id = eth_dev->data->port_id;
+ strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
+ /* Receive command fd from primary process */
+ err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
+ if (err < 0)
+ return NULL;
+ /* Remap UAR for Tx queues. */
+ err = mlx5_tx_uar_init_secondary(eth_dev, err);
+ if (err)
+ return NULL;
+ /*
+ * Ethdev pointer is still required as input since
+ * the primary device is not accessible from the
+ * secondary process.
+ */
+ eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
+ eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
+ return eth_dev;
+ }
+ /*
+ * Some parameters ("tx_db_nc" in particularly) are needed in
+ * advance to create dv/verbs device context. We proceed the
+ * devargs here to get ones, and later proceed devargs again
+ * to override some hardware settings.
+ */
+ err = mlx5_args(&config, dpdk_dev->devargs);
+ if (err) {
+ err = rte_errno;
+ DRV_LOG(ERR, "failed to process device arguments: %s",
+ strerror(rte_errno));
+ goto error;
+ }
+ sh = mlx5_alloc_shared_ibctx(spawn, &config);
+ if (!sh)
+ return NULL;
+ config.devx = sh->devx;
+#ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
+ config.dest_tir = 1;
+#endif
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+ dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+#endif
+ /*
+ * Multi-packet send is supported by ConnectX-4 Lx PF as well
+ * as all ConnectX-5 devices.
+ */
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
+#endif
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
+#endif
+ mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
+ if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+ if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+ DRV_LOG(DEBUG, "enhanced MPW is supported");
+ mps = MLX5_MPW_ENHANCED;
+ } else {
+ DRV_LOG(DEBUG, "MPW is supported");
+ mps = MLX5_MPW;
+ }
+ } else {
+ DRV_LOG(DEBUG, "MPW isn't supported");
+ mps = MLX5_MPW_DISABLED;
+ }
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+ if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
+ swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
+ DRV_LOG(DEBUG, "SWP support: %u", swp);
+#endif
+ config.swp = !!swp;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
+ struct mlx5dv_striding_rq_caps mprq_caps =
+ dv_attr.striding_rq_caps;
+
+ DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
+ mprq_caps.min_single_stride_log_num_of_bytes);
+ DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
+ mprq_caps.max_single_stride_log_num_of_bytes);
+ DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
+ mprq_caps.min_single_wqe_log_num_of_strides);
+ DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
+ mprq_caps.max_single_wqe_log_num_of_strides);
+ DRV_LOG(DEBUG, "\tsupported_qpts: %d",
+ mprq_caps.supported_qpts);
+ DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
+ mprq = 1;
+ mprq_min_stride_size_n =
+ mprq_caps.min_single_stride_log_num_of_bytes;
+ mprq_max_stride_size_n =
+ mprq_caps.max_single_stride_log_num_of_bytes;
+ mprq_min_stride_num_n =
+ mprq_caps.min_single_wqe_log_num_of_strides;
+ mprq_max_stride_num_n =
+ mprq_caps.max_single_wqe_log_num_of_strides;
+ }
+#endif
+ if (RTE_CACHE_LINE_SIZE == 128 &&
+ !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
+ cqe_comp = 0;
+ else
+ cqe_comp = 1;
+ config.cqe_comp = cqe_comp;
+#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
+ /* Whether device supports 128B Rx CQE padding. */
+ cqe_pad = RTE_CACHE_LINE_SIZE == 128 &&
+ (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD);
+#endif
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
+ tunnel_en = ((dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
+ (dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
+ (dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
+ }
+ DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
+ tunnel_en ? "" : "not ");
+#else
+ DRV_LOG(WARNING,
+ "tunnel offloading disabled due to old OFED/rdma-core version");
+#endif
+ config.tunnel_en = tunnel_en;
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+ mpls_en = ((dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
+ (dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
+ DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
+ mpls_en ? "" : "not ");
+#else
+ DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
+ " old OFED/rdma-core version or firmware configuration");
+#endif
+ config.mpls_en = mpls_en;
+ /* Check port status. */
+ err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
+ if (err) {
+ DRV_LOG(ERR, "port query failed: %s", strerror(err));
+ goto error;
+ }
+ if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+ DRV_LOG(ERR, "port is not configured in Ethernet mode");
+ err = EINVAL;
+ goto error;
+ }
+ if (port_attr.state != IBV_PORT_ACTIVE)
+ DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+ mlx5_glue->port_state_str(port_attr.state),
+ port_attr.state);
+ /* Allocate private eth device data. */
+ priv = rte_zmalloc("ethdev private structure",
+ sizeof(*priv),
+ RTE_CACHE_LINE_SIZE);
+ if (priv == NULL) {
+ DRV_LOG(ERR, "priv allocation failure");
+ err = ENOMEM;
+ goto error;
+ }
+ priv->sh = sh;
+ priv->ibv_port = spawn->ibv_port;
+ priv->pci_dev = spawn->pci_dev;
+ priv->mtu = RTE_ETHER_MTU;
+ priv->mp_id.port_id = port_id;
+ strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
+#ifndef RTE_ARCH_64
+ /* Initialize UAR access locks for 32bit implementations. */
+ rte_spinlock_init(&priv->uar_lock_cq);
+ for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
+ rte_spinlock_init(&priv->uar_lock[i]);
+#endif
+ /* Some internal functions rely on Netlink sockets, open them now. */
+ priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
+ priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
+ priv->representor = !!switch_info->representor;
+ priv->master = !!switch_info->master;
+ priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+ priv->vport_meta_tag = 0;
+ priv->vport_meta_mask = 0;
+ priv->pf_bond = spawn->pf_bond;
+#ifdef HAVE_MLX5DV_DR_DEVX_PORT
+ /*
+ * The DevX port query API is implemented. E-Switch may use
+ * either vport or reg_c[0] metadata register to match on
+ * vport index. The engaged part of metadata register is
+ * defined by mask.
+ */
+ if (switch_info->representor || switch_info->master) {
+ devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT |
+ MLX5DV_DEVX_PORT_MATCH_REG_C_0;
+ err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port,
+ &devx_port);
+ if (err) {
+ DRV_LOG(WARNING,
+ "can't query devx port %d on device %s",
+ spawn->ibv_port, spawn->ibv_dev->name);
+ devx_port.comp_mask = 0;
+ }
+ }
+ if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) {
+ priv->vport_meta_tag = devx_port.reg_c_0.value;
+ priv->vport_meta_mask = devx_port.reg_c_0.mask;
+ if (!priv->vport_meta_mask) {
+ DRV_LOG(ERR, "vport zero mask for port %d"
+ " on bonding device %s",
+ spawn->ibv_port, spawn->ibv_dev->name);
+ err = ENOTSUP;
+ goto error;
+ }
+ if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
+ DRV_LOG(ERR, "invalid vport tag for port %d"
+ " on bonding device %s",
+ spawn->ibv_port, spawn->ibv_dev->name);
+ err = ENOTSUP;
+ goto error;
+ }
+ }
+ if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) {
+ priv->vport_id = devx_port.vport_num;
+ } else if (spawn->pf_bond >= 0) {
+ DRV_LOG(ERR, "can't deduce vport index for port %d"
+ " on bonding device %s",
+ spawn->ibv_port, spawn->ibv_dev->name);
+ err = ENOTSUP;
+ goto error;
+ } else {
+ /* Suppose vport index in compatible way. */
+ priv->vport_id = switch_info->representor ?
+ switch_info->port_name + 1 : -1;
+ }
+#else
+ /*
+ * Kernel/rdma_core support single E-Switch per PF configurations
+ * only and vport_id field contains the vport index for
+ * associated VF, which is deduced from representor port name.
+ * For example, let's have the IB device port 10, it has
+ * attached network device eth0, which has port name attribute
+ * pf0vf2, we can deduce the VF number as 2, and set vport index
+ * as 3 (2+1). This assigning schema should be changed if the
+ * multiple E-Switch instances per PF configurations or/and PCI
+ * subfunctions are added.
+ */
+ priv->vport_id = switch_info->representor ?
+ switch_info->port_name + 1 : -1;
+#endif
+ /* representor_id field keeps the unmodified VF index. */
+ priv->representor_id = switch_info->representor ?
+ switch_info->port_name : -1;
+ /*
+ * Look for sibling devices in order to reuse their switch domain
+ * if any, otherwise allocate one.
+ */
+ MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+ const struct mlx5_priv *opriv =
+ rte_eth_devices[port_id].data->dev_private;
+
+ if (!opriv ||
+ opriv->sh != priv->sh ||
+ opriv->domain_id ==
+ RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+ continue;
+ priv->domain_id = opriv->domain_id;
+ break;
+ }
+ if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+ err = rte_eth_switch_domain_alloc(&priv->domain_id);
+ if (err) {
+ err = rte_errno;
+ DRV_LOG(ERR, "unable to allocate switch domain: %s",
+ strerror(rte_errno));
+ goto error;
+ }
+ own_domain_id = 1;
+ }
+ /* Override some values set by hardware configuration. */
+ mlx5_args(&config, dpdk_dev->devargs);
+ err = mlx5_dev_check_sibling_config(priv, &config);
+ if (err)
+ goto error;
+ config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
+ IBV_DEVICE_RAW_IP_CSUM);
+ DRV_LOG(DEBUG, "checksum offloading is %ssupported",
+ (config.hw_csum ? "" : "not "));
+#if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
+ !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ DRV_LOG(DEBUG, "counters are not supported");
+#endif
+#if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
+ if (config.dv_flow_en) {
+ DRV_LOG(WARNING, "DV flow is not supported");
+ config.dv_flow_en = 0;
+ }
+#endif
+ config.ind_table_max_size =
+ sh->device_attr.rss_caps.max_rwq_indirection_table_size;
+ /*
+ * Remove this check once DPDK supports larger/variable
+ * indirection tables.
+ */
+ if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
+ config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
+ DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
+ config.ind_table_max_size);
+ config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
+ IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
+ DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
+ (config.hw_vlan_strip ? "" : "not "));
+ config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
+ IBV_RAW_PACKET_CAP_SCATTER_FCS);
+ DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
+ (config.hw_fcs_strip ? "" : "not "));
+#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
+ hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
+#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
+ hw_padding = !!(sh->device_attr.device_cap_flags_ex &
+ IBV_DEVICE_PCI_WRITE_END_PADDING);
+#endif
+ if (config.hw_padding && !hw_padding) {
+ DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
+ config.hw_padding = 0;
+ } else if (config.hw_padding) {
+ DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
+ }
+ config.tso = (sh->device_attr.tso_caps.max_tso > 0 &&
+ (sh->device_attr.tso_caps.supported_qpts &
+ (1 << IBV_QPT_RAW_PACKET)));
+ if (config.tso)
+ config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso;
+ /*
+ * MPW is disabled by default, while the Enhanced MPW is enabled
+ * by default.
+ */
+ if (config.mps == MLX5_ARG_UNSET)
+ config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
+ MLX5_MPW_DISABLED;
+ else
+ config.mps = config.mps ? mps : MLX5_MPW_DISABLED;
+ DRV_LOG(INFO, "%sMPS is %s",
+ config.mps == MLX5_MPW_ENHANCED ? "enhanced " :
+ config.mps == MLX5_MPW ? "legacy " : "",
+ config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
+ if (config.cqe_comp && !cqe_comp) {
+ DRV_LOG(WARNING, "Rx CQE compression isn't supported");
+ config.cqe_comp = 0;
+ }
+ if (config.cqe_pad && !cqe_pad) {
+ DRV_LOG(WARNING, "Rx CQE padding isn't supported");
+ config.cqe_pad = 0;
+ } else if (config.cqe_pad) {
+ DRV_LOG(INFO, "Rx CQE padding is enabled");
+ }
+ if (config.devx) {
+ priv->counter_fallback = 0;
+ err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
+ if (err) {
+ err = -err;
+ goto error;
+ }
+ if (!config.hca_attr.flow_counters_dump)
+ priv->counter_fallback = 1;
+#ifndef HAVE_IBV_DEVX_ASYNC
+ priv->counter_fallback = 1;
+#endif
+ if (priv->counter_fallback)
+ DRV_LOG(INFO, "Use fall-back DV counter management");
+ /* Check for LRO support. */
+ if (config.dest_tir && config.hca_attr.lro_cap &&
+ config.dv_flow_en) {
+ /* TBD check tunnel lro caps. */
+ config.lro.supported = config.hca_attr.lro_cap;
+ DRV_LOG(DEBUG, "Device supports LRO");
+ /*
+ * If LRO timeout is not configured by application,
+ * use the minimal supported value.
+ */
+ if (!config.lro.timeout)
+ config.lro.timeout =
+ config.hca_attr.lro_timer_supported_periods[0];
+ DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
+ config.lro.timeout);
+ }
+#if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER)
+ if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup &&
+ config.dv_flow_en) {
+ uint8_t reg_c_mask =
+ config.hca_attr.qos.flow_meter_reg_c_ids;
+ /*
+ * Meter needs two REG_C's for color match and pre-sfx
+ * flow match. Here get the REG_C for color match.
+ * REG_C_0 and REG_C_1 is reserved for metadata feature.
+ */
+ reg_c_mask &= 0xfc;
+ if (__builtin_popcount(reg_c_mask) < 1) {
+ priv->mtr_en = 0;
+ DRV_LOG(WARNING, "No available register for"
+ " meter.");
+ } else {
+ priv->mtr_color_reg = ffs(reg_c_mask) - 1 +
+ REG_C_0;
+ priv->mtr_en = 1;
+ priv->mtr_reg_share =
+ config.hca_attr.qos.flow_meter_reg_share;
+ DRV_LOG(DEBUG, "The REG_C meter uses is %d",
+ priv->mtr_color_reg);
+ }
+ }
+#endif
+ }
+ if (config.mprq.enabled && mprq) {
+ if (config.mprq.stride_num_n &&
+ (config.mprq.stride_num_n > mprq_max_stride_num_n ||
+ config.mprq.stride_num_n < mprq_min_stride_num_n)) {
+ config.mprq.stride_num_n =
+ RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+ mprq_min_stride_num_n),
+ mprq_max_stride_num_n);
+ DRV_LOG(WARNING,
+ "the number of strides"
+ " for Multi-Packet RQ is out of range,"
+ " setting default value (%u)",
+ 1 << config.mprq.stride_num_n);
+ }
+ if (config.mprq.stride_size_n &&
+ (config.mprq.stride_size_n > mprq_max_stride_size_n ||
+ config.mprq.stride_size_n < mprq_min_stride_size_n)) {
+ config.mprq.stride_size_n =
+ RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
+ mprq_min_stride_size_n),
+ mprq_max_stride_size_n);
+ DRV_LOG(WARNING,
+ "the size of a stride"
+ " for Multi-Packet RQ is out of range,"
+ " setting default value (%u)",
+ 1 << config.mprq.stride_size_n);
+ }
+ config.mprq.min_stride_size_n = mprq_min_stride_size_n;
+ config.mprq.max_stride_size_n = mprq_max_stride_size_n;
+ } else if (config.mprq.enabled && !mprq) {
+ DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
+ config.mprq.enabled = 0;
+ }
+ if (config.max_dump_files_num == 0)
+ config.max_dump_files_num = 128;
+ eth_dev = rte_eth_dev_allocate(name);
+ if (eth_dev == NULL) {
+ DRV_LOG(ERR, "can not allocate rte ethdev");
+ err = ENOMEM;
+ goto error;
+ }
+ /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */
+ eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
+ if (priv->representor) {
+ eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
+ eth_dev->data->representor_id = priv->representor_id;
+ }
+ /*
+ * Store associated network device interface index. This index
+ * is permanent throughout the lifetime of device. So, we may store
+ * the ifindex here and use the cached value further.
+ */
+ MLX5_ASSERT(spawn->ifindex);
+ priv->if_index = spawn->ifindex;
+ eth_dev->data->dev_private = priv;
+ priv->dev_data = eth_dev->data;
+ eth_dev->data->mac_addrs = priv->mac;
+ eth_dev->device = dpdk_dev;
+ /* Configure the first MAC address by default. */
+ if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
+ DRV_LOG(ERR,
+ "port %u cannot get MAC address, is mlx5_en"
+ " loaded? (errno: %s)",
+ eth_dev->data->port_id, strerror(rte_errno));
+ err = ENODEV;
+ goto error;
+ }
+ DRV_LOG(INFO,
+ "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
+ eth_dev->data->port_id,
+ mac.addr_bytes[0], mac.addr_bytes[1],
+ mac.addr_bytes[2], mac.addr_bytes[3],
+ mac.addr_bytes[4], mac.addr_bytes[5]);
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+ {
+ char ifname[IF_NAMESIZE];
+
+ if (mlx5_get_ifname(eth_dev, &ifname) == 0)
+ DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
+ eth_dev->data->port_id, ifname);
+ else
+ DRV_LOG(DEBUG, "port %u ifname is unknown",
+ eth_dev->data->port_id);
+ }
+#endif
+ /* Get actual MTU if possible. */
+ err = mlx5_get_mtu(eth_dev, &priv->mtu);
+ if (err) {
+ err = rte_errno;
+ goto error;
+ }
+ DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
+ priv->mtu);
+ /* Initialize burst functions to prevent crashes before link-up. */
+ eth_dev->rx_pkt_burst = removed_rx_burst;
+ eth_dev->tx_pkt_burst = removed_tx_burst;
+ eth_dev->dev_ops = &mlx5_dev_ops;
+ /* Register MAC address. */
+ claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
+ if (config.vf && config.vf_nl_en)
+ mlx5_nl_mac_addr_sync(priv->nl_socket_route,
+ mlx5_ifindex(eth_dev),
+ eth_dev->data->mac_addrs,
+ MLX5_MAX_MAC_ADDRESSES);
+ priv->flows = 0;
+ priv->ctrl_flows = 0;
+ TAILQ_INIT(&priv->flow_meters);
+ TAILQ_INIT(&priv->flow_meter_profiles);
+ /* Hint libmlx5 to use PMD allocator for data plane resources */
+ struct mlx5dv_ctx_allocators alctr = {
+ .alloc = &mlx5_alloc_verbs_buf,
+ .free = &mlx5_free_verbs_buf,
+ .data = priv,
+ };
+ mlx5_glue->dv_set_context_attr(sh->ctx,
+ MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+ (void *)((uintptr_t)&alctr));
+ /* Bring Ethernet device up. */
+ DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
+ eth_dev->data->port_id);
+ mlx5_set_link_up(eth_dev);
+ /*
+ * Even though the interrupt handler is not installed yet,
+ * interrupts will still trigger on the async_fd from
+ * Verbs context returned by ibv_open_device().
+ */
+ mlx5_link_update(eth_dev, 0);
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+ if (!(config.hca_attr.eswitch_manager && config.dv_flow_en &&
+ (switch_info->representor || switch_info->master)))
+ config.dv_esw_en = 0;
+#else
+ config.dv_esw_en = 0;
+#endif
+ /* Detect minimal data bytes to inline. */
+ mlx5_set_min_inline(spawn, &config);
+ /* Store device configuration on private structure. */
+ priv->config = config;
+ /* Create context for virtual machine VLAN workaround. */
+ priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
+ if (config.dv_flow_en) {
+ err = mlx5_alloc_shared_dr(priv);
+ if (err)
+ goto error;
+ /*
+ * RSS id is shared with meter flow id. Meter flow id can only
+ * use the 24 MSB of the register.
+ */
+ priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >>
+ MLX5_MTR_COLOR_BITS);
+ if (!priv->qrss_id_pool) {
+ DRV_LOG(ERR, "can't create flow id pool");
+ err = ENOMEM;
+ goto error;
+ }
+ }
+ /* Supported Verbs flow priority number detection. */
+ err = mlx5_flow_discover_priorities(eth_dev);
+ if (err < 0) {
+ err = -err;
+ goto error;
+ }
+ priv->config.flow_prio = err;
+ if (!priv->config.dv_esw_en &&
+ priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+ DRV_LOG(WARNING, "metadata mode %u is not supported "
+ "(no E-Switch)", priv->config.dv_xmeta_en);
+ priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
+ }
+ mlx5_set_metadata_mask(eth_dev);
+ if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+ !priv->sh->dv_regc0_mask) {
+ DRV_LOG(ERR, "metadata mode %u is not supported "
+ "(no metadata reg_c[0] is available)",
+ priv->config.dv_xmeta_en);
+ err = ENOTSUP;
+ goto error;
+ }
+ /*
+ * Allocate the buffer for flow creating, just once.
+ * The allocation must be done before any flow creating.
+ */
+ mlx5_flow_alloc_intermediate(eth_dev);
+ /* Query availibility of metadata reg_c's. */
+ err = mlx5_flow_discover_mreg_c(eth_dev);
+ if (err < 0) {
+ err = -err;
+ goto error;
+ }
+ if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
+ DRV_LOG(DEBUG,
+ "port %u extensive metadata register is not supported",
+ eth_dev->data->port_id);
+ if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+ DRV_LOG(ERR, "metadata mode %u is not supported "
+ "(no metadata registers available)",
+ priv->config.dv_xmeta_en);
+ err = ENOTSUP;
+ goto error;
+ }
+ }
+ if (priv->config.dv_flow_en &&
+ priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+ mlx5_flow_ext_mreg_supported(eth_dev) &&
+ priv->sh->dv_regc0_mask) {
+ priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
+ MLX5_FLOW_MREG_HTABLE_SZ);
+ if (!priv->mreg_cp_tbl) {
+ err = ENOMEM;
+ goto error;
+ }
+ }
+ return eth_dev;
+error:
+ if (priv) {
+ if (priv->mreg_cp_tbl)
+ mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
+ if (priv->sh)
+ mlx5_free_shared_dr(priv);
+ if (priv->nl_socket_route >= 0)
+ close(priv->nl_socket_route);
+ if (priv->nl_socket_rdma >= 0)
+ close(priv->nl_socket_rdma);
+ if (priv->vmwa_context)
+ mlx5_vlan_vmwa_exit(priv->vmwa_context);
+ if (priv->qrss_id_pool)
+ mlx5_flow_id_pool_release(priv->qrss_id_pool);
+ if (own_domain_id)
+ claim_zero(rte_eth_switch_domain_free(priv->domain_id));
+ rte_free(priv);
+ if (eth_dev != NULL)
+ eth_dev->data->dev_private = NULL;
+ }
+ if (eth_dev != NULL) {
+ /* mac_addrs must not be freed alone because part of dev_private */
+ eth_dev->data->mac_addrs = NULL;
+ rte_eth_dev_release_port(eth_dev);
+ }
+ if (sh)
+ mlx5_free_shared_ibctx(sh);
+ MLX5_ASSERT(err > 0);
+ rte_errno = err;
+ return NULL;
+}
+
+/**
+ * Comparison callback to sort device data.
+ *
+ * This is meant to be used with qsort().
+ *
+ * @param a[in]
+ * Pointer to pointer to first data object.
+ * @param b[in]
+ * Pointer to pointer to second data object.
+ *
+ * @return
+ * 0 if both objects are equal, less than 0 if the first argument is less
+ * than the second, greater than 0 otherwise.
+ */
+static int
+mlx5_dev_spawn_data_cmp(const void *a, const void *b)
+{
+ const struct mlx5_switch_info *si_a =
+ &((const struct mlx5_dev_spawn_data *)a)->info;
+ const struct mlx5_switch_info *si_b =
+ &((const struct mlx5_dev_spawn_data *)b)->info;
+ int ret;
+
+ /* Master device first. */
+ ret = si_b->master - si_a->master;
+ if (ret)
+ return ret;
+ /* Then representor devices. */
+ ret = si_b->representor - si_a->representor;
+ if (ret)
+ return ret;
+ /* Unidentified devices come last in no specific order. */
+ if (!si_a->representor)
+ return 0;
+ /* Order representors by name. */
+ return si_a->port_name - si_b->port_name;
+}
+
+/**
+ * Match PCI information for possible slaves of bonding device.
+ *
+ * @param[in] ibv_dev
+ * Pointer to Infiniband device structure.
+ * @param[in] pci_dev
+ * Pointer to PCI device structure to match PCI address.
+ * @param[in] nl_rdma
+ * Netlink RDMA group socket handle.
+ *
+ * @return
+ * negative value if no bonding device found, otherwise
+ * positive index of slave PF in bonding.
+ */
+static int
+mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
+ const struct rte_pci_device *pci_dev,
+ int nl_rdma)
+{
+ char ifname[IF_NAMESIZE + 1];
+ unsigned int ifindex;
+ unsigned int np, i;
+ FILE *file = NULL;
+ int pf = -1;
+
+ /*
+ * Try to get master device name. If something goes
+ * wrong suppose the lack of kernel support and no
+ * bonding devices.
+ */
+ if (nl_rdma < 0)
+ return -1;
+ if (!strstr(ibv_dev->name, "bond"))
+ return -1;
+ np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
+ if (!np)
+ return -1;
+ /*
+ * The Master device might not be on the predefined
+ * port (not on port index 1, it is not garanted),
+ * we have to scan all Infiniband device port and
+ * find master.
+ */
+ for (i = 1; i <= np; ++i) {
+ /* Check whether Infiniband port is populated. */
+ ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
+ if (!ifindex)
+ continue;
+ if (!if_indextoname(ifindex, ifname))
+ continue;
+ /* Try to read bonding slave names from sysfs. */
+ MKSTR(slaves,
+ "/sys/class/net/%s/master/bonding/slaves", ifname);
+ file = fopen(slaves, "r");
+ if (file)
+ break;
+ }
+ if (!file)
+ return -1;
+ /* Use safe format to check maximal buffer length. */
+ MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
+ while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
+ char tmp_str[IF_NAMESIZE + 32];
+ struct rte_pci_addr pci_addr;
+ struct mlx5_switch_info info;
+
+ /* Process slave interface names in the loop. */
+ snprintf(tmp_str, sizeof(tmp_str),
+ "/sys/class/net/%s", ifname);
+ if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
+ DRV_LOG(WARNING, "can not get PCI address"
+ " for netdev \"%s\"", ifname);
+ continue;
+ }
+ if (pci_dev->addr.domain != pci_addr.domain ||
+ pci_dev->addr.bus != pci_addr.bus ||
+ pci_dev->addr.devid != pci_addr.devid ||
+ pci_dev->addr.function != pci_addr.function)
+ continue;
+ /* Slave interface PCI address match found. */
+ fclose(file);
+ snprintf(tmp_str, sizeof(tmp_str),
+ "/sys/class/net/%s/phys_port_name", ifname);
+ file = fopen(tmp_str, "rb");
+ if (!file)
+ break;
+ info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
+ if (fscanf(file, "%32s", tmp_str) == 1)
+ mlx5_translate_port_name(tmp_str, &info);
+ if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
+ info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
+ pf = info.port_name;
+ break;
+ }
+ if (file)
+ fclose(file);
+ return pf;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function spawns Ethernet devices out of a given PCI device.
+ *
+ * @param[in] pci_drv
+ * PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ * PCI device information.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+ struct rte_pci_device *pci_dev)
+{
+ struct ibv_device **ibv_list;
+ /*
+ * Number of found IB Devices matching with requested PCI BDF.
+ * nd != 1 means there are multiple IB devices over the same
+ * PCI device and we have representors and master.
+ */
+ unsigned int nd = 0;
+ /*
+ * Number of found IB device Ports. nd = 1 and np = 1..n means
+ * we have the single multiport IB device, and there may be
+ * representors attached to some of found ports.
+ */
+ unsigned int np = 0;
+ /*
+ * Number of DPDK ethernet devices to Spawn - either over
+ * multiple IB devices or multiple ports of single IB device.
+ * Actually this is the number of iterations to spawn.
+ */
+ unsigned int ns = 0;
+ /*
+ * Bonding device
+ * < 0 - no bonding device (single one)
+ * >= 0 - bonding device (value is slave PF index)
+ */
+ int bd = -1;
+ struct mlx5_dev_spawn_data *list = NULL;
+ struct mlx5_dev_config dev_config;
+ int ret;
+
+ if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) {
+ DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
+ " driver.");
+ return 1;
+ }
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ mlx5_pmd_socket_init();
+ ret = mlx5_init_once();
+ if (ret) {
+ DRV_LOG(ERR, "unable to init PMD global data: %s",
+ strerror(rte_errno));
+ return -rte_errno;
+ }
+ MLX5_ASSERT(pci_drv == &mlx5_driver);
+ errno = 0;
+ ibv_list = mlx5_glue->get_device_list(&ret);
+ if (!ibv_list) {
+ rte_errno = errno ? errno : ENOSYS;
+ DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
+ return -rte_errno;
+ }
+ /*
+ * First scan the list of all Infiniband devices to find
+ * matching ones, gathering into the list.
+ */
+ struct ibv_device *ibv_match[ret + 1];
+ int nl_route = mlx5_nl_init(NETLINK_ROUTE);
+ int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
+ unsigned int i;
+
+ while (ret-- > 0) {
+ struct rte_pci_addr pci_addr;
+
+ DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+ bd = mlx5_device_bond_pci_match
+ (ibv_list[ret], pci_dev, nl_rdma);
+ if (bd >= 0) {
+ /*
+ * Bonding device detected. Only one match is allowed,
+ * the bonding is supported over multi-port IB device,
+ * there should be no matches on representor PCI
+ * functions or non VF LAG bonding devices with
+ * specified address.
+ */
+ if (nd) {
+ DRV_LOG(ERR,
+ "multiple PCI match on bonding device"
+ "\"%s\" found", ibv_list[ret]->name);
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ DRV_LOG(INFO, "PCI information matches for"
+ " slave %d bonding device \"%s\"",
+ bd, ibv_list[ret]->name);
+ ibv_match[nd++] = ibv_list[ret];
+ break;
+ }
+ if (mlx5_dev_to_pci_addr
+ (ibv_list[ret]->ibdev_path, &pci_addr))
+ continue;
+ if (pci_dev->addr.domain != pci_addr.domain ||
+ pci_dev->addr.bus != pci_addr.bus ||
+ pci_dev->addr.devid != pci_addr.devid ||
+ pci_dev->addr.function != pci_addr.function)
+ continue;
+ DRV_LOG(INFO, "PCI information matches for device \"%s\"",
+ ibv_list[ret]->name);
+ ibv_match[nd++] = ibv_list[ret];
+ }
+ ibv_match[nd] = NULL;
+ if (!nd) {
+ /* No device matches, just complain and bail out. */
+ DRV_LOG(WARNING,
+ "no Verbs device matches PCI device " PCI_PRI_FMT ","
+ " are kernel drivers loaded?",
+ pci_dev->addr.domain, pci_dev->addr.bus,
+ pci_dev->addr.devid, pci_dev->addr.function);
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ if (nd == 1) {
+ /*
+ * Found single matching device may have multiple ports.
+ * Each port may be representor, we have to check the port
+ * number and check the representors existence.
+ */
+ if (nl_rdma >= 0)
+ np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
+ if (!np)
+ DRV_LOG(WARNING, "can not get IB device \"%s\""
+ " ports number", ibv_match[0]->name);
+ if (bd >= 0 && !np) {
+ DRV_LOG(ERR, "can not get ports"
+ " for bonding device");
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ }
+#ifndef HAVE_MLX5DV_DR_DEVX_PORT
+ if (bd >= 0) {
+ /*
+ * This may happen if there is VF LAG kernel support and
+ * application is compiled with older rdma_core library.
+ */
+ DRV_LOG(ERR,
+ "No kernel/verbs support for VF LAG bonding found.");
+ rte_errno = ENOTSUP;
+ ret = -rte_errno;
+ goto exit;
+ }
+#endif
+ /*
+ * Now we can determine the maximal
+ * amount of devices to be spawned.
+ */
+ list = rte_zmalloc("device spawn data",
+ sizeof(struct mlx5_dev_spawn_data) *
+ (np ? np : nd),
+ RTE_CACHE_LINE_SIZE);
+ if (!list) {
+ DRV_LOG(ERR, "spawn data array allocation failure");
+ rte_errno = ENOMEM;
+ ret = -rte_errno;
+ goto exit;
+ }
+ if (bd >= 0 || np > 1) {
+ /*
+ * Single IB device with multiple ports found,
+ * it may be E-Switch master device and representors.
+ * We have to perform identification through the ports.
+ */
+ MLX5_ASSERT(nl_rdma >= 0);
+ MLX5_ASSERT(ns == 0);
+ MLX5_ASSERT(nd == 1);
+ MLX5_ASSERT(np);
+ for (i = 1; i <= np; ++i) {
+ list[ns].max_port = np;
+ list[ns].ibv_port = i;
+ list[ns].ibv_dev = ibv_match[0];
+ list[ns].eth_dev = NULL;
+ list[ns].pci_dev = pci_dev;
+ list[ns].pf_bond = bd;
+ list[ns].ifindex = mlx5_nl_ifindex
+ (nl_rdma, list[ns].ibv_dev->name, i);
+ if (!list[ns].ifindex) {
+ /*
+ * No network interface index found for the
+ * specified port, it means there is no
+ * representor on this port. It's OK,
+ * there can be disabled ports, for example
+ * if sriov_numvfs < sriov_totalvfs.
+ */
+ continue;
+ }
+ ret = -1;
+ if (nl_route >= 0)
+ ret = mlx5_nl_switch_info
+ (nl_route,
+ list[ns].ifindex,
+ &list[ns].info);
+ if (ret || (!list[ns].info.representor &&
+ !list[ns].info.master)) {
+ /*
+ * We failed to recognize representors with
+ * Netlink, let's try to perform the task
+ * with sysfs.
+ */
+ ret = mlx5_sysfs_switch_info
+ (list[ns].ifindex,
+ &list[ns].info);
+ }
+ if (!ret && bd >= 0) {
+ switch (list[ns].info.name_type) {
+ case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+ if (list[ns].info.port_name == bd)
+ ns++;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+ if (list[ns].info.pf_num == bd)
+ ns++;
+ break;
+ default:
+ break;
+ }
+ continue;
+ }
+ if (!ret && (list[ns].info.representor ^
+ list[ns].info.master))
+ ns++;
+ }
+ if (!ns) {
+ DRV_LOG(ERR,
+ "unable to recognize master/representors"
+ " on the IB device with multiple ports");
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ } else {
+ /*
+ * The existence of several matching entries (nd > 1) means
+ * port representors have been instantiated. No existing Verbs
+ * call nor sysfs entries can tell them apart, this can only
+ * be done through Netlink calls assuming kernel drivers are
+ * recent enough to support them.
+ *
+ * In the event of identification failure through Netlink,
+ * try again through sysfs, then:
+ *
+ * 1. A single IB device matches (nd == 1) with single
+ * port (np=0/1) and is not a representor, assume
+ * no switch support.
+ *
+ * 2. Otherwise no safe assumptions can be made;
+ * complain louder and bail out.
+ */
+ np = 1;
+ for (i = 0; i != nd; ++i) {
+ memset(&list[ns].info, 0, sizeof(list[ns].info));
+ list[ns].max_port = 1;
+ list[ns].ibv_port = 1;
+ list[ns].ibv_dev = ibv_match[i];
+ list[ns].eth_dev = NULL;
+ list[ns].pci_dev = pci_dev;
+ list[ns].pf_bond = -1;
+ list[ns].ifindex = 0;
+ if (nl_rdma >= 0)
+ list[ns].ifindex = mlx5_nl_ifindex
+ (nl_rdma, list[ns].ibv_dev->name, 1);
+ if (!list[ns].ifindex) {
+ char ifname[IF_NAMESIZE];
+
+ /*
+ * Netlink failed, it may happen with old
+ * ib_core kernel driver (before 4.16).
+ * We can assume there is old driver because
+ * here we are processing single ports IB
+ * devices. Let's try sysfs to retrieve
+ * the ifindex. The method works for
+ * master device only.
+ */
+ if (nd > 1) {
+ /*
+ * Multiple devices found, assume
+ * representors, can not distinguish
+ * master/representor and retrieve
+ * ifindex via sysfs.
+ */
+ continue;
+ }
+ ret = mlx5_get_master_ifname
+ (ibv_match[i]->ibdev_path, &ifname);
+ if (!ret)
+ list[ns].ifindex =
+ if_nametoindex(ifname);
+ if (!list[ns].ifindex) {
+ /*
+ * No network interface index found
+ * for the specified device, it means
+ * there it is neither representor
+ * nor master.
+ */
+ continue;
+ }
+ }
+ ret = -1;
+ if (nl_route >= 0)
+ ret = mlx5_nl_switch_info
+ (nl_route,
+ list[ns].ifindex,
+ &list[ns].info);
+ if (ret || (!list[ns].info.representor &&
+ !list[ns].info.master)) {
+ /*
+ * We failed to recognize representors with
+ * Netlink, let's try to perform the task
+ * with sysfs.
+ */
+ ret = mlx5_sysfs_switch_info
+ (list[ns].ifindex,
+ &list[ns].info);
+ }
+ if (!ret && (list[ns].info.representor ^
+ list[ns].info.master)) {
+ ns++;
+ } else if ((nd == 1) &&
+ !list[ns].info.representor &&
+ !list[ns].info.master) {
+ /*
+ * Single IB device with
+ * one physical port and
+ * attached network device.
+ * May be SRIOV is not enabled
+ * or there is no representors.
+ */
+ DRV_LOG(INFO, "no E-Switch support detected");
+ ns++;
+ break;
+ }
+ }
+ if (!ns) {
+ DRV_LOG(ERR,
+ "unable to recognize master/representors"
+ " on the multiple IB devices");
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ }
+ MLX5_ASSERT(ns);
+ /*
+ * Sort list to probe devices in natural order for users convenience
+ * (i.e. master first, then representors from lowest to highest ID).
+ */
+ qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
+ /* Default configuration. */
+ dev_config = (struct mlx5_dev_config){
+ .hw_padding = 0,
+ .mps = MLX5_ARG_UNSET,
+ .dbnc = MLX5_ARG_UNSET,
+ .rx_vec_en = 1,
+ .txq_inline_max = MLX5_ARG_UNSET,
+ .txq_inline_min = MLX5_ARG_UNSET,
+ .txq_inline_mpw = MLX5_ARG_UNSET,
+ .txqs_inline = MLX5_ARG_UNSET,
+ .vf_nl_en = 1,
+ .mr_ext_memseg_en = 1,
+ .mprq = {
+ .enabled = 0, /* Disabled by default. */
+ .stride_num_n = 0,
+ .stride_size_n = 0,
+ .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
+ .min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
+ },
+ .dv_esw_en = 1,
+ .dv_flow_en = 1,
+ .log_hp_size = MLX5_ARG_UNSET,
+ };
+ /* Device specific configuration. */
+ switch (pci_dev->id.device_id) {
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF:
+ dev_config.vf = 1;
+ break;
+ default:
+ break;
+ }
+ for (i = 0; i != ns; ++i) {
+ uint32_t restore;
+
+ list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
+ &list[i],
+ dev_config);
+ if (!list[i].eth_dev) {
+ if (rte_errno != EBUSY && rte_errno != EEXIST)
+ break;
+ /* Device is disabled or already spawned. Ignore it. */
+ continue;
+ }
+ restore = list[i].eth_dev->data->dev_flags;
+ rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
+ /* Restore non-PCI flags cleared by the above call. */
+ list[i].eth_dev->data->dev_flags |= restore;
+ mlx5_dev_interrupt_handler_devx_install(list[i].eth_dev);
+ rte_eth_dev_probing_finish(list[i].eth_dev);
+ }
+ if (i != ns) {
+ DRV_LOG(ERR,
+ "probe of PCI device " PCI_PRI_FMT " aborted after"
+ " encountering an error: %s",
+ pci_dev->addr.domain, pci_dev->addr.bus,
+ pci_dev->addr.devid, pci_dev->addr.function,
+ strerror(rte_errno));
+ ret = -rte_errno;
+ /* Roll back. */
+ while (i--) {
+ if (!list[i].eth_dev)
+ continue;
+ mlx5_dev_close(list[i].eth_dev);
+ /* mac_addrs must not be freed because in dev_private */
+ list[i].eth_dev->data->mac_addrs = NULL;
+ claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
+ }
+ /* Restore original error. */
+ rte_errno = -ret;
+ } else {
+ ret = 0;
+ }
+exit:
+ /*
+ * Do the routine cleanup:
+ * - close opened Netlink sockets
+ * - free allocated spawn data array
+ * - free the Infiniband device list
+ */
+ if (nl_rdma >= 0)
+ close(nl_rdma);
+ if (nl_route >= 0)
+ close(nl_route);
+ if (list)
+ rte_free(list);
+ MLX5_ASSERT(ibv_list);
+ mlx5_glue->free_device_list(ibv_list);
+ return ret;
+}
+
+/**
+ * Look for the ethernet device belonging to mlx5 driver.
+ *
+ * @param[in] port_id
+ * port_id to start looking for device.
+ * @param[in] pci_dev
+ * Pointer to the hint PCI device. When device is being probed
+ * the its siblings (master and preceding representors might
+ * not have assigned driver yet (because the mlx5_pci_probe()
+ * is not completed yet, for this case match on hint PCI
+ * device may be used to detect sibling device.
+ *
+ * @return
+ * port_id of found device, RTE_MAX_ETHPORT if not found.
+ */
+uint16_t
+mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
+{
+ while (port_id < RTE_MAX_ETHPORTS) {
+ struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+ if (dev->state != RTE_ETH_DEV_UNUSED &&
+ dev->device &&
+ (dev->device == &pci_dev->device ||
+ (dev->device->driver &&
+ dev->device->driver->name &&
+ !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
+ break;
+ port_id++;
+ }
+ if (port_id >= RTE_MAX_ETHPORTS)
+ return RTE_MAX_ETHPORTS;
+ return port_id;
+}
+
+/**
+ * DPDK callback to remove a PCI device.
+ *
+ * This function removes all Ethernet devices belong to a given PCI device.
+ *
+ * @param[in] pci_dev
+ * Pointer to the PCI device.
+ *
+ * @return
+ * 0 on success, the function cannot fail.
+ */
+static int
+mlx5_pci_remove(struct rte_pci_device *pci_dev)
+{
+ uint16_t port_id;
+
+ RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device)
+ rte_eth_dev_close(port_id);
+ return 0;
+}
+
+static const struct rte_pci_id mlx5_pci_id_map[] = {
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX5)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
+ },
+ {
+ .vendor_id = 0
+ }
+};
+
+static struct rte_pci_driver mlx5_driver = {
+ .driver = {
+ .name = MLX5_DRIVER_NAME
+ },
+ .id_table = mlx5_pci_id_map,
+ .probe = mlx5_pci_probe,
+ .remove = mlx5_pci_remove,
+ .dma_map = mlx5_dma_map,
+ .dma_unmap = mlx5_dma_unmap,
+ .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
+ RTE_PCI_DRV_PROBE_AGAIN,
+};
+
+/**
+ * Driver initialization routine.
+ */
+RTE_INIT(rte_mlx5_pmd_init)
+{
+ /* Initialize driver log type. */
+ mlx5_logtype = rte_log_register("pmd.net.mlx5");
+ if (mlx5_logtype >= 0)
+ rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE);
+
+ /* Build the static tables for Verbs conversion. */
+ mlx5_set_ptype_table();
+ mlx5_set_cksum_table();
+ mlx5_set_swp_types_table();
+ if (mlx5_glue)
+ rte_pci_register(&mlx5_driver);
+}
+
+RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
+RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
+RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5.h
new file mode 100644
index 000000000..d9f5d816f
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5.h
@@ -0,0 +1,848 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_H_
+#define RTE_PMD_MLX5_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <limits.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/queue.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_pci.h>
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_rwlock.h>
+#include <rte_interrupts.h>
+#include <rte_errno.h>
+#include <rte_flow.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+#include <mlx5_nl.h>
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5_defs.h"
+#include "mlx5_utils.h"
+#include "mlx5_autoconf.h"
+
+
+enum mlx5_ipool_index {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ MLX5_IPOOL_DECAP_ENCAP = 0, /* Pool for encap/decap resource. */
+ MLX5_IPOOL_PUSH_VLAN, /* Pool for push vlan resource. */
+ MLX5_IPOOL_TAG, /* Pool for tag resource. */
+ MLX5_IPOOL_PORT_ID, /* Pool for port id resource. */
+ MLX5_IPOOL_JUMP, /* Pool for jump resource. */
+#endif
+ MLX5_IPOOL_MTR, /* Pool for meter resource. */
+ MLX5_IPOOL_MCP, /* Pool for metadata resource. */
+ MLX5_IPOOL_HRXQ, /* Pool for hrxq resource. */
+ MLX5_IPOOL_MLX5_FLOW, /* Pool for mlx5 flow handle. */
+ MLX5_IPOOL_RTE_FLOW, /* Pool for rte_flow. */
+ MLX5_IPOOL_MAX,
+};
+
+/** Key string for IPC. */
+#define MLX5_MP_NAME "net_mlx5_mp"
+
+
+LIST_HEAD(mlx5_dev_list, mlx5_ibv_shared);
+
+/* Shared data between primary and secondary processes. */
+struct mlx5_shared_data {
+ rte_spinlock_t lock;
+ /* Global spinlock for primary and secondary processes. */
+ int init_done; /* Whether primary has done initialization. */
+ unsigned int secondary_cnt; /* Number of secondary processes init'd. */
+ struct mlx5_dev_list mem_event_cb_list;
+ rte_rwlock_t mem_event_rwlock;
+};
+
+/* Per-process data structure, not visible to other processes. */
+struct mlx5_local_data {
+ int init_done; /* Whether a secondary has done initialization. */
+};
+
+extern struct mlx5_shared_data *mlx5_shared_data;
+
+struct mlx5_counter_ctrl {
+ /* Name of the counter. */
+ char dpdk_name[RTE_ETH_XSTATS_NAME_SIZE];
+ /* Name of the counter on the device table. */
+ char ctr_name[RTE_ETH_XSTATS_NAME_SIZE];
+ uint32_t ib:1; /**< Nonzero for IB counters. */
+};
+
+struct mlx5_xstats_ctrl {
+ /* Number of device stats. */
+ uint16_t stats_n;
+ /* Number of device stats identified by PMD. */
+ uint16_t mlx5_stats_n;
+ /* Index in the device counters table. */
+ uint16_t dev_table_idx[MLX5_MAX_XSTATS];
+ uint64_t base[MLX5_MAX_XSTATS];
+ uint64_t xstats[MLX5_MAX_XSTATS];
+ uint64_t hw_stats[MLX5_MAX_XSTATS];
+ struct mlx5_counter_ctrl info[MLX5_MAX_XSTATS];
+};
+
+struct mlx5_stats_ctrl {
+ /* Base for imissed counter. */
+ uint64_t imissed_base;
+ uint64_t imissed;
+};
+
+/* Default PMD specific parameter value. */
+#define MLX5_ARG_UNSET (-1)
+
+#define MLX5_LRO_SUPPORTED(dev) \
+ (((struct mlx5_priv *)((dev)->data->dev_private))->config.lro.supported)
+
+/* Maximal size of coalesced segment for LRO is set in chunks of 256 Bytes. */
+#define MLX5_LRO_SEG_CHUNK_SIZE 256u
+
+/* Maximal size of aggregated LRO packet. */
+#define MLX5_MAX_LRO_SIZE (UINT8_MAX * MLX5_LRO_SEG_CHUNK_SIZE)
+
+/* LRO configurations structure. */
+struct mlx5_lro_config {
+ uint32_t supported:1; /* Whether LRO is supported. */
+ uint32_t timeout; /* User configuration. */
+};
+
+/*
+ * Device configuration structure.
+ *
+ * Merged configuration from:
+ *
+ * - Device capabilities,
+ * - User device parameters disabled features.
+ */
+struct mlx5_dev_config {
+ unsigned int hw_csum:1; /* Checksum offload is supported. */
+ unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */
+ unsigned int hw_vlan_insert:1; /* VLAN insertion in WQE is supported. */
+ unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */
+ unsigned int hw_padding:1; /* End alignment padding is supported. */
+ unsigned int vf:1; /* This is a VF. */
+ unsigned int tunnel_en:1;
+ /* Whether tunnel stateless offloads are supported. */
+ unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */
+ unsigned int cqe_comp:1; /* CQE compression is enabled. */
+ unsigned int cqe_pad:1; /* CQE padding is enabled. */
+ unsigned int tso:1; /* Whether TSO is supported. */
+ unsigned int rx_vec_en:1; /* Rx vector is enabled. */
+ unsigned int mr_ext_memseg_en:1;
+ /* Whether memseg should be extended for MR creation. */
+ unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
+ unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */
+ unsigned int dv_esw_en:1; /* Enable E-Switch DV flow. */
+ unsigned int dv_flow_en:1; /* Enable DV flow. */
+ unsigned int dv_xmeta_en:2; /* Enable extensive flow metadata. */
+ unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */
+ unsigned int devx:1; /* Whether devx interface is available or not. */
+ unsigned int dest_tir:1; /* Whether advanced DR API is available. */
+ struct {
+ unsigned int enabled:1; /* Whether MPRQ is enabled. */
+ unsigned int stride_num_n; /* Number of strides. */
+ unsigned int stride_size_n; /* Size of a stride. */
+ unsigned int min_stride_size_n; /* Min size of a stride. */
+ unsigned int max_stride_size_n; /* Max size of a stride. */
+ unsigned int max_memcpy_len;
+ /* Maximum packet size to memcpy Rx packets. */
+ unsigned int min_rxqs_num;
+ /* Rx queue count threshold to enable MPRQ. */
+ } mprq; /* Configurations for Multi-Packet RQ. */
+ int mps; /* Multi-packet send supported mode. */
+ int dbnc; /* Skip doorbell register write barrier. */
+ unsigned int flow_prio; /* Number of flow priorities. */
+ enum modify_reg flow_mreg_c[MLX5_MREG_C_NUM];
+ /* Availibility of mreg_c's. */
+ unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */
+ unsigned int ind_table_max_size; /* Maximum indirection table size. */
+ unsigned int max_dump_files_num; /* Maximum dump files per queue. */
+ unsigned int log_hp_size; /* Single hairpin queue data size in total. */
+ int txqs_inline; /* Queue number threshold for inlining. */
+ int txq_inline_min; /* Minimal amount of data bytes to inline. */
+ int txq_inline_max; /* Max packet size for inlining with SEND. */
+ int txq_inline_mpw; /* Max packet size for inlining with eMPW. */
+ struct mlx5_hca_attr hca_attr; /* HCA attributes. */
+ struct mlx5_lro_config lro; /* LRO configuration. */
+};
+
+
+/**
+ * Type of object being allocated.
+ */
+enum mlx5_verbs_alloc_type {
+ MLX5_VERBS_ALLOC_TYPE_NONE,
+ MLX5_VERBS_ALLOC_TYPE_TX_QUEUE,
+ MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
+};
+
+/* Structure for VF VLAN workaround. */
+struct mlx5_vf_vlan {
+ uint32_t tag:12;
+ uint32_t created:1;
+};
+
+/**
+ * Verbs allocator needs a context to know in the callback which kind of
+ * resources it is allocating.
+ */
+struct mlx5_verbs_alloc_ctx {
+ enum mlx5_verbs_alloc_type type; /* Kind of object being allocated. */
+ const void *obj; /* Pointer to the DPDK object. */
+};
+
+/* Flow drop context necessary due to Verbs API. */
+struct mlx5_drop {
+ struct mlx5_hrxq *hrxq; /* Hash Rx queue queue. */
+ struct mlx5_rxq_obj *rxq; /* Rx queue object. */
+};
+
+#define MLX5_COUNTERS_PER_POOL 512
+#define MLX5_MAX_PENDING_QUERIES 4
+#define MLX5_CNT_CONTAINER_RESIZE 64
+#define MLX5_CNT_AGE_OFFSET 0x80000000
+#define CNT_SIZE (sizeof(struct mlx5_flow_counter))
+#define CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
+#define AGE_SIZE (sizeof(struct mlx5_age_param))
+#define MLX5_AGING_TIME_DELAY 7
+#define CNT_POOL_TYPE_EXT (1 << 0)
+#define CNT_POOL_TYPE_AGE (1 << 1)
+#define IS_EXT_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_EXT)
+#define IS_AGE_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_AGE)
+#define MLX_CNT_IS_AGE(counter) ((counter) & MLX5_CNT_AGE_OFFSET ? 1 : 0)
+#define MLX5_CNT_LEN(pool) \
+ (CNT_SIZE + \
+ (IS_AGE_POOL(pool) ? AGE_SIZE : 0) + \
+ (IS_EXT_POOL(pool) ? CNTEXT_SIZE : 0))
+#define MLX5_POOL_GET_CNT(pool, index) \
+ ((struct mlx5_flow_counter *) \
+ ((uint8_t *)((pool) + 1) + (index) * (MLX5_CNT_LEN(pool))))
+#define MLX5_CNT_ARRAY_IDX(pool, cnt) \
+ ((int)(((uint8_t *)(cnt) - (uint8_t *)((pool) + 1)) / \
+ MLX5_CNT_LEN(pool)))
+/*
+ * The pool index and offset of counter in the pool array makes up the
+ * counter index. In case the counter is from pool 0 and offset 0, it
+ * should plus 1 to avoid index 0, since 0 means invalid counter index
+ * currently.
+ */
+#define MLX5_MAKE_CNT_IDX(pi, offset) \
+ ((pi) * MLX5_COUNTERS_PER_POOL + (offset) + 1)
+#define MLX5_CNT_TO_CNT_EXT(pool, cnt) \
+ ((struct mlx5_flow_counter_ext *)\
+ ((uint8_t *)((cnt) + 1) + \
+ (IS_AGE_POOL(pool) ? AGE_SIZE : 0)))
+#define MLX5_GET_POOL_CNT_EXT(pool, offset) \
+ MLX5_CNT_TO_CNT_EXT(pool, MLX5_POOL_GET_CNT((pool), (offset)))
+#define MLX5_CNT_TO_AGE(cnt) \
+ ((struct mlx5_age_param *)((cnt) + 1))
+
+struct mlx5_flow_counter_pool;
+
+/*age status*/
+enum {
+ AGE_FREE, /* Initialized state. */
+ AGE_CANDIDATE, /* Counter assigned to flows. */
+ AGE_TMOUT, /* Timeout, wait for rte_flow_get_aged_flows and destroy. */
+};
+
+#define MLX5_CNT_CONTAINER(sh, batch, age) (&(sh)->cmng.ccont \
+ [(batch) * 2 + (age)])
+
+enum {
+ MLX5_CCONT_TYPE_SINGLE,
+ MLX5_CCONT_TYPE_SINGLE_FOR_AGE,
+ MLX5_CCONT_TYPE_BATCH,
+ MLX5_CCONT_TYPE_BATCH_FOR_AGE,
+ MLX5_CCONT_TYPE_MAX,
+};
+
+/* Counter age parameter. */
+struct mlx5_age_param {
+ rte_atomic16_t state; /**< Age state. */
+ uint16_t port_id; /**< Port id of the counter. */
+ uint32_t timeout:15; /**< Age timeout in unit of 0.1sec. */
+ uint32_t expire:16; /**< Expire time(0.1sec) in the future. */
+ void *context; /**< Flow counter age context. */
+};
+
+struct flow_counter_stats {
+ uint64_t hits;
+ uint64_t bytes;
+};
+
+/* Generic counters information. */
+struct mlx5_flow_counter {
+ TAILQ_ENTRY(mlx5_flow_counter) next;
+ /**< Pointer to the next flow counter structure. */
+ union {
+ uint64_t hits; /**< Reset value of hits packets. */
+ int64_t query_gen; /**< Generation of the last release. */
+ };
+ uint64_t bytes; /**< Reset value of bytes. */
+ void *action; /**< Pointer to the dv action. */
+};
+
+/* Extend counters information for none batch counters. */
+struct mlx5_flow_counter_ext {
+ uint32_t shared:1; /**< Share counter ID with other flow rules. */
+ uint32_t batch: 1;
+ /**< Whether the counter was allocated by batch command. */
+ uint32_t ref_cnt:30; /**< Reference counter. */
+ uint32_t id; /**< User counter ID. */
+ union { /**< Holds the counters for the rule. */
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+ struct ibv_counter_set *cs;
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ struct ibv_counters *cs;
+#endif
+ struct mlx5_devx_obj *dcs; /**< Counter Devx object. */
+ };
+};
+
+TAILQ_HEAD(mlx5_counters, mlx5_flow_counter);
+
+/* Generic counter pool structure - query is in pool resolution. */
+struct mlx5_flow_counter_pool {
+ TAILQ_ENTRY(mlx5_flow_counter_pool) next;
+ struct mlx5_counters counters; /* Free counter list. */
+ union {
+ struct mlx5_devx_obj *min_dcs;
+ rte_atomic64_t a64_dcs;
+ };
+ /* The devx object of the minimum counter ID. */
+ rte_atomic64_t start_query_gen; /* Query start round. */
+ rte_atomic64_t end_query_gen; /* Query end round. */
+ uint32_t index; /* Pool index in container. */
+ uint8_t type; /* Memory type behind the counter array. */
+ rte_spinlock_t sl; /* The pool lock. */
+ struct mlx5_counter_stats_raw *raw;
+ struct mlx5_counter_stats_raw *raw_hw; /* The raw on HW working. */
+};
+
+struct mlx5_counter_stats_raw;
+
+/* Memory management structure for group of counter statistics raws. */
+struct mlx5_counter_stats_mem_mng {
+ LIST_ENTRY(mlx5_counter_stats_mem_mng) next;
+ struct mlx5_counter_stats_raw *raws;
+ struct mlx5_devx_obj *dm;
+ struct mlx5dv_devx_umem *umem;
+};
+
+/* Raw memory structure for the counter statistics values of a pool. */
+struct mlx5_counter_stats_raw {
+ LIST_ENTRY(mlx5_counter_stats_raw) next;
+ int min_dcs_id;
+ struct mlx5_counter_stats_mem_mng *mem_mng;
+ volatile struct flow_counter_stats *data;
+};
+
+TAILQ_HEAD(mlx5_counter_pools, mlx5_flow_counter_pool);
+
+/* Container structure for counter pools. */
+struct mlx5_pools_container {
+ rte_atomic16_t n_valid; /* Number of valid pools. */
+ uint16_t n; /* Number of pools. */
+ rte_spinlock_t resize_sl; /* The resize lock. */
+ struct mlx5_counter_pools pool_list; /* Counter pool list. */
+ struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
+ struct mlx5_counter_stats_mem_mng *mem_mng;
+ /* Hold the memory management for the next allocated pools raws. */
+};
+
+/* Counter global management structure. */
+struct mlx5_flow_counter_mng {
+ struct mlx5_pools_container ccont[MLX5_CCONT_TYPE_MAX];
+ struct mlx5_counters flow_counters; /* Legacy flow counter list. */
+ uint8_t pending_queries;
+ uint8_t batch;
+ uint16_t pool_index;
+ uint8_t age;
+ uint8_t query_thread_on;
+ LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
+ LIST_HEAD(stat_raws, mlx5_counter_stats_raw) free_stat_raws;
+};
+
+#define MLX5_AGE_EVENT_NEW 1
+#define MLX5_AGE_TRIGGER 2
+#define MLX5_AGE_SET(age_info, BIT) \
+ ((age_info)->flags |= (1 << (BIT)))
+#define MLX5_AGE_GET(age_info, BIT) \
+ ((age_info)->flags & (1 << (BIT)))
+#define GET_PORT_AGE_INFO(priv) \
+ (&((priv)->sh->port[(priv)->ibv_port - 1].age_info))
+
+/* Aging information for per port. */
+struct mlx5_age_info {
+ uint8_t flags; /*Indicate if is new event or need be trigered*/
+ struct mlx5_counters aged_counters; /* Aged flow counter list. */
+ rte_spinlock_t aged_sl; /* Aged flow counter list lock. */
+};
+
+/* Per port data of shared IB device. */
+struct mlx5_ibv_shared_port {
+ uint32_t ih_port_id;
+ uint32_t devx_ih_port_id;
+ /*
+ * Interrupt handler port_id. Used by shared interrupt
+ * handler to find the corresponding rte_eth device
+ * by IB port index. If value is equal or greater
+ * RTE_MAX_ETHPORTS it means there is no subhandler
+ * installed for specified IB port index.
+ */
+ struct mlx5_age_info age_info;
+ /* Aging information for per port. */
+};
+
+/* Table key of the hash organization. */
+union mlx5_flow_tbl_key {
+ struct {
+ /* Table ID should be at the lowest address. */
+ uint32_t table_id; /**< ID of the table. */
+ uint16_t reserved; /**< must be zero for comparison. */
+ uint8_t domain; /**< 1 - FDB, 0 - NIC TX/RX. */
+ uint8_t direction; /**< 1 - egress, 0 - ingress. */
+ };
+ uint64_t v64; /**< full 64bits value of key */
+};
+
+/* Table structure. */
+struct mlx5_flow_tbl_resource {
+ void *obj; /**< Pointer to DR table object. */
+ rte_atomic32_t refcnt; /**< Reference counter. */
+};
+
+#define MLX5_MAX_TABLES UINT16_MAX
+#define MLX5_FLOW_TABLE_LEVEL_METER (UINT16_MAX - 3)
+#define MLX5_FLOW_TABLE_LEVEL_SUFFIX (UINT16_MAX - 2)
+#define MLX5_HAIRPIN_TX_TABLE (UINT16_MAX - 1)
+/* Reserve the last two tables for metadata register copy. */
+#define MLX5_FLOW_MREG_ACT_TABLE_GROUP (MLX5_MAX_TABLES - 1)
+#define MLX5_FLOW_MREG_CP_TABLE_GROUP (MLX5_MAX_TABLES - 2)
+/* Tables for metering splits should be added here. */
+#define MLX5_MAX_TABLES_EXTERNAL (MLX5_MAX_TABLES - 3)
+#define MLX5_MAX_TABLES_FDB UINT16_MAX
+
+#define MLX5_DBR_PAGE_SIZE 4096 /* Must be >= 512. */
+#define MLX5_DBR_SIZE 8
+#define MLX5_DBR_PER_PAGE (MLX5_DBR_PAGE_SIZE / MLX5_DBR_SIZE)
+#define MLX5_DBR_BITMAP_SIZE (MLX5_DBR_PER_PAGE / 64)
+
+struct mlx5_devx_dbr_page {
+ /* Door-bell records, must be first member in structure. */
+ uint8_t dbrs[MLX5_DBR_PAGE_SIZE];
+ LIST_ENTRY(mlx5_devx_dbr_page) next; /* Pointer to the next element. */
+ struct mlx5dv_devx_umem *umem;
+ uint32_t dbr_count; /* Number of door-bell records in use. */
+ /* 1 bit marks matching door-bell is in use. */
+ uint64_t dbr_bitmap[MLX5_DBR_BITMAP_SIZE];
+};
+
+/* ID generation structure. */
+struct mlx5_flow_id_pool {
+ uint32_t *free_arr; /**< Pointer to the a array of free values. */
+ uint32_t base_index;
+ /**< The next index that can be used without any free elements. */
+ uint32_t *curr; /**< Pointer to the index to pop. */
+ uint32_t *last; /**< Pointer to the last element in the empty arrray. */
+ uint32_t max_id; /**< Maximum id can be allocated from the pool. */
+};
+
+/*
+ * Shared Infiniband device context for Master/Representors
+ * which belong to same IB device with multiple IB ports.
+ **/
+struct mlx5_ibv_shared {
+ LIST_ENTRY(mlx5_ibv_shared) next;
+ uint32_t refcnt;
+ uint32_t devx:1; /* Opened with DV. */
+ uint32_t max_port; /* Maximal IB device port index. */
+ struct ibv_context *ctx; /* Verbs/DV context. */
+ struct ibv_pd *pd; /* Protection Domain. */
+ uint32_t pdn; /* Protection Domain number. */
+ uint32_t tdn; /* Transport Domain number. */
+ char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
+ char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
+ struct ibv_device_attr_ex device_attr; /* Device properties. */
+ LIST_ENTRY(mlx5_ibv_shared) mem_event_cb;
+ /**< Called by memory event callback. */
+ struct mlx5_mr_share_cache share_cache;
+ /* Shared DV/DR flow data section. */
+ pthread_mutex_t dv_mutex; /* DV context mutex. */
+ uint32_t dv_meta_mask; /* flow META metadata supported mask. */
+ uint32_t dv_mark_mask; /* flow MARK metadata supported mask. */
+ uint32_t dv_regc0_mask; /* available bits of metatada reg_c[0]. */
+ uint32_t dv_refcnt; /* DV/DR data reference counter. */
+ void *fdb_domain; /* FDB Direct Rules name space handle. */
+ void *rx_domain; /* RX Direct Rules name space handle. */
+ void *tx_domain; /* TX Direct Rules name space handle. */
+ struct mlx5_hlist *flow_tbls;
+ /* Direct Rules tables for FDB, NIC TX+RX */
+ void *esw_drop_action; /* Pointer to DR E-Switch drop action. */
+ void *pop_vlan_action; /* Pointer to DR pop VLAN action. */
+ uint32_t encaps_decaps; /* Encap/decap action indexed memory list. */
+ LIST_HEAD(modify_cmd, mlx5_flow_dv_modify_hdr_resource) modify_cmds;
+ struct mlx5_hlist *tag_table;
+ uint32_t port_id_action_list; /* List of port ID actions. */
+ uint32_t push_vlan_action_list; /* List of push VLAN actions. */
+ struct mlx5_flow_counter_mng cmng; /* Counters management structure. */
+ struct mlx5_indexed_pool *ipool[MLX5_IPOOL_MAX];
+ /* Memory Pool for mlx5 flow resources. */
+ /* Shared interrupt handler section. */
+ pthread_mutex_t intr_mutex; /* Interrupt config mutex. */
+ uint32_t intr_cnt; /* Interrupt handler reference counter. */
+ struct rte_intr_handle intr_handle; /* Interrupt handler for device. */
+ uint32_t devx_intr_cnt; /* Devx interrupt handler reference counter. */
+ struct rte_intr_handle intr_handle_devx; /* DEVX interrupt handler. */
+ struct mlx5dv_devx_cmd_comp *devx_comp; /* DEVX async comp obj. */
+ struct mlx5_devx_obj *tis; /* TIS object. */
+ struct mlx5_devx_obj *td; /* Transport domain. */
+ struct mlx5_flow_id_pool *flow_id_pool; /* Flow ID pool. */
+ struct mlx5_ibv_shared_port port[]; /* per device port data array. */
+};
+
+/* Per-process private structure. */
+struct mlx5_proc_priv {
+ size_t uar_table_sz;
+ /* Size of UAR register table. */
+ void *uar_table[];
+ /* Table of UAR registers for each process. */
+};
+
+/* MTR profile list. */
+TAILQ_HEAD(mlx5_mtr_profiles, mlx5_flow_meter_profile);
+/* MTR list. */
+TAILQ_HEAD(mlx5_flow_meters, mlx5_flow_meter);
+
+#define MLX5_PROC_PRIV(port_id) \
+ ((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private)
+
+struct mlx5_priv {
+ struct rte_eth_dev_data *dev_data; /* Pointer to device data. */
+ struct mlx5_ibv_shared *sh; /* Shared IB device context. */
+ uint32_t ibv_port; /* IB device port number. */
+ struct rte_pci_device *pci_dev; /* Backend PCI device. */
+ struct rte_ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */
+ BITFIELD_DECLARE(mac_own, uint64_t, MLX5_MAX_MAC_ADDRESSES);
+ /* Bit-field of MAC addresses owned by the PMD. */
+ uint16_t vlan_filter[MLX5_MAX_VLAN_IDS]; /* VLAN filters table. */
+ unsigned int vlan_filter_n; /* Number of configured VLAN filters. */
+ /* Device properties. */
+ uint16_t mtu; /* Configured MTU. */
+ unsigned int isolated:1; /* Whether isolated mode is enabled. */
+ unsigned int representor:1; /* Device is a port representor. */
+ unsigned int master:1; /* Device is a E-Switch master. */
+ unsigned int dr_shared:1; /* DV/DR data is shared. */
+ unsigned int counter_fallback:1; /* Use counter fallback management. */
+ unsigned int mtr_en:1; /* Whether support meter. */
+ unsigned int mtr_reg_share:1; /* Whether support meter REG_C share. */
+ uint16_t domain_id; /* Switch domain identifier. */
+ uint16_t vport_id; /* Associated VF vport index (if any). */
+ uint32_t vport_meta_tag; /* Used for vport index match ove VF LAG. */
+ uint32_t vport_meta_mask; /* Used for vport index field match mask. */
+ int32_t representor_id; /* Port representor identifier. */
+ int32_t pf_bond; /* >=0 means PF index in bonding configuration. */
+ unsigned int if_index; /* Associated kernel network device index. */
+ /* RX/TX queues. */
+ unsigned int rxqs_n; /* RX queues array size. */
+ unsigned int txqs_n; /* TX queues array size. */
+ struct mlx5_rxq_data *(*rxqs)[]; /* RX queues. */
+ struct mlx5_txq_data *(*txqs)[]; /* TX queues. */
+ struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
+ struct rte_eth_rss_conf rss_conf; /* RSS configuration. */
+ unsigned int (*reta_idx)[]; /* RETA index table. */
+ unsigned int reta_idx_n; /* RETA index size. */
+ struct mlx5_drop drop_queue; /* Flow drop queues. */
+ uint32_t flows; /* RTE Flow rules. */
+ uint32_t ctrl_flows; /* Control flow rules. */
+ void *inter_flows; /* Intermediate resources for flow creation. */
+ void *rss_desc; /* Intermediate rss description resources. */
+ int flow_idx; /* Intermediate device flow index. */
+ int flow_nested_idx; /* Intermediate device flow index, nested. */
+ LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
+ LIST_HEAD(rxqobj, mlx5_rxq_obj) rxqsobj; /* Verbs/DevX Rx queues. */
+ uint32_t hrxqs; /* Verbs Hash Rx queues. */
+ LIST_HEAD(txq, mlx5_txq_ctrl) txqsctrl; /* DPDK Tx queues. */
+ LIST_HEAD(txqobj, mlx5_txq_obj) txqsobj; /* Verbs/DevX Tx queues. */
+ /* Indirection tables. */
+ LIST_HEAD(ind_tables, mlx5_ind_table_obj) ind_tbls;
+ /* Pointer to next element. */
+ rte_atomic32_t refcnt; /**< Reference counter. */
+ struct ibv_flow_action *verbs_action;
+ /**< Verbs modify header action object. */
+ uint8_t ft_type; /**< Flow table type, Rx or Tx. */
+ uint8_t max_lro_msg_size;
+ /* Tags resources cache. */
+ uint32_t link_speed_capa; /* Link speed capabilities. */
+ struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
+ struct mlx5_stats_ctrl stats_ctrl; /* Stats control. */
+ struct mlx5_dev_config config; /* Device configuration. */
+ struct mlx5_verbs_alloc_ctx verbs_alloc_ctx;
+ /* Context for Verbs allocator. */
+ int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
+ int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
+ LIST_HEAD(dbrpage, mlx5_devx_dbr_page) dbrpgs; /* Door-bell pages. */
+ struct mlx5_nl_vlan_vmwa_context *vmwa_context; /* VLAN WA context. */
+ struct mlx5_flow_id_pool *qrss_id_pool;
+ struct mlx5_hlist *mreg_cp_tbl;
+ /* Hash table of Rx metadata register copy table. */
+ uint8_t mtr_sfx_reg; /* Meter prefix-suffix flow match REG_C. */
+ uint8_t mtr_color_reg; /* Meter color match REG_C. */
+ struct mlx5_mtr_profiles flow_meter_profiles; /* MTR profile list. */
+ struct mlx5_flow_meters flow_meters; /* MTR list. */
+#ifndef RTE_ARCH_64
+ rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
+ rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
+ /* UAR same-page access control required in 32bit implementations. */
+#endif
+ uint8_t skip_default_rss_reta; /* Skip configuration of default reta. */
+ uint8_t fdb_def_rule; /* Whether fdb jump to table 1 is configured. */
+ struct mlx5_mp_id mp_id; /* ID of a multi-process process */
+ LIST_HEAD(fdir, mlx5_fdir_flow) fdir_flows; /* fdir flows. */
+};
+
+#define PORT_ID(priv) ((priv)->dev_data->port_id)
+#define ETH_DEV(priv) (&rte_eth_devices[PORT_ID(priv)])
+
+/* mlx5.c */
+
+int mlx5_getenv_int(const char *);
+int mlx5_proc_priv_init(struct rte_eth_dev *dev);
+int64_t mlx5_get_dbr(struct rte_eth_dev *dev,
+ struct mlx5_devx_dbr_page **dbr_page);
+int32_t mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id,
+ uint64_t offset);
+int mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev,
+ struct rte_eth_udp_tunnel *udp_tunnel);
+uint16_t mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev);
+
+/* Macro to iterate over all valid ports for mlx5 driver. */
+#define MLX5_ETH_FOREACH_DEV(port_id, pci_dev) \
+ for (port_id = mlx5_eth_find_next(0, pci_dev); \
+ port_id < RTE_MAX_ETHPORTS; \
+ port_id = mlx5_eth_find_next(port_id + 1, pci_dev))
+
+/* mlx5_ethdev.c */
+
+int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);
+int mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE]);
+unsigned int mlx5_ifindex(const struct rte_eth_dev *dev);
+int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
+int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);
+int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
+ unsigned int flags);
+int mlx5_dev_configure(struct rte_eth_dev *dev);
+int mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info);
+int mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock);
+int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size);
+const uint32_t *mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev);
+int mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete);
+int mlx5_force_link_status_change(struct rte_eth_dev *dev, int status);
+int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
+int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev,
+ struct rte_eth_fc_conf *fc_conf);
+int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev,
+ struct rte_eth_fc_conf *fc_conf);
+void mlx5_dev_link_status_handler(void *arg);
+void mlx5_dev_interrupt_handler(void *arg);
+void mlx5_dev_interrupt_handler_devx(void *arg);
+void mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev);
+void mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev);
+void mlx5_dev_interrupt_handler_devx_uninstall(struct rte_eth_dev *dev);
+void mlx5_dev_interrupt_handler_devx_install(struct rte_eth_dev *dev);
+int mlx5_set_link_down(struct rte_eth_dev *dev);
+int mlx5_set_link_up(struct rte_eth_dev *dev);
+int mlx5_is_removed(struct rte_eth_dev *dev);
+eth_tx_burst_t mlx5_select_tx_function(struct rte_eth_dev *dev);
+eth_rx_burst_t mlx5_select_rx_function(struct rte_eth_dev *dev);
+struct mlx5_priv *mlx5_port_to_eswitch_info(uint16_t port, bool valid);
+struct mlx5_priv *mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev);
+int mlx5_sysfs_switch_info(unsigned int ifindex,
+ struct mlx5_switch_info *info);
+void mlx5_sysfs_check_switch_info(bool device_dir,
+ struct mlx5_switch_info *switch_info);
+void mlx5_translate_port_name(const char *port_name_in,
+ struct mlx5_switch_info *port_info_out);
+void mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
+ rte_intr_callback_fn cb_fn, void *cb_arg);
+int mlx5_get_module_info(struct rte_eth_dev *dev,
+ struct rte_eth_dev_module_info *modinfo);
+int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
+ struct rte_dev_eeprom_info *info);
+int mlx5_hairpin_cap_get(struct rte_eth_dev *dev,
+ struct rte_eth_hairpin_cap *cap);
+int mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev);
+
+/* mlx5_mac.c */
+
+int mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN]);
+void mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
+int mlx5_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
+ uint32_t index, uint32_t vmdq);
+struct mlx5_nl_vlan_vmwa_context *mlx5_vlan_vmwa_init
+ (struct rte_eth_dev *dev, uint32_t ifindex);
+int mlx5_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr);
+int mlx5_set_mc_addr_list(struct rte_eth_dev *dev,
+ struct rte_ether_addr *mc_addr_set,
+ uint32_t nb_mc_addr);
+
+/* mlx5_rss.c */
+
+int mlx5_rss_hash_update(struct rte_eth_dev *dev,
+ struct rte_eth_rss_conf *rss_conf);
+int mlx5_rss_hash_conf_get(struct rte_eth_dev *dev,
+ struct rte_eth_rss_conf *rss_conf);
+int mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size);
+int mlx5_dev_rss_reta_query(struct rte_eth_dev *dev,
+ struct rte_eth_rss_reta_entry64 *reta_conf,
+ uint16_t reta_size);
+int mlx5_dev_rss_reta_update(struct rte_eth_dev *dev,
+ struct rte_eth_rss_reta_entry64 *reta_conf,
+ uint16_t reta_size);
+
+/* mlx5_rxmode.c */
+
+int mlx5_promiscuous_enable(struct rte_eth_dev *dev);
+int mlx5_promiscuous_disable(struct rte_eth_dev *dev);
+int mlx5_allmulticast_enable(struct rte_eth_dev *dev);
+int mlx5_allmulticast_disable(struct rte_eth_dev *dev);
+
+/* mlx5_stats.c */
+
+void mlx5_stats_init(struct rte_eth_dev *dev);
+int mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
+int mlx5_stats_reset(struct rte_eth_dev *dev);
+int mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats,
+ unsigned int n);
+int mlx5_xstats_reset(struct rte_eth_dev *dev);
+int mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
+ struct rte_eth_xstat_name *xstats_names,
+ unsigned int n);
+
+/* mlx5_vlan.c */
+
+int mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
+void mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on);
+int mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask);
+void mlx5_vlan_vmwa_exit(struct mlx5_nl_vlan_vmwa_context *ctx);
+void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
+ struct mlx5_vf_vlan *vf_vlan);
+void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
+ struct mlx5_vf_vlan *vf_vlan);
+
+/* mlx5_trigger.c */
+
+int mlx5_dev_start(struct rte_eth_dev *dev);
+void mlx5_dev_stop(struct rte_eth_dev *dev);
+int mlx5_traffic_enable(struct rte_eth_dev *dev);
+void mlx5_traffic_disable(struct rte_eth_dev *dev);
+int mlx5_traffic_restart(struct rte_eth_dev *dev);
+
+/* mlx5_flow.c */
+
+int mlx5_flow_discover_mreg_c(struct rte_eth_dev *eth_dev);
+bool mlx5_flow_ext_mreg_supported(struct rte_eth_dev *dev);
+int mlx5_flow_discover_priorities(struct rte_eth_dev *dev);
+void mlx5_flow_print(struct rte_flow *flow);
+int mlx5_flow_validate(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error);
+struct rte_flow *mlx5_flow_create(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error);
+int mlx5_flow_destroy(struct rte_eth_dev *dev, struct rte_flow *flow,
+ struct rte_flow_error *error);
+void mlx5_flow_list_flush(struct rte_eth_dev *dev, uint32_t *list, bool active);
+int mlx5_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
+int mlx5_flow_query(struct rte_eth_dev *dev, struct rte_flow *flow,
+ const struct rte_flow_action *action, void *data,
+ struct rte_flow_error *error);
+int mlx5_flow_isolate(struct rte_eth_dev *dev, int enable,
+ struct rte_flow_error *error);
+int mlx5_dev_filter_ctrl(struct rte_eth_dev *dev,
+ enum rte_filter_type filter_type,
+ enum rte_filter_op filter_op,
+ void *arg);
+int mlx5_flow_start(struct rte_eth_dev *dev, uint32_t *list);
+void mlx5_flow_stop(struct rte_eth_dev *dev, uint32_t *list);
+int mlx5_flow_start_default(struct rte_eth_dev *dev);
+void mlx5_flow_stop_default(struct rte_eth_dev *dev);
+void mlx5_flow_alloc_intermediate(struct rte_eth_dev *dev);
+void mlx5_flow_free_intermediate(struct rte_eth_dev *dev);
+int mlx5_flow_verify(struct rte_eth_dev *dev);
+int mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev, uint32_t queue);
+int mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
+ struct rte_flow_item_eth *eth_spec,
+ struct rte_flow_item_eth *eth_mask,
+ struct rte_flow_item_vlan *vlan_spec,
+ struct rte_flow_item_vlan *vlan_mask);
+int mlx5_ctrl_flow(struct rte_eth_dev *dev,
+ struct rte_flow_item_eth *eth_spec,
+ struct rte_flow_item_eth *eth_mask);
+struct rte_flow *mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev);
+int mlx5_flow_create_drop_queue(struct rte_eth_dev *dev);
+void mlx5_flow_delete_drop_queue(struct rte_eth_dev *dev);
+void mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh,
+ uint64_t async_id, int status);
+void mlx5_set_query_alarm(struct mlx5_ibv_shared *sh);
+void mlx5_flow_query_alarm(void *arg);
+uint32_t mlx5_counter_alloc(struct rte_eth_dev *dev);
+void mlx5_counter_free(struct rte_eth_dev *dev, uint32_t cnt);
+int mlx5_counter_query(struct rte_eth_dev *dev, uint32_t cnt,
+ bool clear, uint64_t *pkts, uint64_t *bytes);
+int mlx5_flow_dev_dump(struct rte_eth_dev *dev, FILE *file,
+ struct rte_flow_error *error);
+void mlx5_flow_rxq_dynf_metadata_set(struct rte_eth_dev *dev);
+int mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void **contexts,
+ uint32_t nb_contexts, struct rte_flow_error *error);
+
+/* mlx5_mp.c */
+int mlx5_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer);
+int mlx5_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer);
+void mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev);
+void mlx5_mp_req_stop_rxtx(struct rte_eth_dev *dev);
+
+/* mlx5_socket.c */
+
+int mlx5_pmd_socket_init(void);
+
+/* mlx5_flow_meter.c */
+
+int mlx5_flow_meter_ops_get(struct rte_eth_dev *dev, void *arg);
+struct mlx5_flow_meter *mlx5_flow_meter_find(struct mlx5_priv *priv,
+ uint32_t meter_id);
+struct mlx5_flow_meter *mlx5_flow_meter_attach
+ (struct mlx5_priv *priv,
+ uint32_t meter_id,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error);
+void mlx5_flow_meter_detach(struct mlx5_flow_meter *fm);
+
+#endif /* RTE_PMD_MLX5_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h
new file mode 100644
index 000000000..260f58429
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_DEFS_H_
+#define RTE_PMD_MLX5_DEFS_H_
+
+#include <rte_ethdev_driver.h>
+#include <rte_vxlan.h>
+
+#include "mlx5_autoconf.h"
+
+/* Reported driver name. */
+#define MLX5_DRIVER_NAME "net_mlx5"
+
+/* Maximum number of simultaneous VLAN filters. */
+#define MLX5_MAX_VLAN_IDS 128
+
+/*
+ * Request TX completion every time descriptors reach this threshold since
+ * the previous request. Must be a power of two for performance reasons.
+ */
+#define MLX5_TX_COMP_THRESH 32u
+
+/*
+ * Request TX completion every time the total number of WQEBBs used for inlining
+ * packets exceeds the size of WQ divided by this divisor. Better to be power of
+ * two for performance.
+ */
+#define MLX5_TX_COMP_THRESH_INLINE_DIV (1 << 3)
+
+/*
+ * Maximal amount of normal completion CQEs
+ * processed in one call of tx_burst() routine.
+ */
+#define MLX5_TX_COMP_MAX_CQE 2u
+
+
+/* Size of per-queue MR cache array for linear search. */
+#define MLX5_MR_CACHE_N 8
+
+/* Size of MR cache table for binary search. */
+#define MLX5_MR_BTREE_CACHE_N 256
+
+/*
+ * If defined, only use software counters. The PMD will never ask the hardware
+ * for these, and many of them won't be available.
+ */
+#ifndef MLX5_PMD_SOFT_COUNTERS
+#define MLX5_PMD_SOFT_COUNTERS 1
+#endif
+
+/* Switch port ID parameters for bonding configurations. */
+#define MLX5_PORT_ID_BONDING_PF_MASK 0xf
+#define MLX5_PORT_ID_BONDING_PF_SHIFT 0xf
+
+/* Alarm timeout. */
+#define MLX5_ALARM_TIMEOUT_US 100000
+
+/* Maximum number of extended statistics counters. */
+#define MLX5_MAX_XSTATS 32
+
+/* Maximum Packet headers size (L2+L3+L4) for TSO. */
+#define MLX5_MAX_TSO_HEADER (128u + 34u)
+
+/* Inline data size required by NICs. */
+#define MLX5_INLINE_HSIZE_NONE 0
+#define MLX5_INLINE_HSIZE_L2 (sizeof(struct rte_ether_hdr) + \
+ sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_L3 (MLX5_INLINE_HSIZE_L2 + \
+ sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_L4 (MLX5_INLINE_HSIZE_L3 + \
+ sizeof(struct rte_tcp_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L2 (MLX5_INLINE_HSIZE_L3 + \
+ sizeof(struct rte_udp_hdr) + \
+ sizeof(struct rte_vxlan_hdr) + \
+ sizeof(struct rte_ether_hdr) + \
+ sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L3 (MLX5_INLINE_HSIZE_INNER_L2 + \
+ sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L4 (MLX5_INLINE_HSIZE_INNER_L3 + \
+ sizeof(struct rte_tcp_hdr))
+
+/* Threshold of buffer replenishment for vectorized Rx. */
+#define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
+ (RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2))
+
+/* Maximum size of burst for vectorized Rx. */
+#define MLX5_VPMD_RX_MAX_BURST 64U
+
+/* Recommended optimal burst size. */
+#define MLX5_RX_DEFAULT_BURST 64U
+#define MLX5_TX_DEFAULT_BURST 64U
+
+/* Number of packets vectorized Rx can simultaneously process in a loop. */
+#define MLX5_VPMD_DESCS_PER_LOOP 4
+
+/* Mask of RSS on source only or destination only. */
+#define MLX5_RSS_SRC_DST_ONLY (ETH_RSS_L3_SRC_ONLY | ETH_RSS_L3_DST_ONLY | \
+ ETH_RSS_L4_SRC_ONLY | ETH_RSS_L4_DST_ONLY)
+
+/* Supported RSS */
+#define MLX5_RSS_HF_MASK (~(ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP | \
+ MLX5_RSS_SRC_DST_ONLY))
+
+/* Timeout in seconds to get a valid link status. */
+#define MLX5_LINK_STATUS_TIMEOUT 10
+
+/* Number of times to retry retrieving the physical link information. */
+#define MLX5_GET_LINK_STATUS_RETRY_COUNT 3
+
+/* Maximum number of UAR pages used by a port,
+ * These are the size and mask for an array of mutexes used to synchronize
+ * the access to port's UARs on platforms that do not support 64 bit writes.
+ * In such systems it is possible to issue the 64 bits DoorBells through two
+ * consecutive writes, each write 32 bits. The access to a UAR page (which can
+ * be accessible by all threads in the process) must be synchronized
+ * (for example, using a semaphore). Such a synchronization is not required
+ * when ringing DoorBells on different UAR pages.
+ * A port with 512 Tx queues uses 8, 4kBytes, UAR pages which are shared
+ * among the ports.
+ */
+#define MLX5_UAR_PAGE_NUM_MAX 64
+#define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) - 1)
+
+/* Fields of memory mapping type in offset parameter of mmap() */
+#define MLX5_UAR_MMAP_CMD_SHIFT 8
+#define MLX5_UAR_MMAP_CMD_MASK 0xff
+
+/* Environment variable to control the doorbell register mapping. */
+#define MLX5_SHUT_UP_BF "MLX5_SHUT_UP_BF"
+#if defined(RTE_ARCH_ARM64)
+#define MLX5_SHUT_UP_BF_DEFAULT "0"
+#else
+#define MLX5_SHUT_UP_BF_DEFAULT "1"
+#endif
+
+#ifndef HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD
+#define MLX5_MMAP_GET_NC_PAGES_CMD 3
+#endif
+
+/* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */
+#define MLX5_MPRQ_STRIDE_NUM_N 6U
+
+/* Log 2 of the default size of a stride per WQE for Multi-Packet RQ. */
+#define MLX5_MPRQ_STRIDE_SIZE_N 11U
+
+/* Two-byte shift is disabled for Multi-Packet RQ. */
+#define MLX5_MPRQ_TWO_BYTE_SHIFT 0
+
+/*
+ * Minimum size of packet to be memcpy'd instead of being attached as an
+ * external buffer.
+ */
+#define MLX5_MPRQ_MEMCPY_DEFAULT_LEN 128
+
+/* Minimum number Rx queues to enable Multi-Packet RQ. */
+#define MLX5_MPRQ_MIN_RXQS 12
+
+/* Cache size of mempool for Multi-Packet RQ. */
+#define MLX5_MPRQ_MP_CACHE_SZ 32U
+
+/* MLX5_DV_XMETA_EN supported values. */
+#define MLX5_XMETA_MODE_LEGACY 0
+#define MLX5_XMETA_MODE_META16 1
+#define MLX5_XMETA_MODE_META32 2
+
+/* MLX5_TX_DB_NC supported values. */
+#define MLX5_TXDB_CACHED 0
+#define MLX5_TXDB_NCACHED 1
+#define MLX5_TXDB_HEURISTIC 2
+
+/* Size of the simple hash table for metadata register table. */
+#define MLX5_FLOW_MREG_HTABLE_SZ 4096
+#define MLX5_FLOW_MREG_HNAME "MARK_COPY_TABLE"
+#define MLX5_DEFAULT_COPY_ID UINT32_MAX
+
+/* Hairpin TX/RX queue configuration parameters. */
+#define MLX5_HAIRPIN_QUEUE_STRIDE 6
+#define MLX5_HAIRPIN_JUMBO_LOG_SIZE (14 + 2)
+
+/* Definition of static_assert found in /usr/include/assert.h */
+#ifndef HAVE_STATIC_ASSERT
+#define static_assert _Static_assert
+#endif
+
+#endif /* RTE_PMD_MLX5_DEFS_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c
new file mode 100644
index 000000000..47f11b963
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c
@@ -0,0 +1,2071 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <dirent.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <fcntl.h>
+#include <stdalign.h>
+#include <sys/un.h>
+#include <time.h>
+
+#include <rte_atomic.h>
+#include <rte_ethdev_driver.h>
+#include <rte_bus_pci.h>
+#include <rte_mbuf.h>
+#include <rte_common.h>
+#include <rte_interrupts.h>
+#include <rte_malloc.h>
+#include <rte_string_fns.h>
+#include <rte_rwlock.h>
+#include <rte_cycles.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+/* Supported speed values found in /usr/include/linux/ethtool.h */
+#ifndef HAVE_SUPPORTED_40000baseKR4_Full
+#define SUPPORTED_40000baseKR4_Full (1 << 23)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseCR4_Full
+#define SUPPORTED_40000baseCR4_Full (1 << 24)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseSR4_Full
+#define SUPPORTED_40000baseSR4_Full (1 << 25)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseLR4_Full
+#define SUPPORTED_40000baseLR4_Full (1 << 26)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseKR4_Full
+#define SUPPORTED_56000baseKR4_Full (1 << 27)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseCR4_Full
+#define SUPPORTED_56000baseCR4_Full (1 << 28)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseSR4_Full
+#define SUPPORTED_56000baseSR4_Full (1 << 29)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseLR4_Full
+#define SUPPORTED_56000baseLR4_Full (1 << 30)
+#endif
+
+/* Add defines in case the running kernel is not the same as user headers. */
+#ifndef ETHTOOL_GLINKSETTINGS
+struct ethtool_link_settings {
+ uint32_t cmd;
+ uint32_t speed;
+ uint8_t duplex;
+ uint8_t port;
+ uint8_t phy_address;
+ uint8_t autoneg;
+ uint8_t mdio_support;
+ uint8_t eth_to_mdix;
+ uint8_t eth_tp_mdix_ctrl;
+ int8_t link_mode_masks_nwords;
+ uint32_t reserved[8];
+ uint32_t link_mode_masks[];
+};
+
+/* The kernel values can be found in /include/uapi/linux/ethtool.h */
+#define ETHTOOL_GLINKSETTINGS 0x0000004c
+#define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
+#define ETHTOOL_LINK_MODE_Autoneg_BIT 6
+#define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
+#define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
+#define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
+#define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
+#define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
+#define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
+#define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
+#define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
+#define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
+#define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
+#define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
+#define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
+#define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
+#define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_25G
+#define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
+#define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
+#define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_50G
+#define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
+#define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_100G
+#define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
+#define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
+#define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
+#define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_200G
+#define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62
+#define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63
+#define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */
+#define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */
+#define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */
+#endif
+
+/**
+ * Get master interface name from private structure.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[out] ifname
+ * Interface name output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE])
+{
+ DIR *dir;
+ struct dirent *dent;
+ unsigned int dev_type = 0;
+ unsigned int dev_port_prev = ~0u;
+ char match[IF_NAMESIZE] = "";
+
+ MLX5_ASSERT(ibdev_path);
+ {
+ MKSTR(path, "%s/device/net", ibdev_path);
+
+ dir = opendir(path);
+ if (dir == NULL) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ }
+ while ((dent = readdir(dir)) != NULL) {
+ char *name = dent->d_name;
+ FILE *file;
+ unsigned int dev_port;
+ int r;
+
+ if ((name[0] == '.') &&
+ ((name[1] == '\0') ||
+ ((name[1] == '.') && (name[2] == '\0'))))
+ continue;
+
+ MKSTR(path, "%s/device/net/%s/%s",
+ ibdev_path, name,
+ (dev_type ? "dev_id" : "dev_port"));
+
+ file = fopen(path, "rb");
+ if (file == NULL) {
+ if (errno != ENOENT)
+ continue;
+ /*
+ * Switch to dev_id when dev_port does not exist as
+ * is the case with Linux kernel versions < 3.15.
+ */
+try_dev_id:
+ match[0] = '\0';
+ if (dev_type)
+ break;
+ dev_type = 1;
+ dev_port_prev = ~0u;
+ rewinddir(dir);
+ continue;
+ }
+ r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
+ fclose(file);
+ if (r != 1)
+ continue;
+ /*
+ * Switch to dev_id when dev_port returns the same value for
+ * all ports. May happen when using a MOFED release older than
+ * 3.0 with a Linux kernel >= 3.15.
+ */
+ if (dev_port == dev_port_prev)
+ goto try_dev_id;
+ dev_port_prev = dev_port;
+ if (dev_port == 0)
+ strlcpy(match, name, sizeof(match));
+ }
+ closedir(dir);
+ if (match[0] == '\0') {
+ rte_errno = ENOENT;
+ return -rte_errno;
+ }
+ strncpy(*ifname, match, sizeof(*ifname));
+ return 0;
+}
+
+/**
+ * Get interface name from private structure.
+ *
+ * This is a port representor-aware version of mlx5_get_master_ifname().
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[out] ifname
+ * Interface name output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int ifindex;
+
+ MLX5_ASSERT(priv);
+ MLX5_ASSERT(priv->sh);
+ ifindex = mlx5_ifindex(dev);
+ if (!ifindex) {
+ if (!priv->representor)
+ return mlx5_get_master_ifname(priv->sh->ibdev_path,
+ ifname);
+ rte_errno = ENXIO;
+ return -rte_errno;
+ }
+ if (if_indextoname(ifindex, &(*ifname)[0]))
+ return 0;
+ rte_errno = errno;
+ return -rte_errno;
+}
+
+/**
+ * Get the interface index from device name.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * Nonzero interface index on success, zero otherwise and rte_errno is set.
+ */
+unsigned int
+mlx5_ifindex(const struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int ifindex;
+
+ MLX5_ASSERT(priv);
+ MLX5_ASSERT(priv->if_index);
+ ifindex = priv->if_index;
+ if (!ifindex)
+ rte_errno = ENXIO;
+ return ifindex;
+}
+
+/**
+ * Perform ifreq ioctl() on associated Ethernet device.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param req
+ * Request number to pass to ioctl().
+ * @param[out] ifr
+ * Interface request structure output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+{
+ int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+ int ret = 0;
+
+ if (sock == -1) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ ret = mlx5_get_ifname(dev, &ifr->ifr_name);
+ if (ret)
+ goto error;
+ ret = ioctl(sock, req, ifr);
+ if (ret == -1) {
+ rte_errno = errno;
+ goto error;
+ }
+ close(sock);
+ return 0;
+error:
+ close(sock);
+ return -rte_errno;
+}
+
+/**
+ * Get device MTU.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] mtu
+ * MTU value output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
+{
+ struct ifreq request;
+ int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
+
+ if (ret)
+ return ret;
+ *mtu = request.ifr_mtu;
+ return 0;
+}
+
+/**
+ * Set device MTU.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mtu
+ * MTU value to set.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
+{
+ struct ifreq request = { .ifr_mtu = mtu, };
+
+ return mlx5_ifreq(dev, SIOCSIFMTU, &request);
+}
+
+/**
+ * Set device flags.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param keep
+ * Bitmask for flags that must remain untouched.
+ * @param flags
+ * Bitmask for flags to modify.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
+{
+ struct ifreq request;
+ int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
+
+ if (ret)
+ return ret;
+ request.ifr_flags &= keep;
+ request.ifr_flags |= flags & ~keep;
+ return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
+}
+
+/**
+ * DPDK callback for Ethernet device configuration.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_configure(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int rxqs_n = dev->data->nb_rx_queues;
+ unsigned int txqs_n = dev->data->nb_tx_queues;
+ const uint8_t use_app_rss_key =
+ !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
+ int ret = 0;
+
+ if (use_app_rss_key &&
+ (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
+ MLX5_RSS_HASH_KEY_LEN)) {
+ DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long",
+ dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN));
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ priv->rss_conf.rss_key =
+ rte_realloc(priv->rss_conf.rss_key,
+ MLX5_RSS_HASH_KEY_LEN, 0);
+ if (!priv->rss_conf.rss_key) {
+ DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
+ dev->data->port_id, rxqs_n);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+
+ if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG)
+ dev->data->dev_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
+
+ memcpy(priv->rss_conf.rss_key,
+ use_app_rss_key ?
+ dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
+ rss_hash_default_key,
+ MLX5_RSS_HASH_KEY_LEN);
+ priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN;
+ priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
+ priv->rxqs = (void *)dev->data->rx_queues;
+ priv->txqs = (void *)dev->data->tx_queues;
+ if (txqs_n != priv->txqs_n) {
+ DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
+ dev->data->port_id, priv->txqs_n, txqs_n);
+ priv->txqs_n = txqs_n;
+ }
+ if (rxqs_n > priv->config.ind_table_max_size) {
+ DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
+ dev->data->port_id, rxqs_n);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ if (rxqs_n != priv->rxqs_n) {
+ DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
+ dev->data->port_id, priv->rxqs_n, rxqs_n);
+ priv->rxqs_n = rxqs_n;
+ }
+ priv->skip_default_rss_reta = 0;
+ ret = mlx5_proc_priv_init(dev);
+ if (ret)
+ return ret;
+ return 0;
+}
+
+/**
+ * Configure default RSS reta.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int rxqs_n = dev->data->nb_rx_queues;
+ unsigned int i;
+ unsigned int j;
+ unsigned int reta_idx_n;
+ int ret = 0;
+ unsigned int *rss_queue_arr = NULL;
+ unsigned int rss_queue_n = 0;
+
+ if (priv->skip_default_rss_reta)
+ return ret;
+ rss_queue_arr = rte_malloc("", rxqs_n * sizeof(unsigned int), 0);
+ if (!rss_queue_arr) {
+ DRV_LOG(ERR, "port %u cannot allocate RSS queue list (%u)",
+ dev->data->port_id, rxqs_n);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ for (i = 0, j = 0; i < rxqs_n; i++) {
+ struct mlx5_rxq_data *rxq_data;
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+
+ rxq_data = (*priv->rxqs)[i];
+ rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ if (rxq_ctrl && rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD)
+ rss_queue_arr[j++] = i;
+ }
+ rss_queue_n = j;
+ if (rss_queue_n > priv->config.ind_table_max_size) {
+ DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
+ dev->data->port_id, rss_queue_n);
+ rte_errno = EINVAL;
+ rte_free(rss_queue_arr);
+ return -rte_errno;
+ }
+ DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
+ dev->data->port_id, priv->rxqs_n, rxqs_n);
+ priv->rxqs_n = rxqs_n;
+ /*
+ * If the requested number of RX queues is not a power of two,
+ * use the maximum indirection table size for better balancing.
+ * The result is always rounded to the next power of two.
+ */
+ reta_idx_n = (1 << log2above((rss_queue_n & (rss_queue_n - 1)) ?
+ priv->config.ind_table_max_size :
+ rss_queue_n));
+ ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
+ if (ret) {
+ rte_free(rss_queue_arr);
+ return ret;
+ }
+ /*
+ * When the number of RX queues is not a power of two,
+ * the remaining table entries are padded with reused WQs
+ * and hashes are not spread uniformly.
+ */
+ for (i = 0, j = 0; (i != reta_idx_n); ++i) {
+ (*priv->reta_idx)[i] = rss_queue_arr[j];
+ if (++j == rss_queue_n)
+ j = 0;
+ }
+ rte_free(rss_queue_arr);
+ return ret;
+}
+
+/**
+ * Sets default tuning parameters.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] info
+ * Info structure output buffer.
+ */
+static void
+mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ /* Minimum CPU utilization. */
+ info->default_rxportconf.ring_size = 256;
+ info->default_txportconf.ring_size = 256;
+ info->default_rxportconf.burst_size = MLX5_RX_DEFAULT_BURST;
+ info->default_txportconf.burst_size = MLX5_TX_DEFAULT_BURST;
+ if ((priv->link_speed_capa & ETH_LINK_SPEED_200G) |
+ (priv->link_speed_capa & ETH_LINK_SPEED_100G)) {
+ info->default_rxportconf.nb_queues = 16;
+ info->default_txportconf.nb_queues = 16;
+ if (dev->data->nb_rx_queues > 2 ||
+ dev->data->nb_tx_queues > 2) {
+ /* Max Throughput. */
+ info->default_rxportconf.ring_size = 2048;
+ info->default_txportconf.ring_size = 2048;
+ }
+ } else {
+ info->default_rxportconf.nb_queues = 8;
+ info->default_txportconf.nb_queues = 8;
+ if (dev->data->nb_rx_queues > 2 ||
+ dev->data->nb_tx_queues > 2) {
+ /* Max Throughput. */
+ info->default_rxportconf.ring_size = 4096;
+ info->default_txportconf.ring_size = 4096;
+ }
+ }
+}
+
+/**
+ * Sets tx mbuf limiting parameters.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] info
+ * Info structure output buffer.
+ */
+static void
+mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ unsigned int inlen;
+ uint16_t nb_max;
+
+ inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ?
+ MLX5_SEND_DEF_INLINE_LEN :
+ (unsigned int)config->txq_inline_max;
+ MLX5_ASSERT(config->txq_inline_min >= 0);
+ inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min);
+ inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX +
+ MLX5_ESEG_MIN_INLINE_SIZE -
+ MLX5_WQE_CSEG_SIZE -
+ MLX5_WQE_ESEG_SIZE -
+ MLX5_WQE_DSEG_SIZE * 2);
+ nb_max = (MLX5_WQE_SIZE_MAX +
+ MLX5_ESEG_MIN_INLINE_SIZE -
+ MLX5_WQE_CSEG_SIZE -
+ MLX5_WQE_ESEG_SIZE -
+ MLX5_WQE_DSEG_SIZE -
+ inlen) / MLX5_WSEG_SIZE;
+ info->tx_desc_lim.nb_seg_max = nb_max;
+ info->tx_desc_lim.nb_mtu_seg_max = nb_max;
+}
+
+/**
+ * DPDK callback to get information about the device.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] info
+ * Info structure output buffer.
+ */
+int
+mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ unsigned int max;
+
+ /* FIXME: we should ask the device for these values. */
+ info->min_rx_bufsize = 32;
+ info->max_rx_pktlen = 65536;
+ info->max_lro_pkt_size = MLX5_MAX_LRO_SIZE;
+ /*
+ * Since we need one CQ per QP, the limit is the minimum number
+ * between the two values.
+ */
+ max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq,
+ priv->sh->device_attr.orig_attr.max_qp);
+ /* max_rx_queues is uint16_t. */
+ max = RTE_MIN(max, (unsigned int)UINT16_MAX);
+ info->max_rx_queues = max;
+ info->max_tx_queues = max;
+ info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
+ info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
+ info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
+ info->rx_queue_offload_capa);
+ info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
+ info->if_index = mlx5_ifindex(dev);
+ info->reta_size = priv->reta_idx_n ?
+ priv->reta_idx_n : config->ind_table_max_size;
+ info->hash_key_size = MLX5_RSS_HASH_KEY_LEN;
+ info->speed_capa = priv->link_speed_capa;
+ info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
+ mlx5_set_default_params(dev, info);
+ mlx5_set_txlimit_params(dev, info);
+ info->switch_info.name = dev->data->name;
+ info->switch_info.domain_id = priv->domain_id;
+ info->switch_info.port_id = priv->representor_id;
+ if (priv->representor) {
+ uint16_t port_id;
+
+ if (priv->pf_bond >= 0) {
+ /*
+ * Switch port ID is opaque value with driver defined
+ * format. Push the PF index in bonding configurations
+ * in upper four bits of port ID. If we get too many
+ * representors (more than 4K) or PFs (more than 15)
+ * this approach must be reconsidered.
+ */
+ if ((info->switch_info.port_id >>
+ MLX5_PORT_ID_BONDING_PF_SHIFT) ||
+ priv->pf_bond > MLX5_PORT_ID_BONDING_PF_MASK) {
+ DRV_LOG(ERR, "can't update switch port ID"
+ " for bonding device");
+ MLX5_ASSERT(false);
+ return -ENODEV;
+ }
+ info->switch_info.port_id |=
+ priv->pf_bond << MLX5_PORT_ID_BONDING_PF_SHIFT;
+ }
+ MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+ struct mlx5_priv *opriv =
+ rte_eth_devices[port_id].data->dev_private;
+
+ if (!opriv ||
+ opriv->representor ||
+ opriv->sh != priv->sh ||
+ opriv->domain_id != priv->domain_id)
+ continue;
+ /*
+ * Override switch name with that of the master
+ * device.
+ */
+ info->switch_info.name = opriv->dev_data->name;
+ break;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Get device current raw clock counter
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] time
+ * Current raw clock counter of the device.
+ *
+ * @return
+ * 0 if the clock has correctly been read
+ * The value of errno in case of error
+ */
+int
+mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ibv_context *ctx = priv->sh->ctx;
+ struct ibv_values_ex values;
+ int err = 0;
+
+ values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
+ err = mlx5_glue->query_rt_values_ex(ctx, &values);
+ if (err != 0) {
+ DRV_LOG(WARNING, "Could not query the clock !");
+ return err;
+ }
+ *clock = values.raw_clock.tv_nsec;
+ return 0;
+}
+
+/**
+ * Get firmware version of a device.
+ *
+ * @param dev
+ * Ethernet device port.
+ * @param fw_ver
+ * String output allocated by caller.
+ * @param fw_size
+ * Size of the output string, including terminating null byte.
+ *
+ * @return
+ * 0 on success, or the size of the non truncated string if too big.
+ */
+int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr;
+ size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1;
+
+ if (fw_size < size)
+ return size;
+ if (fw_ver != NULL)
+ strlcpy(fw_ver, attr->fw_ver, fw_size);
+ return 0;
+}
+
+/**
+ * Get supported packet types.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * A pointer to the supported Packet types array.
+ */
+const uint32_t *
+mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
+{
+ static const uint32_t ptypes[] = {
+ /* refers to rxq_cq_to_pkt_type() */
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+ RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+ RTE_PTYPE_L4_NONFRAG,
+ RTE_PTYPE_L4_FRAG,
+ RTE_PTYPE_L4_TCP,
+ RTE_PTYPE_L4_UDP,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ RTE_PTYPE_INNER_L4_FRAG,
+ RTE_PTYPE_INNER_L4_TCP,
+ RTE_PTYPE_INNER_L4_UDP,
+ RTE_PTYPE_UNKNOWN
+ };
+
+ if (dev->rx_pkt_burst == mlx5_rx_burst ||
+ dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
+ dev->rx_pkt_burst == mlx5_rx_burst_vec)
+ return ptypes;
+ return NULL;
+}
+
+/**
+ * Retrieve the master device for representor in the same switch domain.
+ *
+ * @param dev
+ * Pointer to representor Ethernet device structure.
+ *
+ * @return
+ * Master device structure on success, NULL otherwise.
+ */
+
+static struct rte_eth_dev *
+mlx5_find_master_dev(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv;
+ uint16_t port_id;
+ uint16_t domain_id;
+
+ priv = dev->data->dev_private;
+ domain_id = priv->domain_id;
+ MLX5_ASSERT(priv->representor);
+ MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+ struct mlx5_priv *opriv =
+ rte_eth_devices[port_id].data->dev_private;
+ if (opriv &&
+ opriv->master &&
+ opriv->domain_id == domain_id &&
+ opriv->sh == priv->sh)
+ return &rte_eth_devices[port_id];
+ }
+ return NULL;
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] link
+ * Storage for current link status.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
+ struct rte_eth_link *link)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ethtool_cmd edata = {
+ .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
+ };
+ struct ifreq ifr;
+ struct rte_eth_link dev_link;
+ int link_speed = 0;
+ int ret;
+
+ ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ dev_link = (struct rte_eth_link) {
+ .link_status = ((ifr.ifr_flags & IFF_UP) &&
+ (ifr.ifr_flags & IFF_RUNNING)),
+ };
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)&edata,
+ };
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ if (ret == -ENOTSUP && priv->representor) {
+ struct rte_eth_dev *master;
+
+ /*
+ * For representors we can try to inherit link
+ * settings from the master device. Actually
+ * link settings do not make a lot of sense
+ * for representors due to missing physical
+ * link. The old kernel drivers supported
+ * emulated settings query for representors,
+ * the new ones do not, so we have to add
+ * this code for compatibility issues.
+ */
+ master = mlx5_find_master_dev(dev);
+ if (master) {
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)&edata,
+ };
+ ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+ }
+ }
+ if (ret) {
+ DRV_LOG(WARNING,
+ "port %u ioctl(SIOCETHTOOL,"
+ " ETHTOOL_GSET) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ }
+ link_speed = ethtool_cmd_speed(&edata);
+ if (link_speed == -1)
+ dev_link.link_speed = ETH_SPEED_NUM_NONE;
+ else
+ dev_link.link_speed = link_speed;
+ priv->link_speed_capa = 0;
+ if (edata.supported & SUPPORTED_Autoneg)
+ priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
+ if (edata.supported & (SUPPORTED_1000baseT_Full |
+ SUPPORTED_1000baseKX_Full))
+ priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+ if (edata.supported & SUPPORTED_10000baseKR_Full)
+ priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+ if (edata.supported & (SUPPORTED_40000baseKR4_Full |
+ SUPPORTED_40000baseCR4_Full |
+ SUPPORTED_40000baseSR4_Full |
+ SUPPORTED_40000baseLR4_Full))
+ priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+ dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
+ ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+ dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+ ETH_LINK_SPEED_FIXED);
+ if (((dev_link.link_speed && !dev_link.link_status) ||
+ (!dev_link.link_speed && dev_link.link_status))) {
+ rte_errno = EAGAIN;
+ return -rte_errno;
+ }
+ *link = dev_link;
+ return 0;
+}
+
+/**
+ * Retrieve physical link information (unlocked version using new ioctl).
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] link
+ * Storage for current link status.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
+ struct rte_eth_link *link)
+
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
+ struct ifreq ifr;
+ struct rte_eth_link dev_link;
+ struct rte_eth_dev *master = NULL;
+ uint64_t sc;
+ int ret;
+
+ ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ dev_link = (struct rte_eth_link) {
+ .link_status = ((ifr.ifr_flags & IFF_UP) &&
+ (ifr.ifr_flags & IFF_RUNNING)),
+ };
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)&gcmd,
+ };
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ if (ret == -ENOTSUP && priv->representor) {
+ /*
+ * For representors we can try to inherit link
+ * settings from the master device. Actually
+ * link settings do not make a lot of sense
+ * for representors due to missing physical
+ * link. The old kernel drivers supported
+ * emulated settings query for representors,
+ * the new ones do not, so we have to add
+ * this code for compatibility issues.
+ */
+ master = mlx5_find_master_dev(dev);
+ if (master) {
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)&gcmd,
+ };
+ ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+ }
+ }
+ if (ret) {
+ DRV_LOG(DEBUG,
+ "port %u ioctl(SIOCETHTOOL,"
+ " ETHTOOL_GLINKSETTINGS) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+
+ }
+ gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
+
+ alignas(struct ethtool_link_settings)
+ uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
+ sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
+ struct ethtool_link_settings *ecmd = (void *)data;
+
+ *ecmd = gcmd;
+ ifr.ifr_data = (void *)ecmd;
+ ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(DEBUG,
+ "port %u ioctl(SIOCETHTOOL,"
+ "ETHTOOL_GLINKSETTINGS) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE :
+ ecmd->speed;
+ sc = ecmd->link_mode_masks[0] |
+ ((uint64_t)ecmd->link_mode_masks[1] << 32);
+ priv->link_speed_capa = 0;
+ if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
+ priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_20G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_56G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_25G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_50G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_100G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+
+ sc = ecmd->link_mode_masks[2] |
+ ((uint64_t)ecmd->link_mode_masks[3] << 32);
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) |
+ MLX5_BITSHIFT(
+ ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+ dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
+ ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+ dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+ ETH_LINK_SPEED_FIXED);
+ if (((dev_link.link_speed && !dev_link.link_status) ||
+ (!dev_link.link_speed && dev_link.link_status))) {
+ rte_errno = EAGAIN;
+ return -rte_errno;
+ }
+ *link = dev_link;
+ return 0;
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param wait_to_complete
+ * Wait for request completion.
+ *
+ * @return
+ * 0 if link status was not updated, positive if it was, a negative errno
+ * value otherwise and rte_errno is set.
+ */
+int
+mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
+{
+ int ret;
+ struct rte_eth_link dev_link;
+ time_t start_time = time(NULL);
+ int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
+
+ do {
+ ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
+ if (ret == -ENOTSUP)
+ ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
+ if (ret == 0)
+ break;
+ /* Handle wait to complete situation. */
+ if ((wait_to_complete || retry) && ret == -EAGAIN) {
+ if (abs((int)difftime(time(NULL), start_time)) <
+ MLX5_LINK_STATUS_TIMEOUT) {
+ usleep(0);
+ continue;
+ } else {
+ rte_errno = EBUSY;
+ return -rte_errno;
+ }
+ } else if (ret < 0) {
+ return ret;
+ }
+ } while (wait_to_complete || retry-- > 0);
+ ret = !!memcmp(&dev->data->dev_link, &dev_link,
+ sizeof(struct rte_eth_link));
+ dev->data->dev_link = dev_link;
+ return ret;
+}
+
+/**
+ * DPDK callback to change the MTU.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param in_mtu
+ * New MTU.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint16_t kern_mtu = 0;
+ int ret;
+
+ ret = mlx5_get_mtu(dev, &kern_mtu);
+ if (ret)
+ return ret;
+ /* Set kernel interface MTU first. */
+ ret = mlx5_set_mtu(dev, mtu);
+ if (ret)
+ return ret;
+ ret = mlx5_get_mtu(dev, &kern_mtu);
+ if (ret)
+ return ret;
+ if (kern_mtu == mtu) {
+ priv->mtu = mtu;
+ DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
+ dev->data->port_id, mtu);
+ return 0;
+ }
+ rte_errno = EAGAIN;
+ return -rte_errno;
+}
+
+/**
+ * DPDK callback to get flow control status.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] fc_conf
+ * Flow control output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+ struct ifreq ifr;
+ struct ethtool_pauseparam ethpause = {
+ .cmd = ETHTOOL_GPAUSEPARAM
+ };
+ int ret;
+
+ ifr.ifr_data = (void *)&ethpause;
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING,
+ "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
+ " %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ fc_conf->autoneg = ethpause.autoneg;
+ if (ethpause.rx_pause && ethpause.tx_pause)
+ fc_conf->mode = RTE_FC_FULL;
+ else if (ethpause.rx_pause)
+ fc_conf->mode = RTE_FC_RX_PAUSE;
+ else if (ethpause.tx_pause)
+ fc_conf->mode = RTE_FC_TX_PAUSE;
+ else
+ fc_conf->mode = RTE_FC_NONE;
+ return 0;
+}
+
+/**
+ * DPDK callback to modify flow control parameters.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[in] fc_conf
+ * Flow control parameters.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+ struct ifreq ifr;
+ struct ethtool_pauseparam ethpause = {
+ .cmd = ETHTOOL_SPAUSEPARAM
+ };
+ int ret;
+
+ ifr.ifr_data = (void *)&ethpause;
+ ethpause.autoneg = fc_conf->autoneg;
+ if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+ (fc_conf->mode & RTE_FC_RX_PAUSE))
+ ethpause.rx_pause = 1;
+ else
+ ethpause.rx_pause = 0;
+
+ if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+ (fc_conf->mode & RTE_FC_TX_PAUSE))
+ ethpause.tx_pause = 1;
+ else
+ ethpause.tx_pause = 0;
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING,
+ "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
+ " failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * Handle asynchronous removal event for entire multiport device.
+ *
+ * @param sh
+ * Infiniband device shared context.
+ */
+static void
+mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
+{
+ uint32_t i;
+
+ for (i = 0; i < sh->max_port; ++i) {
+ struct rte_eth_dev *dev;
+
+ if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
+ /*
+ * Or not existing port either no
+ * handler installed for this port.
+ */
+ continue;
+ }
+ dev = &rte_eth_devices[sh->port[i].ih_port_id];
+ MLX5_ASSERT(dev);
+ if (dev->data->dev_conf.intr_conf.rmv)
+ _rte_eth_dev_callback_process
+ (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
+ }
+}
+
+/**
+ * Handle shared asynchronous events the NIC (removal event
+ * and link status change). Supports multiport IB device.
+ *
+ * @param cb_arg
+ * Callback argument.
+ */
+void
+mlx5_dev_interrupt_handler(void *cb_arg)
+{
+ struct mlx5_ibv_shared *sh = cb_arg;
+ struct ibv_async_event event;
+
+ /* Read all message from the IB device and acknowledge them. */
+ for (;;) {
+ struct rte_eth_dev *dev;
+ uint32_t tmp;
+
+ if (mlx5_glue->get_async_event(sh->ctx, &event))
+ break;
+ /* Retrieve and check IB port index. */
+ tmp = (uint32_t)event.element.port_num;
+ if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
+ /*
+ * The DEVICE_FATAL event is called once for
+ * entire device without port specifying.
+ * We should notify all existing ports.
+ */
+ mlx5_glue->ack_async_event(&event);
+ mlx5_dev_interrupt_device_fatal(sh);
+ continue;
+ }
+ MLX5_ASSERT(tmp && (tmp <= sh->max_port));
+ if (!tmp) {
+ /* Unsupported devive level event. */
+ mlx5_glue->ack_async_event(&event);
+ DRV_LOG(DEBUG,
+ "unsupported common event (type %d)",
+ event.event_type);
+ continue;
+ }
+ if (tmp > sh->max_port) {
+ /* Invalid IB port index. */
+ mlx5_glue->ack_async_event(&event);
+ DRV_LOG(DEBUG,
+ "cannot handle an event (type %d)"
+ "due to invalid IB port index (%u)",
+ event.event_type, tmp);
+ continue;
+ }
+ if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
+ /* No handler installed. */
+ mlx5_glue->ack_async_event(&event);
+ DRV_LOG(DEBUG,
+ "cannot handle an event (type %d)"
+ "due to no handler installed for port %u",
+ event.event_type, tmp);
+ continue;
+ }
+ /* Retrieve ethernet device descriptor. */
+ tmp = sh->port[tmp - 1].ih_port_id;
+ dev = &rte_eth_devices[tmp];
+ MLX5_ASSERT(dev);
+ if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
+ event.event_type == IBV_EVENT_PORT_ERR) &&
+ dev->data->dev_conf.intr_conf.lsc) {
+ mlx5_glue->ack_async_event(&event);
+ if (mlx5_link_update(dev, 0) == -EAGAIN) {
+ usleep(0);
+ continue;
+ }
+ _rte_eth_dev_callback_process
+ (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
+ continue;
+ }
+ DRV_LOG(DEBUG,
+ "port %u cannot handle an unknown event (type %d)",
+ dev->data->port_id, event.event_type);
+ mlx5_glue->ack_async_event(&event);
+ }
+}
+
+/*
+ * Unregister callback handler safely. The handler may be active
+ * while we are trying to unregister it, in this case code -EAGAIN
+ * is returned by rte_intr_callback_unregister(). This routine checks
+ * the return code and tries to unregister handler again.
+ *
+ * @param handle
+ * interrupt handle
+ * @param cb_fn
+ * pointer to callback routine
+ * @cb_arg
+ * opaque callback parameter
+ */
+void
+mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
+ rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+ /*
+ * Try to reduce timeout management overhead by not calling
+ * the timer related routines on the first iteration. If the
+ * unregistering succeeds on first call there will be no
+ * timer calls at all.
+ */
+ uint64_t twait = 0;
+ uint64_t start = 0;
+
+ do {
+ int ret;
+
+ ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
+ if (ret >= 0)
+ return;
+ if (ret != -EAGAIN) {
+ DRV_LOG(INFO, "failed to unregister interrupt"
+ " handler (error: %d)", ret);
+ MLX5_ASSERT(false);
+ return;
+ }
+ if (twait) {
+ struct timespec onems;
+
+ /* Wait one millisecond and try again. */
+ onems.tv_sec = 0;
+ onems.tv_nsec = NS_PER_S / MS_PER_S;
+ nanosleep(&onems, 0);
+ /* Check whether one second elapsed. */
+ if ((rte_get_timer_cycles() - start) <= twait)
+ continue;
+ } else {
+ /*
+ * We get the amount of timer ticks for one second.
+ * If this amount elapsed it means we spent one
+ * second in waiting. This branch is executed once
+ * on first iteration.
+ */
+ twait = rte_get_timer_hz();
+ MLX5_ASSERT(twait);
+ }
+ /*
+ * Timeout elapsed, show message (once a second) and retry.
+ * We have no other acceptable option here, if we ignore
+ * the unregistering return code the handler will not
+ * be unregistered, fd will be closed and we may get the
+ * crush. Hanging and messaging in the loop seems not to be
+ * the worst choice.
+ */
+ DRV_LOG(INFO, "Retrying to unregister interrupt handler");
+ start = rte_get_timer_cycles();
+ } while (true);
+}
+
+/**
+ * Handle DEVX interrupts from the NIC.
+ * This function is probably called from the DPDK host thread.
+ *
+ * @param cb_arg
+ * Callback argument.
+ */
+void
+mlx5_dev_interrupt_handler_devx(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_ASYNC
+ (void)cb_arg;
+ return;
+#else
+ struct mlx5_ibv_shared *sh = cb_arg;
+ union {
+ struct mlx5dv_devx_async_cmd_hdr cmd_resp;
+ uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+ MLX5_ST_SZ_BYTES(traffic_counter) +
+ sizeof(struct mlx5dv_devx_async_cmd_hdr)];
+ } out;
+ uint8_t *buf = out.buf + sizeof(out.cmd_resp);
+
+ while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
+ &out.cmd_resp,
+ sizeof(out.buf)))
+ mlx5_flow_async_pool_query_handle
+ (sh, (uint64_t)out.cmd_resp.wr_id,
+ mlx5_devx_get_out_command_status(buf));
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+/**
+ * Uninstall shared asynchronous device events handler.
+ * This function is implemented to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return;
+ pthread_mutex_lock(&sh->intr_mutex);
+ MLX5_ASSERT(priv->ibv_port);
+ MLX5_ASSERT(priv->ibv_port <= sh->max_port);
+ MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
+ if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS)
+ goto exit;
+ MLX5_ASSERT(sh->port[priv->ibv_port - 1].ih_port_id ==
+ (uint32_t)dev->data->port_id);
+ MLX5_ASSERT(sh->intr_cnt);
+ sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
+ if (!sh->intr_cnt || --sh->intr_cnt)
+ goto exit;
+ mlx5_intr_callback_unregister(&sh->intr_handle,
+ mlx5_dev_interrupt_handler, sh);
+ sh->intr_handle.fd = 0;
+ sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+exit:
+ pthread_mutex_unlock(&sh->intr_mutex);
+}
+
+/**
+ * Uninstall devx shared asynchronous device events handler.
+ * This function is implemeted to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx5_dev_shared_handler_devx_uninstall(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return;
+ pthread_mutex_lock(&sh->intr_mutex);
+ MLX5_ASSERT(priv->ibv_port);
+ MLX5_ASSERT(priv->ibv_port <= sh->max_port);
+ MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
+ if (sh->port[priv->ibv_port - 1].devx_ih_port_id >= RTE_MAX_ETHPORTS)
+ goto exit;
+ MLX5_ASSERT(sh->port[priv->ibv_port - 1].devx_ih_port_id ==
+ (uint32_t)dev->data->port_id);
+ sh->port[priv->ibv_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
+ if (!sh->devx_intr_cnt || --sh->devx_intr_cnt)
+ goto exit;
+ if (sh->intr_handle_devx.fd) {
+ rte_intr_callback_unregister(&sh->intr_handle_devx,
+ mlx5_dev_interrupt_handler_devx,
+ sh);
+ sh->intr_handle_devx.fd = 0;
+ sh->intr_handle_devx.type = RTE_INTR_HANDLE_UNKNOWN;
+ }
+ if (sh->devx_comp) {
+ mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
+ sh->devx_comp = NULL;
+ }
+exit:
+ pthread_mutex_unlock(&sh->intr_mutex);
+}
+
+/**
+ * Install shared asynchronous device events handler.
+ * This function is implemented to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx5_dev_shared_handler_install(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ int ret;
+ int flags;
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return;
+ pthread_mutex_lock(&sh->intr_mutex);
+ MLX5_ASSERT(priv->ibv_port);
+ MLX5_ASSERT(priv->ibv_port <= sh->max_port);
+ MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
+ if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) {
+ /* The handler is already installed for this port. */
+ MLX5_ASSERT(sh->intr_cnt);
+ goto exit;
+ }
+ if (sh->intr_cnt) {
+ sh->port[priv->ibv_port - 1].ih_port_id =
+ (uint32_t)dev->data->port_id;
+ sh->intr_cnt++;
+ goto exit;
+ }
+ /* No shared handler installed. */
+ MLX5_ASSERT(sh->ctx->async_fd > 0);
+ flags = fcntl(sh->ctx->async_fd, F_GETFL);
+ ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
+ if (ret) {
+ DRV_LOG(INFO, "failed to change file descriptor async event"
+ " queue");
+ /* Indicate there will be no interrupts. */
+ dev->data->dev_conf.intr_conf.lsc = 0;
+ dev->data->dev_conf.intr_conf.rmv = 0;
+ } else {
+ sh->intr_handle.fd = sh->ctx->async_fd;
+ sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
+ rte_intr_callback_register(&sh->intr_handle,
+ mlx5_dev_interrupt_handler, sh);
+ sh->intr_cnt++;
+ sh->port[priv->ibv_port - 1].ih_port_id =
+ (uint32_t)dev->data->port_id;
+ }
+exit:
+ pthread_mutex_unlock(&sh->intr_mutex);
+}
+
+/**
+ * Install devx shared asyncronous device events handler.
+ * This function is implemeted to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx5_dev_shared_handler_devx_install(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return;
+ pthread_mutex_lock(&sh->intr_mutex);
+ MLX5_ASSERT(priv->ibv_port);
+ MLX5_ASSERT(priv->ibv_port <= sh->max_port);
+ MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
+ if (sh->port[priv->ibv_port - 1].devx_ih_port_id < RTE_MAX_ETHPORTS) {
+ /* The handler is already installed for this port. */
+ MLX5_ASSERT(sh->devx_intr_cnt);
+ goto exit;
+ }
+ if (sh->devx_intr_cnt) {
+ sh->devx_intr_cnt++;
+ sh->port[priv->ibv_port - 1].devx_ih_port_id =
+ (uint32_t)dev->data->port_id;
+ goto exit;
+ }
+ if (priv->config.devx) {
+#ifndef HAVE_IBV_DEVX_ASYNC
+ goto exit;
+#else
+ sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
+ if (sh->devx_comp) {
+ int flags = fcntl(sh->devx_comp->fd, F_GETFL);
+ int ret = fcntl(sh->devx_comp->fd, F_SETFL,
+ flags | O_NONBLOCK);
+
+ if (ret) {
+ DRV_LOG(INFO, "failed to change file descriptor"
+ " devx async event queue");
+ } else {
+ sh->intr_handle_devx.fd = sh->devx_comp->fd;
+ sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
+ rte_intr_callback_register
+ (&sh->intr_handle_devx,
+ mlx5_dev_interrupt_handler_devx, sh);
+ sh->devx_intr_cnt++;
+ sh->port[priv->ibv_port - 1].devx_ih_port_id =
+ (uint32_t)dev->data->port_id;
+ }
+ }
+#endif /* HAVE_IBV_DEVX_ASYNC */
+ }
+exit:
+ pthread_mutex_unlock(&sh->intr_mutex);
+}
+
+/**
+ * Uninstall interrupt handler.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
+{
+ mlx5_dev_shared_handler_uninstall(dev);
+}
+
+/**
+ * Install interrupt handler.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
+{
+ mlx5_dev_shared_handler_install(dev);
+}
+
+/**
+ * Devx uninstall interrupt handler.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_dev_interrupt_handler_devx_uninstall(struct rte_eth_dev *dev)
+{
+ mlx5_dev_shared_handler_devx_uninstall(dev);
+}
+
+/**
+ * Devx install interrupt handler.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_dev_interrupt_handler_devx_install(struct rte_eth_dev *dev)
+{
+ mlx5_dev_shared_handler_devx_install(dev);
+}
+
+/**
+ * DPDK callback to bring the link DOWN.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_link_down(struct rte_eth_dev *dev)
+{
+ return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
+}
+
+/**
+ * DPDK callback to bring the link UP.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_link_up(struct rte_eth_dev *dev)
+{
+ return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
+}
+
+/**
+ * Configure the RX function to use.
+ *
+ * @param dev
+ * Pointer to private data structure.
+ *
+ * @return
+ * Pointer to selected Rx burst function.
+ */
+eth_rx_burst_t
+mlx5_select_rx_function(struct rte_eth_dev *dev)
+{
+ eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
+
+ MLX5_ASSERT(dev != NULL);
+ if (mlx5_check_vec_rx_support(dev) > 0) {
+ rx_pkt_burst = mlx5_rx_burst_vec;
+ DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
+ dev->data->port_id);
+ } else if (mlx5_mprq_enabled(dev)) {
+ rx_pkt_burst = mlx5_rx_burst_mprq;
+ }
+ return rx_pkt_burst;
+}
+
+/**
+ * Check if mlx5 device was removed.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 1 when device is removed, otherwise 0.
+ */
+int
+mlx5_is_removed(struct rte_eth_dev *dev)
+{
+ struct ibv_device_attr device_attr;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
+ return 1;
+ return 0;
+}
+
+/**
+ * Get the E-Switch parameters by port id.
+ *
+ * @param[in] port
+ * Device port id.
+ * @param[in] valid
+ * Device port id is valid, skip check. This flag is useful
+ * when trials are performed from probing and device is not
+ * flagged as valid yet (in attaching process).
+ * @param[out] es_domain_id
+ * E-Switch domain id.
+ * @param[out] es_port_id
+ * The port id of the port in the E-Switch.
+ *
+ * @return
+ * pointer to device private data structure containing data needed
+ * on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_priv *
+mlx5_port_to_eswitch_info(uint16_t port, bool valid)
+{
+ struct rte_eth_dev *dev;
+ struct mlx5_priv *priv;
+
+ if (port >= RTE_MAX_ETHPORTS) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+ if (!valid && !rte_eth_dev_is_valid_port(port)) {
+ rte_errno = ENODEV;
+ return NULL;
+ }
+ dev = &rte_eth_devices[port];
+ priv = dev->data->dev_private;
+ if (!(priv->representor || priv->master)) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+ return priv;
+}
+
+/**
+ * Get the E-Switch parameters by device instance.
+ *
+ * @param[in] port
+ * Device port id.
+ * @param[out] es_domain_id
+ * E-Switch domain id.
+ * @param[out] es_port_id
+ * The port id of the port in the E-Switch.
+ *
+ * @return
+ * pointer to device private data structure containing data needed
+ * on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_priv *
+mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv;
+
+ priv = dev->data->dev_private;
+ if (!(priv->representor || priv->master)) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+ return priv;
+}
+
+/**
+ * Get switch information associated with network interface.
+ *
+ * @param ifindex
+ * Network interface index.
+ * @param[out] info
+ * Switch information object, populated in case of success.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
+{
+ char ifname[IF_NAMESIZE];
+ char port_name[IF_NAMESIZE];
+ FILE *file;
+ struct mlx5_switch_info data = {
+ .master = 0,
+ .representor = 0,
+ .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
+ .port_name = 0,
+ .switch_id = 0,
+ };
+ DIR *dir;
+ bool port_switch_id_set = false;
+ bool device_dir = false;
+ char c;
+ int ret;
+
+ if (!if_indextoname(ifindex, ifname)) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+
+ MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
+ ifname);
+ MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
+ ifname);
+ MKSTR(pci_device, "/sys/class/net/%s/device",
+ ifname);
+
+ file = fopen(phys_port_name, "rb");
+ if (file != NULL) {
+ ret = fscanf(file, "%s", port_name);
+ fclose(file);
+ if (ret == 1)
+ mlx5_translate_port_name(port_name, &data);
+ }
+ file = fopen(phys_switch_id, "rb");
+ if (file == NULL) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ port_switch_id_set =
+ fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
+ c == '\n';
+ fclose(file);
+ dir = opendir(pci_device);
+ if (dir != NULL) {
+ closedir(dir);
+ device_dir = true;
+ }
+ if (port_switch_id_set) {
+ /* We have some E-Switch configuration. */
+ mlx5_sysfs_check_switch_info(device_dir, &data);
+ }
+ *info = data;
+ MLX5_ASSERT(!(data.master && data.representor));
+ if (data.master && data.representor) {
+ DRV_LOG(ERR, "ifindex %u device is recognized as master"
+ " and as representor", ifindex);
+ rte_errno = ENODEV;
+ return -rte_errno;
+ }
+ return 0;
+}
+
+/**
+ * Analyze gathered port parameters via sysfs to recognize master
+ * and representor devices for E-Switch configuration.
+ *
+ * @param[in] device_dir
+ * flag of presence of "device" directory under port device key.
+ * @param[inout] switch_info
+ * Port information, including port name as a number and port name
+ * type if recognized
+ *
+ * @return
+ * master and representor flags are set in switch_info according to
+ * recognized parameters (if any).
+ */
+void
+mlx5_sysfs_check_switch_info(bool device_dir,
+ struct mlx5_switch_info *switch_info)
+{
+ switch (switch_info->name_type) {
+ case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
+ /*
+ * Name is not recognized, assume the master,
+ * check the device directory presence.
+ */
+ switch_info->master = device_dir;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
+ /*
+ * Name is not set, this assumes the legacy naming
+ * schema for master, just check if there is
+ * a device directory.
+ */
+ switch_info->master = device_dir;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+ /* New uplink naming schema recognized. */
+ switch_info->master = 1;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
+ /* Legacy representors naming schema. */
+ switch_info->representor = !device_dir;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+ /* New representors naming schema. */
+ switch_info->representor = 1;
+ break;
+ }
+}
+
+/**
+ * DPDK callback to retrieve plug-in module EEPROM information (type and size).
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] modinfo
+ * Storage for plug-in module EEPROM information.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_module_info(struct rte_eth_dev *dev,
+ struct rte_eth_dev_module_info *modinfo)
+{
+ struct ethtool_modinfo info = {
+ .cmd = ETHTOOL_GMODULEINFO,
+ };
+ struct ifreq ifr = (struct ifreq) {
+ .ifr_data = (void *)&info,
+ };
+ int ret = 0;
+
+ if (!dev || !modinfo) {
+ DRV_LOG(WARNING, "missing argument, cannot get module info");
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ modinfo->type = info.type;
+ modinfo->eeprom_len = info.eeprom_len;
+ return ret;
+}
+
+/**
+ * DPDK callback to retrieve plug-in module EEPROM data.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] info
+ * Storage for plug-in module EEPROM data.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
+ struct rte_dev_eeprom_info *info)
+{
+ struct ethtool_eeprom *eeprom;
+ struct ifreq ifr;
+ int ret = 0;
+
+ if (!dev || !info) {
+ DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ eeprom = rte_calloc(__func__, 1,
+ (sizeof(struct ethtool_eeprom) + info->length), 0);
+ if (!eeprom) {
+ DRV_LOG(WARNING, "port %u cannot allocate memory for "
+ "eeprom data", dev->data->port_id);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ eeprom->cmd = ETHTOOL_GMODULEEEPROM;
+ eeprom->offset = info->offset;
+ eeprom->len = info->length;
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)eeprom,
+ };
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret)
+ DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ else
+ rte_memcpy(info->data, eeprom->data, info->length);
+ rte_free(eeprom);
+ return ret;
+}
+
+/**
+ * DPDK callback to retrieve hairpin capabilities.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] cap
+ * Storage for hairpin capability data.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_hairpin_cap_get(struct rte_eth_dev *dev,
+ struct rte_eth_hairpin_cap *cap)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (priv->sh->devx == 0) {
+ rte_errno = ENOTSUP;
+ return -rte_errno;
+ }
+ cap->max_nb_queues = UINT16_MAX;
+ cap->max_rx_2_tx = 1;
+ cap->max_tx_2_rx = 1;
+ cap->max_nb_desc = 8192;
+ return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c
new file mode 100644
index 000000000..ae478a510
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c
@@ -0,0 +1,6204 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox Technologies, Ltd
+ */
+
+#include <netinet/in.h>
+#include <sys/queue.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_flow.h>
+#include <rte_cycles.h>
+#include <rte_flow_driver.h>
+#include <rte_malloc.h>
+#include <rte_ip.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_flow.h"
+#include "mlx5_rxtx.h"
+
+/* Dev ops structure defined in mlx5.c */
+extern const struct eth_dev_ops mlx5_dev_ops;
+extern const struct eth_dev_ops mlx5_dev_ops_isolate;
+
+/** Device flow drivers. */
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+extern const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops;
+#endif
+extern const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops;
+
+const struct mlx5_flow_driver_ops mlx5_flow_null_drv_ops;
+
+const struct mlx5_flow_driver_ops *flow_drv_ops[] = {
+ [MLX5_FLOW_TYPE_MIN] = &mlx5_flow_null_drv_ops,
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ [MLX5_FLOW_TYPE_DV] = &mlx5_flow_dv_drv_ops,
+#endif
+ [MLX5_FLOW_TYPE_VERBS] = &mlx5_flow_verbs_drv_ops,
+ [MLX5_FLOW_TYPE_MAX] = &mlx5_flow_null_drv_ops
+};
+
+enum mlx5_expansion {
+ MLX5_EXPANSION_ROOT,
+ MLX5_EXPANSION_ROOT_OUTER,
+ MLX5_EXPANSION_ROOT_ETH_VLAN,
+ MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN,
+ MLX5_EXPANSION_OUTER_ETH,
+ MLX5_EXPANSION_OUTER_ETH_VLAN,
+ MLX5_EXPANSION_OUTER_VLAN,
+ MLX5_EXPANSION_OUTER_IPV4,
+ MLX5_EXPANSION_OUTER_IPV4_UDP,
+ MLX5_EXPANSION_OUTER_IPV4_TCP,
+ MLX5_EXPANSION_OUTER_IPV6,
+ MLX5_EXPANSION_OUTER_IPV6_UDP,
+ MLX5_EXPANSION_OUTER_IPV6_TCP,
+ MLX5_EXPANSION_VXLAN,
+ MLX5_EXPANSION_VXLAN_GPE,
+ MLX5_EXPANSION_GRE,
+ MLX5_EXPANSION_MPLS,
+ MLX5_EXPANSION_ETH,
+ MLX5_EXPANSION_ETH_VLAN,
+ MLX5_EXPANSION_VLAN,
+ MLX5_EXPANSION_IPV4,
+ MLX5_EXPANSION_IPV4_UDP,
+ MLX5_EXPANSION_IPV4_TCP,
+ MLX5_EXPANSION_IPV6,
+ MLX5_EXPANSION_IPV6_UDP,
+ MLX5_EXPANSION_IPV6_TCP,
+};
+
+/** Supported expansion of items. */
+static const struct rte_flow_expand_node mlx5_support_expansion[] = {
+ [MLX5_EXPANSION_ROOT] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH,
+ MLX5_EXPANSION_IPV4,
+ MLX5_EXPANSION_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ [MLX5_EXPANSION_ROOT_OUTER] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH,
+ MLX5_EXPANSION_OUTER_IPV4,
+ MLX5_EXPANSION_OUTER_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ [MLX5_EXPANSION_ROOT_ETH_VLAN] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH_VLAN),
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ [MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH_VLAN),
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ [MLX5_EXPANSION_OUTER_ETH] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4,
+ MLX5_EXPANSION_OUTER_IPV6,
+ MLX5_EXPANSION_MPLS),
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ .rss_types = 0,
+ },
+ [MLX5_EXPANSION_OUTER_ETH_VLAN] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_VLAN),
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ .rss_types = 0,
+ },
+ [MLX5_EXPANSION_OUTER_VLAN] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4,
+ MLX5_EXPANSION_OUTER_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_VLAN,
+ },
+ [MLX5_EXPANSION_OUTER_IPV4] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT
+ (MLX5_EXPANSION_OUTER_IPV4_UDP,
+ MLX5_EXPANSION_OUTER_IPV4_TCP,
+ MLX5_EXPANSION_GRE,
+ MLX5_EXPANSION_IPV4,
+ MLX5_EXPANSION_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_IPV4,
+ .rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
+ ETH_RSS_NONFRAG_IPV4_OTHER,
+ },
+ [MLX5_EXPANSION_OUTER_IPV4_UDP] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN,
+ MLX5_EXPANSION_VXLAN_GPE),
+ .type = RTE_FLOW_ITEM_TYPE_UDP,
+ .rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
+ },
+ [MLX5_EXPANSION_OUTER_IPV4_TCP] = {
+ .type = RTE_FLOW_ITEM_TYPE_TCP,
+ .rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
+ },
+ [MLX5_EXPANSION_OUTER_IPV6] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT
+ (MLX5_EXPANSION_OUTER_IPV6_UDP,
+ MLX5_EXPANSION_OUTER_IPV6_TCP,
+ MLX5_EXPANSION_IPV4,
+ MLX5_EXPANSION_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_IPV6,
+ .rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
+ ETH_RSS_NONFRAG_IPV6_OTHER,
+ },
+ [MLX5_EXPANSION_OUTER_IPV6_UDP] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN,
+ MLX5_EXPANSION_VXLAN_GPE),
+ .type = RTE_FLOW_ITEM_TYPE_UDP,
+ .rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
+ },
+ [MLX5_EXPANSION_OUTER_IPV6_TCP] = {
+ .type = RTE_FLOW_ITEM_TYPE_TCP,
+ .rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
+ },
+ [MLX5_EXPANSION_VXLAN] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH,
+ MLX5_EXPANSION_IPV4,
+ MLX5_EXPANSION_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_VXLAN,
+ },
+ [MLX5_EXPANSION_VXLAN_GPE] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH,
+ MLX5_EXPANSION_IPV4,
+ MLX5_EXPANSION_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_VXLAN_GPE,
+ },
+ [MLX5_EXPANSION_GRE] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4),
+ .type = RTE_FLOW_ITEM_TYPE_GRE,
+ },
+ [MLX5_EXPANSION_MPLS] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
+ MLX5_EXPANSION_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_MPLS,
+ },
+ [MLX5_EXPANSION_ETH] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
+ MLX5_EXPANSION_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ },
+ [MLX5_EXPANSION_ETH_VLAN] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VLAN),
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ },
+ [MLX5_EXPANSION_VLAN] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
+ MLX5_EXPANSION_IPV6),
+ .type = RTE_FLOW_ITEM_TYPE_VLAN,
+ },
+ [MLX5_EXPANSION_IPV4] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4_UDP,
+ MLX5_EXPANSION_IPV4_TCP),
+ .type = RTE_FLOW_ITEM_TYPE_IPV4,
+ .rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
+ ETH_RSS_NONFRAG_IPV4_OTHER,
+ },
+ [MLX5_EXPANSION_IPV4_UDP] = {
+ .type = RTE_FLOW_ITEM_TYPE_UDP,
+ .rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
+ },
+ [MLX5_EXPANSION_IPV4_TCP] = {
+ .type = RTE_FLOW_ITEM_TYPE_TCP,
+ .rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
+ },
+ [MLX5_EXPANSION_IPV6] = {
+ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV6_UDP,
+ MLX5_EXPANSION_IPV6_TCP),
+ .type = RTE_FLOW_ITEM_TYPE_IPV6,
+ .rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
+ ETH_RSS_NONFRAG_IPV6_OTHER,
+ },
+ [MLX5_EXPANSION_IPV6_UDP] = {
+ .type = RTE_FLOW_ITEM_TYPE_UDP,
+ .rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
+ },
+ [MLX5_EXPANSION_IPV6_TCP] = {
+ .type = RTE_FLOW_ITEM_TYPE_TCP,
+ .rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
+ },
+};
+
+static const struct rte_flow_ops mlx5_flow_ops = {
+ .validate = mlx5_flow_validate,
+ .create = mlx5_flow_create,
+ .destroy = mlx5_flow_destroy,
+ .flush = mlx5_flow_flush,
+ .isolate = mlx5_flow_isolate,
+ .query = mlx5_flow_query,
+ .dev_dump = mlx5_flow_dev_dump,
+ .get_aged_flows = mlx5_flow_get_aged_flows,
+};
+
+/* Convert FDIR request to Generic flow. */
+struct mlx5_fdir {
+ struct rte_flow_attr attr;
+ struct rte_flow_item items[4];
+ struct rte_flow_item_eth l2;
+ struct rte_flow_item_eth l2_mask;
+ union {
+ struct rte_flow_item_ipv4 ipv4;
+ struct rte_flow_item_ipv6 ipv6;
+ } l3;
+ union {
+ struct rte_flow_item_ipv4 ipv4;
+ struct rte_flow_item_ipv6 ipv6;
+ } l3_mask;
+ union {
+ struct rte_flow_item_udp udp;
+ struct rte_flow_item_tcp tcp;
+ } l4;
+ union {
+ struct rte_flow_item_udp udp;
+ struct rte_flow_item_tcp tcp;
+ } l4_mask;
+ struct rte_flow_action actions[2];
+ struct rte_flow_action_queue queue;
+};
+
+/* Map of Verbs to Flow priority with 8 Verbs priorities. */
+static const uint32_t priority_map_3[][MLX5_PRIORITY_MAP_MAX] = {
+ { 0, 1, 2 }, { 2, 3, 4 }, { 5, 6, 7 },
+};
+
+/* Map of Verbs to Flow priority with 16 Verbs priorities. */
+static const uint32_t priority_map_5[][MLX5_PRIORITY_MAP_MAX] = {
+ { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 },
+ { 9, 10, 11 }, { 12, 13, 14 },
+};
+
+/* Tunnel information. */
+struct mlx5_flow_tunnel_info {
+ uint64_t tunnel; /**< Tunnel bit (see MLX5_FLOW_*). */
+ uint32_t ptype; /**< Tunnel Ptype (see RTE_PTYPE_*). */
+};
+
+static struct mlx5_flow_tunnel_info tunnels_info[] = {
+ {
+ .tunnel = MLX5_FLOW_LAYER_VXLAN,
+ .ptype = RTE_PTYPE_TUNNEL_VXLAN | RTE_PTYPE_L4_UDP,
+ },
+ {
+ .tunnel = MLX5_FLOW_LAYER_GENEVE,
+ .ptype = RTE_PTYPE_TUNNEL_GENEVE | RTE_PTYPE_L4_UDP,
+ },
+ {
+ .tunnel = MLX5_FLOW_LAYER_VXLAN_GPE,
+ .ptype = RTE_PTYPE_TUNNEL_VXLAN_GPE | RTE_PTYPE_L4_UDP,
+ },
+ {
+ .tunnel = MLX5_FLOW_LAYER_GRE,
+ .ptype = RTE_PTYPE_TUNNEL_GRE,
+ },
+ {
+ .tunnel = MLX5_FLOW_LAYER_MPLS | MLX5_FLOW_LAYER_OUTER_L4_UDP,
+ .ptype = RTE_PTYPE_TUNNEL_MPLS_IN_UDP | RTE_PTYPE_L4_UDP,
+ },
+ {
+ .tunnel = MLX5_FLOW_LAYER_MPLS,
+ .ptype = RTE_PTYPE_TUNNEL_MPLS_IN_GRE,
+ },
+ {
+ .tunnel = MLX5_FLOW_LAYER_NVGRE,
+ .ptype = RTE_PTYPE_TUNNEL_NVGRE,
+ },
+ {
+ .tunnel = MLX5_FLOW_LAYER_IPIP,
+ .ptype = RTE_PTYPE_TUNNEL_IP,
+ },
+ {
+ .tunnel = MLX5_FLOW_LAYER_IPV6_ENCAP,
+ .ptype = RTE_PTYPE_TUNNEL_IP,
+ },
+ {
+ .tunnel = MLX5_FLOW_LAYER_GTP,
+ .ptype = RTE_PTYPE_TUNNEL_GTPU,
+ },
+};
+
+/**
+ * Translate tag ID to register.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] feature
+ * The feature that request the register.
+ * @param[in] id
+ * The request register ID.
+ * @param[out] error
+ * Error description in case of any.
+ *
+ * @return
+ * The request register on success, a negative errno
+ * value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_get_reg_id(struct rte_eth_dev *dev,
+ enum mlx5_feature_name feature,
+ uint32_t id,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ enum modify_reg start_reg;
+ bool skip_mtr_reg = false;
+
+ switch (feature) {
+ case MLX5_HAIRPIN_RX:
+ return REG_B;
+ case MLX5_HAIRPIN_TX:
+ return REG_A;
+ case MLX5_METADATA_RX:
+ switch (config->dv_xmeta_en) {
+ case MLX5_XMETA_MODE_LEGACY:
+ return REG_B;
+ case MLX5_XMETA_MODE_META16:
+ return REG_C_0;
+ case MLX5_XMETA_MODE_META32:
+ return REG_C_1;
+ }
+ break;
+ case MLX5_METADATA_TX:
+ return REG_A;
+ case MLX5_METADATA_FDB:
+ switch (config->dv_xmeta_en) {
+ case MLX5_XMETA_MODE_LEGACY:
+ return REG_NONE;
+ case MLX5_XMETA_MODE_META16:
+ return REG_C_0;
+ case MLX5_XMETA_MODE_META32:
+ return REG_C_1;
+ }
+ break;
+ case MLX5_FLOW_MARK:
+ switch (config->dv_xmeta_en) {
+ case MLX5_XMETA_MODE_LEGACY:
+ return REG_NONE;
+ case MLX5_XMETA_MODE_META16:
+ return REG_C_1;
+ case MLX5_XMETA_MODE_META32:
+ return REG_C_0;
+ }
+ break;
+ case MLX5_MTR_SFX:
+ /*
+ * If meter color and flow match share one register, flow match
+ * should use the meter color register for match.
+ */
+ if (priv->mtr_reg_share)
+ return priv->mtr_color_reg;
+ else
+ return priv->mtr_color_reg != REG_C_2 ? REG_C_2 :
+ REG_C_3;
+ case MLX5_MTR_COLOR:
+ MLX5_ASSERT(priv->mtr_color_reg != REG_NONE);
+ return priv->mtr_color_reg;
+ case MLX5_COPY_MARK:
+ /*
+ * Metadata COPY_MARK register using is in meter suffix sub
+ * flow while with meter. It's safe to share the same register.
+ */
+ return priv->mtr_color_reg != REG_C_2 ? REG_C_2 : REG_C_3;
+ case MLX5_APP_TAG:
+ /*
+ * If meter is enable, it will engage the register for color
+ * match and flow match. If meter color match is not using the
+ * REG_C_2, need to skip the REG_C_x be used by meter color
+ * match.
+ * If meter is disable, free to use all available registers.
+ */
+ start_reg = priv->mtr_color_reg != REG_C_2 ? REG_C_2 :
+ (priv->mtr_reg_share ? REG_C_3 : REG_C_4);
+ skip_mtr_reg = !!(priv->mtr_en && start_reg == REG_C_2);
+ if (id > (REG_C_7 - start_reg))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ NULL, "invalid tag id");
+ if (config->flow_mreg_c[id + start_reg - REG_C_0] == REG_NONE)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ NULL, "unsupported tag id");
+ /*
+ * This case means meter is using the REG_C_x great than 2.
+ * Take care not to conflict with meter color REG_C_x.
+ * If the available index REG_C_y >= REG_C_x, skip the
+ * color register.
+ */
+ if (skip_mtr_reg && config->flow_mreg_c
+ [id + start_reg - REG_C_0] >= priv->mtr_color_reg) {
+ if (id >= (REG_C_7 - start_reg))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ NULL, "invalid tag id");
+ if (config->flow_mreg_c
+ [id + 1 + start_reg - REG_C_0] != REG_NONE)
+ return config->flow_mreg_c
+ [id + 1 + start_reg - REG_C_0];
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ NULL, "unsupported tag id");
+ }
+ return config->flow_mreg_c[id + start_reg - REG_C_0];
+ }
+ MLX5_ASSERT(false);
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "invalid feature name");
+}
+
+/**
+ * Check extensive flow metadata register support.
+ *
+ * @param dev
+ * Pointer to rte_eth_dev structure.
+ *
+ * @return
+ * True if device supports extensive flow metadata register, otherwise false.
+ */
+bool
+mlx5_flow_ext_mreg_supported(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+
+ /*
+ * Having available reg_c can be regarded inclusively as supporting
+ * extensive flow metadata register, which could mean,
+ * - metadata register copy action by modify header.
+ * - 16 modify header actions is supported.
+ * - reg_c's are preserved across different domain (FDB and NIC) on
+ * packet loopback by flow lookup miss.
+ */
+ return config->flow_mreg_c[2] != REG_NONE;
+}
+
+/**
+ * Discover the maximum number of priority available.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ *
+ * @return
+ * number of supported flow priority on success, a negative errno
+ * value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct {
+ struct ibv_flow_attr attr;
+ struct ibv_flow_spec_eth eth;
+ struct ibv_flow_spec_action_drop drop;
+ } flow_attr = {
+ .attr = {
+ .num_of_specs = 2,
+ .port = (uint8_t)priv->ibv_port,
+ },
+ .eth = {
+ .type = IBV_FLOW_SPEC_ETH,
+ .size = sizeof(struct ibv_flow_spec_eth),
+ },
+ .drop = {
+ .size = sizeof(struct ibv_flow_spec_action_drop),
+ .type = IBV_FLOW_SPEC_ACTION_DROP,
+ },
+ };
+ struct ibv_flow *flow;
+ struct mlx5_hrxq *drop = mlx5_hrxq_drop_new(dev);
+ uint16_t vprio[] = { 8, 16 };
+ int i;
+ int priority = 0;
+
+ if (!drop) {
+ rte_errno = ENOTSUP;
+ return -rte_errno;
+ }
+ for (i = 0; i != RTE_DIM(vprio); i++) {
+ flow_attr.attr.priority = vprio[i] - 1;
+ flow = mlx5_glue->create_flow(drop->qp, &flow_attr.attr);
+ if (!flow)
+ break;
+ claim_zero(mlx5_glue->destroy_flow(flow));
+ priority = vprio[i];
+ }
+ mlx5_hrxq_drop_release(dev);
+ switch (priority) {
+ case 8:
+ priority = RTE_DIM(priority_map_3);
+ break;
+ case 16:
+ priority = RTE_DIM(priority_map_5);
+ break;
+ default:
+ rte_errno = ENOTSUP;
+ DRV_LOG(ERR,
+ "port %u verbs maximum priority: %d expected 8/16",
+ dev->data->port_id, priority);
+ return -rte_errno;
+ }
+ DRV_LOG(INFO, "port %u flow maximum priority: %d",
+ dev->data->port_id, priority);
+ return priority;
+}
+
+/**
+ * Adjust flow priority based on the highest layer and the request priority.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] priority
+ * The rule base priority.
+ * @param[in] subpriority
+ * The priority based on the items.
+ *
+ * @return
+ * The new priority.
+ */
+uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
+ uint32_t subpriority)
+{
+ uint32_t res = 0;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ switch (priv->config.flow_prio) {
+ case RTE_DIM(priority_map_3):
+ res = priority_map_3[priority][subpriority];
+ break;
+ case RTE_DIM(priority_map_5):
+ res = priority_map_5[priority][subpriority];
+ break;
+ }
+ return res;
+}
+
+/**
+ * Verify the @p item specifications (spec, last, mask) are compatible with the
+ * NIC capabilities.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] mask
+ * @p item->mask or flow default bit-masks.
+ * @param[in] nic_mask
+ * Bit-masks covering supported fields by the NIC to compare with user mask.
+ * @param[in] size
+ * Bit-masks size in bytes.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_item_acceptable(const struct rte_flow_item *item,
+ const uint8_t *mask,
+ const uint8_t *nic_mask,
+ unsigned int size,
+ struct rte_flow_error *error)
+{
+ unsigned int i;
+
+ MLX5_ASSERT(nic_mask);
+ for (i = 0; i < size; ++i)
+ if ((nic_mask[i] | mask[i]) != nic_mask[i])
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "mask enables non supported"
+ " bits");
+ if (!item->spec && (item->mask || item->last))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "mask/last without a spec is not"
+ " supported");
+ if (item->spec && item->last) {
+ uint8_t spec[size];
+ uint8_t last[size];
+ unsigned int i;
+ int ret;
+
+ for (i = 0; i < size; ++i) {
+ spec[i] = ((const uint8_t *)item->spec)[i] & mask[i];
+ last[i] = ((const uint8_t *)item->last)[i] & mask[i];
+ }
+ ret = memcmp(spec, last, size);
+ if (ret != 0)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "range is not valid");
+ }
+ return 0;
+}
+
+/**
+ * Adjust the hash fields according to the @p flow information.
+ *
+ * @param[in] dev_flow.
+ * Pointer to the mlx5_flow.
+ * @param[in] tunnel
+ * 1 when the hash field is for a tunnel item.
+ * @param[in] layer_types
+ * ETH_RSS_* types.
+ * @param[in] hash_fields
+ * Item hash fields.
+ *
+ * @return
+ * The hash fields that should be used.
+ */
+uint64_t
+mlx5_flow_hashfields_adjust(struct mlx5_flow_rss_desc *rss_desc,
+ int tunnel __rte_unused, uint64_t layer_types,
+ uint64_t hash_fields)
+{
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ int rss_request_inner = rss_desc->level >= 2;
+
+ /* Check RSS hash level for tunnel. */
+ if (tunnel && rss_request_inner)
+ hash_fields |= IBV_RX_HASH_INNER;
+ else if (tunnel || rss_request_inner)
+ return 0;
+#endif
+ /* Check if requested layer matches RSS hash fields. */
+ if (!(rss_desc->types & layer_types))
+ return 0;
+ return hash_fields;
+}
+
+/**
+ * Lookup and set the ptype in the data Rx part. A single Ptype can be used,
+ * if several tunnel rules are used on this queue, the tunnel ptype will be
+ * cleared.
+ *
+ * @param rxq_ctrl
+ * Rx queue to update.
+ */
+static void
+flow_rxq_tunnel_ptype_update(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+ unsigned int i;
+ uint32_t tunnel_ptype = 0;
+
+ /* Look up for the ptype to use. */
+ for (i = 0; i != MLX5_FLOW_TUNNEL; ++i) {
+ if (!rxq_ctrl->flow_tunnels_n[i])
+ continue;
+ if (!tunnel_ptype) {
+ tunnel_ptype = tunnels_info[i].ptype;
+ } else {
+ tunnel_ptype = 0;
+ break;
+ }
+ }
+ rxq_ctrl->rxq.tunnel = tunnel_ptype;
+}
+
+/**
+ * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) according to the devive
+ * flow.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] dev_handle
+ * Pointer to device flow handle structure.
+ */
+static void
+flow_drv_rxq_flags_set(struct rte_eth_dev *dev,
+ struct mlx5_flow_handle *dev_handle)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const int mark = dev_handle->mark;
+ const int tunnel = !!(dev_handle->layers & MLX5_FLOW_LAYER_TUNNEL);
+ struct mlx5_hrxq *hrxq;
+ unsigned int i;
+
+ if (dev_handle->fate_action != MLX5_FLOW_FATE_QUEUE)
+ return;
+ hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ],
+ dev_handle->rix_hrxq);
+ if (!hrxq)
+ return;
+ for (i = 0; i != hrxq->ind_table->queues_n; ++i) {
+ int idx = hrxq->ind_table->queues[i];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of((*priv->rxqs)[idx],
+ struct mlx5_rxq_ctrl, rxq);
+
+ /*
+ * To support metadata register copy on Tx loopback,
+ * this must be always enabled (metadata may arive
+ * from other port - not from local flows only.
+ */
+ if (priv->config.dv_flow_en &&
+ priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+ mlx5_flow_ext_mreg_supported(dev)) {
+ rxq_ctrl->rxq.mark = 1;
+ rxq_ctrl->flow_mark_n = 1;
+ } else if (mark) {
+ rxq_ctrl->rxq.mark = 1;
+ rxq_ctrl->flow_mark_n++;
+ }
+ if (tunnel) {
+ unsigned int j;
+
+ /* Increase the counter matching the flow. */
+ for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) {
+ if ((tunnels_info[j].tunnel &
+ dev_handle->layers) ==
+ tunnels_info[j].tunnel) {
+ rxq_ctrl->flow_tunnels_n[j]++;
+ break;
+ }
+ }
+ flow_rxq_tunnel_ptype_update(rxq_ctrl);
+ }
+ }
+}
+
+/**
+ * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) for a flow
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] flow
+ * Pointer to flow structure.
+ */
+static void
+flow_rxq_flags_set(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t handle_idx;
+ struct mlx5_flow_handle *dev_handle;
+
+ SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+ handle_idx, dev_handle, next)
+ flow_drv_rxq_flags_set(dev, dev_handle);
+}
+
+/**
+ * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the
+ * device flow if no other flow uses it with the same kind of request.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[in] dev_handle
+ * Pointer to the device flow handle structure.
+ */
+static void
+flow_drv_rxq_flags_trim(struct rte_eth_dev *dev,
+ struct mlx5_flow_handle *dev_handle)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const int mark = dev_handle->mark;
+ const int tunnel = !!(dev_handle->layers & MLX5_FLOW_LAYER_TUNNEL);
+ struct mlx5_hrxq *hrxq;
+ unsigned int i;
+
+ if (dev_handle->fate_action != MLX5_FLOW_FATE_QUEUE)
+ return;
+ hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ],
+ dev_handle->rix_hrxq);
+ if (!hrxq)
+ return;
+ MLX5_ASSERT(dev->data->dev_started);
+ for (i = 0; i != hrxq->ind_table->queues_n; ++i) {
+ int idx = hrxq->ind_table->queues[i];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of((*priv->rxqs)[idx],
+ struct mlx5_rxq_ctrl, rxq);
+
+ if (priv->config.dv_flow_en &&
+ priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+ mlx5_flow_ext_mreg_supported(dev)) {
+ rxq_ctrl->rxq.mark = 1;
+ rxq_ctrl->flow_mark_n = 1;
+ } else if (mark) {
+ rxq_ctrl->flow_mark_n--;
+ rxq_ctrl->rxq.mark = !!rxq_ctrl->flow_mark_n;
+ }
+ if (tunnel) {
+ unsigned int j;
+
+ /* Decrease the counter matching the flow. */
+ for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) {
+ if ((tunnels_info[j].tunnel &
+ dev_handle->layers) ==
+ tunnels_info[j].tunnel) {
+ rxq_ctrl->flow_tunnels_n[j]--;
+ break;
+ }
+ }
+ flow_rxq_tunnel_ptype_update(rxq_ctrl);
+ }
+ }
+}
+
+/**
+ * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the
+ * @p flow if no other flow uses it with the same kind of request.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[in] flow
+ * Pointer to the flow.
+ */
+static void
+flow_rxq_flags_trim(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t handle_idx;
+ struct mlx5_flow_handle *dev_handle;
+
+ SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+ handle_idx, dev_handle, next)
+ flow_drv_rxq_flags_trim(dev, dev_handle);
+}
+
+/**
+ * Clear the Mark/Flag and Tunnel ptype information in all Rx queues.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+flow_rxq_flags_clear(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+
+ for (i = 0; i != priv->rxqs_n; ++i) {
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+ unsigned int j;
+
+ if (!(*priv->rxqs)[i])
+ continue;
+ rxq_ctrl = container_of((*priv->rxqs)[i],
+ struct mlx5_rxq_ctrl, rxq);
+ rxq_ctrl->flow_mark_n = 0;
+ rxq_ctrl->rxq.mark = 0;
+ for (j = 0; j != MLX5_FLOW_TUNNEL; ++j)
+ rxq_ctrl->flow_tunnels_n[j] = 0;
+ rxq_ctrl->rxq.tunnel = 0;
+ }
+}
+
+/**
+ * Set the Rx queue dynamic metadata (mask and offset) for a flow
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ */
+void
+mlx5_flow_rxq_dynf_metadata_set(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *data;
+ unsigned int i;
+
+ for (i = 0; i != priv->rxqs_n; ++i) {
+ if (!(*priv->rxqs)[i])
+ continue;
+ data = (*priv->rxqs)[i];
+ if (!rte_flow_dynf_metadata_avail()) {
+ data->dynf_meta = 0;
+ data->flow_meta_mask = 0;
+ data->flow_meta_offset = -1;
+ } else {
+ data->dynf_meta = 1;
+ data->flow_meta_mask = rte_flow_dynf_metadata_mask;
+ data->flow_meta_offset = rte_flow_dynf_metadata_offs;
+ }
+ }
+}
+
+/*
+ * return a pointer to the desired action in the list of actions.
+ *
+ * @param[in] actions
+ * The list of actions to search the action in.
+ * @param[in] action
+ * The action to find.
+ *
+ * @return
+ * Pointer to the action in the list, if found. NULL otherwise.
+ */
+const struct rte_flow_action *
+mlx5_flow_find_action(const struct rte_flow_action *actions,
+ enum rte_flow_action_type action)
+{
+ if (actions == NULL)
+ return NULL;
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++)
+ if (actions->type == action)
+ return actions;
+ return NULL;
+}
+
+/*
+ * Validate the flag action.
+ *
+ * @param[in] action_flags
+ * Bit-fields that holds the actions detected until now.
+ * @param[in] attr
+ * Attributes of flow that includes this action.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_flag(uint64_t action_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ if (action_flags & MLX5_FLOW_ACTION_MARK)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't mark and flag in same flow");
+ if (action_flags & MLX5_FLOW_ACTION_FLAG)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't have 2 flag"
+ " actions in same flow");
+ if (attr->egress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+ "flag action not supported for "
+ "egress");
+ return 0;
+}
+
+/*
+ * Validate the mark action.
+ *
+ * @param[in] action
+ * Pointer to the queue action.
+ * @param[in] action_flags
+ * Bit-fields that holds the actions detected until now.
+ * @param[in] attr
+ * Attributes of flow that includes this action.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_mark(const struct rte_flow_action *action,
+ uint64_t action_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_mark *mark = action->conf;
+
+ if (!mark)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ action,
+ "configuration cannot be null");
+ if (mark->id >= MLX5_FLOW_MARK_MAX)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &mark->id,
+ "mark id must in 0 <= id < "
+ RTE_STR(MLX5_FLOW_MARK_MAX));
+ if (action_flags & MLX5_FLOW_ACTION_FLAG)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't flag and mark in same flow");
+ if (action_flags & MLX5_FLOW_ACTION_MARK)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't have 2 mark actions in same"
+ " flow");
+ if (attr->egress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+ "mark action not supported for "
+ "egress");
+ return 0;
+}
+
+/*
+ * Validate the drop action.
+ *
+ * @param[in] action_flags
+ * Bit-fields that holds the actions detected until now.
+ * @param[in] attr
+ * Attributes of flow that includes this action.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_drop(uint64_t action_flags __rte_unused,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ if (attr->egress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+ "drop action not supported for "
+ "egress");
+ return 0;
+}
+
+/*
+ * Validate the queue action.
+ *
+ * @param[in] action
+ * Pointer to the queue action.
+ * @param[in] action_flags
+ * Bit-fields that holds the actions detected until now.
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] attr
+ * Attributes of flow that includes this action.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_queue(const struct rte_flow_action *action,
+ uint64_t action_flags,
+ struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_action_queue *queue = action->conf;
+
+ if (action_flags & MLX5_FLOW_FATE_ACTIONS)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't have 2 fate actions in"
+ " same flow");
+ if (!priv->rxqs_n)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ NULL, "No Rx queues configured");
+ if (queue->index >= priv->rxqs_n)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &queue->index,
+ "queue index out of range");
+ if (!(*priv->rxqs)[queue->index])
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &queue->index,
+ "queue is not configured");
+ if (attr->egress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+ "queue action not supported for "
+ "egress");
+ return 0;
+}
+
+/*
+ * Validate the rss action.
+ *
+ * @param[in] action
+ * Pointer to the queue action.
+ * @param[in] action_flags
+ * Bit-fields that holds the actions detected until now.
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] attr
+ * Attributes of flow that includes this action.
+ * @param[in] item_flags
+ * Items that were detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_rss(const struct rte_flow_action *action,
+ uint64_t action_flags,
+ struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_action_rss *rss = action->conf;
+ int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ unsigned int i;
+
+ if (action_flags & MLX5_FLOW_FATE_ACTIONS)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't have 2 fate actions"
+ " in same flow");
+ if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT &&
+ rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &rss->func,
+ "RSS hash function not supported");
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ if (rss->level > 2)
+#else
+ if (rss->level > 1)
+#endif
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &rss->level,
+ "tunnel RSS is not supported");
+ /* allow RSS key_len 0 in case of NULL (default) RSS key. */
+ if (rss->key_len == 0 && rss->key != NULL)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &rss->key_len,
+ "RSS hash key length 0");
+ if (rss->key_len > 0 && rss->key_len < MLX5_RSS_HASH_KEY_LEN)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &rss->key_len,
+ "RSS hash key too small");
+ if (rss->key_len > MLX5_RSS_HASH_KEY_LEN)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &rss->key_len,
+ "RSS hash key too large");
+ if (rss->queue_num > priv->config.ind_table_max_size)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &rss->queue_num,
+ "number of queues too large");
+ if (rss->types & MLX5_RSS_HF_MASK)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &rss->types,
+ "some RSS protocols are not"
+ " supported");
+ if ((rss->types & (ETH_RSS_L3_SRC_ONLY | ETH_RSS_L3_DST_ONLY)) &&
+ !(rss->types & ETH_RSS_IP))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+ "L3 partial RSS requested but L3 RSS"
+ " type not specified");
+ if ((rss->types & (ETH_RSS_L4_SRC_ONLY | ETH_RSS_L4_DST_ONLY)) &&
+ !(rss->types & (ETH_RSS_UDP | ETH_RSS_TCP)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+ "L4 partial RSS requested but L4 RSS"
+ " type not specified");
+ if (!priv->rxqs_n)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ NULL, "No Rx queues configured");
+ if (!rss->queue_num)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ NULL, "No queues configured");
+ for (i = 0; i != rss->queue_num; ++i) {
+ if (rss->queue[i] >= priv->rxqs_n)
+ return rte_flow_error_set
+ (error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &rss->queue[i], "queue index out of range");
+ if (!(*priv->rxqs)[rss->queue[i]])
+ return rte_flow_error_set
+ (error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &rss->queue[i], "queue is not configured");
+ }
+ if (attr->egress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+ "rss action not supported for "
+ "egress");
+ if (rss->level > 1 && !tunnel)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+ "inner RSS is not supported for "
+ "non-tunnel flows");
+ return 0;
+}
+
+/*
+ * Validate the count action.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] attr
+ * Attributes of flow that includes this action.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_count(struct rte_eth_dev *dev __rte_unused,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ if (attr->egress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+ "count action not supported for "
+ "egress");
+ return 0;
+}
+
+/**
+ * Verify the @p attributes will be correctly understood by the NIC and store
+ * them in the @p flow if everything is correct.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] attributes
+ * Pointer to flow attributes
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_attributes(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attributes,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t priority_max = priv->config.flow_prio - 1;
+
+ if (attributes->group)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+ NULL, "groups is not supported");
+ if (attributes->priority != MLX5_FLOW_PRIO_RSVD &&
+ attributes->priority >= priority_max)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+ NULL, "priority out of range");
+ if (attributes->egress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+ "egress is not supported");
+ if (attributes->transfer && !priv->config.dv_esw_en)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+ NULL, "transfer is not supported");
+ if (!attributes->ingress)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+ NULL,
+ "ingress attribute is mandatory");
+ return 0;
+}
+
+/**
+ * Validate ICMP6 item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_icmp6(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_icmp6 *mask = item->mask;
+ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+ const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+ MLX5_FLOW_LAYER_OUTER_L4;
+ int ret;
+
+ if (target_protocol != 0xFF && target_protocol != IPPROTO_ICMPV6)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "protocol filtering not compatible"
+ " with ICMP6 layer");
+ if (!(item_flags & l3m))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "IPv6 is mandatory to filter on"
+ " ICMP6");
+ if (item_flags & l4m)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple L4 layers not supported");
+ if (!mask)
+ mask = &rte_flow_item_icmp6_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&rte_flow_item_icmp6_mask,
+ sizeof(struct rte_flow_item_icmp6), error);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * Validate ICMP item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_icmp(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_icmp *mask = item->mask;
+ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+ const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+ MLX5_FLOW_LAYER_OUTER_L4;
+ int ret;
+
+ if (target_protocol != 0xFF && target_protocol != IPPROTO_ICMP)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "protocol filtering not compatible"
+ " with ICMP layer");
+ if (!(item_flags & l3m))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "IPv4 is mandatory to filter"
+ " on ICMP");
+ if (item_flags & l4m)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple L4 layers not supported");
+ if (!mask)
+ mask = &rte_flow_item_icmp_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&rte_flow_item_icmp_mask,
+ sizeof(struct rte_flow_item_icmp), error);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * Validate Ethernet item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_eth *mask = item->mask;
+ const struct rte_flow_item_eth nic_mask = {
+ .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+ .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+ .type = RTE_BE16(0xffff),
+ };
+ int ret;
+ int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ const uint64_t ethm = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+ MLX5_FLOW_LAYER_OUTER_L2;
+
+ if (item_flags & ethm)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple L2 layers not supported");
+ if ((!tunnel && (item_flags & MLX5_FLOW_LAYER_OUTER_L3)) ||
+ (tunnel && (item_flags & MLX5_FLOW_LAYER_INNER_L3)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L2 layer should not follow "
+ "L3 layers");
+ if ((!tunnel && (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)) ||
+ (tunnel && (item_flags & MLX5_FLOW_LAYER_INNER_VLAN)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L2 layer should not follow VLAN");
+ if (!mask)
+ mask = &rte_flow_item_eth_mask;
+ ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+ (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_eth),
+ error);
+ return ret;
+}
+
+/**
+ * Validate VLAN item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] dev
+ * Ethernet device flow is being created on.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_eth_dev *dev,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_vlan *spec = item->spec;
+ const struct rte_flow_item_vlan *mask = item->mask;
+ const struct rte_flow_item_vlan nic_mask = {
+ .tci = RTE_BE16(UINT16_MAX),
+ .inner_type = RTE_BE16(UINT16_MAX),
+ };
+ uint16_t vlan_tag = 0;
+ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ int ret;
+ const uint64_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 |
+ MLX5_FLOW_LAYER_INNER_L4) :
+ (MLX5_FLOW_LAYER_OUTER_L3 |
+ MLX5_FLOW_LAYER_OUTER_L4);
+ const uint64_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
+ MLX5_FLOW_LAYER_OUTER_VLAN;
+
+ if (item_flags & vlanm)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple VLAN layers not supported");
+ else if ((item_flags & l34m) != 0)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "VLAN cannot follow L3/L4 layer");
+ if (!mask)
+ mask = &rte_flow_item_vlan_mask;
+ ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+ (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_vlan),
+ error);
+ if (ret)
+ return ret;
+ if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (priv->vmwa_context) {
+ /*
+ * Non-NULL context means we have a virtual machine
+ * and SR-IOV enabled, we have to create VLAN interface
+ * to make hypervisor to setup E-Switch vport
+ * context correctly. We avoid creating the multiple
+ * VLAN interfaces, so we cannot support VLAN tag mask.
+ */
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "VLAN tag mask is not"
+ " supported in virtual"
+ " environment");
+ }
+ }
+ if (spec) {
+ vlan_tag = spec->tci;
+ vlan_tag &= mask->tci;
+ }
+ /*
+ * From verbs perspective an empty VLAN is equivalent
+ * to a packet without VLAN layer.
+ */
+ if (!vlan_tag)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+ item->spec,
+ "VLAN cannot be empty");
+ return 0;
+}
+
+/**
+ * Validate IPV4 item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] acc_mask
+ * Acceptable mask, if NULL default internal default mask
+ * will be used to check whether item fields are supported.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint64_t last_item,
+ uint16_t ether_type,
+ const struct rte_flow_item_ipv4 *acc_mask,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_ipv4 *mask = item->mask;
+ const struct rte_flow_item_ipv4 *spec = item->spec;
+ const struct rte_flow_item_ipv4 nic_mask = {
+ .hdr = {
+ .src_addr = RTE_BE32(0xffffffff),
+ .dst_addr = RTE_BE32(0xffffffff),
+ .type_of_service = 0xff,
+ .next_proto_id = 0xff,
+ },
+ };
+ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
+ MLX5_FLOW_LAYER_OUTER_L3;
+ const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+ MLX5_FLOW_LAYER_OUTER_L4;
+ int ret;
+ uint8_t next_proto = 0xFF;
+ const uint64_t l2_vlan = (MLX5_FLOW_LAYER_L2 |
+ MLX5_FLOW_LAYER_OUTER_VLAN |
+ MLX5_FLOW_LAYER_INNER_VLAN);
+
+ if ((last_item & l2_vlan) && ether_type &&
+ ether_type != RTE_ETHER_TYPE_IPV4)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "IPv4 cannot follow L2/VLAN layer "
+ "which ether type is not IPv4");
+ if (item_flags & MLX5_FLOW_LAYER_IPIP) {
+ if (mask && spec)
+ next_proto = mask->hdr.next_proto_id &
+ spec->hdr.next_proto_id;
+ if (next_proto == IPPROTO_IPIP || next_proto == IPPROTO_IPV6)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "multiple tunnel "
+ "not supported");
+ }
+ if (item_flags & MLX5_FLOW_LAYER_IPV6_ENCAP)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "wrong tunnel type - IPv6 specified "
+ "but IPv4 item provided");
+ if (item_flags & l3m)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple L3 layers not supported");
+ else if (item_flags & l4m)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 cannot follow an L4 layer.");
+ else if ((item_flags & MLX5_FLOW_LAYER_NVGRE) &&
+ !(item_flags & MLX5_FLOW_LAYER_INNER_L2))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 cannot follow an NVGRE layer.");
+ if (!mask)
+ mask = &rte_flow_item_ipv4_mask;
+ else if (mask->hdr.next_proto_id != 0 &&
+ mask->hdr.next_proto_id != 0xff)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
+ "partial mask is not supported"
+ " for protocol");
+ ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+ acc_mask ? (const uint8_t *)acc_mask
+ : (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_ipv4),
+ error);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * Validate IPV6 item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] acc_mask
+ * Acceptable mask, if NULL default internal default mask
+ * will be used to check whether item fields are supported.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint64_t last_item,
+ uint16_t ether_type,
+ const struct rte_flow_item_ipv6 *acc_mask,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_ipv6 *mask = item->mask;
+ const struct rte_flow_item_ipv6 *spec = item->spec;
+ const struct rte_flow_item_ipv6 nic_mask = {
+ .hdr = {
+ .src_addr =
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff",
+ .dst_addr =
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff",
+ .vtc_flow = RTE_BE32(0xffffffff),
+ .proto = 0xff,
+ },
+ };
+ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
+ MLX5_FLOW_LAYER_OUTER_L3;
+ const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+ MLX5_FLOW_LAYER_OUTER_L4;
+ int ret;
+ uint8_t next_proto = 0xFF;
+ const uint64_t l2_vlan = (MLX5_FLOW_LAYER_L2 |
+ MLX5_FLOW_LAYER_OUTER_VLAN |
+ MLX5_FLOW_LAYER_INNER_VLAN);
+
+ if ((last_item & l2_vlan) && ether_type &&
+ ether_type != RTE_ETHER_TYPE_IPV6)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "IPv6 cannot follow L2/VLAN layer "
+ "which ether type is not IPv6");
+ if (item_flags & MLX5_FLOW_LAYER_IPV6_ENCAP) {
+ if (mask && spec)
+ next_proto = mask->hdr.proto & spec->hdr.proto;
+ if (next_proto == IPPROTO_IPIP || next_proto == IPPROTO_IPV6)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "multiple tunnel "
+ "not supported");
+ }
+ if (item_flags & MLX5_FLOW_LAYER_IPIP)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "wrong tunnel type - IPv4 specified "
+ "but IPv6 item provided");
+ if (item_flags & l3m)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple L3 layers not supported");
+ else if (item_flags & l4m)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 cannot follow an L4 layer.");
+ else if ((item_flags & MLX5_FLOW_LAYER_NVGRE) &&
+ !(item_flags & MLX5_FLOW_LAYER_INNER_L2))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 cannot follow an NVGRE layer.");
+ if (!mask)
+ mask = &rte_flow_item_ipv6_mask;
+ ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+ acc_mask ? (const uint8_t *)acc_mask
+ : (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_ipv6),
+ error);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * Validate UDP item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] target_protocol
+ * The next protocol in the previous item.
+ * @param[in] flow_mask
+ * mlx5 flow-specific (DV, verbs, etc.) supported header fields mask.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_udp(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_udp *mask = item->mask;
+ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
+ MLX5_FLOW_LAYER_OUTER_L3;
+ const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+ MLX5_FLOW_LAYER_OUTER_L4;
+ int ret;
+
+ if (target_protocol != 0xff && target_protocol != IPPROTO_UDP)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "protocol filtering not compatible"
+ " with UDP layer");
+ if (!(item_flags & l3m))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 is mandatory to filter on L4");
+ if (item_flags & l4m)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple L4 layers not supported");
+ if (!mask)
+ mask = &rte_flow_item_udp_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&rte_flow_item_udp_mask,
+ sizeof(struct rte_flow_item_udp), error);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * Validate TCP item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] target_protocol
+ * The next protocol in the previous item.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_tcp(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ const struct rte_flow_item_tcp *flow_mask,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_tcp *mask = item->mask;
+ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
+ MLX5_FLOW_LAYER_OUTER_L3;
+ const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+ MLX5_FLOW_LAYER_OUTER_L4;
+ int ret;
+
+ MLX5_ASSERT(flow_mask);
+ if (target_protocol != 0xff && target_protocol != IPPROTO_TCP)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "protocol filtering not compatible"
+ " with TCP layer");
+ if (!(item_flags & l3m))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 is mandatory to filter on L4");
+ if (item_flags & l4m)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple L4 layers not supported");
+ if (!mask)
+ mask = &rte_flow_item_tcp_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)flow_mask,
+ sizeof(struct rte_flow_item_tcp), error);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * Validate VXLAN item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] target_protocol
+ * The next protocol in the previous item.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_vxlan *spec = item->spec;
+ const struct rte_flow_item_vxlan *mask = item->mask;
+ int ret;
+ union vni {
+ uint32_t vlan_id;
+ uint8_t vni[4];
+ } id = { .vlan_id = 0, };
+
+
+ if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple tunnel layers not"
+ " supported");
+ /*
+ * Verify only UDPv4 is present as defined in
+ * https://tools.ietf.org/html/rfc7348
+ */
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "no outer UDP layer found");
+ if (!mask)
+ mask = &rte_flow_item_vxlan_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&rte_flow_item_vxlan_mask,
+ sizeof(struct rte_flow_item_vxlan),
+ error);
+ if (ret < 0)
+ return ret;
+ if (spec) {
+ memcpy(&id.vni[1], spec->vni, 3);
+ memcpy(&id.vni[1], mask->vni, 3);
+ }
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "VXLAN tunnel must be fully defined");
+ return 0;
+}
+
+/**
+ * Validate VXLAN_GPE item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] priv
+ * Pointer to the private data structure.
+ * @param[in] target_protocol
+ * The next protocol in the previous item.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_eth_dev *dev,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_item_vxlan_gpe *spec = item->spec;
+ const struct rte_flow_item_vxlan_gpe *mask = item->mask;
+ int ret;
+ union vni {
+ uint32_t vlan_id;
+ uint8_t vni[4];
+ } id = { .vlan_id = 0, };
+
+ if (!priv->config.l3_vxlan_en)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 VXLAN is not enabled by device"
+ " parameter and/or not configured in"
+ " firmware");
+ if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple tunnel layers not"
+ " supported");
+ /*
+ * Verify only UDPv4 is present as defined in
+ * https://tools.ietf.org/html/rfc7348
+ */
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "no outer UDP layer found");
+ if (!mask)
+ mask = &rte_flow_item_vxlan_gpe_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&rte_flow_item_vxlan_gpe_mask,
+ sizeof(struct rte_flow_item_vxlan_gpe),
+ error);
+ if (ret < 0)
+ return ret;
+ if (spec) {
+ if (spec->protocol)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "VxLAN-GPE protocol"
+ " not supported");
+ memcpy(&id.vni[1], spec->vni, 3);
+ memcpy(&id.vni[1], mask->vni, 3);
+ }
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "VXLAN-GPE tunnel must be fully"
+ " defined");
+ return 0;
+}
+/**
+ * Validate GRE Key item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit flags to mark detected items.
+ * @param[in] gre_item
+ * Pointer to gre_item
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_gre_key(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ const struct rte_flow_item *gre_item,
+ struct rte_flow_error *error)
+{
+ const rte_be32_t *mask = item->mask;
+ int ret = 0;
+ rte_be32_t gre_key_default_mask = RTE_BE32(UINT32_MAX);
+ const struct rte_flow_item_gre *gre_spec;
+ const struct rte_flow_item_gre *gre_mask;
+
+ if (item_flags & MLX5_FLOW_LAYER_GRE_KEY)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "Multiple GRE key not support");
+ if (!(item_flags & MLX5_FLOW_LAYER_GRE))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "No preceding GRE header");
+ if (item_flags & MLX5_FLOW_LAYER_INNER)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "GRE key following a wrong item");
+ gre_mask = gre_item->mask;
+ if (!gre_mask)
+ gre_mask = &rte_flow_item_gre_mask;
+ gre_spec = gre_item->spec;
+ if (gre_spec && (gre_mask->c_rsvd0_ver & RTE_BE16(0x2000)) &&
+ !(gre_spec->c_rsvd0_ver & RTE_BE16(0x2000)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "Key bit must be on");
+
+ if (!mask)
+ mask = &gre_key_default_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&gre_key_default_mask,
+ sizeof(rte_be32_t), error);
+ return ret;
+}
+
+/**
+ * Validate GRE item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit flags to mark detected items.
+ * @param[in] target_protocol
+ * The next protocol in the previous item.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_gre(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_gre *spec __rte_unused = item->spec;
+ const struct rte_flow_item_gre *mask = item->mask;
+ int ret;
+ const struct rte_flow_item_gre nic_mask = {
+ .c_rsvd0_ver = RTE_BE16(0xB000),
+ .protocol = RTE_BE16(UINT16_MAX),
+ };
+
+ if (target_protocol != 0xff && target_protocol != IPPROTO_GRE)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "protocol filtering not compatible"
+ " with this GRE layer");
+ if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple tunnel layers not"
+ " supported");
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 Layer is missing");
+ if (!mask)
+ mask = &rte_flow_item_gre_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_gre), error);
+ if (ret < 0)
+ return ret;
+#ifndef HAVE_MLX5DV_DR
+#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT
+ if (spec && (spec->protocol & mask->protocol))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "without MPLS support the"
+ " specification cannot be used for"
+ " filtering");
+#endif
+#endif
+ return 0;
+}
+
+/**
+ * Validate Geneve item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] itemFlags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] enPriv
+ * Pointer to the private data structure.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+
+int
+mlx5_flow_validate_item_geneve(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_eth_dev *dev,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_item_geneve *spec = item->spec;
+ const struct rte_flow_item_geneve *mask = item->mask;
+ int ret;
+ uint16_t gbhdr;
+ uint8_t opt_len = priv->config.hca_attr.geneve_max_opt_len ?
+ MLX5_GENEVE_OPT_LEN_1 : MLX5_GENEVE_OPT_LEN_0;
+ const struct rte_flow_item_geneve nic_mask = {
+ .ver_opt_len_o_c_rsvd0 = RTE_BE16(0x3f80),
+ .vni = "\xff\xff\xff",
+ .protocol = RTE_BE16(UINT16_MAX),
+ };
+
+ if (!priv->config.hca_attr.tunnel_stateless_geneve_rx)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 Geneve is not enabled by device"
+ " parameter and/or not configured in"
+ " firmware");
+ if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple tunnel layers not"
+ " supported");
+ /*
+ * Verify only UDPv4 is present as defined in
+ * https://tools.ietf.org/html/rfc7348
+ */
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "no outer UDP layer found");
+ if (!mask)
+ mask = &rte_flow_item_geneve_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_geneve), error);
+ if (ret)
+ return ret;
+ if (spec) {
+ gbhdr = rte_be_to_cpu_16(spec->ver_opt_len_o_c_rsvd0);
+ if (MLX5_GENEVE_VER_VAL(gbhdr) ||
+ MLX5_GENEVE_CRITO_VAL(gbhdr) ||
+ MLX5_GENEVE_RSVD_VAL(gbhdr) || spec->rsvd1)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "Geneve protocol unsupported"
+ " fields are being used");
+ if (MLX5_GENEVE_OPTLEN_VAL(gbhdr) > opt_len)
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "Unsupported Geneve options length");
+ }
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER))
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "Geneve tunnel must be fully defined");
+ return 0;
+}
+
+/**
+ * Validate MPLS item.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] prev_layer
+ * The protocol layer indicated in previous item.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_mpls(struct rte_eth_dev *dev __rte_unused,
+ const struct rte_flow_item *item __rte_unused,
+ uint64_t item_flags __rte_unused,
+ uint64_t prev_layer __rte_unused,
+ struct rte_flow_error *error)
+{
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+ const struct rte_flow_item_mpls *mask = item->mask;
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (!priv->config.mpls_en)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "MPLS not supported or"
+ " disabled in firmware"
+ " configuration.");
+ /* MPLS over IP, UDP, GRE is allowed */
+ if (!(prev_layer & (MLX5_FLOW_LAYER_OUTER_L3 |
+ MLX5_FLOW_LAYER_OUTER_L4_UDP |
+ MLX5_FLOW_LAYER_GRE)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "protocol filtering not compatible"
+ " with MPLS layer");
+ /* Multi-tunnel isn't allowed but MPLS over GRE is an exception. */
+ if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
+ !(item_flags & MLX5_FLOW_LAYER_GRE))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple tunnel layers not"
+ " supported");
+ if (!mask)
+ mask = &rte_flow_item_mpls_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&rte_flow_item_mpls_mask,
+ sizeof(struct rte_flow_item_mpls), error);
+ if (ret < 0)
+ return ret;
+ return 0;
+#endif
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "MPLS is not supported by Verbs, please"
+ " update.");
+}
+
+/**
+ * Validate NVGRE item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit flags to mark detected items.
+ * @param[in] target_protocol
+ * The next protocol in the previous item.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_nvgre(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_nvgre *mask = item->mask;
+ int ret;
+
+ if (target_protocol != 0xff && target_protocol != IPPROTO_GRE)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "protocol filtering not compatible"
+ " with this GRE layer");
+ if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple tunnel layers not"
+ " supported");
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "L3 Layer is missing");
+ if (!mask)
+ mask = &rte_flow_item_nvgre_mask;
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&rte_flow_item_nvgre_mask,
+ sizeof(struct rte_flow_item_nvgre), error);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/* Allocate unique ID for the split Q/RSS subflows. */
+static uint32_t
+flow_qrss_get_id(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t qrss_id, ret;
+
+ ret = mlx5_flow_id_get(priv->qrss_id_pool, &qrss_id);
+ if (ret)
+ return 0;
+ MLX5_ASSERT(qrss_id);
+ return qrss_id;
+}
+
+/* Free unique ID for the split Q/RSS subflows. */
+static void
+flow_qrss_free_id(struct rte_eth_dev *dev, uint32_t qrss_id)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (qrss_id)
+ mlx5_flow_id_release(priv->qrss_id_pool, qrss_id);
+}
+
+/**
+ * Release resource related QUEUE/RSS action split.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param flow
+ * Flow to release id's from.
+ */
+static void
+flow_mreg_split_qrss_release(struct rte_eth_dev *dev,
+ struct rte_flow *flow)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t handle_idx;
+ struct mlx5_flow_handle *dev_handle;
+
+ SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+ handle_idx, dev_handle, next)
+ if (dev_handle->split_flow_id)
+ flow_qrss_free_id(dev, dev_handle->split_flow_id);
+}
+
+static int
+flow_null_validate(struct rte_eth_dev *dev __rte_unused,
+ const struct rte_flow_attr *attr __rte_unused,
+ const struct rte_flow_item items[] __rte_unused,
+ const struct rte_flow_action actions[] __rte_unused,
+ bool external __rte_unused,
+ int hairpin __rte_unused,
+ struct rte_flow_error *error)
+{
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+}
+
+static struct mlx5_flow *
+flow_null_prepare(struct rte_eth_dev *dev __rte_unused,
+ const struct rte_flow_attr *attr __rte_unused,
+ const struct rte_flow_item items[] __rte_unused,
+ const struct rte_flow_action actions[] __rte_unused,
+ struct rte_flow_error *error)
+{
+ rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+ return NULL;
+}
+
+static int
+flow_null_translate(struct rte_eth_dev *dev __rte_unused,
+ struct mlx5_flow *dev_flow __rte_unused,
+ const struct rte_flow_attr *attr __rte_unused,
+ const struct rte_flow_item items[] __rte_unused,
+ const struct rte_flow_action actions[] __rte_unused,
+ struct rte_flow_error *error)
+{
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+}
+
+static int
+flow_null_apply(struct rte_eth_dev *dev __rte_unused,
+ struct rte_flow *flow __rte_unused,
+ struct rte_flow_error *error)
+{
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+}
+
+static void
+flow_null_remove(struct rte_eth_dev *dev __rte_unused,
+ struct rte_flow *flow __rte_unused)
+{
+}
+
+static void
+flow_null_destroy(struct rte_eth_dev *dev __rte_unused,
+ struct rte_flow *flow __rte_unused)
+{
+}
+
+static int
+flow_null_query(struct rte_eth_dev *dev __rte_unused,
+ struct rte_flow *flow __rte_unused,
+ const struct rte_flow_action *actions __rte_unused,
+ void *data __rte_unused,
+ struct rte_flow_error *error)
+{
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+}
+
+/* Void driver to protect from null pointer reference. */
+const struct mlx5_flow_driver_ops mlx5_flow_null_drv_ops = {
+ .validate = flow_null_validate,
+ .prepare = flow_null_prepare,
+ .translate = flow_null_translate,
+ .apply = flow_null_apply,
+ .remove = flow_null_remove,
+ .destroy = flow_null_destroy,
+ .query = flow_null_query,
+};
+
+/**
+ * Select flow driver type according to flow attributes and device
+ * configuration.
+ *
+ * @param[in] dev
+ * Pointer to the dev structure.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ *
+ * @return
+ * flow driver type, MLX5_FLOW_TYPE_MAX otherwise.
+ */
+static enum mlx5_flow_drv_type
+flow_get_drv_type(struct rte_eth_dev *dev, const struct rte_flow_attr *attr)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ enum mlx5_flow_drv_type type = MLX5_FLOW_TYPE_MAX;
+
+ if (attr->transfer && priv->config.dv_esw_en)
+ type = MLX5_FLOW_TYPE_DV;
+ if (!attr->transfer)
+ type = priv->config.dv_flow_en ? MLX5_FLOW_TYPE_DV :
+ MLX5_FLOW_TYPE_VERBS;
+ return type;
+}
+
+#define flow_get_drv_ops(type) flow_drv_ops[type]
+
+/**
+ * Flow driver validation API. This abstracts calling driver specific functions.
+ * The type of flow driver is determined according to flow attributes.
+ *
+ * @param[in] dev
+ * Pointer to the dev structure.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] items
+ * Pointer to the list of items.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[in] external
+ * This flow rule is created by request external to PMD.
+ * @param[in] hairpin
+ * Number of hairpin TX actions, 0 means classic flow.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static inline int
+flow_drv_validate(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external, int hairpin, struct rte_flow_error *error)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ enum mlx5_flow_drv_type type = flow_get_drv_type(dev, attr);
+
+ fops = flow_get_drv_ops(type);
+ return fops->validate(dev, attr, items, actions, external,
+ hairpin, error);
+}
+
+/**
+ * Flow driver preparation API. This abstracts calling driver specific
+ * functions. Parent flow (rte_flow) should have driver type (drv_type). It
+ * calculates the size of memory required for device flow, allocates the memory,
+ * initializes the device flow and returns the pointer.
+ *
+ * @note
+ * This function initializes device flow structure such as dv or verbs in
+ * struct mlx5_flow. However, it is caller's responsibility to initialize the
+ * rest. For example, adding returning device flow to flow->dev_flow list and
+ * setting backward reference to the flow should be done out of this function.
+ * layers field is not filled either.
+ *
+ * @param[in] dev
+ * Pointer to the dev structure.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] items
+ * Pointer to the list of items.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[in] flow_idx
+ * This memory pool index to the flow.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * Pointer to device flow on success, otherwise NULL and rte_errno is set.
+ */
+static inline struct mlx5_flow *
+flow_drv_prepare(struct rte_eth_dev *dev,
+ const struct rte_flow *flow,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ uint32_t flow_idx,
+ struct rte_flow_error *error)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ enum mlx5_flow_drv_type type = flow->drv_type;
+ struct mlx5_flow *mlx5_flow = NULL;
+
+ MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+ fops = flow_get_drv_ops(type);
+ mlx5_flow = fops->prepare(dev, attr, items, actions, error);
+ if (mlx5_flow)
+ mlx5_flow->flow_idx = flow_idx;
+ return mlx5_flow;
+}
+
+/**
+ * Flow driver translation API. This abstracts calling driver specific
+ * functions. Parent flow (rte_flow) should have driver type (drv_type). It
+ * translates a generic flow into a driver flow. flow_drv_prepare() must
+ * precede.
+ *
+ * @note
+ * dev_flow->layers could be filled as a result of parsing during translation
+ * if needed by flow_drv_apply(). dev_flow->flow->actions can also be filled
+ * if necessary. As a flow can have multiple dev_flows by RSS flow expansion,
+ * flow->actions could be overwritten even though all the expanded dev_flows
+ * have the same actions.
+ *
+ * @param[in] dev
+ * Pointer to the rte dev structure.
+ * @param[in, out] dev_flow
+ * Pointer to the mlx5 flow.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] items
+ * Pointer to the list of items.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static inline int
+flow_drv_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ enum mlx5_flow_drv_type type = dev_flow->flow->drv_type;
+
+ MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+ fops = flow_get_drv_ops(type);
+ return fops->translate(dev, dev_flow, attr, items, actions, error);
+}
+
+/**
+ * Flow driver apply API. This abstracts calling driver specific functions.
+ * Parent flow (rte_flow) should have driver type (drv_type). It applies
+ * translated driver flows on to device. flow_drv_translate() must precede.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device structure.
+ * @param[in, out] flow
+ * Pointer to flow structure.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static inline int
+flow_drv_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
+ struct rte_flow_error *error)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ enum mlx5_flow_drv_type type = flow->drv_type;
+
+ MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+ fops = flow_get_drv_ops(type);
+ return fops->apply(dev, flow, error);
+}
+
+/**
+ * Flow driver remove API. This abstracts calling driver specific functions.
+ * Parent flow (rte_flow) should have driver type (drv_type). It removes a flow
+ * on device. All the resources of the flow should be freed by calling
+ * flow_drv_destroy().
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in, out] flow
+ * Pointer to flow structure.
+ */
+static inline void
+flow_drv_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ enum mlx5_flow_drv_type type = flow->drv_type;
+
+ MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+ fops = flow_get_drv_ops(type);
+ fops->remove(dev, flow);
+}
+
+/**
+ * Flow driver destroy API. This abstracts calling driver specific functions.
+ * Parent flow (rte_flow) should have driver type (drv_type). It removes a flow
+ * on device and releases resources of the flow.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in, out] flow
+ * Pointer to flow structure.
+ */
+static inline void
+flow_drv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ enum mlx5_flow_drv_type type = flow->drv_type;
+
+ flow_mreg_split_qrss_release(dev, flow);
+ MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+ fops = flow_get_drv_ops(type);
+ fops->destroy(dev, flow);
+}
+
+/**
+ * Get RSS action from the action list.
+ *
+ * @param[in] actions
+ * Pointer to the list of actions.
+ *
+ * @return
+ * Pointer to the RSS action if exist, else return NULL.
+ */
+static const struct rte_flow_action_rss*
+flow_get_rss_action(const struct rte_flow_action actions[])
+{
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_RSS:
+ return (const struct rte_flow_action_rss *)
+ actions->conf;
+ default:
+ break;
+ }
+ }
+ return NULL;
+}
+
+static unsigned int
+find_graph_root(const struct rte_flow_item pattern[], uint32_t rss_level)
+{
+ const struct rte_flow_item *item;
+ unsigned int has_vlan = 0;
+
+ for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) {
+ if (item->type == RTE_FLOW_ITEM_TYPE_VLAN) {
+ has_vlan = 1;
+ break;
+ }
+ }
+ if (has_vlan)
+ return rss_level < 2 ? MLX5_EXPANSION_ROOT_ETH_VLAN :
+ MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN;
+ return rss_level < 2 ? MLX5_EXPANSION_ROOT :
+ MLX5_EXPANSION_ROOT_OUTER;
+}
+
+/**
+ * Get layer flags from the prefix flow.
+ *
+ * Some flows may be split to several subflows, the prefix subflow gets the
+ * match items and the suffix sub flow gets the actions.
+ * Some actions need the user defined match item flags to get the detail for
+ * the action.
+ * This function helps the suffix flow to get the item layer flags from prefix
+ * subflow.
+ *
+ * @param[in] dev_flow
+ * Pointer the created preifx subflow.
+ *
+ * @return
+ * The layers get from prefix subflow.
+ */
+static inline uint64_t
+flow_get_prefix_layer_flags(struct mlx5_flow *dev_flow)
+{
+ uint64_t layers = 0;
+
+ /*
+ * Layers bits could be localization, but usually the compiler will
+ * help to do the optimization work for source code.
+ * If no decap actions, use the layers directly.
+ */
+ if (!(dev_flow->act_flags & MLX5_FLOW_ACTION_DECAP))
+ return dev_flow->handle->layers;
+ /* Convert L3 layers with decap action. */
+ if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L3_IPV4)
+ layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+ else if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L3_IPV6)
+ layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+ /* Convert L4 layers with decap action. */
+ if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L4_TCP)
+ layers |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
+ else if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L4_UDP)
+ layers |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
+ return layers;
+}
+
+/**
+ * Get metadata split action information.
+ *
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[out] qrss
+ * Pointer to the return pointer.
+ * @param[out] qrss_type
+ * Pointer to the action type to return. RTE_FLOW_ACTION_TYPE_END is returned
+ * if no QUEUE/RSS is found.
+ * @param[out] encap_idx
+ * Pointer to the index of the encap action if exists, otherwise the last
+ * action index.
+ *
+ * @return
+ * Total number of actions.
+ */
+static int
+flow_parse_metadata_split_actions_info(const struct rte_flow_action actions[],
+ const struct rte_flow_action **qrss,
+ int *encap_idx)
+{
+ const struct rte_flow_action_raw_encap *raw_encap;
+ int actions_n = 0;
+ int raw_decap_idx = -1;
+
+ *encap_idx = -1;
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+ case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+ *encap_idx = actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+ raw_decap_idx = actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+ raw_encap = actions->conf;
+ if (raw_encap->size > MLX5_ENCAPSULATION_DECISION_SIZE)
+ *encap_idx = raw_decap_idx != -1 ?
+ raw_decap_idx : actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_QUEUE:
+ case RTE_FLOW_ACTION_TYPE_RSS:
+ *qrss = actions;
+ break;
+ default:
+ break;
+ }
+ actions_n++;
+ }
+ if (*encap_idx == -1)
+ *encap_idx = actions_n;
+ /* Count RTE_FLOW_ACTION_TYPE_END. */
+ return actions_n + 1;
+}
+
+/**
+ * Check meter action from the action list.
+ *
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[out] mtr
+ * Pointer to the meter exist flag.
+ *
+ * @return
+ * Total number of actions.
+ */
+static int
+flow_check_meter_action(const struct rte_flow_action actions[], uint32_t *mtr)
+{
+ int actions_n = 0;
+
+ MLX5_ASSERT(mtr);
+ *mtr = 0;
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_METER:
+ *mtr = 1;
+ break;
+ default:
+ break;
+ }
+ actions_n++;
+ }
+ /* Count RTE_FLOW_ACTION_TYPE_END. */
+ return actions_n + 1;
+}
+
+/**
+ * Check if the flow should be splited due to hairpin.
+ * The reason for the split is that in current HW we can't
+ * support encap on Rx, so if a flow have encap we move it
+ * to Tx.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[in] attr
+ * Flow rule attributes.
+ * @param[in] actions
+ * Associated actions (list terminated by the END action).
+ *
+ * @return
+ * > 0 the number of actions and the flow should be split,
+ * 0 when no split required.
+ */
+static int
+flow_check_hairpin_split(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_action actions[])
+{
+ int queue_action = 0;
+ int action_n = 0;
+ int encap = 0;
+ const struct rte_flow_action_queue *queue;
+ const struct rte_flow_action_rss *rss;
+ const struct rte_flow_action_raw_encap *raw_encap;
+
+ if (!attr->ingress)
+ return 0;
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_QUEUE:
+ queue = actions->conf;
+ if (queue == NULL)
+ return 0;
+ if (mlx5_rxq_get_type(dev, queue->index) !=
+ MLX5_RXQ_TYPE_HAIRPIN)
+ return 0;
+ queue_action = 1;
+ action_n++;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RSS:
+ rss = actions->conf;
+ if (rss == NULL || rss->queue_num == 0)
+ return 0;
+ if (mlx5_rxq_get_type(dev, rss->queue[0]) !=
+ MLX5_RXQ_TYPE_HAIRPIN)
+ return 0;
+ queue_action = 1;
+ action_n++;
+ break;
+ case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+ case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+ encap = 1;
+ action_n++;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+ raw_encap = actions->conf;
+ if (raw_encap->size >
+ (sizeof(struct rte_flow_item_eth) +
+ sizeof(struct rte_flow_item_ipv4)))
+ encap = 1;
+ action_n++;
+ break;
+ default:
+ action_n++;
+ break;
+ }
+ }
+ if (encap == 1 && queue_action)
+ return action_n;
+ return 0;
+}
+
+/* Declare flow create/destroy prototype in advance. */
+static uint32_t
+flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external, struct rte_flow_error *error);
+
+static void
+flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
+ uint32_t flow_idx);
+
+/**
+ * Add a flow of copying flow metadata registers in RX_CP_TBL.
+ *
+ * As mark_id is unique, if there's already a registered flow for the mark_id,
+ * return by increasing the reference counter of the resource. Otherwise, create
+ * the resource (mcp_res) and flow.
+ *
+ * Flow looks like,
+ * - If ingress port is ANY and reg_c[1] is mark_id,
+ * flow_tag := mark_id, reg_b := reg_c[0] and jump to RX_ACT_TBL.
+ *
+ * For default flow (zero mark_id), flow is like,
+ * - If ingress port is ANY,
+ * reg_b := reg_c[0] and jump to RX_ACT_TBL.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mark_id
+ * ID of MARK action, zero means default flow for META.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * Associated resource on success, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_flow_mreg_copy_resource *
+flow_mreg_add_copy_action(struct rte_eth_dev *dev, uint32_t mark_id,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow_attr attr = {
+ .group = MLX5_FLOW_MREG_CP_TABLE_GROUP,
+ .ingress = 1,
+ };
+ struct mlx5_rte_flow_item_tag tag_spec = {
+ .data = mark_id,
+ };
+ struct rte_flow_item items[] = {
+ [1] = { .type = RTE_FLOW_ITEM_TYPE_END, },
+ };
+ struct rte_flow_action_mark ftag = {
+ .id = mark_id,
+ };
+ struct mlx5_flow_action_copy_mreg cp_mreg = {
+ .dst = REG_B,
+ .src = 0,
+ };
+ struct rte_flow_action_jump jump = {
+ .group = MLX5_FLOW_MREG_ACT_TABLE_GROUP,
+ };
+ struct rte_flow_action actions[] = {
+ [3] = { .type = RTE_FLOW_ACTION_TYPE_END, },
+ };
+ struct mlx5_flow_mreg_copy_resource *mcp_res;
+ uint32_t idx = 0;
+ int ret;
+
+ /* Fill the register fileds in the flow. */
+ ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+ if (ret < 0)
+ return NULL;
+ tag_spec.id = ret;
+ ret = mlx5_flow_get_reg_id(dev, MLX5_METADATA_RX, 0, error);
+ if (ret < 0)
+ return NULL;
+ cp_mreg.src = ret;
+ /* Check if already registered. */
+ MLX5_ASSERT(priv->mreg_cp_tbl);
+ mcp_res = (void *)mlx5_hlist_lookup(priv->mreg_cp_tbl, mark_id);
+ if (mcp_res) {
+ /* For non-default rule. */
+ if (mark_id != MLX5_DEFAULT_COPY_ID)
+ mcp_res->refcnt++;
+ MLX5_ASSERT(mark_id != MLX5_DEFAULT_COPY_ID ||
+ mcp_res->refcnt == 1);
+ return mcp_res;
+ }
+ /* Provide the full width of FLAG specific value. */
+ if (mark_id == (priv->sh->dv_regc0_mask & MLX5_FLOW_MARK_DEFAULT))
+ tag_spec.data = MLX5_FLOW_MARK_DEFAULT;
+ /* Build a new flow. */
+ if (mark_id != MLX5_DEFAULT_COPY_ID) {
+ items[0] = (struct rte_flow_item){
+ .type = (enum rte_flow_item_type)
+ MLX5_RTE_FLOW_ITEM_TYPE_TAG,
+ .spec = &tag_spec,
+ };
+ items[1] = (struct rte_flow_item){
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ };
+ actions[0] = (struct rte_flow_action){
+ .type = (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_MARK,
+ .conf = &ftag,
+ };
+ actions[1] = (struct rte_flow_action){
+ .type = (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+ .conf = &cp_mreg,
+ };
+ actions[2] = (struct rte_flow_action){
+ .type = RTE_FLOW_ACTION_TYPE_JUMP,
+ .conf = &jump,
+ };
+ actions[3] = (struct rte_flow_action){
+ .type = RTE_FLOW_ACTION_TYPE_END,
+ };
+ } else {
+ /* Default rule, wildcard match. */
+ attr.priority = MLX5_FLOW_PRIO_RSVD;
+ items[0] = (struct rte_flow_item){
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ };
+ actions[0] = (struct rte_flow_action){
+ .type = (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+ .conf = &cp_mreg,
+ };
+ actions[1] = (struct rte_flow_action){
+ .type = RTE_FLOW_ACTION_TYPE_JUMP,
+ .conf = &jump,
+ };
+ actions[2] = (struct rte_flow_action){
+ .type = RTE_FLOW_ACTION_TYPE_END,
+ };
+ }
+ /* Build a new entry. */
+ mcp_res = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MCP], &idx);
+ if (!mcp_res) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ mcp_res->idx = idx;
+ /*
+ * The copy Flows are not included in any list. There
+ * ones are referenced from other Flows and can not
+ * be applied, removed, deleted in ardbitrary order
+ * by list traversing.
+ */
+ mcp_res->rix_flow = flow_list_create(dev, NULL, &attr, items,
+ actions, false, error);
+ if (!mcp_res->rix_flow)
+ goto error;
+ mcp_res->refcnt++;
+ mcp_res->hlist_ent.key = mark_id;
+ ret = mlx5_hlist_insert(priv->mreg_cp_tbl,
+ &mcp_res->hlist_ent);
+ MLX5_ASSERT(!ret);
+ if (ret)
+ goto error;
+ return mcp_res;
+error:
+ if (mcp_res->rix_flow)
+ flow_list_destroy(dev, NULL, mcp_res->rix_flow);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx);
+ return NULL;
+}
+
+/**
+ * Release flow in RX_CP_TBL.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @flow
+ * Parent flow for wich copying is provided.
+ */
+static void
+flow_mreg_del_copy_action(struct rte_eth_dev *dev,
+ struct rte_flow *flow)
+{
+ struct mlx5_flow_mreg_copy_resource *mcp_res;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!flow->rix_mreg_copy)
+ return;
+ mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP],
+ flow->rix_mreg_copy);
+ if (!mcp_res || !priv->mreg_cp_tbl)
+ return;
+ if (flow->copy_applied) {
+ MLX5_ASSERT(mcp_res->appcnt);
+ flow->copy_applied = 0;
+ --mcp_res->appcnt;
+ if (!mcp_res->appcnt) {
+ struct rte_flow *mcp_flow = mlx5_ipool_get
+ (priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
+ mcp_res->rix_flow);
+
+ if (mcp_flow)
+ flow_drv_remove(dev, mcp_flow);
+ }
+ }
+ /*
+ * We do not check availability of metadata registers here,
+ * because copy resources are not allocated in this case.
+ */
+ if (--mcp_res->refcnt)
+ return;
+ MLX5_ASSERT(mcp_res->rix_flow);
+ flow_list_destroy(dev, NULL, mcp_res->rix_flow);
+ mlx5_hlist_remove(priv->mreg_cp_tbl, &mcp_res->hlist_ent);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx);
+ flow->rix_mreg_copy = 0;
+}
+
+/**
+ * Start flow in RX_CP_TBL.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @flow
+ * Parent flow for wich copying is provided.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_mreg_start_copy_action(struct rte_eth_dev *dev,
+ struct rte_flow *flow)
+{
+ struct mlx5_flow_mreg_copy_resource *mcp_res;
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (!flow->rix_mreg_copy || flow->copy_applied)
+ return 0;
+ mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP],
+ flow->rix_mreg_copy);
+ if (!mcp_res)
+ return 0;
+ if (!mcp_res->appcnt) {
+ struct rte_flow *mcp_flow = mlx5_ipool_get
+ (priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
+ mcp_res->rix_flow);
+
+ if (mcp_flow) {
+ ret = flow_drv_apply(dev, mcp_flow, NULL);
+ if (ret)
+ return ret;
+ }
+ }
+ ++mcp_res->appcnt;
+ flow->copy_applied = 1;
+ return 0;
+}
+
+/**
+ * Stop flow in RX_CP_TBL.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @flow
+ * Parent flow for wich copying is provided.
+ */
+static void
+flow_mreg_stop_copy_action(struct rte_eth_dev *dev,
+ struct rte_flow *flow)
+{
+ struct mlx5_flow_mreg_copy_resource *mcp_res;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!flow->rix_mreg_copy || !flow->copy_applied)
+ return;
+ mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP],
+ flow->rix_mreg_copy);
+ if (!mcp_res)
+ return;
+ MLX5_ASSERT(mcp_res->appcnt);
+ --mcp_res->appcnt;
+ flow->copy_applied = 0;
+ if (!mcp_res->appcnt) {
+ struct rte_flow *mcp_flow = mlx5_ipool_get
+ (priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
+ mcp_res->rix_flow);
+
+ if (mcp_flow)
+ flow_drv_remove(dev, mcp_flow);
+ }
+}
+
+/**
+ * Remove the default copy action from RX_CP_TBL.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+flow_mreg_del_default_copy_action(struct rte_eth_dev *dev)
+{
+ struct mlx5_flow_mreg_copy_resource *mcp_res;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ /* Check if default flow is registered. */
+ if (!priv->mreg_cp_tbl)
+ return;
+ mcp_res = (void *)mlx5_hlist_lookup(priv->mreg_cp_tbl,
+ MLX5_DEFAULT_COPY_ID);
+ if (!mcp_res)
+ return;
+ MLX5_ASSERT(mcp_res->rix_flow);
+ flow_list_destroy(dev, NULL, mcp_res->rix_flow);
+ mlx5_hlist_remove(priv->mreg_cp_tbl, &mcp_res->hlist_ent);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx);
+}
+
+/**
+ * Add the default copy action in in RX_CP_TBL.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 for success, negative value otherwise and rte_errno is set.
+ */
+static int
+flow_mreg_add_default_copy_action(struct rte_eth_dev *dev,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_mreg_copy_resource *mcp_res;
+
+ /* Check whether extensive metadata feature is engaged. */
+ if (!priv->config.dv_flow_en ||
+ priv->config.dv_xmeta_en == MLX5_XMETA_MODE_LEGACY ||
+ !mlx5_flow_ext_mreg_supported(dev) ||
+ !priv->sh->dv_regc0_mask)
+ return 0;
+ mcp_res = flow_mreg_add_copy_action(dev, MLX5_DEFAULT_COPY_ID, error);
+ if (!mcp_res)
+ return -rte_errno;
+ return 0;
+}
+
+/**
+ * Add a flow of copying flow metadata registers in RX_CP_TBL.
+ *
+ * All the flow having Q/RSS action should be split by
+ * flow_mreg_split_qrss_prep() to pass by RX_CP_TBL. A flow in the RX_CP_TBL
+ * performs the following,
+ * - CQE->flow_tag := reg_c[1] (MARK)
+ * - CQE->flow_table_metadata (reg_b) := reg_c[0] (META)
+ * As CQE's flow_tag is not a register, it can't be simply copied from reg_c[1]
+ * but there should be a flow per each MARK ID set by MARK action.
+ *
+ * For the aforementioned reason, if there's a MARK action in flow's action
+ * list, a corresponding flow should be added to the RX_CP_TBL in order to copy
+ * the MARK ID to CQE's flow_tag like,
+ * - If reg_c[1] is mark_id,
+ * flow_tag := mark_id, reg_b := reg_c[0] and jump to RX_ACT_TBL.
+ *
+ * For SET_META action which stores value in reg_c[0], as the destination is
+ * also a flow metadata register (reg_b), adding a default flow is enough. Zero
+ * MARK ID means the default flow. The default flow looks like,
+ * - For all flow, reg_b := reg_c[0] and jump to RX_ACT_TBL.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param flow
+ * Pointer to flow structure.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, negative value otherwise and rte_errno is set.
+ */
+static int
+flow_mreg_update_copy_table(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ const struct rte_flow_action *actions,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ struct mlx5_flow_mreg_copy_resource *mcp_res;
+ const struct rte_flow_action_mark *mark;
+
+ /* Check whether extensive metadata feature is engaged. */
+ if (!config->dv_flow_en ||
+ config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY ||
+ !mlx5_flow_ext_mreg_supported(dev) ||
+ !priv->sh->dv_regc0_mask)
+ return 0;
+ /* Find MARK action. */
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_FLAG:
+ mcp_res = flow_mreg_add_copy_action
+ (dev, MLX5_FLOW_MARK_DEFAULT, error);
+ if (!mcp_res)
+ return -rte_errno;
+ flow->rix_mreg_copy = mcp_res->idx;
+ if (dev->data->dev_started) {
+ mcp_res->appcnt++;
+ flow->copy_applied = 1;
+ }
+ return 0;
+ case RTE_FLOW_ACTION_TYPE_MARK:
+ mark = (const struct rte_flow_action_mark *)
+ actions->conf;
+ mcp_res =
+ flow_mreg_add_copy_action(dev, mark->id, error);
+ if (!mcp_res)
+ return -rte_errno;
+ flow->rix_mreg_copy = mcp_res->idx;
+ if (dev->data->dev_started) {
+ mcp_res->appcnt++;
+ flow->copy_applied = 1;
+ }
+ return 0;
+ default:
+ break;
+ }
+ }
+ return 0;
+}
+
+#define MLX5_MAX_SPLIT_ACTIONS 24
+#define MLX5_MAX_SPLIT_ITEMS 24
+
+/**
+ * Split the hairpin flow.
+ * Since HW can't support encap on Rx we move the encap to Tx.
+ * If the count action is after the encap then we also
+ * move the count action. in this case the count will also measure
+ * the outer bytes.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[in] actions
+ * Associated actions (list terminated by the END action).
+ * @param[out] actions_rx
+ * Rx flow actions.
+ * @param[out] actions_tx
+ * Tx flow actions..
+ * @param[out] pattern_tx
+ * The pattern items for the Tx flow.
+ * @param[out] flow_id
+ * The flow ID connected to this flow.
+ *
+ * @return
+ * 0 on success.
+ */
+static int
+flow_hairpin_split(struct rte_eth_dev *dev,
+ const struct rte_flow_action actions[],
+ struct rte_flow_action actions_rx[],
+ struct rte_flow_action actions_tx[],
+ struct rte_flow_item pattern_tx[],
+ uint32_t *flow_id)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_action_raw_encap *raw_encap;
+ const struct rte_flow_action_raw_decap *raw_decap;
+ struct mlx5_rte_flow_action_set_tag *set_tag;
+ struct rte_flow_action *tag_action;
+ struct mlx5_rte_flow_item_tag *tag_item;
+ struct rte_flow_item *item;
+ char *addr;
+ int encap = 0;
+
+ mlx5_flow_id_get(priv->sh->flow_id_pool, flow_id);
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+ case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+ rte_memcpy(actions_tx, actions,
+ sizeof(struct rte_flow_action));
+ actions_tx++;
+ break;
+ case RTE_FLOW_ACTION_TYPE_COUNT:
+ if (encap) {
+ rte_memcpy(actions_tx, actions,
+ sizeof(struct rte_flow_action));
+ actions_tx++;
+ } else {
+ rte_memcpy(actions_rx, actions,
+ sizeof(struct rte_flow_action));
+ actions_rx++;
+ }
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+ raw_encap = actions->conf;
+ if (raw_encap->size >
+ (sizeof(struct rte_flow_item_eth) +
+ sizeof(struct rte_flow_item_ipv4))) {
+ memcpy(actions_tx, actions,
+ sizeof(struct rte_flow_action));
+ actions_tx++;
+ encap = 1;
+ } else {
+ rte_memcpy(actions_rx, actions,
+ sizeof(struct rte_flow_action));
+ actions_rx++;
+ }
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+ raw_decap = actions->conf;
+ if (raw_decap->size <
+ (sizeof(struct rte_flow_item_eth) +
+ sizeof(struct rte_flow_item_ipv4))) {
+ memcpy(actions_tx, actions,
+ sizeof(struct rte_flow_action));
+ actions_tx++;
+ } else {
+ rte_memcpy(actions_rx, actions,
+ sizeof(struct rte_flow_action));
+ actions_rx++;
+ }
+ break;
+ default:
+ rte_memcpy(actions_rx, actions,
+ sizeof(struct rte_flow_action));
+ actions_rx++;
+ break;
+ }
+ }
+ /* Add set meta action and end action for the Rx flow. */
+ tag_action = actions_rx;
+ tag_action->type = (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_TAG;
+ actions_rx++;
+ rte_memcpy(actions_rx, actions, sizeof(struct rte_flow_action));
+ actions_rx++;
+ set_tag = (void *)actions_rx;
+ set_tag->id = mlx5_flow_get_reg_id(dev, MLX5_HAIRPIN_RX, 0, NULL);
+ MLX5_ASSERT(set_tag->id > REG_NONE);
+ set_tag->data = *flow_id;
+ tag_action->conf = set_tag;
+ /* Create Tx item list. */
+ rte_memcpy(actions_tx, actions, sizeof(struct rte_flow_action));
+ addr = (void *)&pattern_tx[2];
+ item = pattern_tx;
+ item->type = (enum rte_flow_item_type)
+ MLX5_RTE_FLOW_ITEM_TYPE_TAG;
+ tag_item = (void *)addr;
+ tag_item->data = *flow_id;
+ tag_item->id = mlx5_flow_get_reg_id(dev, MLX5_HAIRPIN_TX, 0, NULL);
+ MLX5_ASSERT(set_tag->id > REG_NONE);
+ item->spec = tag_item;
+ addr += sizeof(struct mlx5_rte_flow_item_tag);
+ tag_item = (void *)addr;
+ tag_item->data = UINT32_MAX;
+ tag_item->id = UINT16_MAX;
+ item->mask = tag_item;
+ addr += sizeof(struct mlx5_rte_flow_item_tag);
+ item->last = NULL;
+ item++;
+ item->type = RTE_FLOW_ITEM_TYPE_END;
+ return 0;
+}
+
+/**
+ * The last stage of splitting chain, just creates the subflow
+ * without any modification.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] flow
+ * Parent flow structure pointer.
+ * @param[in, out] sub_flow
+ * Pointer to return the created subflow, may be NULL.
+ * @param[in] prefix_layers
+ * Prefix subflow layers, may be 0.
+ * @param[in] attr
+ * Flow rule attributes.
+ * @param[in] items
+ * Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ * Associated actions (list terminated by the END action).
+ * @param[in] external
+ * This flow rule is created by request external to PMD.
+ * @param[in] flow_idx
+ * This memory pool index to the flow.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ * @return
+ * 0 on success, negative value otherwise
+ */
+static int
+flow_create_split_inner(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ struct mlx5_flow **sub_flow,
+ uint64_t prefix_layers,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external, uint32_t flow_idx,
+ struct rte_flow_error *error)
+{
+ struct mlx5_flow *dev_flow;
+
+ dev_flow = flow_drv_prepare(dev, flow, attr, items, actions,
+ flow_idx, error);
+ if (!dev_flow)
+ return -rte_errno;
+ dev_flow->flow = flow;
+ dev_flow->external = external;
+ /* Subflow object was created, we must include one in the list. */
+ SILIST_INSERT(&flow->dev_handles, dev_flow->handle_idx,
+ dev_flow->handle, next);
+ /*
+ * If dev_flow is as one of the suffix flow, some actions in suffix
+ * flow may need some user defined item layer flags.
+ */
+ if (prefix_layers)
+ dev_flow->handle->layers = prefix_layers;
+ if (sub_flow)
+ *sub_flow = dev_flow;
+ return flow_drv_translate(dev, dev_flow, attr, items, actions, error);
+}
+
+/**
+ * Split the meter flow.
+ *
+ * As meter flow will split to three sub flow, other than meter
+ * action, the other actions make sense to only meter accepts
+ * the packet. If it need to be dropped, no other additional
+ * actions should be take.
+ *
+ * One kind of special action which decapsulates the L3 tunnel
+ * header will be in the prefix sub flow, as not to take the
+ * L3 tunnel header into account.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[in] items
+ * Pattern specification (list terminated by the END pattern item).
+ * @param[out] sfx_items
+ * Suffix flow match items (list terminated by the END pattern item).
+ * @param[in] actions
+ * Associated actions (list terminated by the END action).
+ * @param[out] actions_sfx
+ * Suffix flow actions.
+ * @param[out] actions_pre
+ * Prefix flow actions.
+ * @param[out] pattern_sfx
+ * The pattern items for the suffix flow.
+ * @param[out] tag_sfx
+ * Pointer to suffix flow tag.
+ *
+ * @return
+ * 0 on success.
+ */
+static int
+flow_meter_split_prep(struct rte_eth_dev *dev,
+ const struct rte_flow_item items[],
+ struct rte_flow_item sfx_items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_action actions_sfx[],
+ struct rte_flow_action actions_pre[])
+{
+ struct rte_flow_action *tag_action = NULL;
+ struct rte_flow_item *tag_item;
+ struct mlx5_rte_flow_action_set_tag *set_tag;
+ struct rte_flow_error error;
+ const struct rte_flow_action_raw_encap *raw_encap;
+ const struct rte_flow_action_raw_decap *raw_decap;
+ struct mlx5_rte_flow_item_tag *tag_spec;
+ struct mlx5_rte_flow_item_tag *tag_mask;
+ uint32_t tag_id;
+ bool copy_vlan = false;
+
+ /* Prepare the actions for prefix and suffix flow. */
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ struct rte_flow_action **action_cur = NULL;
+
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_METER:
+ /* Add the extra tag action first. */
+ tag_action = actions_pre;
+ tag_action->type = (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_TAG;
+ actions_pre++;
+ action_cur = &actions_pre;
+ break;
+ case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+ case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP:
+ action_cur = &actions_pre;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+ raw_encap = actions->conf;
+ if (raw_encap->size < MLX5_ENCAPSULATION_DECISION_SIZE)
+ action_cur = &actions_pre;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+ raw_decap = actions->conf;
+ if (raw_decap->size > MLX5_ENCAPSULATION_DECISION_SIZE)
+ action_cur = &actions_pre;
+ break;
+ case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
+ case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
+ copy_vlan = true;
+ break;
+ default:
+ break;
+ }
+ if (!action_cur)
+ action_cur = &actions_sfx;
+ memcpy(*action_cur, actions, sizeof(struct rte_flow_action));
+ (*action_cur)++;
+ }
+ /* Add end action to the actions. */
+ actions_sfx->type = RTE_FLOW_ACTION_TYPE_END;
+ actions_pre->type = RTE_FLOW_ACTION_TYPE_END;
+ actions_pre++;
+ /* Set the tag. */
+ set_tag = (void *)actions_pre;
+ set_tag->id = mlx5_flow_get_reg_id(dev, MLX5_MTR_SFX, 0, &error);
+ /*
+ * Get the id from the qrss_pool to make qrss share the id with meter.
+ */
+ tag_id = flow_qrss_get_id(dev);
+ set_tag->data = tag_id << MLX5_MTR_COLOR_BITS;
+ assert(tag_action);
+ tag_action->conf = set_tag;
+ /* Prepare the suffix subflow items. */
+ tag_item = sfx_items++;
+ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+ int item_type = items->type;
+
+ switch (item_type) {
+ case RTE_FLOW_ITEM_TYPE_PORT_ID:
+ memcpy(sfx_items, items, sizeof(*sfx_items));
+ sfx_items++;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VLAN:
+ if (copy_vlan) {
+ memcpy(sfx_items, items, sizeof(*sfx_items));
+ /*
+ * Convert to internal match item, it is used
+ * for vlan push and set vid.
+ */
+ sfx_items->type = (enum rte_flow_item_type)
+ MLX5_RTE_FLOW_ITEM_TYPE_VLAN;
+ sfx_items++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ sfx_items->type = RTE_FLOW_ITEM_TYPE_END;
+ sfx_items++;
+ tag_spec = (struct mlx5_rte_flow_item_tag *)sfx_items;
+ tag_spec->data = tag_id << MLX5_MTR_COLOR_BITS;
+ tag_spec->id = mlx5_flow_get_reg_id(dev, MLX5_MTR_SFX, 0, &error);
+ tag_mask = tag_spec + 1;
+ tag_mask->data = 0xffffff00;
+ tag_item->type = (enum rte_flow_item_type)
+ MLX5_RTE_FLOW_ITEM_TYPE_TAG;
+ tag_item->spec = tag_spec;
+ tag_item->last = NULL;
+ tag_item->mask = tag_mask;
+ return tag_id;
+}
+
+/**
+ * Split action list having QUEUE/RSS for metadata register copy.
+ *
+ * Once Q/RSS action is detected in user's action list, the flow action
+ * should be split in order to copy metadata registers, which will happen in
+ * RX_CP_TBL like,
+ * - CQE->flow_tag := reg_c[1] (MARK)
+ * - CQE->flow_table_metadata (reg_b) := reg_c[0] (META)
+ * The Q/RSS action will be performed on RX_ACT_TBL after passing by RX_CP_TBL.
+ * This is because the last action of each flow must be a terminal action
+ * (QUEUE, RSS or DROP).
+ *
+ * Flow ID must be allocated to identify actions in the RX_ACT_TBL and it is
+ * stored and kept in the mlx5_flow structure per each sub_flow.
+ *
+ * The Q/RSS action is replaced with,
+ * - SET_TAG, setting the allocated flow ID to reg_c[2].
+ * And the following JUMP action is added at the end,
+ * - JUMP, to RX_CP_TBL.
+ *
+ * A flow to perform remained Q/RSS action will be created in RX_ACT_TBL by
+ * flow_create_split_metadata() routine. The flow will look like,
+ * - If flow ID matches (reg_c[2]), perform Q/RSS.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] split_actions
+ * Pointer to store split actions to jump to CP_TBL.
+ * @param[in] actions
+ * Pointer to the list of original flow actions.
+ * @param[in] qrss
+ * Pointer to the Q/RSS action.
+ * @param[in] actions_n
+ * Number of original actions.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * non-zero unique flow_id on success, otherwise 0 and
+ * error/rte_error are set.
+ */
+static uint32_t
+flow_mreg_split_qrss_prep(struct rte_eth_dev *dev,
+ struct rte_flow_action *split_actions,
+ const struct rte_flow_action *actions,
+ const struct rte_flow_action *qrss,
+ int actions_n, struct rte_flow_error *error)
+{
+ struct mlx5_rte_flow_action_set_tag *set_tag;
+ struct rte_flow_action_jump *jump;
+ const int qrss_idx = qrss - actions;
+ uint32_t flow_id = 0;
+ int ret = 0;
+
+ /*
+ * Given actions will be split
+ * - Replace QUEUE/RSS action with SET_TAG to set flow ID.
+ * - Add jump to mreg CP_TBL.
+ * As a result, there will be one more action.
+ */
+ ++actions_n;
+ memcpy(split_actions, actions, sizeof(*split_actions) * actions_n);
+ set_tag = (void *)(split_actions + actions_n);
+ /*
+ * If tag action is not set to void(it means we are not the meter
+ * suffix flow), add the tag action. Since meter suffix flow already
+ * has the tag added.
+ */
+ if (split_actions[qrss_idx].type != RTE_FLOW_ACTION_TYPE_VOID) {
+ /*
+ * Allocate the new subflow ID. This one is unique within
+ * device and not shared with representors. Otherwise,
+ * we would have to resolve multi-thread access synch
+ * issue. Each flow on the shared device is appended
+ * with source vport identifier, so the resulting
+ * flows will be unique in the shared (by master and
+ * representors) domain even if they have coinciding
+ * IDs.
+ */
+ flow_id = flow_qrss_get_id(dev);
+ if (!flow_id)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "can't allocate id "
+ "for split Q/RSS subflow");
+ /* Internal SET_TAG action to set flow ID. */
+ *set_tag = (struct mlx5_rte_flow_action_set_tag){
+ .data = flow_id,
+ };
+ ret = mlx5_flow_get_reg_id(dev, MLX5_COPY_MARK, 0, error);
+ if (ret < 0)
+ return ret;
+ set_tag->id = ret;
+ /* Construct new actions array. */
+ /* Replace QUEUE/RSS action. */
+ split_actions[qrss_idx] = (struct rte_flow_action){
+ .type = (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_TAG,
+ .conf = set_tag,
+ };
+ }
+ /* JUMP action to jump to mreg copy table (CP_TBL). */
+ jump = (void *)(set_tag + 1);
+ *jump = (struct rte_flow_action_jump){
+ .group = MLX5_FLOW_MREG_CP_TABLE_GROUP,
+ };
+ split_actions[actions_n - 2] = (struct rte_flow_action){
+ .type = RTE_FLOW_ACTION_TYPE_JUMP,
+ .conf = jump,
+ };
+ split_actions[actions_n - 1] = (struct rte_flow_action){
+ .type = RTE_FLOW_ACTION_TYPE_END,
+ };
+ return flow_id;
+}
+
+/**
+ * Extend the given action list for Tx metadata copy.
+ *
+ * Copy the given action list to the ext_actions and add flow metadata register
+ * copy action in order to copy reg_a set by WQE to reg_c[0].
+ *
+ * @param[out] ext_actions
+ * Pointer to the extended action list.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[in] actions_n
+ * Number of actions in the list.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ * @param[in] encap_idx
+ * The encap action inndex.
+ *
+ * @return
+ * 0 on success, negative value otherwise
+ */
+static int
+flow_mreg_tx_copy_prep(struct rte_eth_dev *dev,
+ struct rte_flow_action *ext_actions,
+ const struct rte_flow_action *actions,
+ int actions_n, struct rte_flow_error *error,
+ int encap_idx)
+{
+ struct mlx5_flow_action_copy_mreg *cp_mreg =
+ (struct mlx5_flow_action_copy_mreg *)
+ (ext_actions + actions_n + 1);
+ int ret;
+
+ ret = mlx5_flow_get_reg_id(dev, MLX5_METADATA_RX, 0, error);
+ if (ret < 0)
+ return ret;
+ cp_mreg->dst = ret;
+ ret = mlx5_flow_get_reg_id(dev, MLX5_METADATA_TX, 0, error);
+ if (ret < 0)
+ return ret;
+ cp_mreg->src = ret;
+ if (encap_idx != 0)
+ memcpy(ext_actions, actions, sizeof(*ext_actions) * encap_idx);
+ if (encap_idx == actions_n - 1) {
+ ext_actions[actions_n - 1] = (struct rte_flow_action){
+ .type = (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+ .conf = cp_mreg,
+ };
+ ext_actions[actions_n] = (struct rte_flow_action){
+ .type = RTE_FLOW_ACTION_TYPE_END,
+ };
+ } else {
+ ext_actions[encap_idx] = (struct rte_flow_action){
+ .type = (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+ .conf = cp_mreg,
+ };
+ memcpy(ext_actions + encap_idx + 1, actions + encap_idx,
+ sizeof(*ext_actions) * (actions_n - encap_idx));
+ }
+ return 0;
+}
+
+/**
+ * The splitting for metadata feature.
+ *
+ * - Q/RSS action on NIC Rx should be split in order to pass by
+ * the mreg copy table (RX_CP_TBL) and then it jumps to the
+ * action table (RX_ACT_TBL) which has the split Q/RSS action.
+ *
+ * - All the actions on NIC Tx should have a mreg copy action to
+ * copy reg_a from WQE to reg_c[0].
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[in] flow
+ * Parent flow structure pointer.
+ * @param[in] prefix_layers
+ * Prefix flow layer flags.
+ * @param[in] attr
+ * Flow rule attributes.
+ * @param[in] items
+ * Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ * Associated actions (list terminated by the END action).
+ * @param[in] external
+ * This flow rule is created by request external to PMD.
+ * @param[in] flow_idx
+ * This memory pool index to the flow.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ * @return
+ * 0 on success, negative value otherwise
+ */
+static int
+flow_create_split_metadata(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ uint64_t prefix_layers,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external, uint32_t flow_idx,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ const struct rte_flow_action *qrss = NULL;
+ struct rte_flow_action *ext_actions = NULL;
+ struct mlx5_flow *dev_flow = NULL;
+ uint32_t qrss_id = 0;
+ int mtr_sfx = 0;
+ size_t act_size;
+ int actions_n;
+ int encap_idx;
+ int ret;
+
+ /* Check whether extensive metadata feature is engaged. */
+ if (!config->dv_flow_en ||
+ config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY ||
+ !mlx5_flow_ext_mreg_supported(dev))
+ return flow_create_split_inner(dev, flow, NULL, prefix_layers,
+ attr, items, actions, external,
+ flow_idx, error);
+ actions_n = flow_parse_metadata_split_actions_info(actions, &qrss,
+ &encap_idx);
+ if (qrss) {
+ /* Exclude hairpin flows from splitting. */
+ if (qrss->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
+ const struct rte_flow_action_queue *queue;
+
+ queue = qrss->conf;
+ if (mlx5_rxq_get_type(dev, queue->index) ==
+ MLX5_RXQ_TYPE_HAIRPIN)
+ qrss = NULL;
+ } else if (qrss->type == RTE_FLOW_ACTION_TYPE_RSS) {
+ const struct rte_flow_action_rss *rss;
+
+ rss = qrss->conf;
+ if (mlx5_rxq_get_type(dev, rss->queue[0]) ==
+ MLX5_RXQ_TYPE_HAIRPIN)
+ qrss = NULL;
+ }
+ }
+ if (qrss) {
+ /* Check if it is in meter suffix table. */
+ mtr_sfx = attr->group == (attr->transfer ?
+ (MLX5_FLOW_TABLE_LEVEL_SUFFIX - 1) :
+ MLX5_FLOW_TABLE_LEVEL_SUFFIX);
+ /*
+ * Q/RSS action on NIC Rx should be split in order to pass by
+ * the mreg copy table (RX_CP_TBL) and then it jumps to the
+ * action table (RX_ACT_TBL) which has the split Q/RSS action.
+ */
+ act_size = sizeof(struct rte_flow_action) * (actions_n + 1) +
+ sizeof(struct rte_flow_action_set_tag) +
+ sizeof(struct rte_flow_action_jump);
+ ext_actions = rte_zmalloc(__func__, act_size, 0);
+ if (!ext_actions)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "no memory to split "
+ "metadata flow");
+ /*
+ * If we are the suffix flow of meter, tag already exist.
+ * Set the tag action to void.
+ */
+ if (mtr_sfx)
+ ext_actions[qrss - actions].type =
+ RTE_FLOW_ACTION_TYPE_VOID;
+ else
+ ext_actions[qrss - actions].type =
+ (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_TAG;
+ /*
+ * Create the new actions list with removed Q/RSS action
+ * and appended set tag and jump to register copy table
+ * (RX_CP_TBL). We should preallocate unique tag ID here
+ * in advance, because it is needed for set tag action.
+ */
+ qrss_id = flow_mreg_split_qrss_prep(dev, ext_actions, actions,
+ qrss, actions_n, error);
+ if (!mtr_sfx && !qrss_id) {
+ ret = -rte_errno;
+ goto exit;
+ }
+ } else if (attr->egress && !attr->transfer) {
+ /*
+ * All the actions on NIC Tx should have a metadata register
+ * copy action to copy reg_a from WQE to reg_c[meta]
+ */
+ act_size = sizeof(struct rte_flow_action) * (actions_n + 1) +
+ sizeof(struct mlx5_flow_action_copy_mreg);
+ ext_actions = rte_zmalloc(__func__, act_size, 0);
+ if (!ext_actions)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "no memory to split "
+ "metadata flow");
+ /* Create the action list appended with copy register. */
+ ret = flow_mreg_tx_copy_prep(dev, ext_actions, actions,
+ actions_n, error, encap_idx);
+ if (ret < 0)
+ goto exit;
+ }
+ /* Add the unmodified original or prefix subflow. */
+ ret = flow_create_split_inner(dev, flow, &dev_flow, prefix_layers, attr,
+ items, ext_actions ? ext_actions :
+ actions, external, flow_idx, error);
+ if (ret < 0)
+ goto exit;
+ MLX5_ASSERT(dev_flow);
+ if (qrss) {
+ const struct rte_flow_attr q_attr = {
+ .group = MLX5_FLOW_MREG_ACT_TABLE_GROUP,
+ .ingress = 1,
+ };
+ /* Internal PMD action to set register. */
+ struct mlx5_rte_flow_item_tag q_tag_spec = {
+ .data = qrss_id,
+ .id = 0,
+ };
+ struct rte_flow_item q_items[] = {
+ {
+ .type = (enum rte_flow_item_type)
+ MLX5_RTE_FLOW_ITEM_TYPE_TAG,
+ .spec = &q_tag_spec,
+ .last = NULL,
+ .mask = NULL,
+ },
+ {
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ };
+ struct rte_flow_action q_actions[] = {
+ {
+ .type = qrss->type,
+ .conf = qrss->conf,
+ },
+ {
+ .type = RTE_FLOW_ACTION_TYPE_END,
+ },
+ };
+ uint64_t layers = flow_get_prefix_layer_flags(dev_flow);
+
+ /*
+ * Configure the tag item only if there is no meter subflow.
+ * Since tag is already marked in the meter suffix subflow
+ * we can just use the meter suffix items as is.
+ */
+ if (qrss_id) {
+ /* Not meter subflow. */
+ MLX5_ASSERT(!mtr_sfx);
+ /*
+ * Put unique id in prefix flow due to it is destroyed
+ * after suffix flow and id will be freed after there
+ * is no actual flows with this id and identifier
+ * reallocation becomes possible (for example, for
+ * other flows in other threads).
+ */
+ dev_flow->handle->split_flow_id = qrss_id;
+ ret = mlx5_flow_get_reg_id(dev, MLX5_COPY_MARK, 0,
+ error);
+ if (ret < 0)
+ goto exit;
+ q_tag_spec.id = ret;
+ }
+ dev_flow = NULL;
+ /* Add suffix subflow to execute Q/RSS. */
+ ret = flow_create_split_inner(dev, flow, &dev_flow, layers,
+ &q_attr, mtr_sfx ? items :
+ q_items, q_actions,
+ external, flow_idx, error);
+ if (ret < 0)
+ goto exit;
+ /* qrss ID should be freed if failed. */
+ qrss_id = 0;
+ MLX5_ASSERT(dev_flow);
+ }
+
+exit:
+ /*
+ * We do not destroy the partially created sub_flows in case of error.
+ * These ones are included into parent flow list and will be destroyed
+ * by flow_drv_destroy.
+ */
+ flow_qrss_free_id(dev, qrss_id);
+ rte_free(ext_actions);
+ return ret;
+}
+
+/**
+ * The splitting for meter feature.
+ *
+ * - The meter flow will be split to two flows as prefix and
+ * suffix flow. The packets make sense only it pass the prefix
+ * meter action.
+ *
+ * - Reg_C_5 is used for the packet to match betweend prefix and
+ * suffix flow.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[in] flow
+ * Parent flow structure pointer.
+ * @param[in] attr
+ * Flow rule attributes.
+ * @param[in] items
+ * Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ * Associated actions (list terminated by the END action).
+ * @param[in] external
+ * This flow rule is created by request external to PMD.
+ * @param[in] flow_idx
+ * This memory pool index to the flow.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ * @return
+ * 0 on success, negative value otherwise
+ */
+static int
+flow_create_split_meter(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external, uint32_t flow_idx,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow_action *sfx_actions = NULL;
+ struct rte_flow_action *pre_actions = NULL;
+ struct rte_flow_item *sfx_items = NULL;
+ struct mlx5_flow *dev_flow = NULL;
+ struct rte_flow_attr sfx_attr = *attr;
+ uint32_t mtr = 0;
+ uint32_t mtr_tag_id = 0;
+ size_t act_size;
+ size_t item_size;
+ int actions_n = 0;
+ int ret;
+
+ if (priv->mtr_en)
+ actions_n = flow_check_meter_action(actions, &mtr);
+ if (mtr) {
+ /* The five prefix actions: meter, decap, encap, tag, end. */
+ act_size = sizeof(struct rte_flow_action) * (actions_n + 5) +
+ sizeof(struct mlx5_rte_flow_action_set_tag);
+ /* tag, vlan, port id, end. */
+#define METER_SUFFIX_ITEM 4
+ item_size = sizeof(struct rte_flow_item) * METER_SUFFIX_ITEM +
+ sizeof(struct mlx5_rte_flow_item_tag) * 2;
+ sfx_actions = rte_zmalloc(__func__, (act_size + item_size), 0);
+ if (!sfx_actions)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "no memory to split "
+ "meter flow");
+ sfx_items = (struct rte_flow_item *)((char *)sfx_actions +
+ act_size);
+ pre_actions = sfx_actions + actions_n;
+ mtr_tag_id = flow_meter_split_prep(dev, items, sfx_items,
+ actions, sfx_actions,
+ pre_actions);
+ if (!mtr_tag_id) {
+ ret = -rte_errno;
+ goto exit;
+ }
+ /* Add the prefix subflow. */
+ ret = flow_create_split_inner(dev, flow, &dev_flow, 0, attr,
+ items, pre_actions, external,
+ flow_idx, error);
+ if (ret) {
+ ret = -rte_errno;
+ goto exit;
+ }
+ dev_flow->handle->split_flow_id = mtr_tag_id;
+ /* Setting the sfx group atrr. */
+ sfx_attr.group = sfx_attr.transfer ?
+ (MLX5_FLOW_TABLE_LEVEL_SUFFIX - 1) :
+ MLX5_FLOW_TABLE_LEVEL_SUFFIX;
+ }
+ /* Add the prefix subflow. */
+ ret = flow_create_split_metadata(dev, flow, dev_flow ?
+ flow_get_prefix_layer_flags(dev_flow) :
+ 0, &sfx_attr,
+ sfx_items ? sfx_items : items,
+ sfx_actions ? sfx_actions : actions,
+ external, flow_idx, error);
+exit:
+ if (sfx_actions)
+ rte_free(sfx_actions);
+ return ret;
+}
+
+/**
+ * Split the flow to subflow set. The splitters might be linked
+ * in the chain, like this:
+ * flow_create_split_outer() calls:
+ * flow_create_split_meter() calls:
+ * flow_create_split_metadata(meter_subflow_0) calls:
+ * flow_create_split_inner(metadata_subflow_0)
+ * flow_create_split_inner(metadata_subflow_1)
+ * flow_create_split_inner(metadata_subflow_2)
+ * flow_create_split_metadata(meter_subflow_1) calls:
+ * flow_create_split_inner(metadata_subflow_0)
+ * flow_create_split_inner(metadata_subflow_1)
+ * flow_create_split_inner(metadata_subflow_2)
+ *
+ * This provide flexible way to add new levels of flow splitting.
+ * The all of successfully created subflows are included to the
+ * parent flow dev_flow list.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[in] flow
+ * Parent flow structure pointer.
+ * @param[in] attr
+ * Flow rule attributes.
+ * @param[in] items
+ * Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ * Associated actions (list terminated by the END action).
+ * @param[in] external
+ * This flow rule is created by request external to PMD.
+ * @param[in] flow_idx
+ * This memory pool index to the flow.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ * @return
+ * 0 on success, negative value otherwise
+ */
+static int
+flow_create_split_outer(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external, uint32_t flow_idx,
+ struct rte_flow_error *error)
+{
+ int ret;
+
+ ret = flow_create_split_meter(dev, flow, attr, items,
+ actions, external, flow_idx, error);
+ MLX5_ASSERT(ret <= 0);
+ return ret;
+}
+
+/**
+ * Create a flow and add it to @p list.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param list
+ * Pointer to a TAILQ flow list. If this parameter NULL,
+ * no list insertion occurred, flow is just created,
+ * this is caller's responsibility to track the
+ * created flow.
+ * @param[in] attr
+ * Flow rule attributes.
+ * @param[in] items
+ * Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ * Associated actions (list terminated by the END action).
+ * @param[in] external
+ * This flow rule is created by request external to PMD.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * A flow index on success, 0 otherwise and rte_errno is set.
+ */
+static uint32_t
+flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external, struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow *flow = NULL;
+ struct mlx5_flow *dev_flow;
+ const struct rte_flow_action_rss *rss;
+ union {
+ struct rte_flow_expand_rss buf;
+ uint8_t buffer[2048];
+ } expand_buffer;
+ union {
+ struct rte_flow_action actions[MLX5_MAX_SPLIT_ACTIONS];
+ uint8_t buffer[2048];
+ } actions_rx;
+ union {
+ struct rte_flow_action actions[MLX5_MAX_SPLIT_ACTIONS];
+ uint8_t buffer[2048];
+ } actions_hairpin_tx;
+ union {
+ struct rte_flow_item items[MLX5_MAX_SPLIT_ITEMS];
+ uint8_t buffer[2048];
+ } items_tx;
+ struct rte_flow_expand_rss *buf = &expand_buffer.buf;
+ struct mlx5_flow_rss_desc *rss_desc = &((struct mlx5_flow_rss_desc *)
+ priv->rss_desc)[!!priv->flow_idx];
+ const struct rte_flow_action *p_actions_rx = actions;
+ uint32_t i;
+ uint32_t idx = 0;
+ int hairpin_flow;
+ uint32_t hairpin_id = 0;
+ struct rte_flow_attr attr_tx = { .priority = 0 };
+ int ret;
+
+ hairpin_flow = flow_check_hairpin_split(dev, attr, actions);
+ ret = flow_drv_validate(dev, attr, items, p_actions_rx,
+ external, hairpin_flow, error);
+ if (ret < 0)
+ return 0;
+ if (hairpin_flow > 0) {
+ if (hairpin_flow > MLX5_MAX_SPLIT_ACTIONS) {
+ rte_errno = EINVAL;
+ return 0;
+ }
+ flow_hairpin_split(dev, actions, actions_rx.actions,
+ actions_hairpin_tx.actions, items_tx.items,
+ &hairpin_id);
+ p_actions_rx = actions_rx.actions;
+ }
+ flow = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], &idx);
+ if (!flow) {
+ rte_errno = ENOMEM;
+ goto error_before_flow;
+ }
+ flow->drv_type = flow_get_drv_type(dev, attr);
+ if (hairpin_id != 0)
+ flow->hairpin_flow_id = hairpin_id;
+ MLX5_ASSERT(flow->drv_type > MLX5_FLOW_TYPE_MIN &&
+ flow->drv_type < MLX5_FLOW_TYPE_MAX);
+ memset(rss_desc, 0, sizeof(*rss_desc));
+ rss = flow_get_rss_action(p_actions_rx);
+ if (rss) {
+ /*
+ * The following information is required by
+ * mlx5_flow_hashfields_adjust() in advance.
+ */
+ rss_desc->level = rss->level;
+ /* RSS type 0 indicates default RSS type (ETH_RSS_IP). */
+ rss_desc->types = !rss->types ? ETH_RSS_IP : rss->types;
+ }
+ flow->dev_handles = 0;
+ if (rss && rss->types) {
+ unsigned int graph_root;
+
+ graph_root = find_graph_root(items, rss->level);
+ ret = rte_flow_expand_rss(buf, sizeof(expand_buffer.buffer),
+ items, rss->types,
+ mlx5_support_expansion,
+ graph_root);
+ MLX5_ASSERT(ret > 0 &&
+ (unsigned int)ret < sizeof(expand_buffer.buffer));
+ } else {
+ buf->entries = 1;
+ buf->entry[0].pattern = (void *)(uintptr_t)items;
+ }
+ /*
+ * Record the start index when there is a nested call. All sub-flows
+ * need to be translated before another calling.
+ * No need to use ping-pong buffer to save memory here.
+ */
+ if (priv->flow_idx) {
+ MLX5_ASSERT(!priv->flow_nested_idx);
+ priv->flow_nested_idx = priv->flow_idx;
+ }
+ for (i = 0; i < buf->entries; ++i) {
+ /*
+ * The splitter may create multiple dev_flows,
+ * depending on configuration. In the simplest
+ * case it just creates unmodified original flow.
+ */
+ ret = flow_create_split_outer(dev, flow, attr,
+ buf->entry[i].pattern,
+ p_actions_rx, external, idx,
+ error);
+ if (ret < 0)
+ goto error;
+ }
+ /* Create the tx flow. */
+ if (hairpin_flow) {
+ attr_tx.group = MLX5_HAIRPIN_TX_TABLE;
+ attr_tx.ingress = 0;
+ attr_tx.egress = 1;
+ dev_flow = flow_drv_prepare(dev, flow, &attr_tx, items_tx.items,
+ actions_hairpin_tx.actions,
+ idx, error);
+ if (!dev_flow)
+ goto error;
+ dev_flow->flow = flow;
+ dev_flow->external = 0;
+ SILIST_INSERT(&flow->dev_handles, dev_flow->handle_idx,
+ dev_flow->handle, next);
+ ret = flow_drv_translate(dev, dev_flow, &attr_tx,
+ items_tx.items,
+ actions_hairpin_tx.actions, error);
+ if (ret < 0)
+ goto error;
+ }
+ /*
+ * Update the metadata register copy table. If extensive
+ * metadata feature is enabled and registers are supported
+ * we might create the extra rte_flow for each unique
+ * MARK/FLAG action ID.
+ *
+ * The table is updated for ingress Flows only, because
+ * the egress Flows belong to the different device and
+ * copy table should be updated in peer NIC Rx domain.
+ */
+ if (attr->ingress &&
+ (external || attr->group != MLX5_FLOW_MREG_CP_TABLE_GROUP)) {
+ ret = flow_mreg_update_copy_table(dev, flow, actions, error);
+ if (ret)
+ goto error;
+ }
+ /*
+ * If the flow is external (from application) OR device is started, then
+ * the flow will be applied immediately.
+ */
+ if (external || dev->data->dev_started) {
+ ret = flow_drv_apply(dev, flow, error);
+ if (ret < 0)
+ goto error;
+ }
+ if (list)
+ ILIST_INSERT(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], list, idx,
+ flow, next);
+ flow_rxq_flags_set(dev, flow);
+ /* Nested flow creation index recovery. */
+ priv->flow_idx = priv->flow_nested_idx;
+ if (priv->flow_nested_idx)
+ priv->flow_nested_idx = 0;
+ return idx;
+error:
+ MLX5_ASSERT(flow);
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ flow_mreg_del_copy_action(dev, flow);
+ flow_drv_destroy(dev, flow);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], idx);
+ rte_errno = ret; /* Restore rte_errno. */
+error_before_flow:
+ ret = rte_errno;
+ if (hairpin_id)
+ mlx5_flow_id_release(priv->sh->flow_id_pool,
+ hairpin_id);
+ rte_errno = ret;
+ priv->flow_idx = priv->flow_nested_idx;
+ if (priv->flow_nested_idx)
+ priv->flow_nested_idx = 0;
+ return 0;
+}
+
+/**
+ * Create a dedicated flow rule on e-switch table 0 (root table), to direct all
+ * incoming packets to table 1.
+ *
+ * Other flow rules, requested for group n, will be created in
+ * e-switch table n+1.
+ * Jump action to e-switch group n will be created to group n+1.
+ *
+ * Used when working in switchdev mode, to utilise advantages of table 1
+ * and above.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * Pointer to flow on success, NULL otherwise and rte_errno is set.
+ */
+struct rte_flow *
+mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev)
+{
+ const struct rte_flow_attr attr = {
+ .group = 0,
+ .priority = 0,
+ .ingress = 1,
+ .egress = 0,
+ .transfer = 1,
+ };
+ const struct rte_flow_item pattern = {
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ };
+ struct rte_flow_action_jump jump = {
+ .group = 1,
+ };
+ const struct rte_flow_action actions[] = {
+ {
+ .type = RTE_FLOW_ACTION_TYPE_JUMP,
+ .conf = &jump,
+ },
+ {
+ .type = RTE_FLOW_ACTION_TYPE_END,
+ },
+ };
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow_error error;
+
+ return (void *)(uintptr_t)flow_list_create(dev, &priv->ctrl_flows,
+ &attr, &pattern,
+ actions, false, &error);
+}
+
+/**
+ * Validate a flow supported by the NIC.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_validate(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ int hairpin_flow;
+
+ hairpin_flow = flow_check_hairpin_split(dev, attr, actions);
+ return flow_drv_validate(dev, attr, items, actions,
+ true, hairpin_flow, error);
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+struct rte_flow *
+mlx5_flow_create(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ /*
+ * If the device is not started yet, it is not allowed to created a
+ * flow from application. PMD default flows and traffic control flows
+ * are not affected.
+ */
+ if (unlikely(!dev->data->dev_started)) {
+ DRV_LOG(DEBUG, "port %u is not started when "
+ "inserting a flow", dev->data->port_id);
+ rte_flow_error_set(error, ENODEV,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "port not started");
+ return NULL;
+ }
+ return (void *)(uintptr_t)flow_list_create(dev, &priv->flows,
+ attr, items, actions, true, error);
+}
+
+/**
+ * Destroy a flow in a list.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param list
+ * Pointer to the Indexed flow list. If this parameter NULL,
+ * there is no flow removal from the list. Be noted that as
+ * flow is add to the indexed list, memory of the indexed
+ * list points to maybe changed as flow destroyed.
+ * @param[in] flow_idx
+ * Index of flow to destroy.
+ */
+static void
+flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
+ uint32_t flow_idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+ struct rte_flow *flow = mlx5_ipool_get(priv->sh->ipool
+ [MLX5_IPOOL_RTE_FLOW], flow_idx);
+
+ if (!flow)
+ return;
+ /*
+ * Update RX queue flags only if port is started, otherwise it is
+ * already clean.
+ */
+ if (dev->data->dev_started)
+ flow_rxq_flags_trim(dev, flow);
+ if (flow->hairpin_flow_id)
+ mlx5_flow_id_release(priv->sh->flow_id_pool,
+ flow->hairpin_flow_id);
+ flow_drv_destroy(dev, flow);
+ if (list)
+ ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], list,
+ flow_idx, flow, next);
+ flow_mreg_del_copy_action(dev, flow);
+ if (flow->fdir) {
+ LIST_FOREACH(priv_fdir_flow, &priv->fdir_flows, next) {
+ if (priv_fdir_flow->rix_flow == flow_idx)
+ break;
+ }
+ if (priv_fdir_flow) {
+ LIST_REMOVE(priv_fdir_flow, next);
+ rte_free(priv_fdir_flow->fdir);
+ rte_free(priv_fdir_flow);
+ }
+ }
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], flow_idx);
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param list
+ * Pointer to the Indexed flow list.
+ * @param active
+ * If flushing is called avtively.
+ */
+void
+mlx5_flow_list_flush(struct rte_eth_dev *dev, uint32_t *list, bool active)
+{
+ uint32_t num_flushed = 0;
+
+ while (*list) {
+ flow_list_destroy(dev, list, *list);
+ num_flushed++;
+ }
+ if (active) {
+ DRV_LOG(INFO, "port %u: %u flows flushed before stopping",
+ dev->data->port_id, num_flushed);
+ }
+}
+
+/**
+ * Remove all flows.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param list
+ * Pointer to the Indexed flow list.
+ */
+void
+mlx5_flow_stop(struct rte_eth_dev *dev, uint32_t *list)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow *flow = NULL;
+ uint32_t idx;
+
+ ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], *list, idx,
+ flow, next) {
+ flow_drv_remove(dev, flow);
+ flow_mreg_stop_copy_action(dev, flow);
+ }
+ flow_mreg_del_default_copy_action(dev);
+ flow_rxq_flags_clear(dev);
+}
+
+/**
+ * Add all flows.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param list
+ * Pointer to the Indexed flow list.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_start(struct rte_eth_dev *dev, uint32_t *list)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow *flow = NULL;
+ struct rte_flow_error error;
+ uint32_t idx;
+ int ret = 0;
+
+ /* Make sure default copy action (reg_c[0] -> reg_b) is created. */
+ ret = flow_mreg_add_default_copy_action(dev, &error);
+ if (ret < 0)
+ return -rte_errno;
+ /* Apply Flows created by application. */
+ ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], *list, idx,
+ flow, next) {
+ ret = flow_mreg_start_copy_action(dev, flow);
+ if (ret < 0)
+ goto error;
+ ret = flow_drv_apply(dev, flow, &error);
+ if (ret < 0)
+ goto error;
+ flow_rxq_flags_set(dev, flow);
+ }
+ return 0;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ mlx5_flow_stop(dev, list);
+ rte_errno = ret; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Stop all default actions for flows.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_flow_stop_default(struct rte_eth_dev *dev)
+{
+ flow_mreg_del_default_copy_action(dev);
+ flow_rxq_flags_clear(dev);
+}
+
+/**
+ * Start all default actions for flows.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_start_default(struct rte_eth_dev *dev)
+{
+ struct rte_flow_error error;
+
+ /* Make sure default copy action (reg_c[0] -> reg_b) is created. */
+ return flow_mreg_add_default_copy_action(dev, &error);
+}
+
+/**
+ * Allocate intermediate resources for flow creation.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_flow_alloc_intermediate(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!priv->inter_flows) {
+ priv->inter_flows = rte_calloc(__func__, 1,
+ MLX5_NUM_MAX_DEV_FLOWS *
+ sizeof(struct mlx5_flow) +
+ (sizeof(struct mlx5_flow_rss_desc) +
+ sizeof(uint16_t) * UINT16_MAX) * 2, 0);
+ if (!priv->inter_flows) {
+ DRV_LOG(ERR, "can't allocate intermediate memory.");
+ return;
+ }
+ }
+ priv->rss_desc = &((struct mlx5_flow *)priv->inter_flows)
+ [MLX5_NUM_MAX_DEV_FLOWS];
+ /* Reset the index. */
+ priv->flow_idx = 0;
+ priv->flow_nested_idx = 0;
+}
+
+/**
+ * Free intermediate resources for flows.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_flow_free_intermediate(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ rte_free(priv->inter_flows);
+ priv->inter_flows = NULL;
+}
+
+/**
+ * Verify the flow list is empty
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return the number of flows not released.
+ */
+int
+mlx5_flow_verify(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow *flow;
+ uint32_t idx;
+ int ret = 0;
+
+ ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], priv->flows, idx,
+ flow, next) {
+ DRV_LOG(DEBUG, "port %u flow %p still referenced",
+ dev->data->port_id, (void *)flow);
+ ++ret;
+ }
+ return ret;
+}
+
+/**
+ * Enable default hairpin egress flow.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param queue
+ * The queue index.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev,
+ uint32_t queue)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_attr attr = {
+ .egress = 1,
+ .priority = 0,
+ };
+ struct mlx5_rte_flow_item_tx_queue queue_spec = {
+ .queue = queue,
+ };
+ struct mlx5_rte_flow_item_tx_queue queue_mask = {
+ .queue = UINT32_MAX,
+ };
+ struct rte_flow_item items[] = {
+ {
+ .type = (enum rte_flow_item_type)
+ MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
+ .spec = &queue_spec,
+ .last = NULL,
+ .mask = &queue_mask,
+ },
+ {
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ };
+ struct rte_flow_action_jump jump = {
+ .group = MLX5_HAIRPIN_TX_TABLE,
+ };
+ struct rte_flow_action actions[2];
+ uint32_t flow_idx;
+ struct rte_flow_error error;
+
+ actions[0].type = RTE_FLOW_ACTION_TYPE_JUMP;
+ actions[0].conf = &jump;
+ actions[1].type = RTE_FLOW_ACTION_TYPE_END;
+ flow_idx = flow_list_create(dev, &priv->ctrl_flows,
+ &attr, items, actions, false, &error);
+ if (!flow_idx) {
+ DRV_LOG(DEBUG,
+ "Failed to create ctrl flow: rte_errno(%d),"
+ " type(%d), message(%s)",
+ rte_errno, error.type,
+ error.message ? error.message : " (no stated reason)");
+ return -rte_errno;
+ }
+ return 0;
+}
+
+/**
+ * Enable a control flow configured from the control plane.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param eth_spec
+ * An Ethernet flow spec to apply.
+ * @param eth_mask
+ * An Ethernet flow mask to apply.
+ * @param vlan_spec
+ * A VLAN flow spec to apply.
+ * @param vlan_mask
+ * A VLAN flow mask to apply.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
+ struct rte_flow_item_eth *eth_spec,
+ struct rte_flow_item_eth *eth_mask,
+ struct rte_flow_item_vlan *vlan_spec,
+ struct rte_flow_item_vlan *vlan_mask)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_attr attr = {
+ .ingress = 1,
+ .priority = MLX5_FLOW_PRIO_RSVD,
+ };
+ struct rte_flow_item items[] = {
+ {
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ .spec = eth_spec,
+ .last = NULL,
+ .mask = eth_mask,
+ },
+ {
+ .type = (vlan_spec) ? RTE_FLOW_ITEM_TYPE_VLAN :
+ RTE_FLOW_ITEM_TYPE_END,
+ .spec = vlan_spec,
+ .last = NULL,
+ .mask = vlan_mask,
+ },
+ {
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ };
+ uint16_t queue[priv->reta_idx_n];
+ struct rte_flow_action_rss action_rss = {
+ .func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+ .level = 0,
+ .types = priv->rss_conf.rss_hf,
+ .key_len = priv->rss_conf.rss_key_len,
+ .queue_num = priv->reta_idx_n,
+ .key = priv->rss_conf.rss_key,
+ .queue = queue,
+ };
+ struct rte_flow_action actions[] = {
+ {
+ .type = RTE_FLOW_ACTION_TYPE_RSS,
+ .conf = &action_rss,
+ },
+ {
+ .type = RTE_FLOW_ACTION_TYPE_END,
+ },
+ };
+ uint32_t flow_idx;
+ struct rte_flow_error error;
+ unsigned int i;
+
+ if (!priv->reta_idx_n || !priv->rxqs_n) {
+ return 0;
+ }
+ if (!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG))
+ action_rss.types = 0;
+ for (i = 0; i != priv->reta_idx_n; ++i)
+ queue[i] = (*priv->reta_idx)[i];
+ flow_idx = flow_list_create(dev, &priv->ctrl_flows,
+ &attr, items, actions, false, &error);
+ if (!flow_idx)
+ return -rte_errno;
+ return 0;
+}
+
+/**
+ * Enable a flow control configured from the control plane.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param eth_spec
+ * An Ethernet flow spec to apply.
+ * @param eth_mask
+ * An Ethernet flow mask to apply.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_ctrl_flow(struct rte_eth_dev *dev,
+ struct rte_flow_item_eth *eth_spec,
+ struct rte_flow_item_eth *eth_mask)
+{
+ return mlx5_ctrl_flow_vlan(dev, eth_spec, eth_mask, NULL, NULL);
+}
+
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_destroy(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ struct rte_flow_error *error __rte_unused)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ flow_list_destroy(dev, &priv->flows, (uintptr_t)(void *)flow);
+ return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_flush(struct rte_eth_dev *dev,
+ struct rte_flow_error *error __rte_unused)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ mlx5_flow_list_flush(dev, &priv->flows, false);
+ return 0;
+}
+
+/**
+ * Isolated mode.
+ *
+ * @see rte_flow_isolate()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_isolate(struct rte_eth_dev *dev,
+ int enable,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (dev->data->dev_started) {
+ rte_flow_error_set(error, EBUSY,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "port must be stopped first");
+ return -rte_errno;
+ }
+ priv->isolated = !!enable;
+ if (enable)
+ dev->dev_ops = &mlx5_dev_ops_isolate;
+ else
+ dev->dev_ops = &mlx5_dev_ops;
+ return 0;
+}
+
+/**
+ * Query a flow.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+static int
+flow_drv_query(struct rte_eth_dev *dev,
+ uint32_t flow_idx,
+ const struct rte_flow_action *actions,
+ void *data,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct mlx5_flow_driver_ops *fops;
+ struct rte_flow *flow = mlx5_ipool_get(priv->sh->ipool
+ [MLX5_IPOOL_RTE_FLOW],
+ flow_idx);
+ enum mlx5_flow_drv_type ftype;
+
+ if (!flow) {
+ return rte_flow_error_set(error, ENOENT,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "invalid flow handle");
+ }
+ ftype = flow->drv_type;
+ MLX5_ASSERT(ftype > MLX5_FLOW_TYPE_MIN && ftype < MLX5_FLOW_TYPE_MAX);
+ fops = flow_get_drv_ops(ftype);
+
+ return fops->query(dev, flow, actions, data, error);
+}
+
+/**
+ * Query a flow.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_query(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ const struct rte_flow_action *actions,
+ void *data,
+ struct rte_flow_error *error)
+{
+ int ret;
+
+ ret = flow_drv_query(dev, (uintptr_t)(void *)flow, actions, data,
+ error);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * Convert a flow director filter to a generic flow.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param fdir_filter
+ * Flow director filter to add.
+ * @param attributes
+ * Generic flow parameters structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_filter_convert(struct rte_eth_dev *dev,
+ const struct rte_eth_fdir_filter *fdir_filter,
+ struct mlx5_fdir *attributes)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_eth_fdir_input *input = &fdir_filter->input;
+ const struct rte_eth_fdir_masks *mask =
+ &dev->data->dev_conf.fdir_conf.mask;
+
+ /* Validate queue number. */
+ if (fdir_filter->action.rx_queue >= priv->rxqs_n) {
+ DRV_LOG(ERR, "port %u invalid queue number %d",
+ dev->data->port_id, fdir_filter->action.rx_queue);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ attributes->attr.ingress = 1;
+ attributes->items[0] = (struct rte_flow_item) {
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ .spec = &attributes->l2,
+ .mask = &attributes->l2_mask,
+ };
+ switch (fdir_filter->action.behavior) {
+ case RTE_ETH_FDIR_ACCEPT:
+ attributes->actions[0] = (struct rte_flow_action){
+ .type = RTE_FLOW_ACTION_TYPE_QUEUE,
+ .conf = &attributes->queue,
+ };
+ break;
+ case RTE_ETH_FDIR_REJECT:
+ attributes->actions[0] = (struct rte_flow_action){
+ .type = RTE_FLOW_ACTION_TYPE_DROP,
+ };
+ break;
+ default:
+ DRV_LOG(ERR, "port %u invalid behavior %d",
+ dev->data->port_id,
+ fdir_filter->action.behavior);
+ rte_errno = ENOTSUP;
+ return -rte_errno;
+ }
+ attributes->queue.index = fdir_filter->action.rx_queue;
+ /* Handle L3. */
+ switch (fdir_filter->input.flow_type) {
+ case RTE_ETH_FLOW_NONFRAG_IPV4_UDP:
+ case RTE_ETH_FLOW_NONFRAG_IPV4_TCP:
+ case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
+ attributes->l3.ipv4.hdr = (struct rte_ipv4_hdr){
+ .src_addr = input->flow.ip4_flow.src_ip,
+ .dst_addr = input->flow.ip4_flow.dst_ip,
+ .time_to_live = input->flow.ip4_flow.ttl,
+ .type_of_service = input->flow.ip4_flow.tos,
+ };
+ attributes->l3_mask.ipv4.hdr = (struct rte_ipv4_hdr){
+ .src_addr = mask->ipv4_mask.src_ip,
+ .dst_addr = mask->ipv4_mask.dst_ip,
+ .time_to_live = mask->ipv4_mask.ttl,
+ .type_of_service = mask->ipv4_mask.tos,
+ .next_proto_id = mask->ipv4_mask.proto,
+ };
+ attributes->items[1] = (struct rte_flow_item){
+ .type = RTE_FLOW_ITEM_TYPE_IPV4,
+ .spec = &attributes->l3,
+ .mask = &attributes->l3_mask,
+ };
+ break;
+ case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
+ case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
+ case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
+ attributes->l3.ipv6.hdr = (struct rte_ipv6_hdr){
+ .hop_limits = input->flow.ipv6_flow.hop_limits,
+ .proto = input->flow.ipv6_flow.proto,
+ };
+
+ memcpy(attributes->l3.ipv6.hdr.src_addr,
+ input->flow.ipv6_flow.src_ip,
+ RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+ memcpy(attributes->l3.ipv6.hdr.dst_addr,
+ input->flow.ipv6_flow.dst_ip,
+ RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+ memcpy(attributes->l3_mask.ipv6.hdr.src_addr,
+ mask->ipv6_mask.src_ip,
+ RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr));
+ memcpy(attributes->l3_mask.ipv6.hdr.dst_addr,
+ mask->ipv6_mask.dst_ip,
+ RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr));
+ attributes->items[1] = (struct rte_flow_item){
+ .type = RTE_FLOW_ITEM_TYPE_IPV6,
+ .spec = &attributes->l3,
+ .mask = &attributes->l3_mask,
+ };
+ break;
+ default:
+ DRV_LOG(ERR, "port %u invalid flow type%d",
+ dev->data->port_id, fdir_filter->input.flow_type);
+ rte_errno = ENOTSUP;
+ return -rte_errno;
+ }
+ /* Handle L4. */
+ switch (fdir_filter->input.flow_type) {
+ case RTE_ETH_FLOW_NONFRAG_IPV4_UDP:
+ attributes->l4.udp.hdr = (struct rte_udp_hdr){
+ .src_port = input->flow.udp4_flow.src_port,
+ .dst_port = input->flow.udp4_flow.dst_port,
+ };
+ attributes->l4_mask.udp.hdr = (struct rte_udp_hdr){
+ .src_port = mask->src_port_mask,
+ .dst_port = mask->dst_port_mask,
+ };
+ attributes->items[2] = (struct rte_flow_item){
+ .type = RTE_FLOW_ITEM_TYPE_UDP,
+ .spec = &attributes->l4,
+ .mask = &attributes->l4_mask,
+ };
+ break;
+ case RTE_ETH_FLOW_NONFRAG_IPV4_TCP:
+ attributes->l4.tcp.hdr = (struct rte_tcp_hdr){
+ .src_port = input->flow.tcp4_flow.src_port,
+ .dst_port = input->flow.tcp4_flow.dst_port,
+ };
+ attributes->l4_mask.tcp.hdr = (struct rte_tcp_hdr){
+ .src_port = mask->src_port_mask,
+ .dst_port = mask->dst_port_mask,
+ };
+ attributes->items[2] = (struct rte_flow_item){
+ .type = RTE_FLOW_ITEM_TYPE_TCP,
+ .spec = &attributes->l4,
+ .mask = &attributes->l4_mask,
+ };
+ break;
+ case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
+ attributes->l4.udp.hdr = (struct rte_udp_hdr){
+ .src_port = input->flow.udp6_flow.src_port,
+ .dst_port = input->flow.udp6_flow.dst_port,
+ };
+ attributes->l4_mask.udp.hdr = (struct rte_udp_hdr){
+ .src_port = mask->src_port_mask,
+ .dst_port = mask->dst_port_mask,
+ };
+ attributes->items[2] = (struct rte_flow_item){
+ .type = RTE_FLOW_ITEM_TYPE_UDP,
+ .spec = &attributes->l4,
+ .mask = &attributes->l4_mask,
+ };
+ break;
+ case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
+ attributes->l4.tcp.hdr = (struct rte_tcp_hdr){
+ .src_port = input->flow.tcp6_flow.src_port,
+ .dst_port = input->flow.tcp6_flow.dst_port,
+ };
+ attributes->l4_mask.tcp.hdr = (struct rte_tcp_hdr){
+ .src_port = mask->src_port_mask,
+ .dst_port = mask->dst_port_mask,
+ };
+ attributes->items[2] = (struct rte_flow_item){
+ .type = RTE_FLOW_ITEM_TYPE_TCP,
+ .spec = &attributes->l4,
+ .mask = &attributes->l4_mask,
+ };
+ break;
+ case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
+ case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
+ break;
+ default:
+ DRV_LOG(ERR, "port %u invalid flow type%d",
+ dev->data->port_id, fdir_filter->input.flow_type);
+ rte_errno = ENOTSUP;
+ return -rte_errno;
+ }
+ return 0;
+}
+
+#define FLOW_FDIR_CMP(f1, f2, fld) \
+ memcmp(&(f1)->fld, &(f2)->fld, sizeof(f1->fld))
+
+/**
+ * Compare two FDIR flows. If items and actions are identical, the two flows are
+ * regarded as same.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param f1
+ * FDIR flow to compare.
+ * @param f2
+ * FDIR flow to compare.
+ *
+ * @return
+ * Zero on match, 1 otherwise.
+ */
+static int
+flow_fdir_cmp(const struct mlx5_fdir *f1, const struct mlx5_fdir *f2)
+{
+ if (FLOW_FDIR_CMP(f1, f2, attr) ||
+ FLOW_FDIR_CMP(f1, f2, l2) ||
+ FLOW_FDIR_CMP(f1, f2, l2_mask) ||
+ FLOW_FDIR_CMP(f1, f2, l3) ||
+ FLOW_FDIR_CMP(f1, f2, l3_mask) ||
+ FLOW_FDIR_CMP(f1, f2, l4) ||
+ FLOW_FDIR_CMP(f1, f2, l4_mask) ||
+ FLOW_FDIR_CMP(f1, f2, actions[0].type))
+ return 1;
+ if (f1->actions[0].type == RTE_FLOW_ACTION_TYPE_QUEUE &&
+ FLOW_FDIR_CMP(f1, f2, queue))
+ return 1;
+ return 0;
+}
+
+/**
+ * Search device flow list to find out a matched FDIR flow.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param fdir_flow
+ * FDIR flow to lookup.
+ *
+ * @return
+ * Index of flow if found, 0 otherwise.
+ */
+static uint32_t
+flow_fdir_filter_lookup(struct rte_eth_dev *dev, struct mlx5_fdir *fdir_flow)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t flow_idx = 0;
+ struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+
+ MLX5_ASSERT(fdir_flow);
+ LIST_FOREACH(priv_fdir_flow, &priv->fdir_flows, next) {
+ if (!flow_fdir_cmp(priv_fdir_flow->fdir, fdir_flow)) {
+ DRV_LOG(DEBUG, "port %u found FDIR flow %u",
+ dev->data->port_id, flow_idx);
+ flow_idx = priv_fdir_flow->rix_flow;
+ break;
+ }
+ }
+ return flow_idx;
+}
+
+/**
+ * Add new flow director filter and store it in list.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param fdir_filter
+ * Flow director filter to add.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_filter_add(struct rte_eth_dev *dev,
+ const struct rte_eth_fdir_filter *fdir_filter)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_fdir *fdir_flow;
+ struct rte_flow *flow;
+ struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+ uint32_t flow_idx;
+ int ret;
+
+ fdir_flow = rte_zmalloc(__func__, sizeof(*fdir_flow), 0);
+ if (!fdir_flow) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow);
+ if (ret)
+ goto error;
+ flow_idx = flow_fdir_filter_lookup(dev, fdir_flow);
+ if (flow_idx) {
+ rte_errno = EEXIST;
+ goto error;
+ }
+ priv_fdir_flow = rte_zmalloc(__func__, sizeof(struct mlx5_fdir_flow),
+ 0);
+ if (!priv_fdir_flow) {
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ flow_idx = flow_list_create(dev, &priv->flows, &fdir_flow->attr,
+ fdir_flow->items, fdir_flow->actions, true,
+ NULL);
+ flow = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], flow_idx);
+ if (!flow)
+ goto error;
+ flow->fdir = 1;
+ priv_fdir_flow->fdir = fdir_flow;
+ priv_fdir_flow->rix_flow = flow_idx;
+ LIST_INSERT_HEAD(&priv->fdir_flows, priv_fdir_flow, next);
+ DRV_LOG(DEBUG, "port %u created FDIR flow %p",
+ dev->data->port_id, (void *)flow);
+ return 0;
+error:
+ rte_free(priv_fdir_flow);
+ rte_free(fdir_flow);
+ return -rte_errno;
+}
+
+/**
+ * Delete specific filter.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param fdir_filter
+ * Filter to be deleted.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_filter_delete(struct rte_eth_dev *dev,
+ const struct rte_eth_fdir_filter *fdir_filter)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t flow_idx;
+ struct mlx5_fdir fdir_flow = {
+ .attr.group = 0,
+ };
+ struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+ int ret;
+
+ ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow);
+ if (ret)
+ return -rte_errno;
+ LIST_FOREACH(priv_fdir_flow, &priv->fdir_flows, next) {
+ /* Find the fdir in priv list */
+ if (!flow_fdir_cmp(priv_fdir_flow->fdir, &fdir_flow))
+ break;
+ }
+ if (!priv_fdir_flow)
+ return 0;
+ LIST_REMOVE(priv_fdir_flow, next);
+ flow_idx = priv_fdir_flow->rix_flow;
+ flow_list_destroy(dev, &priv->flows, flow_idx);
+ rte_free(priv_fdir_flow->fdir);
+ rte_free(priv_fdir_flow);
+ DRV_LOG(DEBUG, "port %u deleted FDIR flow %u",
+ dev->data->port_id, flow_idx);
+ return 0;
+}
+
+/**
+ * Update queue for specific filter.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param fdir_filter
+ * Filter to be updated.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_filter_update(struct rte_eth_dev *dev,
+ const struct rte_eth_fdir_filter *fdir_filter)
+{
+ int ret;
+
+ ret = flow_fdir_filter_delete(dev, fdir_filter);
+ if (ret)
+ return ret;
+ return flow_fdir_filter_add(dev, fdir_filter);
+}
+
+/**
+ * Flush all filters.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+flow_fdir_filter_flush(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+
+ while (!LIST_EMPTY(&priv->fdir_flows)) {
+ priv_fdir_flow = LIST_FIRST(&priv->fdir_flows);
+ LIST_REMOVE(priv_fdir_flow, next);
+ flow_list_destroy(dev, &priv->flows, priv_fdir_flow->rix_flow);
+ rte_free(priv_fdir_flow->fdir);
+ rte_free(priv_fdir_flow);
+ }
+}
+
+/**
+ * Get flow director information.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] fdir_info
+ * Resulting flow director information.
+ */
+static void
+flow_fdir_info_get(struct rte_eth_dev *dev, struct rte_eth_fdir_info *fdir_info)
+{
+ struct rte_eth_fdir_masks *mask =
+ &dev->data->dev_conf.fdir_conf.mask;
+
+ fdir_info->mode = dev->data->dev_conf.fdir_conf.mode;
+ fdir_info->guarant_spc = 0;
+ rte_memcpy(&fdir_info->mask, mask, sizeof(fdir_info->mask));
+ fdir_info->max_flexpayload = 0;
+ fdir_info->flow_types_mask[0] = 0;
+ fdir_info->flex_payload_unit = 0;
+ fdir_info->max_flex_payload_segment_num = 0;
+ fdir_info->flex_payload_limit = 0;
+ memset(&fdir_info->flex_conf, 0, sizeof(fdir_info->flex_conf));
+}
+
+/**
+ * Deal with flow director operations.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param filter_op
+ * Operation to perform.
+ * @param arg
+ * Pointer to operation-specific structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_ctrl_func(struct rte_eth_dev *dev, enum rte_filter_op filter_op,
+ void *arg)
+{
+ enum rte_fdir_mode fdir_mode =
+ dev->data->dev_conf.fdir_conf.mode;
+
+ if (filter_op == RTE_ETH_FILTER_NOP)
+ return 0;
+ if (fdir_mode != RTE_FDIR_MODE_PERFECT &&
+ fdir_mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN) {
+ DRV_LOG(ERR, "port %u flow director mode %d not supported",
+ dev->data->port_id, fdir_mode);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ switch (filter_op) {
+ case RTE_ETH_FILTER_ADD:
+ return flow_fdir_filter_add(dev, arg);
+ case RTE_ETH_FILTER_UPDATE:
+ return flow_fdir_filter_update(dev, arg);
+ case RTE_ETH_FILTER_DELETE:
+ return flow_fdir_filter_delete(dev, arg);
+ case RTE_ETH_FILTER_FLUSH:
+ flow_fdir_filter_flush(dev);
+ break;
+ case RTE_ETH_FILTER_INFO:
+ flow_fdir_info_get(dev, arg);
+ break;
+ default:
+ DRV_LOG(DEBUG, "port %u unknown operation %u",
+ dev->data->port_id, filter_op);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ return 0;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param filter_type
+ * Filter type.
+ * @param filter_op
+ * Operation to perform.
+ * @param arg
+ * Pointer to operation-specific structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_filter_ctrl(struct rte_eth_dev *dev,
+ enum rte_filter_type filter_type,
+ enum rte_filter_op filter_op,
+ void *arg)
+{
+ switch (filter_type) {
+ case RTE_ETH_FILTER_GENERIC:
+ if (filter_op != RTE_ETH_FILTER_GET) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ *(const void **)arg = &mlx5_flow_ops;
+ return 0;
+ case RTE_ETH_FILTER_FDIR:
+ return flow_fdir_ctrl_func(dev, filter_op, arg);
+ default:
+ DRV_LOG(ERR, "port %u filter type (%d) not supported",
+ dev->data->port_id, filter_type);
+ rte_errno = ENOTSUP;
+ return -rte_errno;
+ }
+ return 0;
+}
+
+/**
+ * Create the needed meter and suffix tables.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] fm
+ * Pointer to the flow meter.
+ *
+ * @return
+ * Pointer to table set on success, NULL otherwise.
+ */
+struct mlx5_meter_domains_infos *
+mlx5_flow_create_mtr_tbls(struct rte_eth_dev *dev,
+ const struct mlx5_flow_meter *fm)
+{
+ const struct mlx5_flow_driver_ops *fops;
+
+ fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+ return fops->create_mtr_tbls(dev, fm);
+}
+
+/**
+ * Destroy the meter table set.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] tbl
+ * Pointer to the meter table set.
+ *
+ * @return
+ * 0 on success.
+ */
+int
+mlx5_flow_destroy_mtr_tbls(struct rte_eth_dev *dev,
+ struct mlx5_meter_domains_infos *tbls)
+{
+ const struct mlx5_flow_driver_ops *fops;
+
+ fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+ return fops->destroy_mtr_tbls(dev, tbls);
+}
+
+/**
+ * Create policer rules.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] fm
+ * Pointer to flow meter structure.
+ * @param[in] attr
+ * Pointer to flow attributes.
+ *
+ * @return
+ * 0 on success, -1 otherwise.
+ */
+int
+mlx5_flow_create_policer_rules(struct rte_eth_dev *dev,
+ struct mlx5_flow_meter *fm,
+ const struct rte_flow_attr *attr)
+{
+ const struct mlx5_flow_driver_ops *fops;
+
+ fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+ return fops->create_policer_rules(dev, fm, attr);
+}
+
+/**
+ * Destroy policer rules.
+ *
+ * @param[in] fm
+ * Pointer to flow meter structure.
+ * @param[in] attr
+ * Pointer to flow attributes.
+ *
+ * @return
+ * 0 on success, -1 otherwise.
+ */
+int
+mlx5_flow_destroy_policer_rules(struct rte_eth_dev *dev,
+ struct mlx5_flow_meter *fm,
+ const struct rte_flow_attr *attr)
+{
+ const struct mlx5_flow_driver_ops *fops;
+
+ fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+ return fops->destroy_policer_rules(dev, fm, attr);
+}
+
+/**
+ * Allocate a counter.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * Index to allocated counter on success, 0 otherwise.
+ */
+uint32_t
+mlx5_counter_alloc(struct rte_eth_dev *dev)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ struct rte_flow_attr attr = { .transfer = 0 };
+
+ if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) {
+ fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+ return fops->counter_alloc(dev);
+ }
+ DRV_LOG(ERR,
+ "port %u counter allocate is not supported.",
+ dev->data->port_id);
+ return 0;
+}
+
+/**
+ * Free a counter.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device structure.
+ * @param[in] cnt
+ * Index to counter to be free.
+ */
+void
+mlx5_counter_free(struct rte_eth_dev *dev, uint32_t cnt)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ struct rte_flow_attr attr = { .transfer = 0 };
+
+ if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) {
+ fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+ fops->counter_free(dev, cnt);
+ return;
+ }
+ DRV_LOG(ERR,
+ "port %u counter free is not supported.",
+ dev->data->port_id);
+}
+
+/**
+ * Query counter statistics.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device structure.
+ * @param[in] cnt
+ * Index to counter to query.
+ * @param[in] clear
+ * Set to clear counter statistics.
+ * @param[out] pkts
+ * The counter hits packets number to save.
+ * @param[out] bytes
+ * The counter hits bytes number to save.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise.
+ */
+int
+mlx5_counter_query(struct rte_eth_dev *dev, uint32_t cnt,
+ bool clear, uint64_t *pkts, uint64_t *bytes)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ struct rte_flow_attr attr = { .transfer = 0 };
+
+ if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) {
+ fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+ return fops->counter_query(dev, cnt, clear, pkts, bytes);
+ }
+ DRV_LOG(ERR,
+ "port %u counter query is not supported.",
+ dev->data->port_id);
+ return -ENOTSUP;
+}
+
+#define MLX5_POOL_QUERY_FREQ_US 1000000
+
+/**
+ * Get number of all validate pools.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_ibv_shared object.
+ *
+ * @return
+ * The number of all validate pools.
+ */
+static uint32_t
+mlx5_get_all_valid_pool_count(struct mlx5_ibv_shared *sh)
+{
+ int i;
+ uint32_t pools_n = 0;
+
+ for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i)
+ pools_n += rte_atomic16_read(&sh->cmng.ccont[i].n_valid);
+ return pools_n;
+}
+
+/**
+ * Set the periodic procedure for triggering asynchronous batch queries for all
+ * the counter pools.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_ibv_shared object.
+ */
+void
+mlx5_set_query_alarm(struct mlx5_ibv_shared *sh)
+{
+ uint32_t pools_n, us;
+
+ pools_n = mlx5_get_all_valid_pool_count(sh);
+ us = MLX5_POOL_QUERY_FREQ_US / pools_n;
+ DRV_LOG(DEBUG, "Set alarm for %u pools each %u us", pools_n, us);
+ if (rte_eal_alarm_set(us, mlx5_flow_query_alarm, sh)) {
+ sh->cmng.query_thread_on = 0;
+ DRV_LOG(ERR, "Cannot reinitialize query alarm");
+ } else {
+ sh->cmng.query_thread_on = 1;
+ }
+}
+
+/**
+ * The periodic procedure for triggering asynchronous batch queries for all the
+ * counter pools. This function is probably called by the host thread.
+ *
+ * @param[in] arg
+ * The parameter for the alarm process.
+ */
+void
+mlx5_flow_query_alarm(void *arg)
+{
+ struct mlx5_ibv_shared *sh = arg;
+ struct mlx5_devx_obj *dcs;
+ uint16_t offset;
+ int ret;
+ uint8_t batch = sh->cmng.batch;
+ uint8_t age = sh->cmng.age;
+ uint16_t pool_index = sh->cmng.pool_index;
+ struct mlx5_pools_container *cont;
+ struct mlx5_flow_counter_pool *pool;
+ int cont_loop = MLX5_CCONT_TYPE_MAX;
+
+ if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES)
+ goto set_alarm;
+next_container:
+ cont = MLX5_CNT_CONTAINER(sh, batch, age);
+ rte_spinlock_lock(&cont->resize_sl);
+ if (!cont->pools) {
+ rte_spinlock_unlock(&cont->resize_sl);
+ /* Check if all the containers are empty. */
+ if (unlikely(--cont_loop == 0))
+ goto set_alarm;
+ batch ^= 0x1;
+ pool_index = 0;
+ if (batch == 0 && pool_index == 0) {
+ age ^= 0x1;
+ sh->cmng.batch = batch;
+ sh->cmng.age = age;
+ }
+ goto next_container;
+ }
+ pool = cont->pools[pool_index];
+ rte_spinlock_unlock(&cont->resize_sl);
+ if (pool->raw_hw)
+ /* There is a pool query in progress. */
+ goto set_alarm;
+ pool->raw_hw =
+ LIST_FIRST(&sh->cmng.free_stat_raws);
+ if (!pool->raw_hw)
+ /* No free counter statistics raw memory. */
+ goto set_alarm;
+ dcs = (struct mlx5_devx_obj *)(uintptr_t)rte_atomic64_read
+ (&pool->a64_dcs);
+ offset = batch ? 0 : dcs->id % MLX5_COUNTERS_PER_POOL;
+ /*
+ * Identify the counters released between query trigger and query
+ * handle more effiecntly. The counter released in this gap period
+ * should wait for a new round of query as the new arrived packets
+ * will not be taken into account.
+ */
+ rte_atomic64_add(&pool->start_query_gen, 1);
+ ret = mlx5_devx_cmd_flow_counter_query(dcs, 0, MLX5_COUNTERS_PER_POOL -
+ offset, NULL, NULL,
+ pool->raw_hw->mem_mng->dm->id,
+ (void *)(uintptr_t)
+ (pool->raw_hw->data + offset),
+ sh->devx_comp,
+ (uint64_t)(uintptr_t)pool);
+ if (ret) {
+ rte_atomic64_sub(&pool->start_query_gen, 1);
+ DRV_LOG(ERR, "Failed to trigger asynchronous query for dcs ID"
+ " %d", pool->min_dcs->id);
+ pool->raw_hw = NULL;
+ goto set_alarm;
+ }
+ pool->raw_hw->min_dcs_id = dcs->id;
+ LIST_REMOVE(pool->raw_hw, next);
+ sh->cmng.pending_queries++;
+ pool_index++;
+ if (pool_index >= rte_atomic16_read(&cont->n_valid)) {
+ batch ^= 0x1;
+ pool_index = 0;
+ if (batch == 0 && pool_index == 0)
+ age ^= 0x1;
+ }
+set_alarm:
+ sh->cmng.batch = batch;
+ sh->cmng.pool_index = pool_index;
+ sh->cmng.age = age;
+ mlx5_set_query_alarm(sh);
+}
+
+/**
+ * Check and callback event for new aged flow in the counter pool
+ *
+ * @param[in] sh
+ * Pointer to mlx5_ibv_shared object.
+ * @param[in] pool
+ * Pointer to Current counter pool.
+ */
+static void
+mlx5_flow_aging_check(struct mlx5_ibv_shared *sh,
+ struct mlx5_flow_counter_pool *pool)
+{
+ struct mlx5_priv *priv;
+ struct mlx5_flow_counter *cnt;
+ struct mlx5_age_info *age_info;
+ struct mlx5_age_param *age_param;
+ struct mlx5_counter_stats_raw *cur = pool->raw_hw;
+ struct mlx5_counter_stats_raw *prev = pool->raw;
+ uint16_t curr = rte_rdtsc() / (rte_get_tsc_hz() / 10);
+ uint32_t i;
+
+ for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+ cnt = MLX5_POOL_GET_CNT(pool, i);
+ age_param = MLX5_CNT_TO_AGE(cnt);
+ if (rte_atomic16_read(&age_param->state) != AGE_CANDIDATE)
+ continue;
+ if (cur->data[i].hits != prev->data[i].hits) {
+ age_param->expire = curr + age_param->timeout;
+ continue;
+ }
+ if ((uint16_t)(curr - age_param->expire) >= (UINT16_MAX / 2))
+ continue;
+ /**
+ * Hold the lock first, or if between the
+ * state AGE_TMOUT and tailq operation the
+ * release happened, the release procedure
+ * may delete a non-existent tailq node.
+ */
+ priv = rte_eth_devices[age_param->port_id].data->dev_private;
+ age_info = GET_PORT_AGE_INFO(priv);
+ rte_spinlock_lock(&age_info->aged_sl);
+ /* If the cpmset fails, release happens. */
+ if (rte_atomic16_cmpset((volatile uint16_t *)
+ &age_param->state,
+ AGE_CANDIDATE,
+ AGE_TMOUT) ==
+ AGE_CANDIDATE) {
+ TAILQ_INSERT_TAIL(&age_info->aged_counters, cnt, next);
+ MLX5_AGE_SET(age_info, MLX5_AGE_EVENT_NEW);
+ }
+ rte_spinlock_unlock(&age_info->aged_sl);
+ }
+ for (i = 0; i < sh->max_port; i++) {
+ age_info = &sh->port[i].age_info;
+ if (!MLX5_AGE_GET(age_info, MLX5_AGE_EVENT_NEW))
+ continue;
+ if (MLX5_AGE_GET(age_info, MLX5_AGE_TRIGGER))
+ _rte_eth_dev_callback_process
+ (&rte_eth_devices[sh->port[i].devx_ih_port_id],
+ RTE_ETH_EVENT_FLOW_AGED, NULL);
+ age_info->flags = 0;
+ }
+}
+
+/**
+ * Handler for the HW respond about ready values from an asynchronous batch
+ * query. This function is probably called by the host thread.
+ *
+ * @param[in] sh
+ * The pointer to the shared IB device context.
+ * @param[in] async_id
+ * The Devx async ID.
+ * @param[in] status
+ * The status of the completion.
+ */
+void
+mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh,
+ uint64_t async_id, int status)
+{
+ struct mlx5_flow_counter_pool *pool =
+ (struct mlx5_flow_counter_pool *)(uintptr_t)async_id;
+ struct mlx5_counter_stats_raw *raw_to_free;
+
+ if (unlikely(status)) {
+ rte_atomic64_sub(&pool->start_query_gen, 1);
+ raw_to_free = pool->raw_hw;
+ } else {
+ raw_to_free = pool->raw;
+ if (IS_AGE_POOL(pool))
+ mlx5_flow_aging_check(sh, pool);
+ rte_spinlock_lock(&pool->sl);
+ pool->raw = pool->raw_hw;
+ rte_spinlock_unlock(&pool->sl);
+ MLX5_ASSERT(rte_atomic64_read(&pool->end_query_gen) + 1 ==
+ rte_atomic64_read(&pool->start_query_gen));
+ rte_atomic64_set(&pool->end_query_gen,
+ rte_atomic64_read(&pool->start_query_gen));
+ /* Be sure the new raw counters data is updated in memory. */
+ rte_cio_wmb();
+ }
+ LIST_INSERT_HEAD(&sh->cmng.free_stat_raws, raw_to_free, next);
+ pool->raw_hw = NULL;
+ sh->cmng.pending_queries--;
+}
+
+/**
+ * Translate the rte_flow group index to HW table value.
+ *
+ * @param[in] attributes
+ * Pointer to flow attributes
+ * @param[in] external
+ * Value is part of flow rule created by request external to PMD.
+ * @param[in] group
+ * rte_flow group index value.
+ * @param[out] fdb_def_rule
+ * Whether fdb jump to table 1 is configured.
+ * @param[out] table
+ * HW table value.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_group_to_table(const struct rte_flow_attr *attributes, bool external,
+ uint32_t group, bool fdb_def_rule, uint32_t *table,
+ struct rte_flow_error *error)
+{
+ if (attributes->transfer && external && fdb_def_rule) {
+ if (group == UINT32_MAX)
+ return rte_flow_error_set
+ (error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+ NULL,
+ "group index not supported");
+ *table = group + 1;
+ } else {
+ *table = group;
+ }
+ return 0;
+}
+
+/**
+ * Discover availability of metadata reg_c's.
+ *
+ * Iteratively use test flows to check availability.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_discover_mreg_c(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ enum modify_reg idx;
+ int n = 0;
+
+ /* reg_c[0] and reg_c[1] are reserved. */
+ config->flow_mreg_c[n++] = REG_C_0;
+ config->flow_mreg_c[n++] = REG_C_1;
+ /* Discover availability of other reg_c's. */
+ for (idx = REG_C_2; idx <= REG_C_7; ++idx) {
+ struct rte_flow_attr attr = {
+ .group = MLX5_FLOW_MREG_CP_TABLE_GROUP,
+ .priority = MLX5_FLOW_PRIO_RSVD,
+ .ingress = 1,
+ };
+ struct rte_flow_item items[] = {
+ [0] = {
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ };
+ struct rte_flow_action actions[] = {
+ [0] = {
+ .type = (enum rte_flow_action_type)
+ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+ .conf = &(struct mlx5_flow_action_copy_mreg){
+ .src = REG_C_1,
+ .dst = idx,
+ },
+ },
+ [1] = {
+ .type = RTE_FLOW_ACTION_TYPE_JUMP,
+ .conf = &(struct rte_flow_action_jump){
+ .group = MLX5_FLOW_MREG_ACT_TABLE_GROUP,
+ },
+ },
+ [2] = {
+ .type = RTE_FLOW_ACTION_TYPE_END,
+ },
+ };
+ uint32_t flow_idx;
+ struct rte_flow *flow;
+ struct rte_flow_error error;
+
+ if (!config->dv_flow_en)
+ break;
+ /* Create internal flow, validation skips copy action. */
+ flow_idx = flow_list_create(dev, NULL, &attr, items,
+ actions, false, &error);
+ flow = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
+ flow_idx);
+ if (!flow)
+ continue;
+ if (dev->data->dev_started || !flow_drv_apply(dev, flow, NULL))
+ config->flow_mreg_c[n++] = idx;
+ flow_list_destroy(dev, NULL, flow_idx);
+ }
+ for (; n < MLX5_MREG_C_NUM; ++n)
+ config->flow_mreg_c[n] = REG_NONE;
+ return 0;
+}
+
+/**
+ * Dump flow raw hw data to file
+ *
+ * @param[in] dev
+ * The pointer to Ethernet device.
+ * @param[in] file
+ * A pointer to a file for output.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL. PMDs initialize this
+ * structure in case of error only.
+ * @return
+ * 0 on success, a nagative value otherwise.
+ */
+int
+mlx5_flow_dev_dump(struct rte_eth_dev *dev,
+ FILE *file,
+ struct rte_flow_error *error __rte_unused)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+
+ return mlx5_devx_cmd_flow_dump(sh->fdb_domain, sh->rx_domain,
+ sh->tx_domain, file);
+}
+
+/**
+ * Get aged-out flows.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] context
+ * The address of an array of pointers to the aged-out flows contexts.
+ * @param[in] nb_countexts
+ * The length of context array pointers.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL. Initialized in case of
+ * error only.
+ *
+ * @return
+ * how many contexts get in success, otherwise negative errno value.
+ * if nb_contexts is 0, return the amount of all aged contexts.
+ * if nb_contexts is not 0 , return the amount of aged flows reported
+ * in the context array.
+ */
+int
+mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void **contexts,
+ uint32_t nb_contexts, struct rte_flow_error *error)
+{
+ const struct mlx5_flow_driver_ops *fops;
+ struct rte_flow_attr attr = { .transfer = 0 };
+
+ if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) {
+ fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+ return fops->get_aged_flows(dev, contexts, nb_contexts,
+ error);
+ }
+ DRV_LOG(ERR,
+ "port %u get aged flows is not supported.",
+ dev->data->port_id);
+ return -ENOTSUP;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h
new file mode 100644
index 000000000..2c9667756
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h
@@ -0,0 +1,1034 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_FLOW_H_
+#define RTE_PMD_MLX5_FLOW_H_
+
+#include <netinet/in.h>
+#include <sys/queue.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_atomic.h>
+#include <rte_alarm.h>
+#include <rte_mtr.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5.h"
+
+/* Private rte flow items. */
+enum mlx5_rte_flow_item_type {
+ MLX5_RTE_FLOW_ITEM_TYPE_END = INT_MIN,
+ MLX5_RTE_FLOW_ITEM_TYPE_TAG,
+ MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
+ MLX5_RTE_FLOW_ITEM_TYPE_VLAN,
+};
+
+/* Private (internal) rte flow actions. */
+enum mlx5_rte_flow_action_type {
+ MLX5_RTE_FLOW_ACTION_TYPE_END = INT_MIN,
+ MLX5_RTE_FLOW_ACTION_TYPE_TAG,
+ MLX5_RTE_FLOW_ACTION_TYPE_MARK,
+ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+};
+
+/* Matches on selected register. */
+struct mlx5_rte_flow_item_tag {
+ enum modify_reg id;
+ uint32_t data;
+};
+
+/* Modify selected register. */
+struct mlx5_rte_flow_action_set_tag {
+ enum modify_reg id;
+ uint32_t data;
+};
+
+struct mlx5_flow_action_copy_mreg {
+ enum modify_reg dst;
+ enum modify_reg src;
+};
+
+/* Matches on source queue. */
+struct mlx5_rte_flow_item_tx_queue {
+ uint32_t queue;
+};
+
+/* Feature name to allocate metadata register. */
+enum mlx5_feature_name {
+ MLX5_HAIRPIN_RX,
+ MLX5_HAIRPIN_TX,
+ MLX5_METADATA_RX,
+ MLX5_METADATA_TX,
+ MLX5_METADATA_FDB,
+ MLX5_FLOW_MARK,
+ MLX5_APP_TAG,
+ MLX5_COPY_MARK,
+ MLX5_MTR_COLOR,
+ MLX5_MTR_SFX,
+};
+
+/* Pattern outer Layer bits. */
+#define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0)
+#define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1)
+#define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2)
+#define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3)
+#define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4)
+#define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5)
+
+/* Pattern inner Layer bits. */
+#define MLX5_FLOW_LAYER_INNER_L2 (1u << 6)
+#define MLX5_FLOW_LAYER_INNER_L3_IPV4 (1u << 7)
+#define MLX5_FLOW_LAYER_INNER_L3_IPV6 (1u << 8)
+#define MLX5_FLOW_LAYER_INNER_L4_UDP (1u << 9)
+#define MLX5_FLOW_LAYER_INNER_L4_TCP (1u << 10)
+#define MLX5_FLOW_LAYER_INNER_VLAN (1u << 11)
+
+/* Pattern tunnel Layer bits. */
+#define MLX5_FLOW_LAYER_VXLAN (1u << 12)
+#define MLX5_FLOW_LAYER_VXLAN_GPE (1u << 13)
+#define MLX5_FLOW_LAYER_GRE (1u << 14)
+#define MLX5_FLOW_LAYER_MPLS (1u << 15)
+/* List of tunnel Layer bits continued below. */
+
+/* General pattern items bits. */
+#define MLX5_FLOW_ITEM_METADATA (1u << 16)
+#define MLX5_FLOW_ITEM_PORT_ID (1u << 17)
+#define MLX5_FLOW_ITEM_TAG (1u << 18)
+#define MLX5_FLOW_ITEM_MARK (1u << 19)
+
+/* Pattern MISC bits. */
+#define MLX5_FLOW_LAYER_ICMP (1u << 20)
+#define MLX5_FLOW_LAYER_ICMP6 (1u << 21)
+#define MLX5_FLOW_LAYER_GRE_KEY (1u << 22)
+
+/* Pattern tunnel Layer bits (continued). */
+#define MLX5_FLOW_LAYER_IPIP (1u << 23)
+#define MLX5_FLOW_LAYER_IPV6_ENCAP (1u << 24)
+#define MLX5_FLOW_LAYER_NVGRE (1u << 25)
+#define MLX5_FLOW_LAYER_GENEVE (1u << 26)
+
+/* Queue items. */
+#define MLX5_FLOW_ITEM_TX_QUEUE (1u << 27)
+
+/* Pattern tunnel Layer bits (continued). */
+#define MLX5_FLOW_LAYER_GTP (1u << 28)
+
+/* Outer Masks. */
+#define MLX5_FLOW_LAYER_OUTER_L3 \
+ (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
+#define MLX5_FLOW_LAYER_OUTER_L4 \
+ (MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP)
+#define MLX5_FLOW_LAYER_OUTER \
+ (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_L3 | \
+ MLX5_FLOW_LAYER_OUTER_L4)
+
+/* Tunnel Masks. */
+#define MLX5_FLOW_LAYER_TUNNEL \
+ (MLX5_FLOW_LAYER_VXLAN | MLX5_FLOW_LAYER_VXLAN_GPE | \
+ MLX5_FLOW_LAYER_GRE | MLX5_FLOW_LAYER_NVGRE | MLX5_FLOW_LAYER_MPLS | \
+ MLX5_FLOW_LAYER_IPIP | MLX5_FLOW_LAYER_IPV6_ENCAP | \
+ MLX5_FLOW_LAYER_GENEVE | MLX5_FLOW_LAYER_GTP)
+
+/* Inner Masks. */
+#define MLX5_FLOW_LAYER_INNER_L3 \
+ (MLX5_FLOW_LAYER_INNER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV6)
+#define MLX5_FLOW_LAYER_INNER_L4 \
+ (MLX5_FLOW_LAYER_INNER_L4_UDP | MLX5_FLOW_LAYER_INNER_L4_TCP)
+#define MLX5_FLOW_LAYER_INNER \
+ (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_L3 | \
+ MLX5_FLOW_LAYER_INNER_L4)
+
+/* Layer Masks. */
+#define MLX5_FLOW_LAYER_L2 \
+ (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_INNER_L2)
+#define MLX5_FLOW_LAYER_L3_IPV4 \
+ (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV4)
+#define MLX5_FLOW_LAYER_L3_IPV6 \
+ (MLX5_FLOW_LAYER_OUTER_L3_IPV6 | MLX5_FLOW_LAYER_INNER_L3_IPV6)
+#define MLX5_FLOW_LAYER_L3 \
+ (MLX5_FLOW_LAYER_L3_IPV4 | MLX5_FLOW_LAYER_L3_IPV6)
+#define MLX5_FLOW_LAYER_L4 \
+ (MLX5_FLOW_LAYER_OUTER_L4 | MLX5_FLOW_LAYER_INNER_L4)
+
+/* Actions */
+#define MLX5_FLOW_ACTION_DROP (1u << 0)
+#define MLX5_FLOW_ACTION_QUEUE (1u << 1)
+#define MLX5_FLOW_ACTION_RSS (1u << 2)
+#define MLX5_FLOW_ACTION_FLAG (1u << 3)
+#define MLX5_FLOW_ACTION_MARK (1u << 4)
+#define MLX5_FLOW_ACTION_COUNT (1u << 5)
+#define MLX5_FLOW_ACTION_PORT_ID (1u << 6)
+#define MLX5_FLOW_ACTION_OF_POP_VLAN (1u << 7)
+#define MLX5_FLOW_ACTION_OF_PUSH_VLAN (1u << 8)
+#define MLX5_FLOW_ACTION_OF_SET_VLAN_VID (1u << 9)
+#define MLX5_FLOW_ACTION_OF_SET_VLAN_PCP (1u << 10)
+#define MLX5_FLOW_ACTION_SET_IPV4_SRC (1u << 11)
+#define MLX5_FLOW_ACTION_SET_IPV4_DST (1u << 12)
+#define MLX5_FLOW_ACTION_SET_IPV6_SRC (1u << 13)
+#define MLX5_FLOW_ACTION_SET_IPV6_DST (1u << 14)
+#define MLX5_FLOW_ACTION_SET_TP_SRC (1u << 15)
+#define MLX5_FLOW_ACTION_SET_TP_DST (1u << 16)
+#define MLX5_FLOW_ACTION_JUMP (1u << 17)
+#define MLX5_FLOW_ACTION_SET_TTL (1u << 18)
+#define MLX5_FLOW_ACTION_DEC_TTL (1u << 19)
+#define MLX5_FLOW_ACTION_SET_MAC_SRC (1u << 20)
+#define MLX5_FLOW_ACTION_SET_MAC_DST (1u << 21)
+#define MLX5_FLOW_ACTION_ENCAP (1u << 22)
+#define MLX5_FLOW_ACTION_DECAP (1u << 23)
+#define MLX5_FLOW_ACTION_INC_TCP_SEQ (1u << 24)
+#define MLX5_FLOW_ACTION_DEC_TCP_SEQ (1u << 25)
+#define MLX5_FLOW_ACTION_INC_TCP_ACK (1u << 26)
+#define MLX5_FLOW_ACTION_DEC_TCP_ACK (1u << 27)
+#define MLX5_FLOW_ACTION_SET_TAG (1ull << 28)
+#define MLX5_FLOW_ACTION_MARK_EXT (1ull << 29)
+#define MLX5_FLOW_ACTION_SET_META (1ull << 30)
+#define MLX5_FLOW_ACTION_METER (1ull << 31)
+#define MLX5_FLOW_ACTION_SET_IPV4_DSCP (1ull << 32)
+#define MLX5_FLOW_ACTION_SET_IPV6_DSCP (1ull << 33)
+#define MLX5_FLOW_ACTION_AGE (1ull << 34)
+
+#define MLX5_FLOW_FATE_ACTIONS \
+ (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE | \
+ MLX5_FLOW_ACTION_RSS | MLX5_FLOW_ACTION_JUMP)
+
+#define MLX5_FLOW_FATE_ESWITCH_ACTIONS \
+ (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
+ MLX5_FLOW_ACTION_JUMP)
+
+
+#define MLX5_FLOW_MODIFY_HDR_ACTIONS (MLX5_FLOW_ACTION_SET_IPV4_SRC | \
+ MLX5_FLOW_ACTION_SET_IPV4_DST | \
+ MLX5_FLOW_ACTION_SET_IPV6_SRC | \
+ MLX5_FLOW_ACTION_SET_IPV6_DST | \
+ MLX5_FLOW_ACTION_SET_TP_SRC | \
+ MLX5_FLOW_ACTION_SET_TP_DST | \
+ MLX5_FLOW_ACTION_SET_TTL | \
+ MLX5_FLOW_ACTION_DEC_TTL | \
+ MLX5_FLOW_ACTION_SET_MAC_SRC | \
+ MLX5_FLOW_ACTION_SET_MAC_DST | \
+ MLX5_FLOW_ACTION_INC_TCP_SEQ | \
+ MLX5_FLOW_ACTION_DEC_TCP_SEQ | \
+ MLX5_FLOW_ACTION_INC_TCP_ACK | \
+ MLX5_FLOW_ACTION_DEC_TCP_ACK | \
+ MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
+ MLX5_FLOW_ACTION_SET_TAG | \
+ MLX5_FLOW_ACTION_MARK_EXT | \
+ MLX5_FLOW_ACTION_SET_META | \
+ MLX5_FLOW_ACTION_SET_IPV4_DSCP | \
+ MLX5_FLOW_ACTION_SET_IPV6_DSCP)
+
+#define MLX5_FLOW_VLAN_ACTIONS (MLX5_FLOW_ACTION_OF_POP_VLAN | \
+ MLX5_FLOW_ACTION_OF_PUSH_VLAN)
+
+#define MLX5_FLOW_XCAP_ACTIONS (MLX5_FLOW_ACTION_ENCAP | MLX5_FLOW_ACTION_DECAP)
+
+#ifndef IPPROTO_MPLS
+#define IPPROTO_MPLS 137
+#endif
+
+/* UDP port number for MPLS */
+#define MLX5_UDP_PORT_MPLS 6635
+
+/* UDP port numbers for VxLAN. */
+#define MLX5_UDP_PORT_VXLAN 4789
+#define MLX5_UDP_PORT_VXLAN_GPE 4790
+
+/* UDP port numbers for GENEVE. */
+#define MLX5_UDP_PORT_GENEVE 6081
+
+/* Priority reserved for default flows. */
+#define MLX5_FLOW_PRIO_RSVD ((uint32_t)-1)
+
+/*
+ * Number of sub priorities.
+ * For each kind of pattern matching i.e. L2, L3, L4 to have a correct
+ * matching on the NIC (firmware dependent) L4 most have the higher priority
+ * followed by L3 and ending with L2.
+ */
+#define MLX5_PRIORITY_MAP_L2 2
+#define MLX5_PRIORITY_MAP_L3 1
+#define MLX5_PRIORITY_MAP_L4 0
+#define MLX5_PRIORITY_MAP_MAX 3
+
+/* Valid layer type for IPV4 RSS. */
+#define MLX5_IPV4_LAYER_TYPES \
+ (ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | \
+ ETH_RSS_NONFRAG_IPV4_TCP | ETH_RSS_NONFRAG_IPV4_UDP | \
+ ETH_RSS_NONFRAG_IPV4_OTHER)
+
+/* IBV hash source bits for IPV4. */
+#define MLX5_IPV4_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4)
+
+/* Valid layer type for IPV6 RSS. */
+#define MLX5_IPV6_LAYER_TYPES \
+ (ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 | ETH_RSS_NONFRAG_IPV6_TCP | \
+ ETH_RSS_NONFRAG_IPV6_UDP | ETH_RSS_IPV6_EX | ETH_RSS_IPV6_TCP_EX | \
+ ETH_RSS_IPV6_UDP_EX | ETH_RSS_NONFRAG_IPV6_OTHER)
+
+/* IBV hash source bits for IPV6. */
+#define MLX5_IPV6_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6)
+
+/* IBV hash bits for L3 SRC. */
+#define MLX5_L3_SRC_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_SRC_IPV6)
+
+/* IBV hash bits for L3 DST. */
+#define MLX5_L3_DST_IBV_RX_HASH (IBV_RX_HASH_DST_IPV4 | IBV_RX_HASH_DST_IPV6)
+
+/* IBV hash bits for TCP. */
+#define MLX5_TCP_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_TCP | \
+ IBV_RX_HASH_DST_PORT_TCP)
+
+/* IBV hash bits for UDP. */
+#define MLX5_UDP_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_UDP | \
+ IBV_RX_HASH_DST_PORT_UDP)
+
+/* IBV hash bits for L4 SRC. */
+#define MLX5_L4_SRC_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_TCP | \
+ IBV_RX_HASH_SRC_PORT_UDP)
+
+/* IBV hash bits for L4 DST. */
+#define MLX5_L4_DST_IBV_RX_HASH (IBV_RX_HASH_DST_PORT_TCP | \
+ IBV_RX_HASH_DST_PORT_UDP)
+
+/* Geneve header first 16Bit */
+#define MLX5_GENEVE_VER_MASK 0x3
+#define MLX5_GENEVE_VER_SHIFT 14
+#define MLX5_GENEVE_VER_VAL(a) \
+ (((a) >> (MLX5_GENEVE_VER_SHIFT)) & (MLX5_GENEVE_VER_MASK))
+#define MLX5_GENEVE_OPTLEN_MASK 0x3F
+#define MLX5_GENEVE_OPTLEN_SHIFT 7
+#define MLX5_GENEVE_OPTLEN_VAL(a) \
+ (((a) >> (MLX5_GENEVE_OPTLEN_SHIFT)) & (MLX5_GENEVE_OPTLEN_MASK))
+#define MLX5_GENEVE_OAMF_MASK 0x1
+#define MLX5_GENEVE_OAMF_SHIFT 7
+#define MLX5_GENEVE_OAMF_VAL(a) \
+ (((a) >> (MLX5_GENEVE_OAMF_SHIFT)) & (MLX5_GENEVE_OAMF_MASK))
+#define MLX5_GENEVE_CRITO_MASK 0x1
+#define MLX5_GENEVE_CRITO_SHIFT 6
+#define MLX5_GENEVE_CRITO_VAL(a) \
+ (((a) >> (MLX5_GENEVE_CRITO_SHIFT)) & (MLX5_GENEVE_CRITO_MASK))
+#define MLX5_GENEVE_RSVD_MASK 0x3F
+#define MLX5_GENEVE_RSVD_VAL(a) ((a) & (MLX5_GENEVE_RSVD_MASK))
+/*
+ * The length of the Geneve options fields, expressed in four byte multiples,
+ * not including the eight byte fixed tunnel.
+ */
+#define MLX5_GENEVE_OPT_LEN_0 14
+#define MLX5_GENEVE_OPT_LEN_1 63
+
+#define MLX5_ENCAPSULATION_DECISION_SIZE (sizeof(struct rte_flow_item_eth) + \
+ sizeof(struct rte_flow_item_ipv4))
+
+/* Software header modify action numbers of a flow. */
+#define MLX5_ACT_NUM_MDF_IPV4 1
+#define MLX5_ACT_NUM_MDF_IPV6 4
+#define MLX5_ACT_NUM_MDF_MAC 2
+#define MLX5_ACT_NUM_MDF_VID 1
+#define MLX5_ACT_NUM_MDF_PORT 2
+#define MLX5_ACT_NUM_MDF_TTL 1
+#define MLX5_ACT_NUM_DEC_TTL MLX5_ACT_NUM_MDF_TTL
+#define MLX5_ACT_NUM_MDF_TCPSEQ 1
+#define MLX5_ACT_NUM_MDF_TCPACK 1
+#define MLX5_ACT_NUM_SET_REG 1
+#define MLX5_ACT_NUM_SET_TAG 1
+#define MLX5_ACT_NUM_CPY_MREG MLX5_ACT_NUM_SET_TAG
+#define MLX5_ACT_NUM_SET_MARK MLX5_ACT_NUM_SET_TAG
+#define MLX5_ACT_NUM_SET_META MLX5_ACT_NUM_SET_TAG
+#define MLX5_ACT_NUM_SET_DSCP 1
+
+enum mlx5_flow_drv_type {
+ MLX5_FLOW_TYPE_MIN,
+ MLX5_FLOW_TYPE_DV,
+ MLX5_FLOW_TYPE_VERBS,
+ MLX5_FLOW_TYPE_MAX,
+};
+
+/* Fate action type. */
+enum mlx5_flow_fate_type {
+ MLX5_FLOW_FATE_NONE, /* Egress flow. */
+ MLX5_FLOW_FATE_QUEUE,
+ MLX5_FLOW_FATE_JUMP,
+ MLX5_FLOW_FATE_PORT_ID,
+ MLX5_FLOW_FATE_DROP,
+ MLX5_FLOW_FATE_MAX,
+};
+
+/* Matcher PRM representation */
+struct mlx5_flow_dv_match_params {
+ size_t size;
+ /**< Size of match value. Do NOT split size and key! */
+ uint32_t buf[MLX5_ST_SZ_DW(fte_match_param)];
+ /**< Matcher value. This value is used as the mask or as a key. */
+};
+
+/* Matcher structure. */
+struct mlx5_flow_dv_matcher {
+ LIST_ENTRY(mlx5_flow_dv_matcher) next;
+ /**< Pointer to the next element. */
+ struct mlx5_flow_tbl_resource *tbl;
+ /**< Pointer to the table(group) the matcher associated with. */
+ rte_atomic32_t refcnt; /**< Reference counter. */
+ void *matcher_object; /**< Pointer to DV matcher */
+ uint16_t crc; /**< CRC of key. */
+ uint16_t priority; /**< Priority of matcher. */
+ struct mlx5_flow_dv_match_params mask; /**< Matcher mask. */
+};
+
+#define MLX5_ENCAP_MAX_LEN 132
+
+/* Encap/decap resource structure. */
+struct mlx5_flow_dv_encap_decap_resource {
+ ILIST_ENTRY(uint32_t)next;
+ /* Pointer to next element. */
+ rte_atomic32_t refcnt; /**< Reference counter. */
+ void *verbs_action;
+ /**< Verbs encap/decap action object. */
+ uint8_t buf[MLX5_ENCAP_MAX_LEN];
+ size_t size;
+ uint8_t reformat_type;
+ uint8_t ft_type;
+ uint64_t flags; /**< Flags for RDMA API. */
+};
+
+/* Tag resource structure. */
+struct mlx5_flow_dv_tag_resource {
+ struct mlx5_hlist_entry entry;
+ /**< hash list entry for tag resource, tag value as the key. */
+ void *action;
+ /**< Verbs tag action object. */
+ rte_atomic32_t refcnt; /**< Reference counter. */
+ uint32_t idx; /**< Index for the index memory pool. */
+};
+
+/*
+ * Number of modification commands.
+ * The maximal actions amount in FW is some constant, and it is 16 in the
+ * latest releases. In some old releases, it will be limited to 8.
+ * Since there is no interface to query the capacity, the maximal value should
+ * be used to allow PMD to create the flow. The validation will be done in the
+ * lower driver layer or FW. A failure will be returned if exceeds the maximal
+ * supported actions number on the root table.
+ * On non-root tables, there is no limitation, but 32 is enough right now.
+ */
+#define MLX5_MAX_MODIFY_NUM 32
+#define MLX5_ROOT_TBL_MODIFY_NUM 16
+
+/* Modify resource structure */
+struct mlx5_flow_dv_modify_hdr_resource {
+ LIST_ENTRY(mlx5_flow_dv_modify_hdr_resource) next;
+ /* Pointer to next element. */
+ rte_atomic32_t refcnt; /**< Reference counter. */
+ struct ibv_flow_action *verbs_action;
+ /**< Verbs modify header action object. */
+ uint8_t ft_type; /**< Flow table type, Rx or Tx. */
+ uint32_t actions_num; /**< Number of modification actions. */
+ uint64_t flags; /**< Flags for RDMA API. */
+ struct mlx5_modification_cmd actions[];
+ /**< Modification actions. */
+};
+
+/* Jump action resource structure. */
+struct mlx5_flow_dv_jump_tbl_resource {
+ rte_atomic32_t refcnt; /**< Reference counter. */
+ uint8_t ft_type; /**< Flow table type, Rx or Tx. */
+ void *action; /**< Pointer to the rdma core action. */
+};
+
+/* Port ID resource structure. */
+struct mlx5_flow_dv_port_id_action_resource {
+ ILIST_ENTRY(uint32_t)next;
+ /* Pointer to next element. */
+ rte_atomic32_t refcnt; /**< Reference counter. */
+ void *action;
+ /**< Verbs tag action object. */
+ uint32_t port_id; /**< Port ID value. */
+};
+
+/* Push VLAN action resource structure */
+struct mlx5_flow_dv_push_vlan_action_resource {
+ ILIST_ENTRY(uint32_t)next;
+ /* Pointer to next element. */
+ rte_atomic32_t refcnt; /**< Reference counter. */
+ void *action; /**< Direct verbs action object. */
+ uint8_t ft_type; /**< Flow table type, Rx, Tx or FDB. */
+ rte_be32_t vlan_tag; /**< VLAN tag value. */
+};
+
+/* Metadata register copy table entry. */
+struct mlx5_flow_mreg_copy_resource {
+ /*
+ * Hash list entry for copy table.
+ * - Key is 32/64-bit MARK action ID.
+ * - MUST be the first entry.
+ */
+ struct mlx5_hlist_entry hlist_ent;
+ LIST_ENTRY(mlx5_flow_mreg_copy_resource) next;
+ /* List entry for device flows. */
+ uint32_t refcnt; /* Reference counter. */
+ uint32_t appcnt; /* Apply/Remove counter. */
+ uint32_t idx;
+ uint32_t rix_flow; /* Built flow for copy. */
+};
+
+/* Table data structure of the hash organization. */
+struct mlx5_flow_tbl_data_entry {
+ struct mlx5_hlist_entry entry;
+ /**< hash list entry, 64-bits key inside. */
+ struct mlx5_flow_tbl_resource tbl;
+ /**< flow table resource. */
+ LIST_HEAD(matchers, mlx5_flow_dv_matcher) matchers;
+ /**< matchers' header associated with the flow table. */
+ struct mlx5_flow_dv_jump_tbl_resource jump;
+ /**< jump resource, at most one for each table created. */
+ uint32_t idx; /**< index for the indexed mempool. */
+};
+
+/* Verbs specification header. */
+struct ibv_spec_header {
+ enum ibv_flow_spec_type type;
+ uint16_t size;
+};
+
+/* RSS description. */
+struct mlx5_flow_rss_desc {
+ uint32_t level;
+ uint32_t queue_num; /**< Number of entries in @p queue. */
+ uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
+ uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */
+ uint16_t queue[]; /**< Destination queues to redirect traffic to. */
+};
+
+
+/** Device flow handle structure for DV mode only. */
+struct mlx5_flow_handle_dv {
+ /* Flow DV api: */
+ struct mlx5_flow_dv_matcher *matcher; /**< Cache to matcher. */
+ struct mlx5_flow_dv_modify_hdr_resource *modify_hdr;
+ /**< Pointer to modify header resource in cache. */
+ uint32_t rix_encap_decap;
+ /**< Index to encap/decap resource in cache. */
+ uint32_t rix_push_vlan;
+ /**< Index to push VLAN action resource in cache. */
+ uint32_t rix_tag;
+ /**< Index to the tag action. */
+} __rte_packed;
+
+/** Device flow handle structure: used both for creating & destroying. */
+struct mlx5_flow_handle {
+ SILIST_ENTRY(uint32_t)next;
+ struct mlx5_vf_vlan vf_vlan; /**< Structure for VF VLAN workaround. */
+ /**< Index to next device flow handle. */
+ uint64_t layers;
+ /**< Bit-fields of present layers, see MLX5_FLOW_LAYER_*. */
+ void *ib_flow; /**< Verbs flow pointer. */
+ uint32_t split_flow_id:28; /**< Sub flow unique match flow id. */
+ uint32_t mark:1; /**< Metadate rxq mark flag. */
+ uint32_t fate_action:3; /**< Fate action type. */
+ union {
+ uint32_t rix_hrxq; /**< Hash Rx queue object index. */
+ uint32_t rix_jump; /**< Index to the jump action resource. */
+ uint32_t rix_port_id_action;
+ /**< Index to port ID action resource. */
+ uint32_t rix_fate;
+ /**< Generic value indicates the fate action. */
+ };
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ struct mlx5_flow_handle_dv dvh;
+#endif
+} __rte_packed;
+
+/*
+ * Size for Verbs device flow handle structure only. Do not use the DV only
+ * structure in Verbs. No DV flows attributes will be accessed.
+ * Macro offsetof() could also be used here.
+ */
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+#define MLX5_FLOW_HANDLE_VERBS_SIZE \
+ (sizeof(struct mlx5_flow_handle) - sizeof(struct mlx5_flow_handle_dv))
+#else
+#define MLX5_FLOW_HANDLE_VERBS_SIZE (sizeof(struct mlx5_flow_handle))
+#endif
+
+/*
+ * Max number of actions per DV flow.
+ * See CREATE_FLOW_MAX_FLOW_ACTIONS_SUPPORTED
+ * in rdma-core file providers/mlx5/verbs.c.
+ */
+#define MLX5_DV_MAX_NUMBER_OF_ACTIONS 8
+
+/** Device flow structure only for DV flow creation. */
+struct mlx5_flow_dv_workspace {
+ uint32_t group; /**< The group index. */
+ uint8_t transfer; /**< 1 if the flow is E-Switch flow. */
+ int actions_n; /**< number of actions. */
+ void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS]; /**< Action list. */
+ struct mlx5_flow_dv_encap_decap_resource *encap_decap;
+ /**< Pointer to encap/decap resource in cache. */
+ struct mlx5_flow_dv_push_vlan_action_resource *push_vlan_res;
+ /**< Pointer to push VLAN action resource in cache. */
+ struct mlx5_flow_dv_tag_resource *tag_resource;
+ /**< pointer to the tag action. */
+ struct mlx5_flow_dv_port_id_action_resource *port_id_action;
+ /**< Pointer to port ID action resource. */
+ struct mlx5_flow_dv_jump_tbl_resource *jump;
+ /**< Pointer to the jump action resource. */
+ struct mlx5_flow_dv_match_params value;
+ /**< Holds the value that the packet is compared to. */
+};
+
+/*
+ * Maximal Verbs flow specifications & actions size.
+ * Some elements are mutually exclusive, but enough space should be allocated.
+ * Tunnel cases: 1. Max 2 Ethernet + IP(v6 len > v4 len) + TCP/UDP headers.
+ * 2. One tunnel header (exception: GRE + MPLS),
+ * SPEC length: GRE == tunnel.
+ * Actions: 1. 1 Mark OR Flag.
+ * 2. 1 Drop (if any).
+ * 3. No limitation for counters, but it makes no sense to support too
+ * many counters in a single device flow.
+ */
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+#define MLX5_VERBS_MAX_SPEC_SIZE \
+ ( \
+ (2 * (sizeof(struct ibv_flow_spec_eth) + \
+ sizeof(struct ibv_flow_spec_ipv6) + \
+ sizeof(struct ibv_flow_spec_tcp_udp)) + \
+ sizeof(struct ibv_flow_spec_gre) + \
+ sizeof(struct ibv_flow_spec_mpls)) \
+ )
+#else
+#define MLX5_VERBS_MAX_SPEC_SIZE \
+ ( \
+ (2 * (sizeof(struct ibv_flow_spec_eth) + \
+ sizeof(struct ibv_flow_spec_ipv6) + \
+ sizeof(struct ibv_flow_spec_tcp_udp)) + \
+ sizeof(struct ibv_flow_spec_tunnel)) \
+ )
+#endif
+
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \
+ defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+#define MLX5_VERBS_MAX_ACT_SIZE \
+ ( \
+ sizeof(struct ibv_flow_spec_action_tag) + \
+ sizeof(struct ibv_flow_spec_action_drop) + \
+ sizeof(struct ibv_flow_spec_counter_action) * 4 \
+ )
+#else
+#define MLX5_VERBS_MAX_ACT_SIZE \
+ ( \
+ sizeof(struct ibv_flow_spec_action_tag) + \
+ sizeof(struct ibv_flow_spec_action_drop) \
+ )
+#endif
+
+#define MLX5_VERBS_MAX_SPEC_ACT_SIZE \
+ (MLX5_VERBS_MAX_SPEC_SIZE + MLX5_VERBS_MAX_ACT_SIZE)
+
+/** Device flow structure only for Verbs flow creation. */
+struct mlx5_flow_verbs_workspace {
+ unsigned int size; /**< Size of the attribute. */
+ struct ibv_flow_attr attr; /**< Verbs flow attribute buffer. */
+ uint8_t specs[MLX5_VERBS_MAX_SPEC_ACT_SIZE];
+ /**< Specifications & actions buffer of verbs flow. */
+};
+
+/** Maximal number of device sub-flows supported. */
+#define MLX5_NUM_MAX_DEV_FLOWS 32
+
+/** Device flow structure. */
+struct mlx5_flow {
+ struct rte_flow *flow; /**< Pointer to the main flow. */
+ uint32_t flow_idx; /**< The memory pool index to the main flow. */
+ uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
+ uint64_t act_flags;
+ /**< Bit-fields of detected actions, see MLX5_FLOW_ACTION_*. */
+ bool external; /**< true if the flow is created external to PMD. */
+ uint8_t ingress; /**< 1 if the flow is ingress. */
+ union {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ struct mlx5_flow_dv_workspace dv;
+#endif
+ struct mlx5_flow_verbs_workspace verbs;
+ };
+ struct mlx5_flow_handle *handle;
+ uint32_t handle_idx; /* Index of the mlx5 flow handle memory. */
+};
+
+/* Flow meter state. */
+#define MLX5_FLOW_METER_DISABLE 0
+#define MLX5_FLOW_METER_ENABLE 1
+
+#define MLX5_MAN_WIDTH 8
+/* Modify this value if enum rte_mtr_color changes. */
+#define RTE_MTR_DROPPED RTE_COLORS
+
+/* Meter policer statistics */
+struct mlx5_flow_policer_stats {
+ uint32_t cnt[RTE_COLORS + 1];
+ /**< Color counter, extra for drop. */
+ uint64_t stats_mask;
+ /**< Statistics mask for the colors. */
+};
+
+/* Meter table structure. */
+struct mlx5_meter_domain_info {
+ struct mlx5_flow_tbl_resource *tbl;
+ /**< Meter table. */
+ struct mlx5_flow_tbl_resource *sfx_tbl;
+ /**< Meter suffix table. */
+ void *any_matcher;
+ /**< Meter color not match default criteria. */
+ void *color_matcher;
+ /**< Meter color match criteria. */
+ void *jump_actn;
+ /**< Meter match action. */
+ void *policer_rules[RTE_MTR_DROPPED + 1];
+ /**< Meter policer for the match. */
+};
+
+/* Meter table set for TX RX FDB. */
+struct mlx5_meter_domains_infos {
+ uint32_t ref_cnt;
+ /**< Table user count. */
+ struct mlx5_meter_domain_info egress;
+ /**< TX meter table. */
+ struct mlx5_meter_domain_info ingress;
+ /**< RX meter table. */
+ struct mlx5_meter_domain_info transfer;
+ /**< FDB meter table. */
+ void *drop_actn;
+ /**< Drop action as not matched. */
+ void *count_actns[RTE_MTR_DROPPED + 1];
+ /**< Counters for match and unmatched statistics. */
+ uint32_t fmp[MLX5_ST_SZ_DW(flow_meter_parameters)];
+ /**< Flow meter parameter. */
+ size_t fmp_size;
+ /**< Flow meter parameter size. */
+ void *meter_action;
+ /**< Flow meter action. */
+};
+
+/* Meter parameter structure. */
+struct mlx5_flow_meter {
+ TAILQ_ENTRY(mlx5_flow_meter) next;
+ /**< Pointer to the next flow meter structure. */
+ uint32_t idx; /* Index to meter object. */
+ uint32_t meter_id;
+ /**< Meter id. */
+ struct mlx5_flow_meter_profile *profile;
+ /**< Meter profile parameters. */
+
+ /** Policer actions (per meter output color). */
+ enum rte_mtr_policer_action action[RTE_COLORS];
+
+ /** Set of stats counters to be enabled.
+ * @see enum rte_mtr_stats_type
+ */
+ uint64_t stats_mask;
+
+ /**< Rule applies to ingress traffic. */
+ uint32_t ingress:1;
+
+ /**< Rule applies to egress traffic. */
+ uint32_t egress:1;
+ /**
+ * Instead of simply matching the properties of traffic as it would
+ * appear on a given DPDK port ID, enabling this attribute transfers
+ * a flow rule to the lowest possible level of any device endpoints
+ * found in the pattern.
+ *
+ * When supported, this effectively enables an application to
+ * re-route traffic not necessarily intended for it (e.g. coming
+ * from or addressed to different physical ports, VFs or
+ * applications) at the device level.
+ *
+ * It complements the behavior of some pattern items such as
+ * RTE_FLOW_ITEM_TYPE_PHY_PORT and is meaningless without them.
+ *
+ * When transferring flow rules, ingress and egress attributes keep
+ * their original meaning, as if processing traffic emitted or
+ * received by the application.
+ */
+ uint32_t transfer:1;
+ struct mlx5_meter_domains_infos *mfts;
+ /**< Flow table created for this meter. */
+ struct mlx5_flow_policer_stats policer_stats;
+ /**< Meter policer statistics. */
+ uint32_t ref_cnt;
+ /**< Use count. */
+ uint32_t active_state:1;
+ /**< Meter state. */
+ uint32_t shared:1;
+ /**< Meter shared or not. */
+};
+
+/* RFC2697 parameter structure. */
+struct mlx5_flow_meter_srtcm_rfc2697_prm {
+ /* green_saturation_value = cbs_mantissa * 2^cbs_exponent */
+ uint32_t cbs_exponent:5;
+ uint32_t cbs_mantissa:8;
+ /* cir = 8G * cir_mantissa * 1/(2^cir_exponent) Bytes/Sec */
+ uint32_t cir_exponent:5;
+ uint32_t cir_mantissa:8;
+ /* yellow _saturation_value = ebs_mantissa * 2^ebs_exponent */
+ uint32_t ebs_exponent:5;
+ uint32_t ebs_mantissa:8;
+};
+
+/* Flow meter profile structure. */
+struct mlx5_flow_meter_profile {
+ TAILQ_ENTRY(mlx5_flow_meter_profile) next;
+ /**< Pointer to the next flow meter structure. */
+ uint32_t meter_profile_id; /**< Profile id. */
+ struct rte_mtr_meter_profile profile; /**< Profile detail. */
+ union {
+ struct mlx5_flow_meter_srtcm_rfc2697_prm srtcm_prm;
+ /**< srtcm_rfc2697 struct. */
+ };
+ uint32_t ref_cnt; /**< Use count. */
+};
+
+/* Fdir flow structure */
+struct mlx5_fdir_flow {
+ LIST_ENTRY(mlx5_fdir_flow) next; /* Pointer to the next element. */
+ struct mlx5_fdir *fdir; /* Pointer to fdir. */
+ uint32_t rix_flow; /* Index to flow. */
+};
+
+#define HAIRPIN_FLOW_ID_BITS 28
+
+/* Flow structure. */
+struct rte_flow {
+ ILIST_ENTRY(uint32_t)next; /**< Index to the next flow structure. */
+ uint32_t dev_handles;
+ /**< Device flow handles that are part of the flow. */
+ uint32_t drv_type:2; /**< Driver type. */
+ uint32_t fdir:1; /**< Identifier of associated FDIR if any. */
+ uint32_t hairpin_flow_id:HAIRPIN_FLOW_ID_BITS;
+ /**< The flow id used for hairpin. */
+ uint32_t copy_applied:1; /**< The MARK copy Flow os applied. */
+ uint32_t rix_mreg_copy;
+ /**< Index to metadata register copy table resource. */
+ uint32_t counter; /**< Holds flow counter. */
+ uint16_t meter; /**< Holds flow meter id. */
+} __rte_packed;
+
+typedef int (*mlx5_flow_validate_t)(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external,
+ int hairpin,
+ struct rte_flow_error *error);
+typedef struct mlx5_flow *(*mlx5_flow_prepare_t)
+ (struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[], struct rte_flow_error *error);
+typedef int (*mlx5_flow_translate_t)(struct rte_eth_dev *dev,
+ struct mlx5_flow *dev_flow,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error);
+typedef int (*mlx5_flow_apply_t)(struct rte_eth_dev *dev, struct rte_flow *flow,
+ struct rte_flow_error *error);
+typedef void (*mlx5_flow_remove_t)(struct rte_eth_dev *dev,
+ struct rte_flow *flow);
+typedef void (*mlx5_flow_destroy_t)(struct rte_eth_dev *dev,
+ struct rte_flow *flow);
+typedef int (*mlx5_flow_query_t)(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ const struct rte_flow_action *actions,
+ void *data,
+ struct rte_flow_error *error);
+typedef struct mlx5_meter_domains_infos *(*mlx5_flow_create_mtr_tbls_t)
+ (struct rte_eth_dev *dev,
+ const struct mlx5_flow_meter *fm);
+typedef int (*mlx5_flow_destroy_mtr_tbls_t)(struct rte_eth_dev *dev,
+ struct mlx5_meter_domains_infos *tbls);
+typedef int (*mlx5_flow_create_policer_rules_t)
+ (struct rte_eth_dev *dev,
+ struct mlx5_flow_meter *fm,
+ const struct rte_flow_attr *attr);
+typedef int (*mlx5_flow_destroy_policer_rules_t)
+ (struct rte_eth_dev *dev,
+ const struct mlx5_flow_meter *fm,
+ const struct rte_flow_attr *attr);
+typedef uint32_t (*mlx5_flow_counter_alloc_t)
+ (struct rte_eth_dev *dev);
+typedef void (*mlx5_flow_counter_free_t)(struct rte_eth_dev *dev,
+ uint32_t cnt);
+typedef int (*mlx5_flow_counter_query_t)(struct rte_eth_dev *dev,
+ uint32_t cnt,
+ bool clear, uint64_t *pkts,
+ uint64_t *bytes);
+typedef int (*mlx5_flow_get_aged_flows_t)
+ (struct rte_eth_dev *dev,
+ void **context,
+ uint32_t nb_contexts,
+ struct rte_flow_error *error);
+struct mlx5_flow_driver_ops {
+ mlx5_flow_validate_t validate;
+ mlx5_flow_prepare_t prepare;
+ mlx5_flow_translate_t translate;
+ mlx5_flow_apply_t apply;
+ mlx5_flow_remove_t remove;
+ mlx5_flow_destroy_t destroy;
+ mlx5_flow_query_t query;
+ mlx5_flow_create_mtr_tbls_t create_mtr_tbls;
+ mlx5_flow_destroy_mtr_tbls_t destroy_mtr_tbls;
+ mlx5_flow_create_policer_rules_t create_policer_rules;
+ mlx5_flow_destroy_policer_rules_t destroy_policer_rules;
+ mlx5_flow_counter_alloc_t counter_alloc;
+ mlx5_flow_counter_free_t counter_free;
+ mlx5_flow_counter_query_t counter_query;
+ mlx5_flow_get_aged_flows_t get_aged_flows;
+};
+
+/* mlx5_flow.c */
+
+struct mlx5_flow_id_pool *mlx5_flow_id_pool_alloc(uint32_t max_id);
+void mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool);
+uint32_t mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id);
+uint32_t mlx5_flow_id_release(struct mlx5_flow_id_pool *pool,
+ uint32_t id);
+int mlx5_flow_group_to_table(const struct rte_flow_attr *attributes,
+ bool external, uint32_t group, bool fdb_def_rule,
+ uint32_t *table, struct rte_flow_error *error);
+uint64_t mlx5_flow_hashfields_adjust(struct mlx5_flow_rss_desc *rss_desc,
+ int tunnel, uint64_t layer_types,
+ uint64_t hash_fields);
+uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
+ uint32_t subpriority);
+int mlx5_flow_get_reg_id(struct rte_eth_dev *dev,
+ enum mlx5_feature_name feature,
+ uint32_t id,
+ struct rte_flow_error *error);
+const struct rte_flow_action *mlx5_flow_find_action
+ (const struct rte_flow_action *actions,
+ enum rte_flow_action_type action);
+int mlx5_flow_validate_action_count(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_action_drop(uint64_t action_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_action_flag(uint64_t action_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_action_mark(const struct rte_flow_action *action,
+ uint64_t action_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_action_queue(const struct rte_flow_action *action,
+ uint64_t action_flags,
+ struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_action_rss(const struct rte_flow_action *action,
+ uint64_t action_flags,
+ struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ uint64_t item_flags,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_attributes(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attributes,
+ struct rte_flow_error *error);
+int mlx5_flow_item_acceptable(const struct rte_flow_item *item,
+ const uint8_t *mask,
+ const uint8_t *nic_mask,
+ unsigned int size,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_gre(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_gre_key(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ const struct rte_flow_item *gre_item,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint64_t last_item,
+ uint16_t ether_type,
+ const struct rte_flow_item_ipv4 *acc_mask,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint64_t last_item,
+ uint16_t ether_type,
+ const struct rte_flow_item_ipv6 *acc_mask,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_mpls(struct rte_eth_dev *dev,
+ const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint64_t prev_layer,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_tcp(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ const struct rte_flow_item_tcp *flow_mask,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_udp(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_eth_dev *dev,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_eth_dev *dev,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_icmp(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_icmp6(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_nvgre(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ uint8_t target_protocol,
+ struct rte_flow_error *error);
+int mlx5_flow_validate_item_geneve(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_eth_dev *dev,
+ struct rte_flow_error *error);
+struct mlx5_meter_domains_infos *mlx5_flow_create_mtr_tbls
+ (struct rte_eth_dev *dev,
+ const struct mlx5_flow_meter *fm);
+int mlx5_flow_destroy_mtr_tbls(struct rte_eth_dev *dev,
+ struct mlx5_meter_domains_infos *tbl);
+int mlx5_flow_create_policer_rules(struct rte_eth_dev *dev,
+ struct mlx5_flow_meter *fm,
+ const struct rte_flow_attr *attr);
+int mlx5_flow_destroy_policer_rules(struct rte_eth_dev *dev,
+ struct mlx5_flow_meter *fm,
+ const struct rte_flow_attr *attr);
+int mlx5_flow_meter_flush(struct rte_eth_dev *dev,
+ struct rte_mtr_error *error);
+#endif /* RTE_PMD_MLX5_FLOW_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c
new file mode 100644
index 000000000..e48183195
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c
@@ -0,0 +1,9666 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <sys/queue.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+#include <rte_malloc.h>
+#include <rte_cycles.h>
+#include <rte_ip.h>
+#include <rte_gre.h>
+#include <rte_vxlan.h>
+#include <rte_gtp.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_flow.h"
+#include "mlx5_rxtx.h"
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+
+#ifndef HAVE_IBV_FLOW_DEVX_COUNTERS
+#define MLX5DV_FLOW_ACTION_COUNTERS_DEVX 0
+#endif
+
+#ifndef HAVE_MLX5DV_DR_ESWITCH
+#ifndef MLX5DV_FLOW_TABLE_TYPE_FDB
+#define MLX5DV_FLOW_TABLE_TYPE_FDB 0
+#endif
+#endif
+
+#ifndef HAVE_MLX5DV_DR
+#define MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL 1
+#endif
+
+/* VLAN header definitions */
+#define MLX5DV_FLOW_VLAN_PCP_SHIFT 13
+#define MLX5DV_FLOW_VLAN_PCP_MASK (0x7 << MLX5DV_FLOW_VLAN_PCP_SHIFT)
+#define MLX5DV_FLOW_VLAN_VID_MASK 0x0fff
+#define MLX5DV_FLOW_VLAN_PCP_MASK_BE RTE_BE16(MLX5DV_FLOW_VLAN_PCP_MASK)
+#define MLX5DV_FLOW_VLAN_VID_MASK_BE RTE_BE16(MLX5DV_FLOW_VLAN_VID_MASK)
+
+union flow_dv_attr {
+ struct {
+ uint32_t valid:1;
+ uint32_t ipv4:1;
+ uint32_t ipv6:1;
+ uint32_t tcp:1;
+ uint32_t udp:1;
+ uint32_t reserved:27;
+ };
+ uint32_t attr;
+};
+
+static int
+flow_dv_tbl_resource_release(struct rte_eth_dev *dev,
+ struct mlx5_flow_tbl_resource *tbl);
+
+/**
+ * Initialize flow attributes structure according to flow items' types.
+ *
+ * flow_dv_validate() avoids multiple L3/L4 layers cases other than tunnel
+ * mode. For tunnel mode, the items to be modified are the outermost ones.
+ *
+ * @param[in] item
+ * Pointer to item specification.
+ * @param[out] attr
+ * Pointer to flow attributes structure.
+ * @param[in] dev_flow
+ * Pointer to the sub flow.
+ * @param[in] tunnel_decap
+ * Whether action is after tunnel decapsulation.
+ */
+static void
+flow_dv_attr_init(const struct rte_flow_item *item, union flow_dv_attr *attr,
+ struct mlx5_flow *dev_flow, bool tunnel_decap)
+{
+ uint64_t layers = dev_flow->handle->layers;
+
+ /*
+ * If layers is already initialized, it means this dev_flow is the
+ * suffix flow, the layers flags is set by the prefix flow. Need to
+ * use the layer flags from prefix flow as the suffix flow may not
+ * have the user defined items as the flow is split.
+ */
+ if (layers) {
+ if (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV4)
+ attr->ipv4 = 1;
+ else if (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV6)
+ attr->ipv6 = 1;
+ if (layers & MLX5_FLOW_LAYER_OUTER_L4_TCP)
+ attr->tcp = 1;
+ else if (layers & MLX5_FLOW_LAYER_OUTER_L4_UDP)
+ attr->udp = 1;
+ attr->valid = 1;
+ return;
+ }
+ for (; item->type != RTE_FLOW_ITEM_TYPE_END; item++) {
+ uint8_t next_protocol = 0xff;
+ switch (item->type) {
+ case RTE_FLOW_ITEM_TYPE_GRE:
+ case RTE_FLOW_ITEM_TYPE_NVGRE:
+ case RTE_FLOW_ITEM_TYPE_VXLAN:
+ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+ case RTE_FLOW_ITEM_TYPE_GENEVE:
+ case RTE_FLOW_ITEM_TYPE_MPLS:
+ if (tunnel_decap)
+ attr->attr = 0;
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV4:
+ if (!attr->ipv6)
+ attr->ipv4 = 1;
+ if (item->mask != NULL &&
+ ((const struct rte_flow_item_ipv4 *)
+ item->mask)->hdr.next_proto_id)
+ next_protocol =
+ ((const struct rte_flow_item_ipv4 *)
+ (item->spec))->hdr.next_proto_id &
+ ((const struct rte_flow_item_ipv4 *)
+ (item->mask))->hdr.next_proto_id;
+ if ((next_protocol == IPPROTO_IPIP ||
+ next_protocol == IPPROTO_IPV6) && tunnel_decap)
+ attr->attr = 0;
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV6:
+ if (!attr->ipv4)
+ attr->ipv6 = 1;
+ if (item->mask != NULL &&
+ ((const struct rte_flow_item_ipv6 *)
+ item->mask)->hdr.proto)
+ next_protocol =
+ ((const struct rte_flow_item_ipv6 *)
+ (item->spec))->hdr.proto &
+ ((const struct rte_flow_item_ipv6 *)
+ (item->mask))->hdr.proto;
+ if ((next_protocol == IPPROTO_IPIP ||
+ next_protocol == IPPROTO_IPV6) && tunnel_decap)
+ attr->attr = 0;
+ break;
+ case RTE_FLOW_ITEM_TYPE_UDP:
+ if (!attr->tcp)
+ attr->udp = 1;
+ break;
+ case RTE_FLOW_ITEM_TYPE_TCP:
+ if (!attr->udp)
+ attr->tcp = 1;
+ break;
+ default:
+ break;
+ }
+ }
+ attr->valid = 1;
+}
+
+/**
+ * Convert rte_mtr_color to mlx5 color.
+ *
+ * @param[in] rcol
+ * rte_mtr_color.
+ *
+ * @return
+ * mlx5 color.
+ */
+static int
+rte_col_2_mlx5_col(enum rte_color rcol)
+{
+ switch (rcol) {
+ case RTE_COLOR_GREEN:
+ return MLX5_FLOW_COLOR_GREEN;
+ case RTE_COLOR_YELLOW:
+ return MLX5_FLOW_COLOR_YELLOW;
+ case RTE_COLOR_RED:
+ return MLX5_FLOW_COLOR_RED;
+ default:
+ break;
+ }
+ return MLX5_FLOW_COLOR_UNDEFINED;
+}
+
+struct field_modify_info {
+ uint32_t size; /* Size of field in protocol header, in bytes. */
+ uint32_t offset; /* Offset of field in protocol header, in bytes. */
+ enum mlx5_modification_field id;
+};
+
+struct field_modify_info modify_eth[] = {
+ {4, 0, MLX5_MODI_OUT_DMAC_47_16},
+ {2, 4, MLX5_MODI_OUT_DMAC_15_0},
+ {4, 6, MLX5_MODI_OUT_SMAC_47_16},
+ {2, 10, MLX5_MODI_OUT_SMAC_15_0},
+ {0, 0, 0},
+};
+
+struct field_modify_info modify_vlan_out_first_vid[] = {
+ /* Size in bits !!! */
+ {12, 0, MLX5_MODI_OUT_FIRST_VID},
+ {0, 0, 0},
+};
+
+struct field_modify_info modify_ipv4[] = {
+ {1, 1, MLX5_MODI_OUT_IP_DSCP},
+ {1, 8, MLX5_MODI_OUT_IPV4_TTL},
+ {4, 12, MLX5_MODI_OUT_SIPV4},
+ {4, 16, MLX5_MODI_OUT_DIPV4},
+ {0, 0, 0},
+};
+
+struct field_modify_info modify_ipv6[] = {
+ {1, 0, MLX5_MODI_OUT_IP_DSCP},
+ {1, 7, MLX5_MODI_OUT_IPV6_HOPLIMIT},
+ {4, 8, MLX5_MODI_OUT_SIPV6_127_96},
+ {4, 12, MLX5_MODI_OUT_SIPV6_95_64},
+ {4, 16, MLX5_MODI_OUT_SIPV6_63_32},
+ {4, 20, MLX5_MODI_OUT_SIPV6_31_0},
+ {4, 24, MLX5_MODI_OUT_DIPV6_127_96},
+ {4, 28, MLX5_MODI_OUT_DIPV6_95_64},
+ {4, 32, MLX5_MODI_OUT_DIPV6_63_32},
+ {4, 36, MLX5_MODI_OUT_DIPV6_31_0},
+ {0, 0, 0},
+};
+
+struct field_modify_info modify_udp[] = {
+ {2, 0, MLX5_MODI_OUT_UDP_SPORT},
+ {2, 2, MLX5_MODI_OUT_UDP_DPORT},
+ {0, 0, 0},
+};
+
+struct field_modify_info modify_tcp[] = {
+ {2, 0, MLX5_MODI_OUT_TCP_SPORT},
+ {2, 2, MLX5_MODI_OUT_TCP_DPORT},
+ {4, 4, MLX5_MODI_OUT_TCP_SEQ_NUM},
+ {4, 8, MLX5_MODI_OUT_TCP_ACK_NUM},
+ {0, 0, 0},
+};
+
+static void
+mlx5_flow_tunnel_ip_check(const struct rte_flow_item *item __rte_unused,
+ uint8_t next_protocol, uint64_t *item_flags,
+ int *tunnel)
+{
+ MLX5_ASSERT(item->type == RTE_FLOW_ITEM_TYPE_IPV4 ||
+ item->type == RTE_FLOW_ITEM_TYPE_IPV6);
+ if (next_protocol == IPPROTO_IPIP) {
+ *item_flags |= MLX5_FLOW_LAYER_IPIP;
+ *tunnel = 1;
+ }
+ if (next_protocol == IPPROTO_IPV6) {
+ *item_flags |= MLX5_FLOW_LAYER_IPV6_ENCAP;
+ *tunnel = 1;
+ }
+}
+
+/**
+ * Acquire the synchronizing object to protect multithreaded access
+ * to shared dv context. Lock occurs only if context is actually
+ * shared, i.e. we have multiport IB device and representors are
+ * created.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ */
+static void
+flow_dv_shared_lock(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+
+ if (sh->dv_refcnt > 1) {
+ int ret;
+
+ ret = pthread_mutex_lock(&sh->dv_mutex);
+ MLX5_ASSERT(!ret);
+ (void)ret;
+ }
+}
+
+static void
+flow_dv_shared_unlock(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+
+ if (sh->dv_refcnt > 1) {
+ int ret;
+
+ ret = pthread_mutex_unlock(&sh->dv_mutex);
+ MLX5_ASSERT(!ret);
+ (void)ret;
+ }
+}
+
+/* Update VLAN's VID/PCP based on input rte_flow_action.
+ *
+ * @param[in] action
+ * Pointer to struct rte_flow_action.
+ * @param[out] vlan
+ * Pointer to struct rte_vlan_hdr.
+ */
+static void
+mlx5_update_vlan_vid_pcp(const struct rte_flow_action *action,
+ struct rte_vlan_hdr *vlan)
+{
+ uint16_t vlan_tci;
+ if (action->type == RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
+ vlan_tci =
+ ((const struct rte_flow_action_of_set_vlan_pcp *)
+ action->conf)->vlan_pcp;
+ vlan_tci = vlan_tci << MLX5DV_FLOW_VLAN_PCP_SHIFT;
+ vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_PCP_MASK;
+ vlan->vlan_tci |= vlan_tci;
+ } else if (action->type == RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
+ vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_VID_MASK;
+ vlan->vlan_tci |= rte_be_to_cpu_16
+ (((const struct rte_flow_action_of_set_vlan_vid *)
+ action->conf)->vlan_vid);
+ }
+}
+
+/**
+ * Fetch 1, 2, 3 or 4 byte field from the byte array
+ * and return as unsigned integer in host-endian format.
+ *
+ * @param[in] data
+ * Pointer to data array.
+ * @param[in] size
+ * Size of field to extract.
+ *
+ * @return
+ * converted field in host endian format.
+ */
+static inline uint32_t
+flow_dv_fetch_field(const uint8_t *data, uint32_t size)
+{
+ uint32_t ret;
+
+ switch (size) {
+ case 1:
+ ret = *data;
+ break;
+ case 2:
+ ret = rte_be_to_cpu_16(*(const unaligned_uint16_t *)data);
+ break;
+ case 3:
+ ret = rte_be_to_cpu_16(*(const unaligned_uint16_t *)data);
+ ret = (ret << 8) | *(data + sizeof(uint16_t));
+ break;
+ case 4:
+ ret = rte_be_to_cpu_32(*(const unaligned_uint32_t *)data);
+ break;
+ default:
+ MLX5_ASSERT(false);
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * Convert modify-header action to DV specification.
+ *
+ * Data length of each action is determined by provided field description
+ * and the item mask. Data bit offset and width of each action is determined
+ * by provided item mask.
+ *
+ * @param[in] item
+ * Pointer to item specification.
+ * @param[in] field
+ * Pointer to field modification information.
+ * For MLX5_MODIFICATION_TYPE_SET specifies destination field.
+ * For MLX5_MODIFICATION_TYPE_ADD specifies destination field.
+ * For MLX5_MODIFICATION_TYPE_COPY specifies source field.
+ * @param[in] dcopy
+ * Destination field info for MLX5_MODIFICATION_TYPE_COPY in @type.
+ * Negative offset value sets the same offset as source offset.
+ * size field is ignored, value is taken from source field.
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] type
+ * Type of modification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_modify_action(struct rte_flow_item *item,
+ struct field_modify_info *field,
+ struct field_modify_info *dcopy,
+ struct mlx5_flow_dv_modify_hdr_resource *resource,
+ uint32_t type, struct rte_flow_error *error)
+{
+ uint32_t i = resource->actions_num;
+ struct mlx5_modification_cmd *actions = resource->actions;
+
+ /*
+ * The item and mask are provided in big-endian format.
+ * The fields should be presented as in big-endian format either.
+ * Mask must be always present, it defines the actual field width.
+ */
+ MLX5_ASSERT(item->mask);
+ MLX5_ASSERT(field->size);
+ do {
+ unsigned int size_b;
+ unsigned int off_b;
+ uint32_t mask;
+ uint32_t data;
+
+ if (i >= MLX5_MAX_MODIFY_NUM)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "too many items to modify");
+ /* Fetch variable byte size mask from the array. */
+ mask = flow_dv_fetch_field((const uint8_t *)item->mask +
+ field->offset, field->size);
+ if (!mask) {
+ ++field;
+ continue;
+ }
+ /* Deduce actual data width in bits from mask value. */
+ off_b = rte_bsf32(mask);
+ size_b = sizeof(uint32_t) * CHAR_BIT -
+ off_b - __builtin_clz(mask);
+ MLX5_ASSERT(size_b);
+ size_b = size_b == sizeof(uint32_t) * CHAR_BIT ? 0 : size_b;
+ actions[i] = (struct mlx5_modification_cmd) {
+ .action_type = type,
+ .field = field->id,
+ .offset = off_b,
+ .length = size_b,
+ };
+ /* Convert entire record to expected big-endian format. */
+ actions[i].data0 = rte_cpu_to_be_32(actions[i].data0);
+ if (type == MLX5_MODIFICATION_TYPE_COPY) {
+ MLX5_ASSERT(dcopy);
+ actions[i].dst_field = dcopy->id;
+ actions[i].dst_offset =
+ (int)dcopy->offset < 0 ? off_b : dcopy->offset;
+ /* Convert entire record to big-endian format. */
+ actions[i].data1 = rte_cpu_to_be_32(actions[i].data1);
+ } else {
+ MLX5_ASSERT(item->spec);
+ data = flow_dv_fetch_field((const uint8_t *)item->spec +
+ field->offset, field->size);
+ /* Shift out the trailing masked bits from data. */
+ data = (data & mask) >> off_b;
+ actions[i].data1 = rte_cpu_to_be_32(data);
+ }
+ ++i;
+ ++field;
+ } while (field->size);
+ if (resource->actions_num == i)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "invalid modification flow item");
+ resource->actions_num = i;
+ return 0;
+}
+
+/**
+ * Convert modify-header set IPv4 address action to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ipv4
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_set_ipv4 *conf =
+ (const struct rte_flow_action_set_ipv4 *)(action->conf);
+ struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV4 };
+ struct rte_flow_item_ipv4 ipv4;
+ struct rte_flow_item_ipv4 ipv4_mask;
+
+ memset(&ipv4, 0, sizeof(ipv4));
+ memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+ if (action->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC) {
+ ipv4.hdr.src_addr = conf->ipv4_addr;
+ ipv4_mask.hdr.src_addr = rte_flow_item_ipv4_mask.hdr.src_addr;
+ } else {
+ ipv4.hdr.dst_addr = conf->ipv4_addr;
+ ipv4_mask.hdr.dst_addr = rte_flow_item_ipv4_mask.hdr.dst_addr;
+ }
+ item.spec = &ipv4;
+ item.mask = &ipv4_mask;
+ return flow_dv_convert_modify_action(&item, modify_ipv4, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set IPv6 address action to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ipv6
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_set_ipv6 *conf =
+ (const struct rte_flow_action_set_ipv6 *)(action->conf);
+ struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV6 };
+ struct rte_flow_item_ipv6 ipv6;
+ struct rte_flow_item_ipv6 ipv6_mask;
+
+ memset(&ipv6, 0, sizeof(ipv6));
+ memset(&ipv6_mask, 0, sizeof(ipv6_mask));
+ if (action->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC) {
+ memcpy(&ipv6.hdr.src_addr, &conf->ipv6_addr,
+ sizeof(ipv6.hdr.src_addr));
+ memcpy(&ipv6_mask.hdr.src_addr,
+ &rte_flow_item_ipv6_mask.hdr.src_addr,
+ sizeof(ipv6.hdr.src_addr));
+ } else {
+ memcpy(&ipv6.hdr.dst_addr, &conf->ipv6_addr,
+ sizeof(ipv6.hdr.dst_addr));
+ memcpy(&ipv6_mask.hdr.dst_addr,
+ &rte_flow_item_ipv6_mask.hdr.dst_addr,
+ sizeof(ipv6.hdr.dst_addr));
+ }
+ item.spec = &ipv6;
+ item.mask = &ipv6_mask;
+ return flow_dv_convert_modify_action(&item, modify_ipv6, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set MAC address action to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_mac
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_set_mac *conf =
+ (const struct rte_flow_action_set_mac *)(action->conf);
+ struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_ETH };
+ struct rte_flow_item_eth eth;
+ struct rte_flow_item_eth eth_mask;
+
+ memset(&eth, 0, sizeof(eth));
+ memset(&eth_mask, 0, sizeof(eth_mask));
+ if (action->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC) {
+ memcpy(&eth.src.addr_bytes, &conf->mac_addr,
+ sizeof(eth.src.addr_bytes));
+ memcpy(&eth_mask.src.addr_bytes,
+ &rte_flow_item_eth_mask.src.addr_bytes,
+ sizeof(eth_mask.src.addr_bytes));
+ } else {
+ memcpy(&eth.dst.addr_bytes, &conf->mac_addr,
+ sizeof(eth.dst.addr_bytes));
+ memcpy(&eth_mask.dst.addr_bytes,
+ &rte_flow_item_eth_mask.dst.addr_bytes,
+ sizeof(eth_mask.dst.addr_bytes));
+ }
+ item.spec = &eth;
+ item.mask = &eth_mask;
+ return flow_dv_convert_modify_action(&item, modify_eth, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set VLAN VID action to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_vlan_vid
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_of_set_vlan_vid *conf =
+ (const struct rte_flow_action_of_set_vlan_vid *)(action->conf);
+ int i = resource->actions_num;
+ struct mlx5_modification_cmd *actions = resource->actions;
+ struct field_modify_info *field = modify_vlan_out_first_vid;
+
+ if (i >= MLX5_MAX_MODIFY_NUM)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "too many items to modify");
+ actions[i] = (struct mlx5_modification_cmd) {
+ .action_type = MLX5_MODIFICATION_TYPE_SET,
+ .field = field->id,
+ .length = field->size,
+ .offset = field->offset,
+ };
+ actions[i].data0 = rte_cpu_to_be_32(actions[i].data0);
+ actions[i].data1 = conf->vlan_vid;
+ actions[i].data1 = actions[i].data1 << 16;
+ resource->actions_num = ++i;
+ return 0;
+}
+
+/**
+ * Convert modify-header set TP action to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[in] items
+ * Pointer to rte_flow_item objects list.
+ * @param[in] attr
+ * Pointer to flow attributes structure.
+ * @param[in] dev_flow
+ * Pointer to the sub flow.
+ * @param[in] tunnel_decap
+ * Whether action is after tunnel decapsulation.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_tp
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ const struct rte_flow_item *items,
+ union flow_dv_attr *attr, struct mlx5_flow *dev_flow,
+ bool tunnel_decap, struct rte_flow_error *error)
+{
+ const struct rte_flow_action_set_tp *conf =
+ (const struct rte_flow_action_set_tp *)(action->conf);
+ struct rte_flow_item item;
+ struct rte_flow_item_udp udp;
+ struct rte_flow_item_udp udp_mask;
+ struct rte_flow_item_tcp tcp;
+ struct rte_flow_item_tcp tcp_mask;
+ struct field_modify_info *field;
+
+ if (!attr->valid)
+ flow_dv_attr_init(items, attr, dev_flow, tunnel_decap);
+ if (attr->udp) {
+ memset(&udp, 0, sizeof(udp));
+ memset(&udp_mask, 0, sizeof(udp_mask));
+ if (action->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC) {
+ udp.hdr.src_port = conf->port;
+ udp_mask.hdr.src_port =
+ rte_flow_item_udp_mask.hdr.src_port;
+ } else {
+ udp.hdr.dst_port = conf->port;
+ udp_mask.hdr.dst_port =
+ rte_flow_item_udp_mask.hdr.dst_port;
+ }
+ item.type = RTE_FLOW_ITEM_TYPE_UDP;
+ item.spec = &udp;
+ item.mask = &udp_mask;
+ field = modify_udp;
+ } else {
+ MLX5_ASSERT(attr->tcp);
+ memset(&tcp, 0, sizeof(tcp));
+ memset(&tcp_mask, 0, sizeof(tcp_mask));
+ if (action->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC) {
+ tcp.hdr.src_port = conf->port;
+ tcp_mask.hdr.src_port =
+ rte_flow_item_tcp_mask.hdr.src_port;
+ } else {
+ tcp.hdr.dst_port = conf->port;
+ tcp_mask.hdr.dst_port =
+ rte_flow_item_tcp_mask.hdr.dst_port;
+ }
+ item.type = RTE_FLOW_ITEM_TYPE_TCP;
+ item.spec = &tcp;
+ item.mask = &tcp_mask;
+ field = modify_tcp;
+ }
+ return flow_dv_convert_modify_action(&item, field, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set TTL action to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[in] items
+ * Pointer to rte_flow_item objects list.
+ * @param[in] attr
+ * Pointer to flow attributes structure.
+ * @param[in] dev_flow
+ * Pointer to the sub flow.
+ * @param[in] tunnel_decap
+ * Whether action is after tunnel decapsulation.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ttl
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ const struct rte_flow_item *items,
+ union flow_dv_attr *attr, struct mlx5_flow *dev_flow,
+ bool tunnel_decap, struct rte_flow_error *error)
+{
+ const struct rte_flow_action_set_ttl *conf =
+ (const struct rte_flow_action_set_ttl *)(action->conf);
+ struct rte_flow_item item;
+ struct rte_flow_item_ipv4 ipv4;
+ struct rte_flow_item_ipv4 ipv4_mask;
+ struct rte_flow_item_ipv6 ipv6;
+ struct rte_flow_item_ipv6 ipv6_mask;
+ struct field_modify_info *field;
+
+ if (!attr->valid)
+ flow_dv_attr_init(items, attr, dev_flow, tunnel_decap);
+ if (attr->ipv4) {
+ memset(&ipv4, 0, sizeof(ipv4));
+ memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+ ipv4.hdr.time_to_live = conf->ttl_value;
+ ipv4_mask.hdr.time_to_live = 0xFF;
+ item.type = RTE_FLOW_ITEM_TYPE_IPV4;
+ item.spec = &ipv4;
+ item.mask = &ipv4_mask;
+ field = modify_ipv4;
+ } else {
+ MLX5_ASSERT(attr->ipv6);
+ memset(&ipv6, 0, sizeof(ipv6));
+ memset(&ipv6_mask, 0, sizeof(ipv6_mask));
+ ipv6.hdr.hop_limits = conf->ttl_value;
+ ipv6_mask.hdr.hop_limits = 0xFF;
+ item.type = RTE_FLOW_ITEM_TYPE_IPV6;
+ item.spec = &ipv6;
+ item.mask = &ipv6_mask;
+ field = modify_ipv6;
+ }
+ return flow_dv_convert_modify_action(&item, field, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header decrement TTL action to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[in] items
+ * Pointer to rte_flow_item objects list.
+ * @param[in] attr
+ * Pointer to flow attributes structure.
+ * @param[in] dev_flow
+ * Pointer to the sub flow.
+ * @param[in] tunnel_decap
+ * Whether action is after tunnel decapsulation.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_dec_ttl
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_item *items,
+ union flow_dv_attr *attr, struct mlx5_flow *dev_flow,
+ bool tunnel_decap, struct rte_flow_error *error)
+{
+ struct rte_flow_item item;
+ struct rte_flow_item_ipv4 ipv4;
+ struct rte_flow_item_ipv4 ipv4_mask;
+ struct rte_flow_item_ipv6 ipv6;
+ struct rte_flow_item_ipv6 ipv6_mask;
+ struct field_modify_info *field;
+
+ if (!attr->valid)
+ flow_dv_attr_init(items, attr, dev_flow, tunnel_decap);
+ if (attr->ipv4) {
+ memset(&ipv4, 0, sizeof(ipv4));
+ memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+ ipv4.hdr.time_to_live = 0xFF;
+ ipv4_mask.hdr.time_to_live = 0xFF;
+ item.type = RTE_FLOW_ITEM_TYPE_IPV4;
+ item.spec = &ipv4;
+ item.mask = &ipv4_mask;
+ field = modify_ipv4;
+ } else {
+ MLX5_ASSERT(attr->ipv6);
+ memset(&ipv6, 0, sizeof(ipv6));
+ memset(&ipv6_mask, 0, sizeof(ipv6_mask));
+ ipv6.hdr.hop_limits = 0xFF;
+ ipv6_mask.hdr.hop_limits = 0xFF;
+ item.type = RTE_FLOW_ITEM_TYPE_IPV6;
+ item.spec = &ipv6;
+ item.mask = &ipv6_mask;
+ field = modify_ipv6;
+ }
+ return flow_dv_convert_modify_action(&item, field, NULL, resource,
+ MLX5_MODIFICATION_TYPE_ADD, error);
+}
+
+/**
+ * Convert modify-header increment/decrement TCP Sequence number
+ * to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_tcp_seq
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const rte_be32_t *conf = (const rte_be32_t *)(action->conf);
+ uint64_t value = rte_be_to_cpu_32(*conf);
+ struct rte_flow_item item;
+ struct rte_flow_item_tcp tcp;
+ struct rte_flow_item_tcp tcp_mask;
+
+ memset(&tcp, 0, sizeof(tcp));
+ memset(&tcp_mask, 0, sizeof(tcp_mask));
+ if (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ)
+ /*
+ * The HW has no decrement operation, only increment operation.
+ * To simulate decrement X from Y using increment operation
+ * we need to add UINT32_MAX X times to Y.
+ * Each adding of UINT32_MAX decrements Y by 1.
+ */
+ value *= UINT32_MAX;
+ tcp.hdr.sent_seq = rte_cpu_to_be_32((uint32_t)value);
+ tcp_mask.hdr.sent_seq = RTE_BE32(UINT32_MAX);
+ item.type = RTE_FLOW_ITEM_TYPE_TCP;
+ item.spec = &tcp;
+ item.mask = &tcp_mask;
+ return flow_dv_convert_modify_action(&item, modify_tcp, NULL, resource,
+ MLX5_MODIFICATION_TYPE_ADD, error);
+}
+
+/**
+ * Convert modify-header increment/decrement TCP Acknowledgment number
+ * to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_tcp_ack
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const rte_be32_t *conf = (const rte_be32_t *)(action->conf);
+ uint64_t value = rte_be_to_cpu_32(*conf);
+ struct rte_flow_item item;
+ struct rte_flow_item_tcp tcp;
+ struct rte_flow_item_tcp tcp_mask;
+
+ memset(&tcp, 0, sizeof(tcp));
+ memset(&tcp_mask, 0, sizeof(tcp_mask));
+ if (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK)
+ /*
+ * The HW has no decrement operation, only increment operation.
+ * To simulate decrement X from Y using increment operation
+ * we need to add UINT32_MAX X times to Y.
+ * Each adding of UINT32_MAX decrements Y by 1.
+ */
+ value *= UINT32_MAX;
+ tcp.hdr.recv_ack = rte_cpu_to_be_32((uint32_t)value);
+ tcp_mask.hdr.recv_ack = RTE_BE32(UINT32_MAX);
+ item.type = RTE_FLOW_ITEM_TYPE_TCP;
+ item.spec = &tcp;
+ item.mask = &tcp_mask;
+ return flow_dv_convert_modify_action(&item, modify_tcp, NULL, resource,
+ MLX5_MODIFICATION_TYPE_ADD, error);
+}
+
+static enum mlx5_modification_field reg_to_field[] = {
+ [REG_NONE] = MLX5_MODI_OUT_NONE,
+ [REG_A] = MLX5_MODI_META_DATA_REG_A,
+ [REG_B] = MLX5_MODI_META_DATA_REG_B,
+ [REG_C_0] = MLX5_MODI_META_REG_C_0,
+ [REG_C_1] = MLX5_MODI_META_REG_C_1,
+ [REG_C_2] = MLX5_MODI_META_REG_C_2,
+ [REG_C_3] = MLX5_MODI_META_REG_C_3,
+ [REG_C_4] = MLX5_MODI_META_REG_C_4,
+ [REG_C_5] = MLX5_MODI_META_REG_C_5,
+ [REG_C_6] = MLX5_MODI_META_REG_C_6,
+ [REG_C_7] = MLX5_MODI_META_REG_C_7,
+};
+
+/**
+ * Convert register set to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_set_reg
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const struct mlx5_rte_flow_action_set_tag *conf = action->conf;
+ struct mlx5_modification_cmd *actions = resource->actions;
+ uint32_t i = resource->actions_num;
+
+ if (i >= MLX5_MAX_MODIFY_NUM)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "too many items to modify");
+ MLX5_ASSERT(conf->id != REG_NONE);
+ MLX5_ASSERT(conf->id < RTE_DIM(reg_to_field));
+ actions[i] = (struct mlx5_modification_cmd) {
+ .action_type = MLX5_MODIFICATION_TYPE_SET,
+ .field = reg_to_field[conf->id],
+ };
+ actions[i].data0 = rte_cpu_to_be_32(actions[i].data0);
+ actions[i].data1 = rte_cpu_to_be_32(conf->data);
+ ++i;
+ resource->actions_num = i;
+ return 0;
+}
+
+/**
+ * Convert SET_TAG action to DV specification.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] conf
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_set_tag
+ (struct rte_eth_dev *dev,
+ struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action_set_tag *conf,
+ struct rte_flow_error *error)
+{
+ rte_be32_t data = rte_cpu_to_be_32(conf->data);
+ rte_be32_t mask = rte_cpu_to_be_32(conf->mask);
+ struct rte_flow_item item = {
+ .spec = &data,
+ .mask = &mask,
+ };
+ struct field_modify_info reg_c_x[] = {
+ [1] = {0, 0, 0},
+ };
+ enum mlx5_modification_field reg_type;
+ int ret;
+
+ ret = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, conf->index, error);
+ if (ret < 0)
+ return ret;
+ MLX5_ASSERT(ret != REG_NONE);
+ MLX5_ASSERT((unsigned int)ret < RTE_DIM(reg_to_field));
+ reg_type = reg_to_field[ret];
+ MLX5_ASSERT(reg_type > 0);
+ reg_c_x[0] = (struct field_modify_info){4, 0, reg_type};
+ return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert internal COPY_REG action to DV specification.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in,out] res
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_copy_mreg(struct rte_eth_dev *dev,
+ struct mlx5_flow_dv_modify_hdr_resource *res,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const struct mlx5_flow_action_copy_mreg *conf = action->conf;
+ rte_be32_t mask = RTE_BE32(UINT32_MAX);
+ struct rte_flow_item item = {
+ .spec = NULL,
+ .mask = &mask,
+ };
+ struct field_modify_info reg_src[] = {
+ {4, 0, reg_to_field[conf->src]},
+ {0, 0, 0},
+ };
+ struct field_modify_info reg_dst = {
+ .offset = 0,
+ .id = reg_to_field[conf->dst],
+ };
+ /* Adjust reg_c[0] usage according to reported mask. */
+ if (conf->dst == REG_C_0 || conf->src == REG_C_0) {
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t reg_c0 = priv->sh->dv_regc0_mask;
+
+ MLX5_ASSERT(reg_c0);
+ MLX5_ASSERT(priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY);
+ if (conf->dst == REG_C_0) {
+ /* Copy to reg_c[0], within mask only. */
+ reg_dst.offset = rte_bsf32(reg_c0);
+ /*
+ * Mask is ignoring the enianness, because
+ * there is no conversion in datapath.
+ */
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+ /* Copy from destination lower bits to reg_c[0]. */
+ mask = reg_c0 >> reg_dst.offset;
+#else
+ /* Copy from destination upper bits to reg_c[0]. */
+ mask = reg_c0 << (sizeof(reg_c0) * CHAR_BIT -
+ rte_fls_u32(reg_c0));
+#endif
+ } else {
+ mask = rte_cpu_to_be_32(reg_c0);
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+ /* Copy from reg_c[0] to destination lower bits. */
+ reg_dst.offset = 0;
+#else
+ /* Copy from reg_c[0] to destination upper bits. */
+ reg_dst.offset = sizeof(reg_c0) * CHAR_BIT -
+ (rte_fls_u32(reg_c0) -
+ rte_bsf32(reg_c0));
+#endif
+ }
+ }
+ return flow_dv_convert_modify_action(&item,
+ reg_src, &reg_dst, res,
+ MLX5_MODIFICATION_TYPE_COPY,
+ error);
+}
+
+/**
+ * Convert MARK action to DV specification. This routine is used
+ * in extensive metadata only and requires metadata register to be
+ * handled. In legacy mode hardware tag resource is engaged.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] conf
+ * Pointer to MARK action specification.
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_mark(struct rte_eth_dev *dev,
+ const struct rte_flow_action_mark *conf,
+ struct mlx5_flow_dv_modify_hdr_resource *resource,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ rte_be32_t mask = rte_cpu_to_be_32(MLX5_FLOW_MARK_MASK &
+ priv->sh->dv_mark_mask);
+ rte_be32_t data = rte_cpu_to_be_32(conf->id) & mask;
+ struct rte_flow_item item = {
+ .spec = &data,
+ .mask = &mask,
+ };
+ struct field_modify_info reg_c_x[] = {
+ {4, 0, 0}, /* dynamic instead of MLX5_MODI_META_REG_C_1. */
+ {0, 0, 0},
+ };
+ int reg;
+
+ if (!mask)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ NULL, "zero mark action mask");
+ reg = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+ if (reg < 0)
+ return reg;
+ MLX5_ASSERT(reg > 0);
+ if (reg == REG_C_0) {
+ uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+ uint32_t shl_c0 = rte_bsf32(msk_c0);
+
+ data = rte_cpu_to_be_32(rte_cpu_to_be_32(data) << shl_c0);
+ mask = rte_cpu_to_be_32(mask) & msk_c0;
+ mask = rte_cpu_to_be_32(mask << shl_c0);
+ }
+ reg_c_x[0].id = reg_to_field[reg];
+ return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Get metadata register index for specified steering domain.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] attr
+ * Attributes of flow to determine steering domain.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * positive index on success, a negative errno value otherwise
+ * and rte_errno is set.
+ */
+static enum modify_reg
+flow_dv_get_metadata_reg(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ int reg =
+ mlx5_flow_get_reg_id(dev, attr->transfer ?
+ MLX5_METADATA_FDB :
+ attr->egress ?
+ MLX5_METADATA_TX :
+ MLX5_METADATA_RX, 0, error);
+ if (reg < 0)
+ return rte_flow_error_set(error,
+ ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ NULL, "unavailable "
+ "metadata register");
+ return reg;
+}
+
+/**
+ * Convert SET_META action to DV specification.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] attr
+ * Attributes of flow that includes this item.
+ * @param[in] conf
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_set_meta
+ (struct rte_eth_dev *dev,
+ struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_action_set_meta *conf,
+ struct rte_flow_error *error)
+{
+ uint32_t data = conf->data;
+ uint32_t mask = conf->mask;
+ struct rte_flow_item item = {
+ .spec = &data,
+ .mask = &mask,
+ };
+ struct field_modify_info reg_c_x[] = {
+ [1] = {0, 0, 0},
+ };
+ int reg = flow_dv_get_metadata_reg(dev, attr, error);
+
+ if (reg < 0)
+ return reg;
+ /*
+ * In datapath code there is no endianness
+ * coversions for perfromance reasons, all
+ * pattern conversions are done in rte_flow.
+ */
+ if (reg == REG_C_0) {
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+ uint32_t shl_c0;
+
+ MLX5_ASSERT(msk_c0);
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+ shl_c0 = rte_bsf32(msk_c0);
+#else
+ shl_c0 = sizeof(msk_c0) * CHAR_BIT - rte_fls_u32(msk_c0);
+#endif
+ mask <<= shl_c0;
+ data <<= shl_c0;
+ MLX5_ASSERT(!(~msk_c0 & rte_cpu_to_be_32(mask)));
+ }
+ reg_c_x[0] = (struct field_modify_info){4, 0, reg_to_field[reg]};
+ /* The routine expects parameters in memory as big-endian ones. */
+ return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set IPv4 DSCP action to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ipv4_dscp
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_set_dscp *conf =
+ (const struct rte_flow_action_set_dscp *)(action->conf);
+ struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV4 };
+ struct rte_flow_item_ipv4 ipv4;
+ struct rte_flow_item_ipv4 ipv4_mask;
+
+ memset(&ipv4, 0, sizeof(ipv4));
+ memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+ ipv4.hdr.type_of_service = conf->dscp;
+ ipv4_mask.hdr.type_of_service = RTE_IPV4_HDR_DSCP_MASK >> 2;
+ item.spec = &ipv4;
+ item.mask = &ipv4_mask;
+ return flow_dv_convert_modify_action(&item, modify_ipv4, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set IPv6 DSCP action to DV specification.
+ *
+ * @param[in,out] resource
+ * Pointer to the modify-header resource.
+ * @param[in] action
+ * Pointer to action specification.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ipv6_dscp
+ (struct mlx5_flow_dv_modify_hdr_resource *resource,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_set_dscp *conf =
+ (const struct rte_flow_action_set_dscp *)(action->conf);
+ struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV6 };
+ struct rte_flow_item_ipv6 ipv6;
+ struct rte_flow_item_ipv6 ipv6_mask;
+
+ memset(&ipv6, 0, sizeof(ipv6));
+ memset(&ipv6_mask, 0, sizeof(ipv6_mask));
+ /*
+ * Even though the DSCP bits offset of IPv6 is not byte aligned,
+ * rdma-core only accept the DSCP bits byte aligned start from
+ * bit 0 to 5 as to be compatible with IPv4. No need to shift the
+ * bits in IPv6 case as rdma-core requires byte aligned value.
+ */
+ ipv6.hdr.vtc_flow = conf->dscp;
+ ipv6_mask.hdr.vtc_flow = RTE_IPV6_HDR_DSCP_MASK >> 22;
+ item.spec = &ipv6;
+ item.mask = &ipv6_mask;
+ return flow_dv_convert_modify_action(&item, modify_ipv6, NULL, resource,
+ MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Validate MARK item.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] attr
+ * Attributes of flow that includes this item.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_mark(struct rte_eth_dev *dev,
+ const struct rte_flow_item *item,
+ const struct rte_flow_attr *attr __rte_unused,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ const struct rte_flow_item_mark *spec = item->spec;
+ const struct rte_flow_item_mark *mask = item->mask;
+ const struct rte_flow_item_mark nic_mask = {
+ .id = priv->sh->dv_mark_mask,
+ };
+ int ret;
+
+ if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "extended metadata feature"
+ " isn't enabled");
+ if (!mlx5_flow_ext_mreg_supported(dev))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "extended metadata register"
+ " isn't supported");
+ if (!nic_mask.id)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "extended metadata register"
+ " isn't available");
+ ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+ if (ret < 0)
+ return ret;
+ if (!spec)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+ item->spec,
+ "data cannot be empty");
+ if (spec->id >= (MLX5_FLOW_MARK_MAX & nic_mask.id))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &spec->id,
+ "mark id exceeds the limit");
+ if (!mask)
+ mask = &nic_mask;
+ if (!mask->id)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL,
+ "mask cannot be zero");
+
+ ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+ (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_mark),
+ error);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * Validate META item.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] attr
+ * Attributes of flow that includes this item.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_meta(struct rte_eth_dev *dev __rte_unused,
+ const struct rte_flow_item *item,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ const struct rte_flow_item_meta *spec = item->spec;
+ const struct rte_flow_item_meta *mask = item->mask;
+ struct rte_flow_item_meta nic_mask = {
+ .data = UINT32_MAX
+ };
+ int reg;
+ int ret;
+
+ if (!spec)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+ item->spec,
+ "data cannot be empty");
+ if (config->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+ if (!mlx5_flow_ext_mreg_supported(dev))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "extended metadata register"
+ " isn't supported");
+ reg = flow_dv_get_metadata_reg(dev, attr, error);
+ if (reg < 0)
+ return reg;
+ if (reg == REG_B)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "match on reg_b "
+ "isn't supported");
+ if (reg != REG_A)
+ nic_mask.data = priv->sh->dv_meta_mask;
+ }
+ if (!mask)
+ mask = &rte_flow_item_meta_mask;
+ if (!mask->data)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL,
+ "mask cannot be zero");
+
+ ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+ (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_meta),
+ error);
+ return ret;
+}
+
+/**
+ * Validate TAG item.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] attr
+ * Attributes of flow that includes this item.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_tag(struct rte_eth_dev *dev,
+ const struct rte_flow_item *item,
+ const struct rte_flow_attr *attr __rte_unused,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_tag *spec = item->spec;
+ const struct rte_flow_item_tag *mask = item->mask;
+ const struct rte_flow_item_tag nic_mask = {
+ .data = RTE_BE32(UINT32_MAX),
+ .index = 0xff,
+ };
+ int ret;
+
+ if (!mlx5_flow_ext_mreg_supported(dev))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "extensive metadata register"
+ " isn't supported");
+ if (!spec)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+ item->spec,
+ "data cannot be empty");
+ if (!mask)
+ mask = &rte_flow_item_tag_mask;
+ if (!mask->data)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL,
+ "mask cannot be zero");
+
+ ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+ (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_tag),
+ error);
+ if (ret < 0)
+ return ret;
+ if (mask->index != 0xff)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL,
+ "partial mask for tag index"
+ " is not supported");
+ ret = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, spec->index, error);
+ if (ret < 0)
+ return ret;
+ MLX5_ASSERT(ret != REG_NONE);
+ return 0;
+}
+
+/**
+ * Validate vport item.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] attr
+ * Attributes of flow that includes this item.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_port_id(struct rte_eth_dev *dev,
+ const struct rte_flow_item *item,
+ const struct rte_flow_attr *attr,
+ uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_port_id *spec = item->spec;
+ const struct rte_flow_item_port_id *mask = item->mask;
+ const struct rte_flow_item_port_id switch_mask = {
+ .id = 0xffffffff,
+ };
+ struct mlx5_priv *esw_priv;
+ struct mlx5_priv *dev_priv;
+ int ret;
+
+ if (!attr->transfer)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ NULL,
+ "match on port id is valid only"
+ " when transfer flag is enabled");
+ if (item_flags & MLX5_FLOW_ITEM_PORT_ID)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple source ports are not"
+ " supported");
+ if (!mask)
+ mask = &switch_mask;
+ if (mask->id != 0xffffffff)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM_MASK,
+ mask,
+ "no support for partial mask on"
+ " \"id\" field");
+ ret = mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&rte_flow_item_port_id_mask,
+ sizeof(struct rte_flow_item_port_id),
+ error);
+ if (ret)
+ return ret;
+ if (!spec)
+ return 0;
+ esw_priv = mlx5_port_to_eswitch_info(spec->id, false);
+ if (!esw_priv)
+ return rte_flow_error_set(error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC, spec,
+ "failed to obtain E-Switch info for"
+ " port");
+ dev_priv = mlx5_dev_to_eswitch_info(dev);
+ if (!dev_priv)
+ return rte_flow_error_set(error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "failed to obtain E-Switch info");
+ if (esw_priv->domain_id != dev_priv->domain_id)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM_SPEC, spec,
+ "cannot match on a port from a"
+ " different E-Switch");
+ return 0;
+}
+
+/*
+ * GTP flags are contained in 1 byte of the format:
+ * -------------------------------------------
+ * | bit | 0 - 2 | 3 | 4 | 5 | 6 | 7 |
+ * |-----------------------------------------|
+ * | value | Version | PT | Res | E | S | PN |
+ * -------------------------------------------
+ *
+ * Matching is supported only for GTP flags E, S, PN.
+ */
+#define MLX5_GTP_FLAGS_MASK 0x07
+
+/**
+ * Validate VLAN item.
+ *
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] dev
+ * Ethernet device flow is being created on.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_vlan(const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_eth_dev *dev,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item_vlan *mask = item->mask;
+ const struct rte_flow_item_vlan nic_mask = {
+ .tci = RTE_BE16(UINT16_MAX),
+ .inner_type = RTE_BE16(UINT16_MAX),
+ };
+ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ int ret;
+ const uint64_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 |
+ MLX5_FLOW_LAYER_INNER_L4) :
+ (MLX5_FLOW_LAYER_OUTER_L3 |
+ MLX5_FLOW_LAYER_OUTER_L4);
+ const uint64_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
+ MLX5_FLOW_LAYER_OUTER_VLAN;
+
+ if (item_flags & vlanm)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple VLAN layers not supported");
+ else if ((item_flags & l34m) != 0)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "VLAN cannot follow L3/L4 layer");
+ if (!mask)
+ mask = &rte_flow_item_vlan_mask;
+ ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+ (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_vlan),
+ error);
+ if (ret)
+ return ret;
+ if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (priv->vmwa_context) {
+ /*
+ * Non-NULL context means we have a virtual machine
+ * and SR-IOV enabled, we have to create VLAN interface
+ * to make hypervisor to setup E-Switch vport
+ * context correctly. We avoid creating the multiple
+ * VLAN interfaces, so we cannot support VLAN tag mask.
+ */
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "VLAN tag mask is not"
+ " supported in virtual"
+ " environment");
+ }
+ }
+ return 0;
+}
+
+/**
+ * Validate GTP item.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_gtp(struct rte_eth_dev *dev,
+ const struct rte_flow_item *item,
+ uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_item_gtp *spec = item->spec;
+ const struct rte_flow_item_gtp *mask = item->mask;
+ const struct rte_flow_item_gtp nic_mask = {
+ .v_pt_rsv_flags = MLX5_GTP_FLAGS_MASK,
+ .msg_type = 0xff,
+ .teid = RTE_BE32(0xffffffff),
+ };
+
+ if (!priv->config.hca_attr.tunnel_stateless_gtp)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "GTP support is not enabled");
+ if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "multiple tunnel layers not"
+ " supported");
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "no outer UDP layer found");
+ if (!mask)
+ mask = &rte_flow_item_gtp_mask;
+ if (spec && spec->v_pt_rsv_flags & ~MLX5_GTP_FLAGS_MASK)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "Match is supported for GTP"
+ " flags only");
+ return mlx5_flow_item_acceptable
+ (item, (const uint8_t *)mask,
+ (const uint8_t *)&nic_mask,
+ sizeof(struct rte_flow_item_gtp),
+ error);
+}
+
+/**
+ * Validate the pop VLAN action.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the pop vlan action.
+ * @param[in] item_flags
+ * The items found in this flow rule.
+ * @param[in] attr
+ * Pointer to flow attributes.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_pop_vlan(struct rte_eth_dev *dev,
+ uint64_t action_flags,
+ const struct rte_flow_action *action,
+ uint64_t item_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ const struct mlx5_priv *priv = dev->data->dev_private;
+
+ (void)action;
+ (void)attr;
+ if (!priv->sh->pop_vlan_action)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "pop vlan action is not supported");
+ if (attr->egress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
+ NULL,
+ "pop vlan action not supported for "
+ "egress");
+ if (action_flags & MLX5_FLOW_VLAN_ACTIONS)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "no support for multiple VLAN "
+ "actions");
+ if (!(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "cannot pop vlan without a "
+ "match on (outer) vlan in the flow");
+ if (action_flags & MLX5_FLOW_ACTION_PORT_ID)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "wrong action order, port_id should "
+ "be after pop VLAN action");
+ if (!attr->transfer && priv->representor)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "pop vlan action for VF representor "
+ "not supported on NIC table");
+ return 0;
+}
+
+/**
+ * Get VLAN default info from vlan match info.
+ *
+ * @param[in] items
+ * the list of item specifications.
+ * @param[out] vlan
+ * pointer VLAN info to fill to.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static void
+flow_dev_get_vlan_info_from_items(const struct rte_flow_item *items,
+ struct rte_vlan_hdr *vlan)
+{
+ const struct rte_flow_item_vlan nic_mask = {
+ .tci = RTE_BE16(MLX5DV_FLOW_VLAN_PCP_MASK |
+ MLX5DV_FLOW_VLAN_VID_MASK),
+ .inner_type = RTE_BE16(0xffff),
+ };
+
+ if (items == NULL)
+ return;
+ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+ int type = items->type;
+
+ if (type == RTE_FLOW_ITEM_TYPE_VLAN ||
+ type == MLX5_RTE_FLOW_ITEM_TYPE_VLAN)
+ break;
+ }
+ if (items->type != RTE_FLOW_ITEM_TYPE_END) {
+ const struct rte_flow_item_vlan *vlan_m = items->mask;
+ const struct rte_flow_item_vlan *vlan_v = items->spec;
+
+ /* If VLAN item in pattern doesn't contain data, return here. */
+ if (!vlan_v)
+ return;
+ if (!vlan_m)
+ vlan_m = &nic_mask;
+ /* Only full match values are accepted */
+ if ((vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) ==
+ MLX5DV_FLOW_VLAN_PCP_MASK_BE) {
+ vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_PCP_MASK;
+ vlan->vlan_tci |=
+ rte_be_to_cpu_16(vlan_v->tci &
+ MLX5DV_FLOW_VLAN_PCP_MASK_BE);
+ }
+ if ((vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) ==
+ MLX5DV_FLOW_VLAN_VID_MASK_BE) {
+ vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_VID_MASK;
+ vlan->vlan_tci |=
+ rte_be_to_cpu_16(vlan_v->tci &
+ MLX5DV_FLOW_VLAN_VID_MASK_BE);
+ }
+ if (vlan_m->inner_type == nic_mask.inner_type)
+ vlan->eth_proto = rte_be_to_cpu_16(vlan_v->inner_type &
+ vlan_m->inner_type);
+ }
+}
+
+/**
+ * Validate the push VLAN action.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] item_flags
+ * The items found in this flow rule.
+ * @param[in] action
+ * Pointer to the action structure.
+ * @param[in] attr
+ * Pointer to flow attributes
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_push_vlan(struct rte_eth_dev *dev,
+ uint64_t action_flags,
+ const struct rte_flow_item_vlan *vlan_m,
+ const struct rte_flow_action *action,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_of_push_vlan *push_vlan = action->conf;
+ const struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!attr->transfer && attr->ingress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+ NULL,
+ "push VLAN action not supported for "
+ "ingress");
+ if (push_vlan->ethertype != RTE_BE16(RTE_ETHER_TYPE_VLAN) &&
+ push_vlan->ethertype != RTE_BE16(RTE_ETHER_TYPE_QINQ))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "invalid vlan ethertype");
+ if (action_flags & MLX5_FLOW_VLAN_ACTIONS)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "no support for multiple VLAN "
+ "actions");
+ if (action_flags & MLX5_FLOW_ACTION_PORT_ID)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "wrong action order, port_id should "
+ "be after push VLAN");
+ if (!attr->transfer && priv->representor)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "push vlan action for VF representor "
+ "not supported on NIC table");
+ if (vlan_m &&
+ (vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) &&
+ (vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) !=
+ MLX5DV_FLOW_VLAN_PCP_MASK_BE &&
+ !(action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_PCP) &&
+ !(mlx5_flow_find_action
+ (action + 1, RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "not full match mask on VLAN PCP and "
+ "there is no of_set_vlan_pcp action, "
+ "push VLAN action cannot figure out "
+ "PCP value");
+ if (vlan_m &&
+ (vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) &&
+ (vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) !=
+ MLX5DV_FLOW_VLAN_VID_MASK_BE &&
+ !(action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_VID) &&
+ !(mlx5_flow_find_action
+ (action + 1, RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "not full match mask on VLAN VID and "
+ "there is no of_set_vlan_vid action, "
+ "push VLAN action cannot figure out "
+ "VID value");
+ (void)attr;
+ return 0;
+}
+
+/**
+ * Validate the set VLAN PCP.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] actions
+ * Pointer to the list of actions remaining in the flow rule.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_set_vlan_pcp(uint64_t action_flags,
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action *action = actions;
+ const struct rte_flow_action_of_set_vlan_pcp *conf = action->conf;
+
+ if (conf->vlan_pcp > 7)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "VLAN PCP value is too big");
+ if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "set VLAN PCP action must follow "
+ "the push VLAN action");
+ if (action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "Multiple VLAN PCP modification are "
+ "not supported");
+ if (action_flags & MLX5_FLOW_ACTION_PORT_ID)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "wrong action order, port_id should "
+ "be after set VLAN PCP");
+ return 0;
+}
+
+/**
+ * Validate the set VLAN VID.
+ *
+ * @param[in] item_flags
+ * Holds the items detected in this rule.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] actions
+ * Pointer to the list of actions remaining in the flow rule.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_set_vlan_vid(uint64_t item_flags,
+ uint64_t action_flags,
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action *action = actions;
+ const struct rte_flow_action_of_set_vlan_vid *conf = action->conf;
+
+ if (rte_be_to_cpu_16(conf->vlan_vid) > 0xFFE)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "VLAN VID value is too big");
+ if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
+ !(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "set VLAN VID action must follow push"
+ " VLAN action or match on VLAN item");
+ if (action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_VID)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "Multiple VLAN VID modifications are "
+ "not supported");
+ if (action_flags & MLX5_FLOW_ACTION_PORT_ID)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "wrong action order, port_id should "
+ "be after set VLAN VID");
+ return 0;
+}
+
+/*
+ * Validate the FLAG action.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] attr
+ * Pointer to flow attributes
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_flag(struct rte_eth_dev *dev,
+ uint64_t action_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ int ret;
+
+ /* Fall back if no extended metadata register support. */
+ if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY)
+ return mlx5_flow_validate_action_flag(action_flags, attr,
+ error);
+ /* Extensive metadata mode requires registers. */
+ if (!mlx5_flow_ext_mreg_supported(dev))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "no metadata registers "
+ "to support flag action");
+ if (!(priv->sh->dv_mark_mask & MLX5_FLOW_MARK_DEFAULT))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "extended metadata register"
+ " isn't available");
+ ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+ if (ret < 0)
+ return ret;
+ MLX5_ASSERT(ret > 0);
+ if (action_flags & MLX5_FLOW_ACTION_MARK)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't mark and flag in same flow");
+ if (action_flags & MLX5_FLOW_ACTION_FLAG)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't have 2 flag"
+ " actions in same flow");
+ return 0;
+}
+
+/**
+ * Validate MARK action.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] action
+ * Pointer to action.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] attr
+ * Pointer to flow attributes
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_mark(struct rte_eth_dev *dev,
+ const struct rte_flow_action *action,
+ uint64_t action_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ const struct rte_flow_action_mark *mark = action->conf;
+ int ret;
+
+ /* Fall back if no extended metadata register support. */
+ if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY)
+ return mlx5_flow_validate_action_mark(action, action_flags,
+ attr, error);
+ /* Extensive metadata mode requires registers. */
+ if (!mlx5_flow_ext_mreg_supported(dev))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "no metadata registers "
+ "to support mark action");
+ if (!priv->sh->dv_mark_mask)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "extended metadata register"
+ " isn't available");
+ ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+ if (ret < 0)
+ return ret;
+ MLX5_ASSERT(ret > 0);
+ if (!mark)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "configuration cannot be null");
+ if (mark->id >= (MLX5_FLOW_MARK_MAX & priv->sh->dv_mark_mask))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ &mark->id,
+ "mark id exceeds the limit");
+ if (action_flags & MLX5_FLOW_ACTION_FLAG)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't flag and mark in same flow");
+ if (action_flags & MLX5_FLOW_ACTION_MARK)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't have 2 mark actions in same"
+ " flow");
+ return 0;
+}
+
+/**
+ * Validate SET_META action.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] action
+ * Pointer to the action structure.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] attr
+ * Pointer to flow attributes
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_set_meta(struct rte_eth_dev *dev,
+ const struct rte_flow_action *action,
+ uint64_t action_flags __rte_unused,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_set_meta *conf;
+ uint32_t nic_mask = UINT32_MAX;
+ int reg;
+
+ if (!mlx5_flow_ext_mreg_supported(dev))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "extended metadata register"
+ " isn't supported");
+ reg = flow_dv_get_metadata_reg(dev, attr, error);
+ if (reg < 0)
+ return reg;
+ if (reg != REG_A && reg != REG_B) {
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ nic_mask = priv->sh->dv_meta_mask;
+ }
+ if (!(action->conf))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "configuration cannot be null");
+ conf = (const struct rte_flow_action_set_meta *)action->conf;
+ if (!conf->mask)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "zero mask doesn't have any effect");
+ if (conf->mask & ~nic_mask)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "meta data must be within reg C0");
+ return 0;
+}
+
+/**
+ * Validate SET_TAG action.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] action
+ * Pointer to the action structure.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] attr
+ * Pointer to flow attributes
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_set_tag(struct rte_eth_dev *dev,
+ const struct rte_flow_action *action,
+ uint64_t action_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_set_tag *conf;
+ const uint64_t terminal_action_flags =
+ MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE |
+ MLX5_FLOW_ACTION_RSS;
+ int ret;
+
+ if (!mlx5_flow_ext_mreg_supported(dev))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "extensive metadata register"
+ " isn't supported");
+ if (!(action->conf))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "configuration cannot be null");
+ conf = (const struct rte_flow_action_set_tag *)action->conf;
+ if (!conf->mask)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "zero mask doesn't have any effect");
+ ret = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, conf->index, error);
+ if (ret < 0)
+ return ret;
+ if (!attr->transfer && attr->ingress &&
+ (action_flags & terminal_action_flags))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "set_tag has no effect"
+ " with terminal actions");
+ return 0;
+}
+
+/**
+ * Validate count action.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_count(struct rte_eth_dev *dev,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!priv->config.devx)
+ goto notsup_err;
+#ifdef HAVE_IBV_FLOW_DEVX_COUNTERS
+ return 0;
+#endif
+notsup_err:
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "count action not supported");
+}
+
+/**
+ * Validate the L2 encap action.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the action structure.
+ * @param[in] attr
+ * Pointer to flow attributes.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_l2_encap(struct rte_eth_dev *dev,
+ uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ const struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!(action->conf))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "configuration cannot be null");
+ if (action_flags & MLX5_FLOW_ACTION_ENCAP)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can only have a single encap action "
+ "in a flow");
+ if (!attr->transfer && priv->representor)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "encap action for VF representor "
+ "not supported on NIC table");
+ return 0;
+}
+
+/**
+ * Validate a decap action.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] attr
+ * Pointer to flow attributes
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_decap(struct rte_eth_dev *dev,
+ uint64_t action_flags,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ const struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (action_flags & MLX5_FLOW_XCAP_ACTIONS)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ action_flags &
+ MLX5_FLOW_ACTION_DECAP ? "can only "
+ "have a single decap action" : "decap "
+ "after encap is not supported");
+ if (action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't have decap action after"
+ " modify action");
+ if (attr->egress)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
+ NULL,
+ "decap action not supported for "
+ "egress");
+ if (!attr->transfer && priv->representor)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "decap action for VF representor "
+ "not supported on NIC table");
+ return 0;
+}
+
+const struct rte_flow_action_raw_decap empty_decap = {.data = NULL, .size = 0,};
+
+/**
+ * Validate the raw encap and decap actions.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] decap
+ * Pointer to the decap action.
+ * @param[in] encap
+ * Pointer to the encap action.
+ * @param[in] attr
+ * Pointer to flow attributes
+ * @param[in/out] action_flags
+ * Holds the actions detected until now.
+ * @param[out] actions_n
+ * pointer to the number of actions counter.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_raw_encap_decap
+ (struct rte_eth_dev *dev,
+ const struct rte_flow_action_raw_decap *decap,
+ const struct rte_flow_action_raw_encap *encap,
+ const struct rte_flow_attr *attr, uint64_t *action_flags,
+ int *actions_n, struct rte_flow_error *error)
+{
+ const struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (encap && (!encap->size || !encap->data))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "raw encap data cannot be empty");
+ if (decap && encap) {
+ if (decap->size <= MLX5_ENCAPSULATION_DECISION_SIZE &&
+ encap->size > MLX5_ENCAPSULATION_DECISION_SIZE)
+ /* L3 encap. */
+ decap = NULL;
+ else if (encap->size <=
+ MLX5_ENCAPSULATION_DECISION_SIZE &&
+ decap->size >
+ MLX5_ENCAPSULATION_DECISION_SIZE)
+ /* L3 decap. */
+ encap = NULL;
+ else if (encap->size >
+ MLX5_ENCAPSULATION_DECISION_SIZE &&
+ decap->size >
+ MLX5_ENCAPSULATION_DECISION_SIZE)
+ /* 2 L2 actions: encap and decap. */
+ ;
+ else
+ return rte_flow_error_set(error,
+ ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "unsupported too small "
+ "raw decap and too small raw "
+ "encap combination");
+ }
+ if (decap) {
+ ret = flow_dv_validate_action_decap(dev, *action_flags, attr,
+ error);
+ if (ret < 0)
+ return ret;
+ *action_flags |= MLX5_FLOW_ACTION_DECAP;
+ ++(*actions_n);
+ }
+ if (encap) {
+ if (encap->size <= MLX5_ENCAPSULATION_DECISION_SIZE)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "small raw encap size");
+ if (*action_flags & MLX5_FLOW_ACTION_ENCAP)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "more than one encap action");
+ if (!attr->transfer && priv->representor)
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "encap action for VF representor "
+ "not supported on NIC table");
+ *action_flags |= MLX5_FLOW_ACTION_ENCAP;
+ ++(*actions_n);
+ }
+ return 0;
+}
+
+/**
+ * Find existing encap/decap resource or create and register a new one.
+ *
+ * @param[in, out] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in, out] resource
+ * Pointer to encap/decap resource.
+ * @parm[in, out] dev_flow
+ * Pointer to the dev_flow.
+ * @param[out] error
+ * pointer to error structure.
+ *
+ * @return
+ * 0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_encap_decap_resource_register
+ (struct rte_eth_dev *dev,
+ struct mlx5_flow_dv_encap_decap_resource *resource,
+ struct mlx5_flow *dev_flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_dv_encap_decap_resource *cache_resource;
+ struct mlx5dv_dr_domain *domain;
+ uint32_t idx = 0;
+
+ resource->flags = dev_flow->dv.group ? 0 : 1;
+ if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_FDB)
+ domain = sh->fdb_domain;
+ else if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_NIC_RX)
+ domain = sh->rx_domain;
+ else
+ domain = sh->tx_domain;
+ /* Lookup a matching resource from cache. */
+ ILIST_FOREACH(sh->ipool[MLX5_IPOOL_DECAP_ENCAP], sh->encaps_decaps, idx,
+ cache_resource, next) {
+ if (resource->reformat_type == cache_resource->reformat_type &&
+ resource->ft_type == cache_resource->ft_type &&
+ resource->flags == cache_resource->flags &&
+ resource->size == cache_resource->size &&
+ !memcmp((const void *)resource->buf,
+ (const void *)cache_resource->buf,
+ resource->size)) {
+ DRV_LOG(DEBUG, "encap/decap resource %p: refcnt %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ rte_atomic32_inc(&cache_resource->refcnt);
+ dev_flow->handle->dvh.rix_encap_decap = idx;
+ dev_flow->dv.encap_decap = cache_resource;
+ return 0;
+ }
+ }
+ /* Register new encap/decap resource. */
+ cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_DECAP_ENCAP],
+ &dev_flow->handle->dvh.rix_encap_decap);
+ if (!cache_resource)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot allocate resource memory");
+ *cache_resource = *resource;
+ cache_resource->verbs_action =
+ mlx5_glue->dv_create_flow_action_packet_reformat
+ (sh->ctx, cache_resource->reformat_type,
+ cache_resource->ft_type, domain, cache_resource->flags,
+ cache_resource->size,
+ (cache_resource->size ? cache_resource->buf : NULL));
+ if (!cache_resource->verbs_action) {
+ rte_free(cache_resource);
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot create action");
+ }
+ rte_atomic32_init(&cache_resource->refcnt);
+ rte_atomic32_inc(&cache_resource->refcnt);
+ ILIST_INSERT(sh->ipool[MLX5_IPOOL_DECAP_ENCAP], &sh->encaps_decaps,
+ dev_flow->handle->dvh.rix_encap_decap, cache_resource,
+ next);
+ dev_flow->dv.encap_decap = cache_resource;
+ DRV_LOG(DEBUG, "new encap/decap resource %p: refcnt %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ return 0;
+}
+
+/**
+ * Find existing table jump resource or create and register a new one.
+ *
+ * @param[in, out] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in, out] tbl
+ * Pointer to flow table resource.
+ * @parm[in, out] dev_flow
+ * Pointer to the dev_flow.
+ * @param[out] error
+ * pointer to error structure.
+ *
+ * @return
+ * 0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_jump_tbl_resource_register
+ (struct rte_eth_dev *dev __rte_unused,
+ struct mlx5_flow_tbl_resource *tbl,
+ struct mlx5_flow *dev_flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_flow_tbl_data_entry *tbl_data =
+ container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl);
+ int cnt;
+
+ MLX5_ASSERT(tbl);
+ cnt = rte_atomic32_read(&tbl_data->jump.refcnt);
+ if (!cnt) {
+ tbl_data->jump.action =
+ mlx5_glue->dr_create_flow_action_dest_flow_tbl
+ (tbl->obj);
+ if (!tbl_data->jump.action)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot create jump action");
+ DRV_LOG(DEBUG, "new jump table resource %p: refcnt %d++",
+ (void *)&tbl_data->jump, cnt);
+ } else {
+ /* old jump should not make the table ref++. */
+ flow_dv_tbl_resource_release(dev, &tbl_data->tbl);
+ MLX5_ASSERT(tbl_data->jump.action);
+ DRV_LOG(DEBUG, "existed jump table resource %p: refcnt %d++",
+ (void *)&tbl_data->jump, cnt);
+ }
+ rte_atomic32_inc(&tbl_data->jump.refcnt);
+ dev_flow->handle->rix_jump = tbl_data->idx;
+ dev_flow->dv.jump = &tbl_data->jump;
+ return 0;
+}
+
+/**
+ * Find existing table port ID resource or create and register a new one.
+ *
+ * @param[in, out] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in, out] resource
+ * Pointer to port ID action resource.
+ * @parm[in, out] dev_flow
+ * Pointer to the dev_flow.
+ * @param[out] error
+ * pointer to error structure.
+ *
+ * @return
+ * 0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_port_id_action_resource_register
+ (struct rte_eth_dev *dev,
+ struct mlx5_flow_dv_port_id_action_resource *resource,
+ struct mlx5_flow *dev_flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_dv_port_id_action_resource *cache_resource;
+ uint32_t idx = 0;
+
+ /* Lookup a matching resource from cache. */
+ ILIST_FOREACH(sh->ipool[MLX5_IPOOL_PORT_ID], sh->port_id_action_list,
+ idx, cache_resource, next) {
+ if (resource->port_id == cache_resource->port_id) {
+ DRV_LOG(DEBUG, "port id action resource resource %p: "
+ "refcnt %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ rte_atomic32_inc(&cache_resource->refcnt);
+ dev_flow->handle->rix_port_id_action = idx;
+ dev_flow->dv.port_id_action = cache_resource;
+ return 0;
+ }
+ }
+ /* Register new port id action resource. */
+ cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_PORT_ID],
+ &dev_flow->handle->rix_port_id_action);
+ if (!cache_resource)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot allocate resource memory");
+ *cache_resource = *resource;
+ /*
+ * Depending on rdma_core version the glue routine calls
+ * either mlx5dv_dr_action_create_dest_ib_port(domain, ibv_port)
+ * or mlx5dv_dr_action_create_dest_vport(domain, vport_id).
+ */
+ cache_resource->action =
+ mlx5_glue->dr_create_flow_action_dest_port
+ (priv->sh->fdb_domain, resource->port_id);
+ if (!cache_resource->action) {
+ rte_free(cache_resource);
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot create action");
+ }
+ rte_atomic32_init(&cache_resource->refcnt);
+ rte_atomic32_inc(&cache_resource->refcnt);
+ ILIST_INSERT(sh->ipool[MLX5_IPOOL_PORT_ID], &sh->port_id_action_list,
+ dev_flow->handle->rix_port_id_action, cache_resource,
+ next);
+ dev_flow->dv.port_id_action = cache_resource;
+ DRV_LOG(DEBUG, "new port id action resource %p: refcnt %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ return 0;
+}
+
+/**
+ * Find existing push vlan resource or create and register a new one.
+ *
+ * @param [in, out] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in, out] resource
+ * Pointer to port ID action resource.
+ * @parm[in, out] dev_flow
+ * Pointer to the dev_flow.
+ * @param[out] error
+ * pointer to error structure.
+ *
+ * @return
+ * 0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_push_vlan_action_resource_register
+ (struct rte_eth_dev *dev,
+ struct mlx5_flow_dv_push_vlan_action_resource *resource,
+ struct mlx5_flow *dev_flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_dv_push_vlan_action_resource *cache_resource;
+ struct mlx5dv_dr_domain *domain;
+ uint32_t idx = 0;
+
+ /* Lookup a matching resource from cache. */
+ ILIST_FOREACH(sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+ sh->push_vlan_action_list, idx, cache_resource, next) {
+ if (resource->vlan_tag == cache_resource->vlan_tag &&
+ resource->ft_type == cache_resource->ft_type) {
+ DRV_LOG(DEBUG, "push-VLAN action resource resource %p: "
+ "refcnt %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ rte_atomic32_inc(&cache_resource->refcnt);
+ dev_flow->handle->dvh.rix_push_vlan = idx;
+ dev_flow->dv.push_vlan_res = cache_resource;
+ return 0;
+ }
+ }
+ /* Register new push_vlan action resource. */
+ cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+ &dev_flow->handle->dvh.rix_push_vlan);
+ if (!cache_resource)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot allocate resource memory");
+ *cache_resource = *resource;
+ if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_FDB)
+ domain = sh->fdb_domain;
+ else if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_NIC_RX)
+ domain = sh->rx_domain;
+ else
+ domain = sh->tx_domain;
+ cache_resource->action =
+ mlx5_glue->dr_create_flow_action_push_vlan(domain,
+ resource->vlan_tag);
+ if (!cache_resource->action) {
+ rte_free(cache_resource);
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot create action");
+ }
+ rte_atomic32_init(&cache_resource->refcnt);
+ rte_atomic32_inc(&cache_resource->refcnt);
+ ILIST_INSERT(sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+ &sh->push_vlan_action_list,
+ dev_flow->handle->dvh.rix_push_vlan,
+ cache_resource, next);
+ dev_flow->dv.push_vlan_res = cache_resource;
+ DRV_LOG(DEBUG, "new push vlan action resource %p: refcnt %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ return 0;
+}
+/**
+ * Get the size of specific rte_flow_item_type
+ *
+ * @param[in] item_type
+ * Tested rte_flow_item_type.
+ *
+ * @return
+ * sizeof struct item_type, 0 if void or irrelevant.
+ */
+static size_t
+flow_dv_get_item_len(const enum rte_flow_item_type item_type)
+{
+ size_t retval;
+
+ switch (item_type) {
+ case RTE_FLOW_ITEM_TYPE_ETH:
+ retval = sizeof(struct rte_flow_item_eth);
+ break;
+ case RTE_FLOW_ITEM_TYPE_VLAN:
+ retval = sizeof(struct rte_flow_item_vlan);
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV4:
+ retval = sizeof(struct rte_flow_item_ipv4);
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV6:
+ retval = sizeof(struct rte_flow_item_ipv6);
+ break;
+ case RTE_FLOW_ITEM_TYPE_UDP:
+ retval = sizeof(struct rte_flow_item_udp);
+ break;
+ case RTE_FLOW_ITEM_TYPE_TCP:
+ retval = sizeof(struct rte_flow_item_tcp);
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN:
+ retval = sizeof(struct rte_flow_item_vxlan);
+ break;
+ case RTE_FLOW_ITEM_TYPE_GRE:
+ retval = sizeof(struct rte_flow_item_gre);
+ break;
+ case RTE_FLOW_ITEM_TYPE_NVGRE:
+ retval = sizeof(struct rte_flow_item_nvgre);
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+ retval = sizeof(struct rte_flow_item_vxlan_gpe);
+ break;
+ case RTE_FLOW_ITEM_TYPE_MPLS:
+ retval = sizeof(struct rte_flow_item_mpls);
+ break;
+ case RTE_FLOW_ITEM_TYPE_VOID: /* Fall through. */
+ default:
+ retval = 0;
+ break;
+ }
+ return retval;
+}
+
+#define MLX5_ENCAP_IPV4_VERSION 0x40
+#define MLX5_ENCAP_IPV4_IHL_MIN 0x05
+#define MLX5_ENCAP_IPV4_TTL_DEF 0x40
+#define MLX5_ENCAP_IPV6_VTC_FLOW 0x60000000
+#define MLX5_ENCAP_IPV6_HOP_LIMIT 0xff
+#define MLX5_ENCAP_VXLAN_FLAGS 0x08000000
+#define MLX5_ENCAP_VXLAN_GPE_FLAGS 0x04
+
+/**
+ * Convert the encap action data from list of rte_flow_item to raw buffer
+ *
+ * @param[in] items
+ * Pointer to rte_flow_item objects list.
+ * @param[out] buf
+ * Pointer to the output buffer.
+ * @param[out] size
+ * Pointer to the output buffer size.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_encap_data(const struct rte_flow_item *items, uint8_t *buf,
+ size_t *size, struct rte_flow_error *error)
+{
+ struct rte_ether_hdr *eth = NULL;
+ struct rte_vlan_hdr *vlan = NULL;
+ struct rte_ipv4_hdr *ipv4 = NULL;
+ struct rte_ipv6_hdr *ipv6 = NULL;
+ struct rte_udp_hdr *udp = NULL;
+ struct rte_vxlan_hdr *vxlan = NULL;
+ struct rte_vxlan_gpe_hdr *vxlan_gpe = NULL;
+ struct rte_gre_hdr *gre = NULL;
+ size_t len;
+ size_t temp_size = 0;
+
+ if (!items)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "invalid empty data");
+ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+ len = flow_dv_get_item_len(items->type);
+ if (len + temp_size > MLX5_ENCAP_MAX_LEN)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "items total size is too big"
+ " for encap action");
+ rte_memcpy((void *)&buf[temp_size], items->spec, len);
+ switch (items->type) {
+ case RTE_FLOW_ITEM_TYPE_ETH:
+ eth = (struct rte_ether_hdr *)&buf[temp_size];
+ break;
+ case RTE_FLOW_ITEM_TYPE_VLAN:
+ vlan = (struct rte_vlan_hdr *)&buf[temp_size];
+ if (!eth)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "eth header not found");
+ if (!eth->ether_type)
+ eth->ether_type = RTE_BE16(RTE_ETHER_TYPE_VLAN);
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV4:
+ ipv4 = (struct rte_ipv4_hdr *)&buf[temp_size];
+ if (!vlan && !eth)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "neither eth nor vlan"
+ " header found");
+ if (vlan && !vlan->eth_proto)
+ vlan->eth_proto = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+ else if (eth && !eth->ether_type)
+ eth->ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+ if (!ipv4->version_ihl)
+ ipv4->version_ihl = MLX5_ENCAP_IPV4_VERSION |
+ MLX5_ENCAP_IPV4_IHL_MIN;
+ if (!ipv4->time_to_live)
+ ipv4->time_to_live = MLX5_ENCAP_IPV4_TTL_DEF;
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV6:
+ ipv6 = (struct rte_ipv6_hdr *)&buf[temp_size];
+ if (!vlan && !eth)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "neither eth nor vlan"
+ " header found");
+ if (vlan && !vlan->eth_proto)
+ vlan->eth_proto = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+ else if (eth && !eth->ether_type)
+ eth->ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+ if (!ipv6->vtc_flow)
+ ipv6->vtc_flow =
+ RTE_BE32(MLX5_ENCAP_IPV6_VTC_FLOW);
+ if (!ipv6->hop_limits)
+ ipv6->hop_limits = MLX5_ENCAP_IPV6_HOP_LIMIT;
+ break;
+ case RTE_FLOW_ITEM_TYPE_UDP:
+ udp = (struct rte_udp_hdr *)&buf[temp_size];
+ if (!ipv4 && !ipv6)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "ip header not found");
+ if (ipv4 && !ipv4->next_proto_id)
+ ipv4->next_proto_id = IPPROTO_UDP;
+ else if (ipv6 && !ipv6->proto)
+ ipv6->proto = IPPROTO_UDP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN:
+ vxlan = (struct rte_vxlan_hdr *)&buf[temp_size];
+ if (!udp)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "udp header not found");
+ if (!udp->dst_port)
+ udp->dst_port = RTE_BE16(MLX5_UDP_PORT_VXLAN);
+ if (!vxlan->vx_flags)
+ vxlan->vx_flags =
+ RTE_BE32(MLX5_ENCAP_VXLAN_FLAGS);
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+ vxlan_gpe = (struct rte_vxlan_gpe_hdr *)&buf[temp_size];
+ if (!udp)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "udp header not found");
+ if (!vxlan_gpe->proto)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "next protocol not found");
+ if (!udp->dst_port)
+ udp->dst_port =
+ RTE_BE16(MLX5_UDP_PORT_VXLAN_GPE);
+ if (!vxlan_gpe->vx_flags)
+ vxlan_gpe->vx_flags =
+ MLX5_ENCAP_VXLAN_GPE_FLAGS;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GRE:
+ case RTE_FLOW_ITEM_TYPE_NVGRE:
+ gre = (struct rte_gre_hdr *)&buf[temp_size];
+ if (!gre->proto)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "next protocol not found");
+ if (!ipv4 && !ipv6)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "ip header not found");
+ if (ipv4 && !ipv4->next_proto_id)
+ ipv4->next_proto_id = IPPROTO_GRE;
+ else if (ipv6 && !ipv6->proto)
+ ipv6->proto = IPPROTO_GRE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VOID:
+ break;
+ default:
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ (void *)items->type,
+ "unsupported item type");
+ break;
+ }
+ temp_size += len;
+ }
+ *size = temp_size;
+ return 0;
+}
+
+static int
+flow_dv_zero_encap_udp_csum(void *data, struct rte_flow_error *error)
+{
+ struct rte_ether_hdr *eth = NULL;
+ struct rte_vlan_hdr *vlan = NULL;
+ struct rte_ipv6_hdr *ipv6 = NULL;
+ struct rte_udp_hdr *udp = NULL;
+ char *next_hdr;
+ uint16_t proto;
+
+ eth = (struct rte_ether_hdr *)data;
+ next_hdr = (char *)(eth + 1);
+ proto = RTE_BE16(eth->ether_type);
+
+ /* VLAN skipping */
+ while (proto == RTE_ETHER_TYPE_VLAN || proto == RTE_ETHER_TYPE_QINQ) {
+ vlan = (struct rte_vlan_hdr *)next_hdr;
+ proto = RTE_BE16(vlan->eth_proto);
+ next_hdr += sizeof(struct rte_vlan_hdr);
+ }
+
+ /* HW calculates IPv4 csum. no need to proceed */
+ if (proto == RTE_ETHER_TYPE_IPV4)
+ return 0;
+
+ /* non IPv4/IPv6 header. not supported */
+ if (proto != RTE_ETHER_TYPE_IPV6) {
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "Cannot offload non IPv4/IPv6");
+ }
+
+ ipv6 = (struct rte_ipv6_hdr *)next_hdr;
+
+ /* ignore non UDP */
+ if (ipv6->proto != IPPROTO_UDP)
+ return 0;
+
+ udp = (struct rte_udp_hdr *)(ipv6 + 1);
+ udp->dgram_cksum = 0;
+
+ return 0;
+}
+
+/**
+ * Convert L2 encap action to DV specification.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in] action
+ * Pointer to action structure.
+ * @param[in, out] dev_flow
+ * Pointer to the mlx5_flow.
+ * @param[in] transfer
+ * Mark if the flow is E-Switch flow.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_create_action_l2_encap(struct rte_eth_dev *dev,
+ const struct rte_flow_action *action,
+ struct mlx5_flow *dev_flow,
+ uint8_t transfer,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_item *encap_data;
+ const struct rte_flow_action_raw_encap *raw_encap_data;
+ struct mlx5_flow_dv_encap_decap_resource res = {
+ .reformat_type =
+ MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL,
+ .ft_type = transfer ? MLX5DV_FLOW_TABLE_TYPE_FDB :
+ MLX5DV_FLOW_TABLE_TYPE_NIC_TX,
+ };
+
+ if (action->type == RTE_FLOW_ACTION_TYPE_RAW_ENCAP) {
+ raw_encap_data =
+ (const struct rte_flow_action_raw_encap *)action->conf;
+ res.size = raw_encap_data->size;
+ memcpy(res.buf, raw_encap_data->data, res.size);
+ } else {
+ if (action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP)
+ encap_data =
+ ((const struct rte_flow_action_vxlan_encap *)
+ action->conf)->definition;
+ else
+ encap_data =
+ ((const struct rte_flow_action_nvgre_encap *)
+ action->conf)->definition;
+ if (flow_dv_convert_encap_data(encap_data, res.buf,
+ &res.size, error))
+ return -rte_errno;
+ }
+ if (flow_dv_zero_encap_udp_csum(res.buf, error))
+ return -rte_errno;
+ if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "can't create L2 encap action");
+ return 0;
+}
+
+/**
+ * Convert L2 decap action to DV specification.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in, out] dev_flow
+ * Pointer to the mlx5_flow.
+ * @param[in] transfer
+ * Mark if the flow is E-Switch flow.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_create_action_l2_decap(struct rte_eth_dev *dev,
+ struct mlx5_flow *dev_flow,
+ uint8_t transfer,
+ struct rte_flow_error *error)
+{
+ struct mlx5_flow_dv_encap_decap_resource res = {
+ .size = 0,
+ .reformat_type =
+ MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2,
+ .ft_type = transfer ? MLX5DV_FLOW_TABLE_TYPE_FDB :
+ MLX5DV_FLOW_TABLE_TYPE_NIC_RX,
+ };
+
+ if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "can't create L2 decap action");
+ return 0;
+}
+
+/**
+ * Convert raw decap/encap (L3 tunnel) action to DV specification.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in] action
+ * Pointer to action structure.
+ * @param[in, out] dev_flow
+ * Pointer to the mlx5_flow.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_create_action_raw_encap(struct rte_eth_dev *dev,
+ const struct rte_flow_action *action,
+ struct mlx5_flow *dev_flow,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_raw_encap *encap_data;
+ struct mlx5_flow_dv_encap_decap_resource res;
+
+ memset(&res, 0, sizeof(res));
+ encap_data = (const struct rte_flow_action_raw_encap *)action->conf;
+ res.size = encap_data->size;
+ memcpy(res.buf, encap_data->data, res.size);
+ res.reformat_type = res.size < MLX5_ENCAPSULATION_DECISION_SIZE ?
+ MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2 :
+ MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL;
+ if (attr->transfer)
+ res.ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB;
+ else
+ res.ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX :
+ MLX5DV_FLOW_TABLE_TYPE_NIC_RX;
+ if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "can't create encap action");
+ return 0;
+}
+
+/**
+ * Create action push VLAN.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] vlan
+ * Pointer to the vlan to push to the Ethernet header.
+ * @param[in, out] dev_flow
+ * Pointer to the mlx5_flow.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_create_action_push_vlan(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_vlan_hdr *vlan,
+ struct mlx5_flow *dev_flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_flow_dv_push_vlan_action_resource res;
+
+ memset(&res, 0, sizeof(res));
+ res.vlan_tag =
+ rte_cpu_to_be_32(((uint32_t)vlan->eth_proto) << 16 |
+ vlan->vlan_tci);
+ if (attr->transfer)
+ res.ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB;
+ else
+ res.ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX :
+ MLX5DV_FLOW_TABLE_TYPE_NIC_RX;
+ return flow_dv_push_vlan_action_resource_register
+ (dev, &res, dev_flow, error);
+}
+
+/**
+ * Validate the modify-header actions.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_hdr(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ struct rte_flow_error *error)
+{
+ if (action->type != RTE_FLOW_ACTION_TYPE_DEC_TTL && !action->conf)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ NULL, "action configuration not set");
+ if (action_flags & MLX5_FLOW_ACTION_ENCAP)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't have encap action before"
+ " modify action");
+ return 0;
+}
+
+/**
+ * Validate the modify-header MAC address actions.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[in] item_flags
+ * Holds the items detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_mac(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ int ret = 0;
+
+ ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+ if (!ret) {
+ if (!(item_flags & MLX5_FLOW_LAYER_L2))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "no L2 item in pattern");
+ }
+ return ret;
+}
+
+/**
+ * Validate the modify-header IPv4 address actions.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[in] item_flags
+ * Holds the items detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ipv4(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ int ret = 0;
+ uint64_t layer;
+
+ ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+ if (!ret) {
+ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+ MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+ if (!(item_flags & layer))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "no ipv4 item in pattern");
+ }
+ return ret;
+}
+
+/**
+ * Validate the modify-header IPv6 address actions.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[in] item_flags
+ * Holds the items detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ipv6(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ int ret = 0;
+ uint64_t layer;
+
+ ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+ if (!ret) {
+ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+ MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+ if (!(item_flags & layer))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "no ipv6 item in pattern");
+ }
+ return ret;
+}
+
+/**
+ * Validate the modify-header TP actions.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[in] item_flags
+ * Holds the items detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_tp(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ int ret = 0;
+ uint64_t layer;
+
+ ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+ if (!ret) {
+ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+ MLX5_FLOW_LAYER_INNER_L4 :
+ MLX5_FLOW_LAYER_OUTER_L4;
+ if (!(item_flags & layer))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "no transport layer "
+ "in pattern");
+ }
+ return ret;
+}
+
+/**
+ * Validate the modify-header actions of increment/decrement
+ * TCP Sequence-number.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[in] item_flags
+ * Holds the items detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_tcp_seq(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ int ret = 0;
+ uint64_t layer;
+
+ ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+ if (!ret) {
+ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+ MLX5_FLOW_LAYER_INNER_L4_TCP :
+ MLX5_FLOW_LAYER_OUTER_L4_TCP;
+ if (!(item_flags & layer))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "no TCP item in"
+ " pattern");
+ if ((action->type == RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ &&
+ (action_flags & MLX5_FLOW_ACTION_DEC_TCP_SEQ)) ||
+ (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ &&
+ (action_flags & MLX5_FLOW_ACTION_INC_TCP_SEQ)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "cannot decrease and increase"
+ " TCP sequence number"
+ " at the same time");
+ }
+ return ret;
+}
+
+/**
+ * Validate the modify-header actions of increment/decrement
+ * TCP Acknowledgment number.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[in] item_flags
+ * Holds the items detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_tcp_ack(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ int ret = 0;
+ uint64_t layer;
+
+ ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+ if (!ret) {
+ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+ MLX5_FLOW_LAYER_INNER_L4_TCP :
+ MLX5_FLOW_LAYER_OUTER_L4_TCP;
+ if (!(item_flags & layer))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "no TCP item in"
+ " pattern");
+ if ((action->type == RTE_FLOW_ACTION_TYPE_INC_TCP_ACK &&
+ (action_flags & MLX5_FLOW_ACTION_DEC_TCP_ACK)) ||
+ (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK &&
+ (action_flags & MLX5_FLOW_ACTION_INC_TCP_ACK)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "cannot decrease and increase"
+ " TCP acknowledgment number"
+ " at the same time");
+ }
+ return ret;
+}
+
+/**
+ * Validate the modify-header TTL actions.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[in] item_flags
+ * Holds the items detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ttl(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ int ret = 0;
+ uint64_t layer;
+
+ ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+ if (!ret) {
+ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+ MLX5_FLOW_LAYER_INNER_L3 :
+ MLX5_FLOW_LAYER_OUTER_L3;
+ if (!(item_flags & layer))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "no IP protocol in pattern");
+ }
+ return ret;
+}
+
+/**
+ * Validate jump action.
+ *
+ * @param[in] action
+ * Pointer to the jump action.
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] attributes
+ * Pointer to flow attributes
+ * @param[in] external
+ * Action belongs to flow rule created by request external to PMD.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_jump(const struct rte_flow_action *action,
+ uint64_t action_flags,
+ const struct rte_flow_attr *attributes,
+ bool external, struct rte_flow_error *error)
+{
+ uint32_t target_group, table;
+ int ret = 0;
+
+ if (action_flags & (MLX5_FLOW_FATE_ACTIONS |
+ MLX5_FLOW_FATE_ESWITCH_ACTIONS))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can't have 2 fate actions in"
+ " same flow");
+ if (action_flags & MLX5_FLOW_ACTION_METER)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "jump with meter not support");
+ if (!action->conf)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ NULL, "action configuration not set");
+ target_group =
+ ((const struct rte_flow_action_jump *)action->conf)->group;
+ ret = mlx5_flow_group_to_table(attributes, external, target_group,
+ true, &table, error);
+ if (ret)
+ return ret;
+ if (attributes->group == target_group)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "target group must be other than"
+ " the current flow group");
+ return 0;
+}
+
+/*
+ * Validate the port_id action.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in] action_flags
+ * Bit-fields that holds the actions detected until now.
+ * @param[in] action
+ * Port_id RTE action structure.
+ * @param[in] attr
+ * Attributes of flow that includes this action.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_port_id(struct rte_eth_dev *dev,
+ uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_port_id *port_id;
+ struct mlx5_priv *act_priv;
+ struct mlx5_priv *dev_priv;
+ uint16_t port;
+
+ if (!attr->transfer)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "port id action is valid in transfer"
+ " mode only");
+ if (!action || !action->conf)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ NULL,
+ "port id action parameters must be"
+ " specified");
+ if (action_flags & (MLX5_FLOW_FATE_ACTIONS |
+ MLX5_FLOW_FATE_ESWITCH_ACTIONS))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "can have only one fate actions in"
+ " a flow");
+ dev_priv = mlx5_dev_to_eswitch_info(dev);
+ if (!dev_priv)
+ return rte_flow_error_set(error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "failed to obtain E-Switch info");
+ port_id = action->conf;
+ port = port_id->original ? dev->data->port_id : port_id->id;
+ act_priv = mlx5_port_to_eswitch_info(port, false);
+ if (!act_priv)
+ return rte_flow_error_set
+ (error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_ACTION_CONF, port_id,
+ "failed to obtain E-Switch port id for port");
+ if (act_priv->domain_id != dev_priv->domain_id)
+ return rte_flow_error_set
+ (error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "port does not belong to"
+ " E-Switch being configured");
+ return 0;
+}
+
+/**
+ * Get the maximum number of modify header actions.
+ *
+ * @param dev
+ * Pointer to rte_eth_dev structure.
+ * @param flags
+ * Flags bits to check if root level.
+ *
+ * @return
+ * Max number of modify header actions device can support.
+ */
+static inline unsigned int
+flow_dv_modify_hdr_action_max(struct rte_eth_dev *dev __rte_unused,
+ uint64_t flags)
+{
+ /*
+ * There's no way to directly query the max capacity from FW.
+ * The maximal value on root table should be assumed to be supported.
+ */
+ if (!(flags & MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL))
+ return MLX5_MAX_MODIFY_NUM;
+ else
+ return MLX5_ROOT_TBL_MODIFY_NUM;
+}
+
+/**
+ * Validate the meter action.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in] action_flags
+ * Bit-fields that holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the meter action.
+ * @param[in] attr
+ * Attributes of flow that includes this action.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_ernno is set.
+ */
+static int
+mlx5_flow_validate_action_meter(struct rte_eth_dev *dev,
+ uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_action_meter *am = action->conf;
+ struct mlx5_flow_meter *fm;
+
+ if (!am)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "meter action conf is NULL");
+
+ if (action_flags & MLX5_FLOW_ACTION_METER)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "meter chaining not support");
+ if (action_flags & MLX5_FLOW_ACTION_JUMP)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "meter with jump not support");
+ if (!priv->mtr_en)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "meter action not supported");
+ fm = mlx5_flow_meter_find(priv, am->mtr_id);
+ if (!fm)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "Meter not found");
+ if (fm->ref_cnt && (!(fm->transfer == attr->transfer ||
+ (!fm->ingress && !attr->ingress && attr->egress) ||
+ (!fm->egress && !attr->egress && attr->ingress))))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "Flow attributes are either invalid "
+ "or have a conflict with current "
+ "meter attributes");
+ return 0;
+}
+
+/**
+ * Validate the age action.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the age action.
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_age(uint64_t action_flags,
+ const struct rte_flow_action *action,
+ struct rte_eth_dev *dev,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_action_age *age = action->conf;
+
+ if (!priv->config.devx || priv->counter_fallback)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "age action not supported");
+ if (!(action->conf))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "configuration cannot be null");
+ if (age->timeout >= UINT16_MAX / 2 / 10)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "Max age time: 3275 seconds");
+ if (action_flags & MLX5_FLOW_ACTION_AGE)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "Duplicate age ctions set");
+ return 0;
+}
+
+/**
+ * Validate the modify-header IPv4 DSCP actions.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[in] item_flags
+ * Holds the items detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ipv4_dscp(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ int ret = 0;
+
+ ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+ if (!ret) {
+ if (!(item_flags & MLX5_FLOW_LAYER_L3_IPV4))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "no ipv4 item in pattern");
+ }
+ return ret;
+}
+
+/**
+ * Validate the modify-header IPv6 DSCP actions.
+ *
+ * @param[in] action_flags
+ * Holds the actions detected until now.
+ * @param[in] action
+ * Pointer to the modify action.
+ * @param[in] item_flags
+ * Holds the items detected.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ipv6_dscp(const uint64_t action_flags,
+ const struct rte_flow_action *action,
+ const uint64_t item_flags,
+ struct rte_flow_error *error)
+{
+ int ret = 0;
+
+ ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+ if (!ret) {
+ if (!(item_flags & MLX5_FLOW_LAYER_L3_IPV6))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "no ipv6 item in pattern");
+ }
+ return ret;
+}
+
+/**
+ * Find existing modify-header resource or create and register a new one.
+ *
+ * @param dev[in, out]
+ * Pointer to rte_eth_dev structure.
+ * @param[in, out] resource
+ * Pointer to modify-header resource.
+ * @parm[in, out] dev_flow
+ * Pointer to the dev_flow.
+ * @param[out] error
+ * pointer to error structure.
+ *
+ * @return
+ * 0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_modify_hdr_resource_register
+ (struct rte_eth_dev *dev,
+ struct mlx5_flow_dv_modify_hdr_resource *resource,
+ struct mlx5_flow *dev_flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_dv_modify_hdr_resource *cache_resource;
+ struct mlx5dv_dr_domain *ns;
+ uint32_t actions_len;
+
+ resource->flags = dev_flow->dv.group ? 0 :
+ MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL;
+ if (resource->actions_num > flow_dv_modify_hdr_action_max(dev,
+ resource->flags))
+ return rte_flow_error_set(error, EOVERFLOW,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "too many modify header items");
+ if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_FDB)
+ ns = sh->fdb_domain;
+ else if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_NIC_TX)
+ ns = sh->tx_domain;
+ else
+ ns = sh->rx_domain;
+ /* Lookup a matching resource from cache. */
+ actions_len = resource->actions_num * sizeof(resource->actions[0]);
+ LIST_FOREACH(cache_resource, &sh->modify_cmds, next) {
+ if (resource->ft_type == cache_resource->ft_type &&
+ resource->actions_num == cache_resource->actions_num &&
+ resource->flags == cache_resource->flags &&
+ !memcmp((const void *)resource->actions,
+ (const void *)cache_resource->actions,
+ actions_len)) {
+ DRV_LOG(DEBUG, "modify-header resource %p: refcnt %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ rte_atomic32_inc(&cache_resource->refcnt);
+ dev_flow->handle->dvh.modify_hdr = cache_resource;
+ return 0;
+ }
+ }
+ /* Register new modify-header resource. */
+ cache_resource = rte_calloc(__func__, 1,
+ sizeof(*cache_resource) + actions_len, 0);
+ if (!cache_resource)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot allocate resource memory");
+ *cache_resource = *resource;
+ rte_memcpy(cache_resource->actions, resource->actions, actions_len);
+ cache_resource->verbs_action =
+ mlx5_glue->dv_create_flow_action_modify_header
+ (sh->ctx, cache_resource->ft_type, ns,
+ cache_resource->flags, actions_len,
+ (uint64_t *)cache_resource->actions);
+ if (!cache_resource->verbs_action) {
+ rte_free(cache_resource);
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot create action");
+ }
+ rte_atomic32_init(&cache_resource->refcnt);
+ rte_atomic32_inc(&cache_resource->refcnt);
+ LIST_INSERT_HEAD(&sh->modify_cmds, cache_resource, next);
+ dev_flow->handle->dvh.modify_hdr = cache_resource;
+ DRV_LOG(DEBUG, "new modify-header resource %p: refcnt %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ return 0;
+}
+
+/**
+ * Get DV flow counter by index.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] idx
+ * mlx5 flow counter index in the container.
+ * @param[out] ppool
+ * mlx5 flow counter pool in the container,
+ *
+ * @return
+ * Pointer to the counter, NULL otherwise.
+ */
+static struct mlx5_flow_counter *
+flow_dv_counter_get_by_idx(struct rte_eth_dev *dev,
+ uint32_t idx,
+ struct mlx5_flow_counter_pool **ppool)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_pools_container *cont;
+ struct mlx5_flow_counter_pool *pool;
+ uint32_t batch = 0, age = 0;
+
+ idx--;
+ age = MLX_CNT_IS_AGE(idx);
+ idx = age ? idx - MLX5_CNT_AGE_OFFSET : idx;
+ if (idx >= MLX5_CNT_BATCH_OFFSET) {
+ idx -= MLX5_CNT_BATCH_OFFSET;
+ batch = 1;
+ }
+ cont = MLX5_CNT_CONTAINER(priv->sh, batch, age);
+ MLX5_ASSERT(idx / MLX5_COUNTERS_PER_POOL < cont->n);
+ pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
+ MLX5_ASSERT(pool);
+ if (ppool)
+ *ppool = pool;
+ return MLX5_POOL_GET_CNT(pool, idx % MLX5_COUNTERS_PER_POOL);
+}
+
+/**
+ * Get a pool by devx counter ID.
+ *
+ * @param[in] cont
+ * Pointer to the counter container.
+ * @param[in] id
+ * The counter devx ID.
+ *
+ * @return
+ * The counter pool pointer if exists, NULL otherwise,
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_find_pool_by_id(struct mlx5_pools_container *cont, int id)
+{
+ uint32_t i;
+ uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
+
+ for (i = 0; i < n_valid; i++) {
+ struct mlx5_flow_counter_pool *pool = cont->pools[i];
+ int base = (pool->min_dcs->id / MLX5_COUNTERS_PER_POOL) *
+ MLX5_COUNTERS_PER_POOL;
+
+ if (id >= base && id < base + MLX5_COUNTERS_PER_POOL) {
+ /*
+ * Move the pool to the head, as counter allocate
+ * always gets the first pool in the container.
+ */
+ if (pool != TAILQ_FIRST(&cont->pool_list)) {
+ TAILQ_REMOVE(&cont->pool_list, pool, next);
+ TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+ }
+ return pool;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Allocate a new memory for the counter values wrapped by all the needed
+ * management.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] raws_n
+ * The raw memory areas - each one for MLX5_COUNTERS_PER_POOL counters.
+ *
+ * @return
+ * The new memory management pointer on success, otherwise NULL and rte_errno
+ * is set.
+ */
+static struct mlx5_counter_stats_mem_mng *
+flow_dv_create_counter_stat_mem_mng(struct rte_eth_dev *dev, int raws_n)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_devx_mkey_attr mkey_attr;
+ struct mlx5_counter_stats_mem_mng *mem_mng;
+ volatile struct flow_counter_stats *raw_data;
+ int size = (sizeof(struct flow_counter_stats) *
+ MLX5_COUNTERS_PER_POOL +
+ sizeof(struct mlx5_counter_stats_raw)) * raws_n +
+ sizeof(struct mlx5_counter_stats_mem_mng);
+ uint8_t *mem = rte_calloc(__func__, 1, size, sysconf(_SC_PAGESIZE));
+ int i;
+
+ if (!mem) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ mem_mng = (struct mlx5_counter_stats_mem_mng *)(mem + size) - 1;
+ size = sizeof(*raw_data) * MLX5_COUNTERS_PER_POOL * raws_n;
+ mem_mng->umem = mlx5_glue->devx_umem_reg(sh->ctx, mem, size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!mem_mng->umem) {
+ rte_errno = errno;
+ rte_free(mem);
+ return NULL;
+ }
+ mkey_attr.addr = (uintptr_t)mem;
+ mkey_attr.size = size;
+ mkey_attr.umem_id = mem_mng->umem->umem_id;
+ mkey_attr.pd = sh->pdn;
+ mkey_attr.log_entity_size = 0;
+ mkey_attr.pg_access = 0;
+ mkey_attr.klm_array = NULL;
+ mkey_attr.klm_num = 0;
+ if (priv->config.hca_attr.relaxed_ordering_write &&
+ priv->config.hca_attr.relaxed_ordering_read &&
+ !haswell_broadwell_cpu)
+ mkey_attr.relaxed_ordering = 1;
+ mem_mng->dm = mlx5_devx_cmd_mkey_create(sh->ctx, &mkey_attr);
+ if (!mem_mng->dm) {
+ mlx5_glue->devx_umem_dereg(mem_mng->umem);
+ rte_errno = errno;
+ rte_free(mem);
+ return NULL;
+ }
+ mem_mng->raws = (struct mlx5_counter_stats_raw *)(mem + size);
+ raw_data = (volatile struct flow_counter_stats *)mem;
+ for (i = 0; i < raws_n; ++i) {
+ mem_mng->raws[i].mem_mng = mem_mng;
+ mem_mng->raws[i].data = raw_data + i * MLX5_COUNTERS_PER_POOL;
+ }
+ LIST_INSERT_HEAD(&sh->cmng.mem_mngs, mem_mng, next);
+ return mem_mng;
+}
+
+/**
+ * Resize a counter container.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] batch
+ * Whether the pool is for counter that was allocated by batch command.
+ * @param[in] age
+ * Whether the pool is for Aging counter.
+ *
+ * @return
+ * 0 on success, otherwise negative errno value and rte_errno is set.
+ */
+static int
+flow_dv_container_resize(struct rte_eth_dev *dev,
+ uint32_t batch, uint32_t age)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
+ age);
+ struct mlx5_counter_stats_mem_mng *mem_mng = NULL;
+ void *old_pools = cont->pools;
+ uint32_t resize = cont->n + MLX5_CNT_CONTAINER_RESIZE;
+ uint32_t mem_size = sizeof(struct mlx5_flow_counter_pool *) * resize;
+ void *pools = rte_calloc(__func__, 1, mem_size, 0);
+
+ if (!pools) {
+ rte_errno = ENOMEM;
+ return -ENOMEM;
+ }
+ if (old_pools)
+ memcpy(pools, old_pools, cont->n *
+ sizeof(struct mlx5_flow_counter_pool *));
+ /*
+ * Fallback mode query the counter directly, no background query
+ * resources are needed.
+ */
+ if (!priv->counter_fallback) {
+ int i;
+
+ mem_mng = flow_dv_create_counter_stat_mem_mng(dev,
+ MLX5_CNT_CONTAINER_RESIZE + MLX5_MAX_PENDING_QUERIES);
+ if (!mem_mng) {
+ rte_free(pools);
+ return -ENOMEM;
+ }
+ for (i = 0; i < MLX5_MAX_PENDING_QUERIES; ++i)
+ LIST_INSERT_HEAD(&priv->sh->cmng.free_stat_raws,
+ mem_mng->raws +
+ MLX5_CNT_CONTAINER_RESIZE +
+ i, next);
+ }
+ rte_spinlock_lock(&cont->resize_sl);
+ cont->n = resize;
+ cont->mem_mng = mem_mng;
+ cont->pools = pools;
+ rte_spinlock_unlock(&cont->resize_sl);
+ if (old_pools)
+ rte_free(old_pools);
+ return 0;
+}
+
+/**
+ * Query a devx flow counter.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] cnt
+ * Index to the flow counter.
+ * @param[out] pkts
+ * The statistics value of packets.
+ * @param[out] bytes
+ * The statistics value of bytes.
+ *
+ * @return
+ * 0 on success, otherwise a negative errno value and rte_errno is set.
+ */
+static inline int
+_flow_dv_query_count(struct rte_eth_dev *dev, uint32_t counter, uint64_t *pkts,
+ uint64_t *bytes)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_counter_pool *pool = NULL;
+ struct mlx5_flow_counter *cnt;
+ struct mlx5_flow_counter_ext *cnt_ext = NULL;
+ int offset;
+
+ cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
+ MLX5_ASSERT(pool);
+ if (counter < MLX5_CNT_BATCH_OFFSET) {
+ cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+ if (priv->counter_fallback)
+ return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
+ 0, pkts, bytes, 0, NULL, NULL, 0);
+ }
+
+ rte_spinlock_lock(&pool->sl);
+ /*
+ * The single counters allocation may allocate smaller ID than the
+ * current allocated in parallel to the host reading.
+ * In this case the new counter values must be reported as 0.
+ */
+ if (unlikely(cnt_ext && cnt_ext->dcs->id < pool->raw->min_dcs_id)) {
+ *pkts = 0;
+ *bytes = 0;
+ } else {
+ offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
+ *pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
+ *bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
+ }
+ rte_spinlock_unlock(&pool->sl);
+ return 0;
+}
+
+/**
+ * Create and initialize a new counter pool.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[out] dcs
+ * The devX counter handle.
+ * @param[in] batch
+ * Whether the pool is for counter that was allocated by batch command.
+ * @param[in] age
+ * Whether the pool is for counter that was allocated for aging.
+ * @param[in/out] cont_cur
+ * Pointer to the container pointer, it will be update in pool resize.
+ *
+ * @return
+ * The pool container pointer on success, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_pool_create(struct rte_eth_dev *dev, struct mlx5_devx_obj *dcs,
+ uint32_t batch, uint32_t age)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_counter_pool *pool;
+ struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
+ age);
+ int16_t n_valid = rte_atomic16_read(&cont->n_valid);
+ uint32_t size = sizeof(*pool);
+
+ if (cont->n == n_valid && flow_dv_container_resize(dev, batch, age))
+ return NULL;
+ size += MLX5_COUNTERS_PER_POOL * CNT_SIZE;
+ size += (batch ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
+ size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * AGE_SIZE);
+ pool = rte_calloc(__func__, 1, size, 0);
+ if (!pool) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ pool->min_dcs = dcs;
+ if (!priv->counter_fallback)
+ pool->raw = cont->mem_mng->raws + n_valid %
+ MLX5_CNT_CONTAINER_RESIZE;
+ pool->raw_hw = NULL;
+ pool->type = 0;
+ pool->type |= (batch ? 0 : CNT_POOL_TYPE_EXT);
+ pool->type |= (!age ? 0 : CNT_POOL_TYPE_AGE);
+ rte_spinlock_init(&pool->sl);
+ /*
+ * The generation of the new allocated counters in this pool is 0, 2 in
+ * the pool generation makes all the counters valid for allocation.
+ * The start and end query generation protect the counters be released
+ * between the query and update gap period will not be reallocated
+ * without the last query finished and stats updated to the memory.
+ */
+ rte_atomic64_set(&pool->start_query_gen, 0x2);
+ /*
+ * There's no background query thread for fallback mode, set the
+ * end_query_gen to the maximum value since no need to wait for
+ * statistics update.
+ */
+ rte_atomic64_set(&pool->end_query_gen, priv->counter_fallback ?
+ INT64_MAX : 0x2);
+ TAILQ_INIT(&pool->counters);
+ TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+ pool->index = n_valid;
+ cont->pools[n_valid] = pool;
+ /* Pool initialization must be updated before host thread access. */
+ rte_cio_wmb();
+ rte_atomic16_add(&cont->n_valid, 1);
+ return pool;
+}
+
+/**
+ * Update the minimum dcs-id for aged or no-aged counter pool.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] pool
+ * Current counter pool.
+ * @param[in] batch
+ * Whether the pool is for counter that was allocated by batch command.
+ * @param[in] age
+ * Whether the counter is for aging.
+ */
+static void
+flow_dv_counter_update_min_dcs(struct rte_eth_dev *dev,
+ struct mlx5_flow_counter_pool *pool,
+ uint32_t batch, uint32_t age)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_counter_pool *other;
+ struct mlx5_pools_container *cont;
+
+ cont = MLX5_CNT_CONTAINER(priv->sh, batch, (age ^ 0x1));
+ other = flow_dv_find_pool_by_id(cont, pool->min_dcs->id);
+ if (!other)
+ return;
+ if (pool->min_dcs->id < other->min_dcs->id) {
+ rte_atomic64_set(&other->a64_dcs,
+ rte_atomic64_read(&pool->a64_dcs));
+ } else {
+ rte_atomic64_set(&pool->a64_dcs,
+ rte_atomic64_read(&other->a64_dcs));
+ }
+}
+/**
+ * Prepare a new counter and/or a new counter pool.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[out] cnt_free
+ * Where to put the pointer of a new counter.
+ * @param[in] batch
+ * Whether the pool is for counter that was allocated by batch command.
+ * @param[in] age
+ * Whether the pool is for counter that was allocated for aging.
+ *
+ * @return
+ * The counter pool pointer and @p cnt_free is set on success,
+ * NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_counter_pool_prepare(struct rte_eth_dev *dev,
+ struct mlx5_flow_counter **cnt_free,
+ uint32_t batch, uint32_t age)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_pools_container *cont;
+ struct mlx5_flow_counter_pool *pool;
+ struct mlx5_devx_obj *dcs = NULL;
+ struct mlx5_flow_counter *cnt;
+ uint32_t i;
+
+ cont = MLX5_CNT_CONTAINER(priv->sh, batch, age);
+ if (!batch) {
+ /* bulk_bitmap must be 0 for single counter allocation. */
+ dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0);
+ if (!dcs)
+ return NULL;
+ pool = flow_dv_find_pool_by_id(cont, dcs->id);
+ if (!pool) {
+ pool = flow_dv_pool_create(dev, dcs, batch, age);
+ if (!pool) {
+ mlx5_devx_cmd_destroy(dcs);
+ return NULL;
+ }
+ } else if (dcs->id < pool->min_dcs->id) {
+ rte_atomic64_set(&pool->a64_dcs,
+ (int64_t)(uintptr_t)dcs);
+ }
+ flow_dv_counter_update_min_dcs(dev,
+ pool, batch, age);
+ i = dcs->id % MLX5_COUNTERS_PER_POOL;
+ cnt = MLX5_POOL_GET_CNT(pool, i);
+ TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+ MLX5_GET_POOL_CNT_EXT(pool, i)->dcs = dcs;
+ *cnt_free = cnt;
+ return pool;
+ }
+ /* bulk_bitmap is in 128 counters units. */
+ if (priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4)
+ dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0x4);
+ if (!dcs) {
+ rte_errno = ENODATA;
+ return NULL;
+ }
+ pool = flow_dv_pool_create(dev, dcs, batch, age);
+ if (!pool) {
+ mlx5_devx_cmd_destroy(dcs);
+ return NULL;
+ }
+ for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+ cnt = MLX5_POOL_GET_CNT(pool, i);
+ TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+ }
+ *cnt_free = MLX5_POOL_GET_CNT(pool, 0);
+ return pool;
+}
+
+/**
+ * Search for existed shared counter.
+ *
+ * @param[in] cont
+ * Pointer to the relevant counter pool container.
+ * @param[in] id
+ * The shared counter ID to search.
+ * @param[out] ppool
+ * mlx5 flow counter pool in the container,
+ *
+ * @return
+ * NULL if not existed, otherwise pointer to the shared extend counter.
+ */
+static struct mlx5_flow_counter_ext *
+flow_dv_counter_shared_search(struct mlx5_pools_container *cont, uint32_t id,
+ struct mlx5_flow_counter_pool **ppool)
+{
+ struct mlx5_flow_counter_ext *cnt;
+ struct mlx5_flow_counter_pool *pool;
+ uint32_t i, j;
+ uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
+
+ for (i = 0; i < n_valid; i++) {
+ pool = cont->pools[i];
+ for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
+ cnt = MLX5_GET_POOL_CNT_EXT(pool, j);
+ if (cnt->ref_cnt && cnt->shared && cnt->id == id) {
+ if (ppool)
+ *ppool = cont->pools[i];
+ return cnt;
+ }
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Allocate a flow counter.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] shared
+ * Indicate if this counter is shared with other flows.
+ * @param[in] id
+ * Counter identifier.
+ * @param[in] group
+ * Counter flow group.
+ * @param[in] age
+ * Whether the counter was allocated for aging.
+ *
+ * @return
+ * Index to flow counter on success, 0 otherwise and rte_errno is set.
+ */
+static uint32_t
+flow_dv_counter_alloc(struct rte_eth_dev *dev, uint32_t shared, uint32_t id,
+ uint16_t group, uint32_t age)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_counter_pool *pool = NULL;
+ struct mlx5_flow_counter *cnt_free = NULL;
+ struct mlx5_flow_counter_ext *cnt_ext = NULL;
+ /*
+ * Currently group 0 flow counter cannot be assigned to a flow if it is
+ * not the first one in the batch counter allocation, so it is better
+ * to allocate counters one by one for these flows in a separate
+ * container.
+ * A counter can be shared between different groups so need to take
+ * shared counters from the single container.
+ */
+ uint32_t batch = (group && !shared && !priv->counter_fallback) ? 1 : 0;
+ struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
+ age);
+ uint32_t cnt_idx;
+
+ if (!priv->config.devx) {
+ rte_errno = ENOTSUP;
+ return 0;
+ }
+ if (shared) {
+ cnt_ext = flow_dv_counter_shared_search(cont, id, &pool);
+ if (cnt_ext) {
+ if (cnt_ext->ref_cnt + 1 == 0) {
+ rte_errno = E2BIG;
+ return 0;
+ }
+ cnt_ext->ref_cnt++;
+ cnt_idx = pool->index * MLX5_COUNTERS_PER_POOL +
+ (cnt_ext->dcs->id % MLX5_COUNTERS_PER_POOL)
+ + 1;
+ return cnt_idx;
+ }
+ }
+ /* Pools which has a free counters are in the start. */
+ TAILQ_FOREACH(pool, &cont->pool_list, next) {
+ /*
+ * The free counter reset values must be updated between the
+ * counter release to the counter allocation, so, at least one
+ * query must be done in this time. ensure it by saving the
+ * query generation in the release time.
+ * The free list is sorted according to the generation - so if
+ * the first one is not updated, all the others are not
+ * updated too.
+ */
+ cnt_free = TAILQ_FIRST(&pool->counters);
+ if (cnt_free && cnt_free->query_gen <
+ rte_atomic64_read(&pool->end_query_gen))
+ break;
+ cnt_free = NULL;
+ }
+ if (!cnt_free) {
+ pool = flow_dv_counter_pool_prepare(dev, &cnt_free, batch, age);
+ if (!pool)
+ return 0;
+ }
+ if (!batch)
+ cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt_free);
+ /* Create a DV counter action only in the first time usage. */
+ if (!cnt_free->action) {
+ uint16_t offset;
+ struct mlx5_devx_obj *dcs;
+
+ if (batch) {
+ offset = MLX5_CNT_ARRAY_IDX(pool, cnt_free);
+ dcs = pool->min_dcs;
+ } else {
+ offset = 0;
+ dcs = cnt_ext->dcs;
+ }
+ cnt_free->action = mlx5_glue->dv_create_flow_action_counter
+ (dcs->obj, offset);
+ if (!cnt_free->action) {
+ rte_errno = errno;
+ return 0;
+ }
+ }
+ cnt_idx = MLX5_MAKE_CNT_IDX(pool->index,
+ MLX5_CNT_ARRAY_IDX(pool, cnt_free));
+ cnt_idx += batch * MLX5_CNT_BATCH_OFFSET;
+ cnt_idx += age * MLX5_CNT_AGE_OFFSET;
+ /* Update the counter reset values. */
+ if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits,
+ &cnt_free->bytes))
+ return 0;
+ if (cnt_ext) {
+ cnt_ext->shared = shared;
+ cnt_ext->ref_cnt = 1;
+ cnt_ext->id = id;
+ }
+ if (!priv->counter_fallback && !priv->sh->cmng.query_thread_on)
+ /* Start the asynchronous batch query by the host thread. */
+ mlx5_set_query_alarm(priv->sh);
+ TAILQ_REMOVE(&pool->counters, cnt_free, next);
+ if (TAILQ_EMPTY(&pool->counters)) {
+ /* Move the pool to the end of the container pool list. */
+ TAILQ_REMOVE(&cont->pool_list, pool, next);
+ TAILQ_INSERT_TAIL(&cont->pool_list, pool, next);
+ }
+ return cnt_idx;
+}
+
+/**
+ * Get age param from counter index.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] counter
+ * Index to the counter handler.
+ *
+ * @return
+ * The aging parameter specified for the counter index.
+ */
+static struct mlx5_age_param*
+flow_dv_counter_idx_get_age(struct rte_eth_dev *dev,
+ uint32_t counter)
+{
+ struct mlx5_flow_counter *cnt;
+ struct mlx5_flow_counter_pool *pool = NULL;
+
+ flow_dv_counter_get_by_idx(dev, counter, &pool);
+ counter = (counter - 1) % MLX5_COUNTERS_PER_POOL;
+ cnt = MLX5_POOL_GET_CNT(pool, counter);
+ return MLX5_CNT_TO_AGE(cnt);
+}
+
+/**
+ * Remove a flow counter from aged counter list.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] counter
+ * Index to the counter handler.
+ * @param[in] cnt
+ * Pointer to the counter handler.
+ */
+static void
+flow_dv_counter_remove_from_age(struct rte_eth_dev *dev,
+ uint32_t counter, struct mlx5_flow_counter *cnt)
+{
+ struct mlx5_age_info *age_info;
+ struct mlx5_age_param *age_param;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ age_info = GET_PORT_AGE_INFO(priv);
+ age_param = flow_dv_counter_idx_get_age(dev, counter);
+ if (rte_atomic16_cmpset((volatile uint16_t *)
+ &age_param->state,
+ AGE_CANDIDATE, AGE_FREE)
+ != AGE_CANDIDATE) {
+ /**
+ * We need the lock even it is age timeout,
+ * since counter may still in process.
+ */
+ rte_spinlock_lock(&age_info->aged_sl);
+ TAILQ_REMOVE(&age_info->aged_counters, cnt, next);
+ rte_spinlock_unlock(&age_info->aged_sl);
+ }
+ rte_atomic16_set(&age_param->state, AGE_FREE);
+}
+/**
+ * Release a flow counter.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] counter
+ * Index to the counter handler.
+ */
+static void
+flow_dv_counter_release(struct rte_eth_dev *dev, uint32_t counter)
+{
+ struct mlx5_flow_counter_pool *pool = NULL;
+ struct mlx5_flow_counter *cnt;
+ struct mlx5_flow_counter_ext *cnt_ext = NULL;
+
+ if (!counter)
+ return;
+ cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
+ MLX5_ASSERT(pool);
+ if (counter < MLX5_CNT_BATCH_OFFSET) {
+ cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+ if (cnt_ext && --cnt_ext->ref_cnt)
+ return;
+ }
+ if (IS_AGE_POOL(pool))
+ flow_dv_counter_remove_from_age(dev, counter, cnt);
+ /* Put the counter in the end - the last updated one. */
+ TAILQ_INSERT_TAIL(&pool->counters, cnt, next);
+ /*
+ * Counters released between query trigger and handler need
+ * to wait the next round of query. Since the packets arrive
+ * in the gap period will not be taken into account to the
+ * old counter.
+ */
+ cnt->query_gen = rte_atomic64_read(&pool->start_query_gen);
+}
+
+/**
+ * Verify the @p attributes will be correctly understood by the NIC and store
+ * them in the @p flow if everything is correct.
+ *
+ * @param[in] dev
+ * Pointer to dev struct.
+ * @param[in] attributes
+ * Pointer to flow attributes
+ * @param[in] external
+ * This flow rule is created by request external to PMD.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * - 0 on success and non root table.
+ * - 1 on success and root table.
+ * - a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_attributes(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attributes,
+ bool external __rte_unused,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t priority_max = priv->config.flow_prio - 1;
+ int ret = 0;
+
+#ifndef HAVE_MLX5DV_DR
+ if (attributes->group)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+ NULL,
+ "groups are not supported");
+#else
+ uint32_t table = 0;
+
+ ret = mlx5_flow_group_to_table(attributes, external,
+ attributes->group, !!priv->fdb_def_rule,
+ &table, error);
+ if (ret)
+ return ret;
+ if (!table)
+ ret = MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL;
+#endif
+ if (attributes->priority != MLX5_FLOW_PRIO_RSVD &&
+ attributes->priority >= priority_max)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+ NULL,
+ "priority out of range");
+ if (attributes->transfer) {
+ if (!priv->config.dv_esw_en)
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "E-Switch dr is not supported");
+ if (!(priv->representor || priv->master))
+ return rte_flow_error_set
+ (error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "E-Switch configuration can only be"
+ " done by a master or a representor device");
+ if (attributes->egress)
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, attributes,
+ "egress is not supported");
+ }
+ if (!(attributes->egress ^ attributes->ingress))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR, NULL,
+ "must specify exactly one of "
+ "ingress or egress");
+ return ret;
+}
+
+/**
+ * Internal validation function. For validating both actions and items.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] items
+ * Pointer to the list of items.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[in] external
+ * This flow rule is created by request external to PMD.
+ * @param[in] hairpin
+ * Number of hairpin TX actions, 0 means classic flow.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external, int hairpin, struct rte_flow_error *error)
+{
+ int ret;
+ uint64_t action_flags = 0;
+ uint64_t item_flags = 0;
+ uint64_t last_item = 0;
+ uint8_t next_protocol = 0xff;
+ uint16_t ether_type = 0;
+ int actions_n = 0;
+ uint8_t item_ipv6_proto = 0;
+ const struct rte_flow_item *gre_item = NULL;
+ const struct rte_flow_action_raw_decap *decap;
+ const struct rte_flow_action_raw_encap *encap;
+ const struct rte_flow_action_rss *rss;
+ const struct rte_flow_item_tcp nic_tcp_mask = {
+ .hdr = {
+ .tcp_flags = 0xFF,
+ .src_port = RTE_BE16(UINT16_MAX),
+ .dst_port = RTE_BE16(UINT16_MAX),
+ }
+ };
+ const struct rte_flow_item_ipv4 nic_ipv4_mask = {
+ .hdr = {
+ .src_addr = RTE_BE32(0xffffffff),
+ .dst_addr = RTE_BE32(0xffffffff),
+ .type_of_service = 0xff,
+ .next_proto_id = 0xff,
+ .time_to_live = 0xff,
+ },
+ };
+ const struct rte_flow_item_ipv6 nic_ipv6_mask = {
+ .hdr = {
+ .src_addr =
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff",
+ .dst_addr =
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff",
+ .vtc_flow = RTE_BE32(0xffffffff),
+ .proto = 0xff,
+ .hop_limits = 0xff,
+ },
+ };
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *dev_conf = &priv->config;
+ uint16_t queue_index = 0xFFFF;
+ const struct rte_flow_item_vlan *vlan_m = NULL;
+ int16_t rw_act_num = 0;
+ uint64_t is_root;
+
+ if (items == NULL)
+ return -1;
+ ret = flow_dv_validate_attributes(dev, attr, external, error);
+ if (ret < 0)
+ return ret;
+ is_root = (uint64_t)ret;
+ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+ int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ int type = items->type;
+
+ switch (type) {
+ case RTE_FLOW_ITEM_TYPE_VOID:
+ break;
+ case RTE_FLOW_ITEM_TYPE_PORT_ID:
+ ret = flow_dv_validate_item_port_id
+ (dev, items, attr, item_flags, error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_ITEM_PORT_ID;
+ break;
+ case RTE_FLOW_ITEM_TYPE_ETH:
+ ret = mlx5_flow_validate_item_eth(items, item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+ MLX5_FLOW_LAYER_OUTER_L2;
+ if (items->mask != NULL && items->spec != NULL) {
+ ether_type =
+ ((const struct rte_flow_item_eth *)
+ items->spec)->type;
+ ether_type &=
+ ((const struct rte_flow_item_eth *)
+ items->mask)->type;
+ ether_type = rte_be_to_cpu_16(ether_type);
+ } else {
+ ether_type = 0;
+ }
+ break;
+ case RTE_FLOW_ITEM_TYPE_VLAN:
+ ret = flow_dv_validate_item_vlan(items, item_flags,
+ dev, error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
+ MLX5_FLOW_LAYER_OUTER_VLAN;
+ if (items->mask != NULL && items->spec != NULL) {
+ ether_type =
+ ((const struct rte_flow_item_vlan *)
+ items->spec)->inner_type;
+ ether_type &=
+ ((const struct rte_flow_item_vlan *)
+ items->mask)->inner_type;
+ ether_type = rte_be_to_cpu_16(ether_type);
+ } else {
+ ether_type = 0;
+ }
+ /* Store outer VLAN mask for of_push_vlan action. */
+ if (!tunnel)
+ vlan_m = items->mask;
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV4:
+ mlx5_flow_tunnel_ip_check(items, next_protocol,
+ &item_flags, &tunnel);
+ ret = mlx5_flow_validate_item_ipv4(items, item_flags,
+ last_item,
+ ether_type,
+ &nic_ipv4_mask,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+ if (items->mask != NULL &&
+ ((const struct rte_flow_item_ipv4 *)
+ items->mask)->hdr.next_proto_id) {
+ next_protocol =
+ ((const struct rte_flow_item_ipv4 *)
+ (items->spec))->hdr.next_proto_id;
+ next_protocol &=
+ ((const struct rte_flow_item_ipv4 *)
+ (items->mask))->hdr.next_proto_id;
+ } else {
+ /* Reset for inner layer. */
+ next_protocol = 0xff;
+ }
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV6:
+ mlx5_flow_tunnel_ip_check(items, next_protocol,
+ &item_flags, &tunnel);
+ ret = mlx5_flow_validate_item_ipv6(items, item_flags,
+ last_item,
+ ether_type,
+ &nic_ipv6_mask,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+ if (items->mask != NULL &&
+ ((const struct rte_flow_item_ipv6 *)
+ items->mask)->hdr.proto) {
+ item_ipv6_proto =
+ ((const struct rte_flow_item_ipv6 *)
+ items->spec)->hdr.proto;
+ next_protocol =
+ ((const struct rte_flow_item_ipv6 *)
+ items->spec)->hdr.proto;
+ next_protocol &=
+ ((const struct rte_flow_item_ipv6 *)
+ items->mask)->hdr.proto;
+ } else {
+ /* Reset for inner layer. */
+ next_protocol = 0xff;
+ }
+ break;
+ case RTE_FLOW_ITEM_TYPE_TCP:
+ ret = mlx5_flow_validate_item_tcp
+ (items, item_flags,
+ next_protocol,
+ &nic_tcp_mask,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP :
+ MLX5_FLOW_LAYER_OUTER_L4_TCP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_UDP:
+ ret = mlx5_flow_validate_item_udp(items, item_flags,
+ next_protocol,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP :
+ MLX5_FLOW_LAYER_OUTER_L4_UDP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GRE:
+ ret = mlx5_flow_validate_item_gre(items, item_flags,
+ next_protocol, error);
+ if (ret < 0)
+ return ret;
+ gre_item = items;
+ last_item = MLX5_FLOW_LAYER_GRE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_NVGRE:
+ ret = mlx5_flow_validate_item_nvgre(items, item_flags,
+ next_protocol,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_NVGRE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GRE_KEY:
+ ret = mlx5_flow_validate_item_gre_key
+ (items, item_flags, gre_item, error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_GRE_KEY;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN:
+ ret = mlx5_flow_validate_item_vxlan(items, item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_VXLAN;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+ ret = mlx5_flow_validate_item_vxlan_gpe(items,
+ item_flags, dev,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_VXLAN_GPE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GENEVE:
+ ret = mlx5_flow_validate_item_geneve(items,
+ item_flags, dev,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_GENEVE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_MPLS:
+ ret = mlx5_flow_validate_item_mpls(dev, items,
+ item_flags,
+ last_item, error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_MPLS;
+ break;
+
+ case RTE_FLOW_ITEM_TYPE_MARK:
+ ret = flow_dv_validate_item_mark(dev, items, attr,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_ITEM_MARK;
+ break;
+ case RTE_FLOW_ITEM_TYPE_META:
+ ret = flow_dv_validate_item_meta(dev, items, attr,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_ITEM_METADATA;
+ break;
+ case RTE_FLOW_ITEM_TYPE_ICMP:
+ ret = mlx5_flow_validate_item_icmp(items, item_flags,
+ next_protocol,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_ICMP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_ICMP6:
+ ret = mlx5_flow_validate_item_icmp6(items, item_flags,
+ next_protocol,
+ error);
+ if (ret < 0)
+ return ret;
+ item_ipv6_proto = IPPROTO_ICMPV6;
+ last_item = MLX5_FLOW_LAYER_ICMP6;
+ break;
+ case RTE_FLOW_ITEM_TYPE_TAG:
+ ret = flow_dv_validate_item_tag(dev, items,
+ attr, error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_ITEM_TAG;
+ break;
+ case MLX5_RTE_FLOW_ITEM_TYPE_TAG:
+ case MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE:
+ break;
+ case RTE_FLOW_ITEM_TYPE_GTP:
+ ret = flow_dv_validate_item_gtp(dev, items, item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_GTP;
+ break;
+ default:
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ NULL, "item not supported");
+ }
+ item_flags |= last_item;
+ }
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ int type = actions->type;
+ if (actions_n == MLX5_DV_MAX_NUMBER_OF_ACTIONS)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ actions, "too many actions");
+ switch (type) {
+ case RTE_FLOW_ACTION_TYPE_VOID:
+ break;
+ case RTE_FLOW_ACTION_TYPE_PORT_ID:
+ ret = flow_dv_validate_action_port_id(dev,
+ action_flags,
+ actions,
+ attr,
+ error);
+ if (ret)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_PORT_ID;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_FLAG:
+ ret = flow_dv_validate_action_flag(dev, action_flags,
+ attr, error);
+ if (ret < 0)
+ return ret;
+ if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+ /* Count all modify-header actions as one. */
+ if (!(action_flags &
+ MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= MLX5_FLOW_ACTION_FLAG |
+ MLX5_FLOW_ACTION_MARK_EXT;
+ } else {
+ action_flags |= MLX5_FLOW_ACTION_FLAG;
+ ++actions_n;
+ }
+ rw_act_num += MLX5_ACT_NUM_SET_MARK;
+ break;
+ case RTE_FLOW_ACTION_TYPE_MARK:
+ ret = flow_dv_validate_action_mark(dev, actions,
+ action_flags,
+ attr, error);
+ if (ret < 0)
+ return ret;
+ if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+ /* Count all modify-header actions as one. */
+ if (!(action_flags &
+ MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= MLX5_FLOW_ACTION_MARK |
+ MLX5_FLOW_ACTION_MARK_EXT;
+ } else {
+ action_flags |= MLX5_FLOW_ACTION_MARK;
+ ++actions_n;
+ }
+ rw_act_num += MLX5_ACT_NUM_SET_MARK;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_META:
+ ret = flow_dv_validate_action_set_meta(dev, actions,
+ action_flags,
+ attr, error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= MLX5_FLOW_ACTION_SET_META;
+ rw_act_num += MLX5_ACT_NUM_SET_META;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_TAG:
+ ret = flow_dv_validate_action_set_tag(dev, actions,
+ action_flags,
+ attr, error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= MLX5_FLOW_ACTION_SET_TAG;
+ rw_act_num += MLX5_ACT_NUM_SET_TAG;
+ break;
+ case RTE_FLOW_ACTION_TYPE_DROP:
+ ret = mlx5_flow_validate_action_drop(action_flags,
+ attr, error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_DROP;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_QUEUE:
+ ret = mlx5_flow_validate_action_queue(actions,
+ action_flags, dev,
+ attr, error);
+ if (ret < 0)
+ return ret;
+ queue_index = ((const struct rte_flow_action_queue *)
+ (actions->conf))->index;
+ action_flags |= MLX5_FLOW_ACTION_QUEUE;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RSS:
+ rss = actions->conf;
+ ret = mlx5_flow_validate_action_rss(actions,
+ action_flags, dev,
+ attr, item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ if (rss != NULL && rss->queue_num)
+ queue_index = rss->queue[0];
+ action_flags |= MLX5_FLOW_ACTION_RSS;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_COUNT:
+ ret = flow_dv_validate_action_count(dev, error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_COUNT;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
+ if (flow_dv_validate_action_pop_vlan(dev,
+ action_flags,
+ actions,
+ item_flags, attr,
+ error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
+ ret = flow_dv_validate_action_push_vlan(dev,
+ action_flags,
+ vlan_m,
+ actions, attr,
+ error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
+ ret = flow_dv_validate_action_set_vlan_pcp
+ (action_flags, actions, error);
+ if (ret < 0)
+ return ret;
+ /* Count PCP with push_vlan command. */
+ action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
+ ret = flow_dv_validate_action_set_vlan_vid
+ (item_flags, action_flags,
+ actions, error);
+ if (ret < 0)
+ return ret;
+ /* Count VID with push_vlan command. */
+ action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
+ rw_act_num += MLX5_ACT_NUM_MDF_VID;
+ break;
+ case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+ case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+ ret = flow_dv_validate_action_l2_encap(dev,
+ action_flags,
+ actions, attr,
+ error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_ENCAP;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+ case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP:
+ ret = flow_dv_validate_action_decap(dev, action_flags,
+ attr, error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_DECAP;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+ ret = flow_dv_validate_action_raw_encap_decap
+ (dev, NULL, actions->conf, attr, &action_flags,
+ &actions_n, error);
+ if (ret < 0)
+ return ret;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+ decap = actions->conf;
+ while ((++actions)->type == RTE_FLOW_ACTION_TYPE_VOID)
+ ;
+ if (actions->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) {
+ encap = NULL;
+ actions--;
+ } else {
+ encap = actions->conf;
+ }
+ ret = flow_dv_validate_action_raw_encap_decap
+ (dev,
+ decap ? decap : &empty_decap, encap,
+ attr, &action_flags, &actions_n,
+ error);
+ if (ret < 0)
+ return ret;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+ case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
+ ret = flow_dv_validate_action_modify_mac(action_flags,
+ actions,
+ item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
+ MLX5_FLOW_ACTION_SET_MAC_SRC :
+ MLX5_FLOW_ACTION_SET_MAC_DST;
+ /*
+ * Even if the source and destination MAC addresses have
+ * overlap in the header with 4B alignment, the convert
+ * function will handle them separately and 4 SW actions
+ * will be created. And 2 actions will be added each
+ * time no matter how many bytes of address will be set.
+ */
+ rw_act_num += MLX5_ACT_NUM_MDF_MAC;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+ case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
+ ret = flow_dv_validate_action_modify_ipv4(action_flags,
+ actions,
+ item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
+ MLX5_FLOW_ACTION_SET_IPV4_SRC :
+ MLX5_FLOW_ACTION_SET_IPV4_DST;
+ rw_act_num += MLX5_ACT_NUM_MDF_IPV4;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+ case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
+ ret = flow_dv_validate_action_modify_ipv6(action_flags,
+ actions,
+ item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ if (item_ipv6_proto == IPPROTO_ICMPV6)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ actions,
+ "Can't change header "
+ "with ICMPv6 proto");
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
+ MLX5_FLOW_ACTION_SET_IPV6_SRC :
+ MLX5_FLOW_ACTION_SET_IPV6_DST;
+ rw_act_num += MLX5_ACT_NUM_MDF_IPV6;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
+ case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
+ ret = flow_dv_validate_action_modify_tp(action_flags,
+ actions,
+ item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
+ MLX5_FLOW_ACTION_SET_TP_SRC :
+ MLX5_FLOW_ACTION_SET_TP_DST;
+ rw_act_num += MLX5_ACT_NUM_MDF_PORT;
+ break;
+ case RTE_FLOW_ACTION_TYPE_DEC_TTL:
+ case RTE_FLOW_ACTION_TYPE_SET_TTL:
+ ret = flow_dv_validate_action_modify_ttl(action_flags,
+ actions,
+ item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_SET_TTL ?
+ MLX5_FLOW_ACTION_SET_TTL :
+ MLX5_FLOW_ACTION_DEC_TTL;
+ rw_act_num += MLX5_ACT_NUM_MDF_TTL;
+ break;
+ case RTE_FLOW_ACTION_TYPE_JUMP:
+ ret = flow_dv_validate_action_jump(actions,
+ action_flags,
+ attr, external,
+ error);
+ if (ret)
+ return ret;
+ ++actions_n;
+ action_flags |= MLX5_FLOW_ACTION_JUMP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+ case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ:
+ ret = flow_dv_validate_action_modify_tcp_seq
+ (action_flags,
+ actions,
+ item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ ?
+ MLX5_FLOW_ACTION_INC_TCP_SEQ :
+ MLX5_FLOW_ACTION_DEC_TCP_SEQ;
+ rw_act_num += MLX5_ACT_NUM_MDF_TCPSEQ;
+ break;
+ case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+ case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+ ret = flow_dv_validate_action_modify_tcp_ack
+ (action_flags,
+ actions,
+ item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_INC_TCP_ACK ?
+ MLX5_FLOW_ACTION_INC_TCP_ACK :
+ MLX5_FLOW_ACTION_DEC_TCP_ACK;
+ rw_act_num += MLX5_ACT_NUM_MDF_TCPACK;
+ break;
+ case MLX5_RTE_FLOW_ACTION_TYPE_MARK:
+ break;
+ case MLX5_RTE_FLOW_ACTION_TYPE_TAG:
+ case MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG:
+ rw_act_num += MLX5_ACT_NUM_SET_TAG;
+ break;
+ case RTE_FLOW_ACTION_TYPE_METER:
+ ret = mlx5_flow_validate_action_meter(dev,
+ action_flags,
+ actions, attr,
+ error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_METER;
+ ++actions_n;
+ /* Meter action will add one more TAG action. */
+ rw_act_num += MLX5_ACT_NUM_SET_TAG;
+ break;
+ case RTE_FLOW_ACTION_TYPE_AGE:
+ ret = flow_dv_validate_action_age(action_flags,
+ actions, dev,
+ error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_AGE;
+ ++actions_n;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+ ret = flow_dv_validate_action_modify_ipv4_dscp
+ (action_flags,
+ actions,
+ item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= MLX5_FLOW_ACTION_SET_IPV4_DSCP;
+ rw_act_num += MLX5_ACT_NUM_SET_DSCP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP:
+ ret = flow_dv_validate_action_modify_ipv6_dscp
+ (action_flags,
+ actions,
+ item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ /* Count all modify-header actions as one action. */
+ if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+ ++actions_n;
+ action_flags |= MLX5_FLOW_ACTION_SET_IPV6_DSCP;
+ rw_act_num += MLX5_ACT_NUM_SET_DSCP;
+ break;
+ default:
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ actions,
+ "action not supported");
+ }
+ }
+ /*
+ * Validate the drop action mutual exclusion with other actions.
+ * Drop action is mutually-exclusive with any other action, except for
+ * Count action.
+ */
+ if ((action_flags & MLX5_FLOW_ACTION_DROP) &&
+ (action_flags & ~(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_COUNT)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "Drop action is mutually-exclusive "
+ "with any other action, except for "
+ "Count action");
+ /* Eswitch has few restrictions on using items and actions */
+ if (attr->transfer) {
+ if (!mlx5_flow_ext_mreg_supported(dev) &&
+ action_flags & MLX5_FLOW_ACTION_FLAG)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "unsupported action FLAG");
+ if (!mlx5_flow_ext_mreg_supported(dev) &&
+ action_flags & MLX5_FLOW_ACTION_MARK)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "unsupported action MARK");
+ if (action_flags & MLX5_FLOW_ACTION_QUEUE)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "unsupported action QUEUE");
+ if (action_flags & MLX5_FLOW_ACTION_RSS)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "unsupported action RSS");
+ if (!(action_flags & MLX5_FLOW_FATE_ESWITCH_ACTIONS))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ actions,
+ "no fate action is found");
+ } else {
+ if (!(action_flags & MLX5_FLOW_FATE_ACTIONS) && attr->ingress)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ actions,
+ "no fate action is found");
+ }
+ /* Continue validation for Xcap actions.*/
+ if ((action_flags & MLX5_FLOW_XCAP_ACTIONS) && (queue_index == 0xFFFF ||
+ mlx5_rxq_get_type(dev, queue_index) != MLX5_RXQ_TYPE_HAIRPIN)) {
+ if ((action_flags & MLX5_FLOW_XCAP_ACTIONS) ==
+ MLX5_FLOW_XCAP_ACTIONS)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "encap and decap "
+ "combination aren't supported");
+ if (!attr->transfer && attr->ingress && (action_flags &
+ MLX5_FLOW_ACTION_ENCAP))
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "encap is not supported"
+ " for ingress traffic");
+ }
+ /* Hairpin flow will add one more TAG action. */
+ if (hairpin > 0)
+ rw_act_num += MLX5_ACT_NUM_SET_TAG;
+ /* extra metadata enabled: one more TAG action will be add. */
+ if (dev_conf->dv_flow_en &&
+ dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+ mlx5_flow_ext_mreg_supported(dev))
+ rw_act_num += MLX5_ACT_NUM_SET_TAG;
+ if ((uint32_t)rw_act_num >
+ flow_dv_modify_hdr_action_max(dev, is_root)) {
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL, "too many header modify"
+ " actions to support");
+ }
+ return 0;
+}
+
+/**
+ * Internal preparation function. Allocates the DV flow size,
+ * this size is constant.
+ *
+ * @param[in] dev
+ * Pointer to the rte_eth_dev structure.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] items
+ * Pointer to the list of items.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * Pointer to mlx5_flow object on success,
+ * otherwise NULL and rte_errno is set.
+ */
+static struct mlx5_flow *
+flow_dv_prepare(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr __rte_unused,
+ const struct rte_flow_item items[] __rte_unused,
+ const struct rte_flow_action actions[] __rte_unused,
+ struct rte_flow_error *error)
+{
+ uint32_t handle_idx = 0;
+ struct mlx5_flow *dev_flow;
+ struct mlx5_flow_handle *dev_handle;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ /* In case of corrupting the memory. */
+ if (priv->flow_idx >= MLX5_NUM_MAX_DEV_FLOWS) {
+ rte_flow_error_set(error, ENOSPC,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "not free temporary device flow");
+ return NULL;
+ }
+ dev_handle = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+ &handle_idx);
+ if (!dev_handle) {
+ rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "not enough memory to create flow handle");
+ return NULL;
+ }
+ /* No multi-thread supporting. */
+ dev_flow = &((struct mlx5_flow *)priv->inter_flows)[priv->flow_idx++];
+ dev_flow->handle = dev_handle;
+ dev_flow->handle_idx = handle_idx;
+ dev_flow->dv.value.size = MLX5_ST_SZ_BYTES(fte_match_param);
+ /*
+ * The matching value needs to be cleared to 0 before using. In the
+ * past, it will be automatically cleared when using rte_*alloc
+ * API. The time consumption will be almost the same as before.
+ */
+ memset(dev_flow->dv.value.buf, 0, MLX5_ST_SZ_BYTES(fte_match_param));
+ dev_flow->ingress = attr->ingress;
+ dev_flow->dv.transfer = attr->transfer;
+ return dev_flow;
+}
+
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+/**
+ * Sanity check for match mask and value. Similar to check_valid_spec() in
+ * kernel driver. If unmasked bit is present in value, it returns failure.
+ *
+ * @param match_mask
+ * pointer to match mask buffer.
+ * @param match_value
+ * pointer to match value buffer.
+ *
+ * @return
+ * 0 if valid, -EINVAL otherwise.
+ */
+static int
+flow_dv_check_valid_spec(void *match_mask, void *match_value)
+{
+ uint8_t *m = match_mask;
+ uint8_t *v = match_value;
+ unsigned int i;
+
+ for (i = 0; i < MLX5_ST_SZ_BYTES(fte_match_param); ++i) {
+ if (v[i] & ~m[i]) {
+ DRV_LOG(ERR,
+ "match_value differs from match_criteria"
+ " %p[%u] != %p[%u]",
+ match_value, i, match_mask, i);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+#endif
+
+/**
+ * Add match of ip_version.
+ *
+ * @param[in] group
+ * Flow group.
+ * @param[in] headers_v
+ * Values header pointer.
+ * @param[in] headers_m
+ * Masks header pointer.
+ * @param[in] ip_version
+ * The IP version to set.
+ */
+static inline void
+flow_dv_set_match_ip_version(uint32_t group,
+ void *headers_v,
+ void *headers_m,
+ uint8_t ip_version)
+{
+ if (group == 0)
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 0xf);
+ else
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version,
+ ip_version);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, ip_version);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, 0);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype, 0);
+}
+
+/**
+ * Add Ethernet item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_eth(void *matcher, void *key,
+ const struct rte_flow_item *item, int inner,
+ uint32_t group)
+{
+ const struct rte_flow_item_eth *eth_m = item->mask;
+ const struct rte_flow_item_eth *eth_v = item->spec;
+ const struct rte_flow_item_eth nic_mask = {
+ .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+ .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+ .type = RTE_BE16(0xffff),
+ };
+ void *headers_m;
+ void *headers_v;
+ char *l24_v;
+ unsigned int i;
+
+ if (!eth_v)
+ return;
+ if (!eth_m)
+ eth_m = &nic_mask;
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, dmac_47_16),
+ &eth_m->dst, sizeof(eth_m->dst));
+ /* The value must be in the range of the mask. */
+ l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, dmac_47_16);
+ for (i = 0; i < sizeof(eth_m->dst); ++i)
+ l24_v[i] = eth_m->dst.addr_bytes[i] & eth_v->dst.addr_bytes[i];
+ memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, smac_47_16),
+ &eth_m->src, sizeof(eth_m->src));
+ l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, smac_47_16);
+ /* The value must be in the range of the mask. */
+ for (i = 0; i < sizeof(eth_m->dst); ++i)
+ l24_v[i] = eth_m->src.addr_bytes[i] & eth_v->src.addr_bytes[i];
+ if (eth_v->type) {
+ /* When ethertype is present set mask for tagged VLAN. */
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1);
+ /* Set value for tagged VLAN if ethertype is 802.1Q. */
+ if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
+ eth_v->type == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag,
+ 1);
+ /* Return here to avoid setting match on ethertype. */
+ return;
+ }
+ }
+ /*
+ * HW supports match on one Ethertype, the Ethertype following the last
+ * VLAN tag of the packet (see PRM).
+ * Set match on ethertype only if ETH header is not followed by VLAN.
+ * HW is optimized for IPv4/IPv6. In such cases, avoid setting
+ * ethertype, and use ip_version field instead.
+ */
+ if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_IPV4) &&
+ eth_m->type == 0xFFFF) {
+ flow_dv_set_match_ip_version(group, headers_v, headers_m, 4);
+ } else if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_IPV6) &&
+ eth_m->type == 0xFFFF) {
+ flow_dv_set_match_ip_version(group, headers_v, headers_m, 6);
+ } else {
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype,
+ rte_be_to_cpu_16(eth_m->type));
+ l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+ ethertype);
+ *(uint16_t *)(l24_v) = eth_m->type & eth_v->type;
+ }
+}
+
+/**
+ * Add VLAN item to matcher and to the value.
+ *
+ * @param[in, out] dev_flow
+ * Flow descriptor.
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow,
+ void *matcher, void *key,
+ const struct rte_flow_item *item,
+ int inner, uint32_t group)
+{
+ const struct rte_flow_item_vlan *vlan_m = item->mask;
+ const struct rte_flow_item_vlan *vlan_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+ uint16_t tci_m;
+ uint16_t tci_v;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ /*
+ * This is workaround, masks are not supported,
+ * and pre-validated.
+ */
+ if (vlan_v)
+ dev_flow->handle->vf_vlan.tag =
+ rte_be_to_cpu_16(vlan_v->tci) & 0x0fff;
+ }
+ /*
+ * When VLAN item exists in flow, mark packet as tagged,
+ * even if TCI is not specified.
+ */
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
+ if (!vlan_v)
+ return;
+ if (!vlan_m)
+ vlan_m = &rte_flow_item_vlan_mask;
+ tci_m = rte_be_to_cpu_16(vlan_m->tci);
+ tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_vid, tci_m);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, tci_v);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_cfi, tci_m >> 12);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_cfi, tci_v >> 12);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_prio, tci_m >> 13);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, tci_v >> 13);
+ /*
+ * HW is optimized for IPv4/IPv6. In such cases, avoid setting
+ * ethertype, and use ip_version field instead.
+ */
+ if (vlan_v->inner_type == RTE_BE16(RTE_ETHER_TYPE_IPV4) &&
+ vlan_m->inner_type == 0xFFFF) {
+ flow_dv_set_match_ip_version(group, headers_v, headers_m, 4);
+ } else if (vlan_v->inner_type == RTE_BE16(RTE_ETHER_TYPE_IPV6) &&
+ vlan_m->inner_type == 0xFFFF) {
+ flow_dv_set_match_ip_version(group, headers_v, headers_m, 6);
+ } else {
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype,
+ rte_be_to_cpu_16(vlan_m->inner_type));
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
+ rte_be_to_cpu_16(vlan_m->inner_type &
+ vlan_v->inner_type));
+ }
+}
+
+/**
+ * Add IPV4 item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] inner
+ * Item is inner pattern.
+ * @param[in] group
+ * The group to insert the rule.
+ */
+static void
+flow_dv_translate_item_ipv4(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ const uint64_t item_flags,
+ int inner, uint32_t group)
+{
+ const struct rte_flow_item_ipv4 *ipv4_m = item->mask;
+ const struct rte_flow_item_ipv4 *ipv4_v = item->spec;
+ const struct rte_flow_item_ipv4 nic_mask = {
+ .hdr = {
+ .src_addr = RTE_BE32(0xffffffff),
+ .dst_addr = RTE_BE32(0xffffffff),
+ .type_of_service = 0xff,
+ .next_proto_id = 0xff,
+ .time_to_live = 0xff,
+ },
+ };
+ void *headers_m;
+ void *headers_v;
+ char *l24_m;
+ char *l24_v;
+ uint8_t tos;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ flow_dv_set_match_ip_version(group, headers_v, headers_m, 4);
+ /*
+ * On outer header (which must contains L2), or inner header with L2,
+ * set cvlan_tag mask bit to mark this packet as untagged.
+ * This should be done even if item->spec is empty.
+ */
+ if (!inner || item_flags & MLX5_FLOW_LAYER_INNER_L2)
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1);
+ if (!ipv4_v)
+ return;
+ if (!ipv4_m)
+ ipv4_m = &nic_mask;
+ l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m,
+ dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+ l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+ dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+ *(uint32_t *)l24_m = ipv4_m->hdr.dst_addr;
+ *(uint32_t *)l24_v = ipv4_m->hdr.dst_addr & ipv4_v->hdr.dst_addr;
+ l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m,
+ src_ipv4_src_ipv6.ipv4_layout.ipv4);
+ l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+ src_ipv4_src_ipv6.ipv4_layout.ipv4);
+ *(uint32_t *)l24_m = ipv4_m->hdr.src_addr;
+ *(uint32_t *)l24_v = ipv4_m->hdr.src_addr & ipv4_v->hdr.src_addr;
+ tos = ipv4_m->hdr.type_of_service & ipv4_v->hdr.type_of_service;
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ecn,
+ ipv4_m->hdr.type_of_service);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, tos);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_dscp,
+ ipv4_m->hdr.type_of_service >> 2);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, tos >> 2);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol,
+ ipv4_m->hdr.next_proto_id);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+ ipv4_v->hdr.next_proto_id & ipv4_m->hdr.next_proto_id);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ttl_hoplimit,
+ ipv4_m->hdr.time_to_live);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ttl_hoplimit,
+ ipv4_v->hdr.time_to_live & ipv4_m->hdr.time_to_live);
+}
+
+/**
+ * Add IPV6 item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] item_flags
+ * Bit-fields that holds the items detected until now.
+ * @param[in] inner
+ * Item is inner pattern.
+ * @param[in] group
+ * The group to insert the rule.
+ */
+static void
+flow_dv_translate_item_ipv6(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ const uint64_t item_flags,
+ int inner, uint32_t group)
+{
+ const struct rte_flow_item_ipv6 *ipv6_m = item->mask;
+ const struct rte_flow_item_ipv6 *ipv6_v = item->spec;
+ const struct rte_flow_item_ipv6 nic_mask = {
+ .hdr = {
+ .src_addr =
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff",
+ .dst_addr =
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff",
+ .vtc_flow = RTE_BE32(0xffffffff),
+ .proto = 0xff,
+ .hop_limits = 0xff,
+ },
+ };
+ void *headers_m;
+ void *headers_v;
+ void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+ void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+ char *l24_m;
+ char *l24_v;
+ uint32_t vtc_m;
+ uint32_t vtc_v;
+ int i;
+ int size;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ flow_dv_set_match_ip_version(group, headers_v, headers_m, 6);
+ /*
+ * On outer header (which must contains L2), or inner header with L2,
+ * set cvlan_tag mask bit to mark this packet as untagged.
+ * This should be done even if item->spec is empty.
+ */
+ if (!inner || item_flags & MLX5_FLOW_LAYER_INNER_L2)
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1);
+ if (!ipv6_v)
+ return;
+ if (!ipv6_m)
+ ipv6_m = &nic_mask;
+ size = sizeof(ipv6_m->hdr.dst_addr);
+ l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m,
+ dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
+ l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+ dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
+ memcpy(l24_m, ipv6_m->hdr.dst_addr, size);
+ for (i = 0; i < size; ++i)
+ l24_v[i] = l24_m[i] & ipv6_v->hdr.dst_addr[i];
+ l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m,
+ src_ipv4_src_ipv6.ipv6_layout.ipv6);
+ l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+ src_ipv4_src_ipv6.ipv6_layout.ipv6);
+ memcpy(l24_m, ipv6_m->hdr.src_addr, size);
+ for (i = 0; i < size; ++i)
+ l24_v[i] = l24_m[i] & ipv6_v->hdr.src_addr[i];
+ /* TOS. */
+ vtc_m = rte_be_to_cpu_32(ipv6_m->hdr.vtc_flow);
+ vtc_v = rte_be_to_cpu_32(ipv6_m->hdr.vtc_flow & ipv6_v->hdr.vtc_flow);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ecn, vtc_m >> 20);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, vtc_v >> 20);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_dscp, vtc_m >> 22);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, vtc_v >> 22);
+ /* Label. */
+ if (inner) {
+ MLX5_SET(fte_match_set_misc, misc_m, inner_ipv6_flow_label,
+ vtc_m);
+ MLX5_SET(fte_match_set_misc, misc_v, inner_ipv6_flow_label,
+ vtc_v);
+ } else {
+ MLX5_SET(fte_match_set_misc, misc_m, outer_ipv6_flow_label,
+ vtc_m);
+ MLX5_SET(fte_match_set_misc, misc_v, outer_ipv6_flow_label,
+ vtc_v);
+ }
+ /* Protocol. */
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol,
+ ipv6_m->hdr.proto);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+ ipv6_v->hdr.proto & ipv6_m->hdr.proto);
+ /* Hop limit. */
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ttl_hoplimit,
+ ipv6_m->hdr.hop_limits);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ttl_hoplimit,
+ ipv6_v->hdr.hop_limits & ipv6_m->hdr.hop_limits);
+}
+
+/**
+ * Add TCP item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_tcp(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ int inner)
+{
+ const struct rte_flow_item_tcp *tcp_m = item->mask;
+ const struct rte_flow_item_tcp *tcp_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_TCP);
+ if (!tcp_v)
+ return;
+ if (!tcp_m)
+ tcp_m = &rte_flow_item_tcp_mask;
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_sport,
+ rte_be_to_cpu_16(tcp_m->hdr.src_port));
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
+ rte_be_to_cpu_16(tcp_v->hdr.src_port & tcp_m->hdr.src_port));
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_dport,
+ rte_be_to_cpu_16(tcp_m->hdr.dst_port));
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
+ rte_be_to_cpu_16(tcp_v->hdr.dst_port & tcp_m->hdr.dst_port));
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_flags,
+ tcp_m->hdr.tcp_flags);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_flags,
+ (tcp_v->hdr.tcp_flags & tcp_m->hdr.tcp_flags));
+}
+
+/**
+ * Add UDP item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_udp(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ int inner)
+{
+ const struct rte_flow_item_udp *udp_m = item->mask;
+ const struct rte_flow_item_udp *udp_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_UDP);
+ if (!udp_v)
+ return;
+ if (!udp_m)
+ udp_m = &rte_flow_item_udp_mask;
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_sport,
+ rte_be_to_cpu_16(udp_m->hdr.src_port));
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
+ rte_be_to_cpu_16(udp_v->hdr.src_port & udp_m->hdr.src_port));
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport,
+ rte_be_to_cpu_16(udp_m->hdr.dst_port));
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
+ rte_be_to_cpu_16(udp_v->hdr.dst_port & udp_m->hdr.dst_port));
+}
+
+/**
+ * Add GRE optional Key item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_gre_key(void *matcher, void *key,
+ const struct rte_flow_item *item)
+{
+ const rte_be32_t *key_m = item->mask;
+ const rte_be32_t *key_v = item->spec;
+ void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+ void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+ rte_be32_t gre_key_default_mask = RTE_BE32(UINT32_MAX);
+
+ /* GRE K bit must be on and should already be validated */
+ MLX5_SET(fte_match_set_misc, misc_m, gre_k_present, 1);
+ MLX5_SET(fte_match_set_misc, misc_v, gre_k_present, 1);
+ if (!key_v)
+ return;
+ if (!key_m)
+ key_m = &gre_key_default_mask;
+ MLX5_SET(fte_match_set_misc, misc_m, gre_key_h,
+ rte_be_to_cpu_32(*key_m) >> 8);
+ MLX5_SET(fte_match_set_misc, misc_v, gre_key_h,
+ rte_be_to_cpu_32((*key_v) & (*key_m)) >> 8);
+ MLX5_SET(fte_match_set_misc, misc_m, gre_key_l,
+ rte_be_to_cpu_32(*key_m) & 0xFF);
+ MLX5_SET(fte_match_set_misc, misc_v, gre_key_l,
+ rte_be_to_cpu_32((*key_v) & (*key_m)) & 0xFF);
+}
+
+/**
+ * Add GRE item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_gre(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ int inner)
+{
+ const struct rte_flow_item_gre *gre_m = item->mask;
+ const struct rte_flow_item_gre *gre_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+ void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+ void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+ struct {
+ union {
+ __extension__
+ struct {
+ uint16_t version:3;
+ uint16_t rsvd0:9;
+ uint16_t s_present:1;
+ uint16_t k_present:1;
+ uint16_t rsvd_bit1:1;
+ uint16_t c_present:1;
+ };
+ uint16_t value;
+ };
+ } gre_crks_rsvd0_ver_m, gre_crks_rsvd0_ver_v;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_GRE);
+ if (!gre_v)
+ return;
+ if (!gre_m)
+ gre_m = &rte_flow_item_gre_mask;
+ MLX5_SET(fte_match_set_misc, misc_m, gre_protocol,
+ rte_be_to_cpu_16(gre_m->protocol));
+ MLX5_SET(fte_match_set_misc, misc_v, gre_protocol,
+ rte_be_to_cpu_16(gre_v->protocol & gre_m->protocol));
+ gre_crks_rsvd0_ver_m.value = rte_be_to_cpu_16(gre_m->c_rsvd0_ver);
+ gre_crks_rsvd0_ver_v.value = rte_be_to_cpu_16(gre_v->c_rsvd0_ver);
+ MLX5_SET(fte_match_set_misc, misc_m, gre_c_present,
+ gre_crks_rsvd0_ver_m.c_present);
+ MLX5_SET(fte_match_set_misc, misc_v, gre_c_present,
+ gre_crks_rsvd0_ver_v.c_present &
+ gre_crks_rsvd0_ver_m.c_present);
+ MLX5_SET(fte_match_set_misc, misc_m, gre_k_present,
+ gre_crks_rsvd0_ver_m.k_present);
+ MLX5_SET(fte_match_set_misc, misc_v, gre_k_present,
+ gre_crks_rsvd0_ver_v.k_present &
+ gre_crks_rsvd0_ver_m.k_present);
+ MLX5_SET(fte_match_set_misc, misc_m, gre_s_present,
+ gre_crks_rsvd0_ver_m.s_present);
+ MLX5_SET(fte_match_set_misc, misc_v, gre_s_present,
+ gre_crks_rsvd0_ver_v.s_present &
+ gre_crks_rsvd0_ver_m.s_present);
+}
+
+/**
+ * Add NVGRE item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_nvgre(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ int inner)
+{
+ const struct rte_flow_item_nvgre *nvgre_m = item->mask;
+ const struct rte_flow_item_nvgre *nvgre_v = item->spec;
+ void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+ void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+ const char *tni_flow_id_m = (const char *)nvgre_m->tni;
+ const char *tni_flow_id_v = (const char *)nvgre_v->tni;
+ char *gre_key_m;
+ char *gre_key_v;
+ int size;
+ int i;
+
+ /* For NVGRE, GRE header fields must be set with defined values. */
+ const struct rte_flow_item_gre gre_spec = {
+ .c_rsvd0_ver = RTE_BE16(0x2000),
+ .protocol = RTE_BE16(RTE_ETHER_TYPE_TEB)
+ };
+ const struct rte_flow_item_gre gre_mask = {
+ .c_rsvd0_ver = RTE_BE16(0xB000),
+ .protocol = RTE_BE16(UINT16_MAX),
+ };
+ const struct rte_flow_item gre_item = {
+ .spec = &gre_spec,
+ .mask = &gre_mask,
+ .last = NULL,
+ };
+ flow_dv_translate_item_gre(matcher, key, &gre_item, inner);
+ if (!nvgre_v)
+ return;
+ if (!nvgre_m)
+ nvgre_m = &rte_flow_item_nvgre_mask;
+ size = sizeof(nvgre_m->tni) + sizeof(nvgre_m->flow_id);
+ gre_key_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, gre_key_h);
+ gre_key_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, gre_key_h);
+ memcpy(gre_key_m, tni_flow_id_m, size);
+ for (i = 0; i < size; ++i)
+ gre_key_v[i] = gre_key_m[i] & tni_flow_id_v[i];
+}
+
+/**
+ * Add VXLAN item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_vxlan(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ int inner)
+{
+ const struct rte_flow_item_vxlan *vxlan_m = item->mask;
+ const struct rte_flow_item_vxlan *vxlan_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+ void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+ void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+ char *vni_m;
+ char *vni_v;
+ uint16_t dport;
+ int size;
+ int i;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ dport = item->type == RTE_FLOW_ITEM_TYPE_VXLAN ?
+ MLX5_UDP_PORT_VXLAN : MLX5_UDP_PORT_VXLAN_GPE;
+ if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) {
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport);
+ }
+ if (!vxlan_v)
+ return;
+ if (!vxlan_m)
+ vxlan_m = &rte_flow_item_vxlan_mask;
+ size = sizeof(vxlan_m->vni);
+ vni_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, vxlan_vni);
+ vni_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, vxlan_vni);
+ memcpy(vni_m, vxlan_m->vni, size);
+ for (i = 0; i < size; ++i)
+ vni_v[i] = vni_m[i] & vxlan_v->vni[i];
+}
+
+/**
+ * Add VXLAN-GPE item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+
+static void
+flow_dv_translate_item_vxlan_gpe(void *matcher, void *key,
+ const struct rte_flow_item *item, int inner)
+{
+ const struct rte_flow_item_vxlan_gpe *vxlan_m = item->mask;
+ const struct rte_flow_item_vxlan_gpe *vxlan_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+ void *misc_m =
+ MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_3);
+ void *misc_v =
+ MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3);
+ char *vni_m;
+ char *vni_v;
+ uint16_t dport;
+ int size;
+ int i;
+ uint8_t flags_m = 0xff;
+ uint8_t flags_v = 0xc;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ dport = item->type == RTE_FLOW_ITEM_TYPE_VXLAN ?
+ MLX5_UDP_PORT_VXLAN : MLX5_UDP_PORT_VXLAN_GPE;
+ if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) {
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport);
+ }
+ if (!vxlan_v)
+ return;
+ if (!vxlan_m)
+ vxlan_m = &rte_flow_item_vxlan_gpe_mask;
+ size = sizeof(vxlan_m->vni);
+ vni_m = MLX5_ADDR_OF(fte_match_set_misc3, misc_m, outer_vxlan_gpe_vni);
+ vni_v = MLX5_ADDR_OF(fte_match_set_misc3, misc_v, outer_vxlan_gpe_vni);
+ memcpy(vni_m, vxlan_m->vni, size);
+ for (i = 0; i < size; ++i)
+ vni_v[i] = vni_m[i] & vxlan_v->vni[i];
+ if (vxlan_m->flags) {
+ flags_m = vxlan_m->flags;
+ flags_v = vxlan_v->flags;
+ }
+ MLX5_SET(fte_match_set_misc3, misc_m, outer_vxlan_gpe_flags, flags_m);
+ MLX5_SET(fte_match_set_misc3, misc_v, outer_vxlan_gpe_flags, flags_v);
+ MLX5_SET(fte_match_set_misc3, misc_m, outer_vxlan_gpe_next_protocol,
+ vxlan_m->protocol);
+ MLX5_SET(fte_match_set_misc3, misc_v, outer_vxlan_gpe_next_protocol,
+ vxlan_v->protocol);
+}
+
+/**
+ * Add Geneve item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+
+static void
+flow_dv_translate_item_geneve(void *matcher, void *key,
+ const struct rte_flow_item *item, int inner)
+{
+ const struct rte_flow_item_geneve *geneve_m = item->mask;
+ const struct rte_flow_item_geneve *geneve_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+ void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+ void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+ uint16_t dport;
+ uint16_t gbhdr_m;
+ uint16_t gbhdr_v;
+ char *vni_m;
+ char *vni_v;
+ size_t size, i;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ dport = MLX5_UDP_PORT_GENEVE;
+ if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) {
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport);
+ }
+ if (!geneve_v)
+ return;
+ if (!geneve_m)
+ geneve_m = &rte_flow_item_geneve_mask;
+ size = sizeof(geneve_m->vni);
+ vni_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, geneve_vni);
+ vni_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, geneve_vni);
+ memcpy(vni_m, geneve_m->vni, size);
+ for (i = 0; i < size; ++i)
+ vni_v[i] = vni_m[i] & geneve_v->vni[i];
+ MLX5_SET(fte_match_set_misc, misc_m, geneve_protocol_type,
+ rte_be_to_cpu_16(geneve_m->protocol));
+ MLX5_SET(fte_match_set_misc, misc_v, geneve_protocol_type,
+ rte_be_to_cpu_16(geneve_v->protocol & geneve_m->protocol));
+ gbhdr_m = rte_be_to_cpu_16(geneve_m->ver_opt_len_o_c_rsvd0);
+ gbhdr_v = rte_be_to_cpu_16(geneve_v->ver_opt_len_o_c_rsvd0);
+ MLX5_SET(fte_match_set_misc, misc_m, geneve_oam,
+ MLX5_GENEVE_OAMF_VAL(gbhdr_m));
+ MLX5_SET(fte_match_set_misc, misc_v, geneve_oam,
+ MLX5_GENEVE_OAMF_VAL(gbhdr_v) & MLX5_GENEVE_OAMF_VAL(gbhdr_m));
+ MLX5_SET(fte_match_set_misc, misc_m, geneve_opt_len,
+ MLX5_GENEVE_OPTLEN_VAL(gbhdr_m));
+ MLX5_SET(fte_match_set_misc, misc_v, geneve_opt_len,
+ MLX5_GENEVE_OPTLEN_VAL(gbhdr_v) &
+ MLX5_GENEVE_OPTLEN_VAL(gbhdr_m));
+}
+
+/**
+ * Add MPLS item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] prev_layer
+ * The protocol layer indicated in previous item.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_mpls(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ uint64_t prev_layer,
+ int inner)
+{
+ const uint32_t *in_mpls_m = item->mask;
+ const uint32_t *in_mpls_v = item->spec;
+ uint32_t *out_mpls_m = 0;
+ uint32_t *out_mpls_v = 0;
+ void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+ void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+ void *misc2_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ misc_parameters_2);
+ void *misc2_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2);
+ void *headers_m = MLX5_ADDR_OF(fte_match_param, matcher, outer_headers);
+ void *headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+
+ switch (prev_layer) {
+ case MLX5_FLOW_LAYER_OUTER_L4_UDP:
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xffff);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
+ MLX5_UDP_PORT_MPLS);
+ break;
+ case MLX5_FLOW_LAYER_GRE:
+ MLX5_SET(fte_match_set_misc, misc_m, gre_protocol, 0xffff);
+ MLX5_SET(fte_match_set_misc, misc_v, gre_protocol,
+ RTE_ETHER_TYPE_MPLS);
+ break;
+ default:
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+ IPPROTO_MPLS);
+ break;
+ }
+ if (!in_mpls_v)
+ return;
+ if (!in_mpls_m)
+ in_mpls_m = (const uint32_t *)&rte_flow_item_mpls_mask;
+ switch (prev_layer) {
+ case MLX5_FLOW_LAYER_OUTER_L4_UDP:
+ out_mpls_m =
+ (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_m,
+ outer_first_mpls_over_udp);
+ out_mpls_v =
+ (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_v,
+ outer_first_mpls_over_udp);
+ break;
+ case MLX5_FLOW_LAYER_GRE:
+ out_mpls_m =
+ (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_m,
+ outer_first_mpls_over_gre);
+ out_mpls_v =
+ (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_v,
+ outer_first_mpls_over_gre);
+ break;
+ default:
+ /* Inner MPLS not over GRE is not supported. */
+ if (!inner) {
+ out_mpls_m =
+ (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2,
+ misc2_m,
+ outer_first_mpls);
+ out_mpls_v =
+ (uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2,
+ misc2_v,
+ outer_first_mpls);
+ }
+ break;
+ }
+ if (out_mpls_m && out_mpls_v) {
+ *out_mpls_m = *in_mpls_m;
+ *out_mpls_v = *in_mpls_v & *in_mpls_m;
+ }
+}
+
+/**
+ * Add metadata register item to matcher
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] reg_type
+ * Type of device metadata register
+ * @param[in] value
+ * Register value
+ * @param[in] mask
+ * Register mask
+ */
+static void
+flow_dv_match_meta_reg(void *matcher, void *key,
+ enum modify_reg reg_type,
+ uint32_t data, uint32_t mask)
+{
+ void *misc2_m =
+ MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_2);
+ void *misc2_v =
+ MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2);
+ uint32_t temp;
+
+ data &= mask;
+ switch (reg_type) {
+ case REG_A:
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_a, mask);
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a, data);
+ break;
+ case REG_B:
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_b, mask);
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_b, data);
+ break;
+ case REG_C_0:
+ /*
+ * The metadata register C0 field might be divided into
+ * source vport index and META item value, we should set
+ * this field according to specified mask, not as whole one.
+ */
+ temp = MLX5_GET(fte_match_set_misc2, misc2_m, metadata_reg_c_0);
+ temp |= mask;
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_0, temp);
+ temp = MLX5_GET(fte_match_set_misc2, misc2_v, metadata_reg_c_0);
+ temp &= ~mask;
+ temp |= data;
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_0, temp);
+ break;
+ case REG_C_1:
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_1, mask);
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_1, data);
+ break;
+ case REG_C_2:
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_2, mask);
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_2, data);
+ break;
+ case REG_C_3:
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_3, mask);
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_3, data);
+ break;
+ case REG_C_4:
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_4, mask);
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_4, data);
+ break;
+ case REG_C_5:
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_5, mask);
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_5, data);
+ break;
+ case REG_C_6:
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_6, mask);
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_6, data);
+ break;
+ case REG_C_7:
+ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_7, mask);
+ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_7, data);
+ break;
+ default:
+ MLX5_ASSERT(false);
+ break;
+ }
+}
+
+/**
+ * Add MARK item to matcher
+ *
+ * @param[in] dev
+ * The device to configure through.
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ */
+static void
+flow_dv_translate_item_mark(struct rte_eth_dev *dev,
+ void *matcher, void *key,
+ const struct rte_flow_item *item)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const struct rte_flow_item_mark *mark;
+ uint32_t value;
+ uint32_t mask;
+
+ mark = item->mask ? (const void *)item->mask :
+ &rte_flow_item_mark_mask;
+ mask = mark->id & priv->sh->dv_mark_mask;
+ mark = (const void *)item->spec;
+ MLX5_ASSERT(mark);
+ value = mark->id & priv->sh->dv_mark_mask & mask;
+ if (mask) {
+ enum modify_reg reg;
+
+ /* Get the metadata register index for the mark. */
+ reg = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, NULL);
+ MLX5_ASSERT(reg > 0);
+ if (reg == REG_C_0) {
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+ uint32_t shl_c0 = rte_bsf32(msk_c0);
+
+ mask &= msk_c0;
+ mask <<= shl_c0;
+ value <<= shl_c0;
+ }
+ flow_dv_match_meta_reg(matcher, key, reg, value, mask);
+ }
+}
+
+/**
+ * Add META item to matcher
+ *
+ * @param[in] dev
+ * The devich to configure through.
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] attr
+ * Attributes of flow that includes this item.
+ * @param[in] item
+ * Flow pattern to translate.
+ */
+static void
+flow_dv_translate_item_meta(struct rte_eth_dev *dev,
+ void *matcher, void *key,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item *item)
+{
+ const struct rte_flow_item_meta *meta_m;
+ const struct rte_flow_item_meta *meta_v;
+
+ meta_m = (const void *)item->mask;
+ if (!meta_m)
+ meta_m = &rte_flow_item_meta_mask;
+ meta_v = (const void *)item->spec;
+ if (meta_v) {
+ int reg;
+ uint32_t value = meta_v->data;
+ uint32_t mask = meta_m->data;
+
+ reg = flow_dv_get_metadata_reg(dev, attr, NULL);
+ if (reg < 0)
+ return;
+ /*
+ * In datapath code there is no endianness
+ * coversions for perfromance reasons, all
+ * pattern conversions are done in rte_flow.
+ */
+ value = rte_cpu_to_be_32(value);
+ mask = rte_cpu_to_be_32(mask);
+ if (reg == REG_C_0) {
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+ uint32_t shl_c0 = rte_bsf32(msk_c0);
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+ uint32_t shr_c0 = __builtin_clz(priv->sh->dv_meta_mask);
+
+ value >>= shr_c0;
+ mask >>= shr_c0;
+#endif
+ value <<= shl_c0;
+ mask <<= shl_c0;
+ MLX5_ASSERT(msk_c0);
+ MLX5_ASSERT(!(~msk_c0 & mask));
+ }
+ flow_dv_match_meta_reg(matcher, key, reg, value, mask);
+ }
+}
+
+/**
+ * Add vport metadata Reg C0 item to matcher
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] reg
+ * Flow pattern to translate.
+ */
+static void
+flow_dv_translate_item_meta_vport(void *matcher, void *key,
+ uint32_t value, uint32_t mask)
+{
+ flow_dv_match_meta_reg(matcher, key, REG_C_0, value, mask);
+}
+
+/**
+ * Add tag item to matcher
+ *
+ * @param[in] dev
+ * The devich to configure through.
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ */
+static void
+flow_dv_translate_mlx5_item_tag(struct rte_eth_dev *dev,
+ void *matcher, void *key,
+ const struct rte_flow_item *item)
+{
+ const struct mlx5_rte_flow_item_tag *tag_v = item->spec;
+ const struct mlx5_rte_flow_item_tag *tag_m = item->mask;
+ uint32_t mask, value;
+
+ MLX5_ASSERT(tag_v);
+ value = tag_v->data;
+ mask = tag_m ? tag_m->data : UINT32_MAX;
+ if (tag_v->id == REG_C_0) {
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+ uint32_t shl_c0 = rte_bsf32(msk_c0);
+
+ mask &= msk_c0;
+ mask <<= shl_c0;
+ value <<= shl_c0;
+ }
+ flow_dv_match_meta_reg(matcher, key, tag_v->id, value, mask);
+}
+
+/**
+ * Add TAG item to matcher
+ *
+ * @param[in] dev
+ * The devich to configure through.
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ */
+static void
+flow_dv_translate_item_tag(struct rte_eth_dev *dev,
+ void *matcher, void *key,
+ const struct rte_flow_item *item)
+{
+ const struct rte_flow_item_tag *tag_v = item->spec;
+ const struct rte_flow_item_tag *tag_m = item->mask;
+ enum modify_reg reg;
+
+ MLX5_ASSERT(tag_v);
+ tag_m = tag_m ? tag_m : &rte_flow_item_tag_mask;
+ /* Get the metadata register index for the tag. */
+ reg = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, tag_v->index, NULL);
+ MLX5_ASSERT(reg > 0);
+ flow_dv_match_meta_reg(matcher, key, reg, tag_v->data, tag_m->data);
+}
+
+/**
+ * Add source vport match to the specified matcher.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] port
+ * Source vport value to match
+ * @param[in] mask
+ * Mask
+ */
+static void
+flow_dv_translate_item_source_vport(void *matcher, void *key,
+ int16_t port, uint16_t mask)
+{
+ void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+ void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+
+ MLX5_SET(fte_match_set_misc, misc_m, source_port, mask);
+ MLX5_SET(fte_match_set_misc, misc_v, source_port, port);
+}
+
+/**
+ * Translate port-id item to eswitch match on port-id.
+ *
+ * @param[in] dev
+ * The devich to configure through.
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise.
+ */
+static int
+flow_dv_translate_item_port_id(struct rte_eth_dev *dev, void *matcher,
+ void *key, const struct rte_flow_item *item)
+{
+ const struct rte_flow_item_port_id *pid_m = item ? item->mask : NULL;
+ const struct rte_flow_item_port_id *pid_v = item ? item->spec : NULL;
+ struct mlx5_priv *priv;
+ uint16_t mask, id;
+
+ mask = pid_m ? pid_m->id : 0xffff;
+ id = pid_v ? pid_v->id : dev->data->port_id;
+ priv = mlx5_port_to_eswitch_info(id, item == NULL);
+ if (!priv)
+ return -rte_errno;
+ /* Translate to vport field or to metadata, depending on mode. */
+ if (priv->vport_meta_mask)
+ flow_dv_translate_item_meta_vport(matcher, key,
+ priv->vport_meta_tag,
+ priv->vport_meta_mask);
+ else
+ flow_dv_translate_item_source_vport(matcher, key,
+ priv->vport_id, mask);
+ return 0;
+}
+
+/**
+ * Add ICMP6 item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_icmp6(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ int inner)
+{
+ const struct rte_flow_item_icmp6 *icmp6_m = item->mask;
+ const struct rte_flow_item_icmp6 *icmp6_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+ void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ misc_parameters_3);
+ void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3);
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xFF);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_ICMPV6);
+ if (!icmp6_v)
+ return;
+ if (!icmp6_m)
+ icmp6_m = &rte_flow_item_icmp6_mask;
+ /*
+ * Force flow only to match the non-fragmented IPv6 ICMPv6 packets.
+ * If only the protocol is specified, no need to match the frag.
+ */
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, frag, 1);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0);
+ MLX5_SET(fte_match_set_misc3, misc3_m, icmpv6_type, icmp6_m->type);
+ MLX5_SET(fte_match_set_misc3, misc3_v, icmpv6_type,
+ icmp6_v->type & icmp6_m->type);
+ MLX5_SET(fte_match_set_misc3, misc3_m, icmpv6_code, icmp6_m->code);
+ MLX5_SET(fte_match_set_misc3, misc3_v, icmpv6_code,
+ icmp6_v->code & icmp6_m->code);
+}
+
+/**
+ * Add ICMP item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_icmp(void *matcher, void *key,
+ const struct rte_flow_item *item,
+ int inner)
+{
+ const struct rte_flow_item_icmp *icmp_m = item->mask;
+ const struct rte_flow_item_icmp *icmp_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+ void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ misc_parameters_3);
+ void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3);
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xFF);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_ICMP);
+ if (!icmp_v)
+ return;
+ if (!icmp_m)
+ icmp_m = &rte_flow_item_icmp_mask;
+ /*
+ * Force flow only to match the non-fragmented IPv4 ICMP packets.
+ * If only the protocol is specified, no need to match the frag.
+ */
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, frag, 1);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0);
+ MLX5_SET(fte_match_set_misc3, misc3_m, icmp_type,
+ icmp_m->hdr.icmp_type);
+ MLX5_SET(fte_match_set_misc3, misc3_v, icmp_type,
+ icmp_v->hdr.icmp_type & icmp_m->hdr.icmp_type);
+ MLX5_SET(fte_match_set_misc3, misc3_m, icmp_code,
+ icmp_m->hdr.icmp_code);
+ MLX5_SET(fte_match_set_misc3, misc3_v, icmp_code,
+ icmp_v->hdr.icmp_code & icmp_m->hdr.icmp_code);
+}
+
+/**
+ * Add GTP item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_gtp(void *matcher, void *key,
+ const struct rte_flow_item *item, int inner)
+{
+ const struct rte_flow_item_gtp *gtp_m = item->mask;
+ const struct rte_flow_item_gtp *gtp_v = item->spec;
+ void *headers_m;
+ void *headers_v;
+ void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ misc_parameters_3);
+ void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3);
+ uint16_t dport = RTE_GTPU_UDP_PORT;
+
+ if (inner) {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ inner_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+ } else {
+ headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+ outer_headers);
+ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ }
+ if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) {
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport);
+ }
+ if (!gtp_v)
+ return;
+ if (!gtp_m)
+ gtp_m = &rte_flow_item_gtp_mask;
+ MLX5_SET(fte_match_set_misc3, misc3_m, gtpu_msg_flags,
+ gtp_m->v_pt_rsv_flags);
+ MLX5_SET(fte_match_set_misc3, misc3_v, gtpu_msg_flags,
+ gtp_v->v_pt_rsv_flags & gtp_m->v_pt_rsv_flags);
+ MLX5_SET(fte_match_set_misc3, misc3_m, gtpu_msg_type, gtp_m->msg_type);
+ MLX5_SET(fte_match_set_misc3, misc3_v, gtpu_msg_type,
+ gtp_v->msg_type & gtp_m->msg_type);
+ MLX5_SET(fte_match_set_misc3, misc3_m, gtpu_teid,
+ rte_be_to_cpu_32(gtp_m->teid));
+ MLX5_SET(fte_match_set_misc3, misc3_v, gtpu_teid,
+ rte_be_to_cpu_32(gtp_v->teid & gtp_m->teid));
+}
+
+static uint32_t matcher_zero[MLX5_ST_SZ_DW(fte_match_param)] = { 0 };
+
+#define HEADER_IS_ZERO(match_criteria, headers) \
+ !(memcmp(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
+ matcher_zero, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \
+
+/**
+ * Calculate flow matcher enable bitmap.
+ *
+ * @param match_criteria
+ * Pointer to flow matcher criteria.
+ *
+ * @return
+ * Bitmap of enabled fields.
+ */
+static uint8_t
+flow_dv_matcher_enable(uint32_t *match_criteria)
+{
+ uint8_t match_criteria_enable;
+
+ match_criteria_enable =
+ (!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
+ MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT;
+ match_criteria_enable |=
+ (!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
+ MLX5_MATCH_CRITERIA_ENABLE_MISC_BIT;
+ match_criteria_enable |=
+ (!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
+ MLX5_MATCH_CRITERIA_ENABLE_INNER_BIT;
+ match_criteria_enable |=
+ (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
+ MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT;
+ match_criteria_enable |=
+ (!HEADER_IS_ZERO(match_criteria, misc_parameters_3)) <<
+ MLX5_MATCH_CRITERIA_ENABLE_MISC3_BIT;
+ return match_criteria_enable;
+}
+
+
+/**
+ * Get a flow table.
+ *
+ * @param[in, out] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in] table_id
+ * Table id to use.
+ * @param[in] egress
+ * Direction of the table.
+ * @param[in] transfer
+ * E-Switch or NIC flow.
+ * @param[out] error
+ * pointer to error structure.
+ *
+ * @return
+ * Returns tables resource based on the index, NULL in case of failed.
+ */
+static struct mlx5_flow_tbl_resource *
+flow_dv_tbl_resource_get(struct rte_eth_dev *dev,
+ uint32_t table_id, uint8_t egress,
+ uint8_t transfer,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_tbl_resource *tbl;
+ union mlx5_flow_tbl_key table_key = {
+ {
+ .table_id = table_id,
+ .reserved = 0,
+ .domain = !!transfer,
+ .direction = !!egress,
+ }
+ };
+ struct mlx5_hlist_entry *pos = mlx5_hlist_lookup(sh->flow_tbls,
+ table_key.v64);
+ struct mlx5_flow_tbl_data_entry *tbl_data;
+ uint32_t idx = 0;
+ int ret;
+ void *domain;
+
+ if (pos) {
+ tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
+ entry);
+ tbl = &tbl_data->tbl;
+ rte_atomic32_inc(&tbl->refcnt);
+ return tbl;
+ }
+ tbl_data = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_JUMP], &idx);
+ if (!tbl_data) {
+ rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "cannot allocate flow table data entry");
+ return NULL;
+ }
+ tbl_data->idx = idx;
+ tbl = &tbl_data->tbl;
+ pos = &tbl_data->entry;
+ if (transfer)
+ domain = sh->fdb_domain;
+ else if (egress)
+ domain = sh->tx_domain;
+ else
+ domain = sh->rx_domain;
+ tbl->obj = mlx5_glue->dr_create_flow_tbl(domain, table_id);
+ if (!tbl->obj) {
+ rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot create flow table object");
+ mlx5_ipool_free(sh->ipool[MLX5_IPOOL_JUMP], idx);
+ return NULL;
+ }
+ /*
+ * No multi-threads now, but still better to initialize the reference
+ * count before insert it into the hash list.
+ */
+ rte_atomic32_init(&tbl->refcnt);
+ /* Jump action reference count is initialized here. */
+ rte_atomic32_init(&tbl_data->jump.refcnt);
+ pos->key = table_key.v64;
+ ret = mlx5_hlist_insert(sh->flow_tbls, pos);
+ if (ret < 0) {
+ rte_flow_error_set(error, -ret,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot insert flow table data entry");
+ mlx5_glue->dr_destroy_flow_tbl(tbl->obj);
+ mlx5_ipool_free(sh->ipool[MLX5_IPOOL_JUMP], idx);
+ }
+ rte_atomic32_inc(&tbl->refcnt);
+ return tbl;
+}
+
+/**
+ * Release a flow table.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in] tbl
+ * Table resource to be released.
+ *
+ * @return
+ * Returns 0 if table was released, else return 1;
+ */
+static int
+flow_dv_tbl_resource_release(struct rte_eth_dev *dev,
+ struct mlx5_flow_tbl_resource *tbl)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_tbl_data_entry *tbl_data =
+ container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl);
+
+ if (!tbl)
+ return 0;
+ if (rte_atomic32_dec_and_test(&tbl->refcnt)) {
+ struct mlx5_hlist_entry *pos = &tbl_data->entry;
+
+ mlx5_glue->dr_destroy_flow_tbl(tbl->obj);
+ tbl->obj = NULL;
+ /* remove the entry from the hash list and free memory. */
+ mlx5_hlist_remove(sh->flow_tbls, pos);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_JUMP],
+ tbl_data->idx);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Register the flow matcher.
+ *
+ * @param[in, out] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in, out] matcher
+ * Pointer to flow matcher.
+ * @param[in, out] key
+ * Pointer to flow table key.
+ * @parm[in, out] dev_flow
+ * Pointer to the dev_flow.
+ * @param[out] error
+ * pointer to error structure.
+ *
+ * @return
+ * 0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_matcher_register(struct rte_eth_dev *dev,
+ struct mlx5_flow_dv_matcher *matcher,
+ union mlx5_flow_tbl_key *key,
+ struct mlx5_flow *dev_flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_dv_matcher *cache_matcher;
+ struct mlx5dv_flow_matcher_attr dv_attr = {
+ .type = IBV_FLOW_ATTR_NORMAL,
+ .match_mask = (void *)&matcher->mask,
+ };
+ struct mlx5_flow_tbl_resource *tbl;
+ struct mlx5_flow_tbl_data_entry *tbl_data;
+
+ tbl = flow_dv_tbl_resource_get(dev, key->table_id, key->direction,
+ key->domain, error);
+ if (!tbl)
+ return -rte_errno; /* No need to refill the error info */
+ tbl_data = container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl);
+ /* Lookup from cache. */
+ LIST_FOREACH(cache_matcher, &tbl_data->matchers, next) {
+ if (matcher->crc == cache_matcher->crc &&
+ matcher->priority == cache_matcher->priority &&
+ !memcmp((const void *)matcher->mask.buf,
+ (const void *)cache_matcher->mask.buf,
+ cache_matcher->mask.size)) {
+ DRV_LOG(DEBUG,
+ "%s group %u priority %hd use %s "
+ "matcher %p: refcnt %d++",
+ key->domain ? "FDB" : "NIC", key->table_id,
+ cache_matcher->priority,
+ key->direction ? "tx" : "rx",
+ (void *)cache_matcher,
+ rte_atomic32_read(&cache_matcher->refcnt));
+ rte_atomic32_inc(&cache_matcher->refcnt);
+ dev_flow->handle->dvh.matcher = cache_matcher;
+ /* old matcher should not make the table ref++. */
+ flow_dv_tbl_resource_release(dev, tbl);
+ return 0;
+ }
+ }
+ /* Register new matcher. */
+ cache_matcher = rte_calloc(__func__, 1, sizeof(*cache_matcher), 0);
+ if (!cache_matcher) {
+ flow_dv_tbl_resource_release(dev, tbl);
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot allocate matcher memory");
+ }
+ *cache_matcher = *matcher;
+ dv_attr.match_criteria_enable =
+ flow_dv_matcher_enable(cache_matcher->mask.buf);
+ dv_attr.priority = matcher->priority;
+ if (key->direction)
+ dv_attr.flags |= IBV_FLOW_ATTR_FLAGS_EGRESS;
+ cache_matcher->matcher_object =
+ mlx5_glue->dv_create_flow_matcher(sh->ctx, &dv_attr, tbl->obj);
+ if (!cache_matcher->matcher_object) {
+ rte_free(cache_matcher);
+#ifdef HAVE_MLX5DV_DR
+ flow_dv_tbl_resource_release(dev, tbl);
+#endif
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot create matcher");
+ }
+ /* Save the table information */
+ cache_matcher->tbl = tbl;
+ rte_atomic32_init(&cache_matcher->refcnt);
+ /* only matcher ref++, table ref++ already done above in get API. */
+ rte_atomic32_inc(&cache_matcher->refcnt);
+ LIST_INSERT_HEAD(&tbl_data->matchers, cache_matcher, next);
+ dev_flow->handle->dvh.matcher = cache_matcher;
+ DRV_LOG(DEBUG, "%s group %u priority %hd new %s matcher %p: refcnt %d",
+ key->domain ? "FDB" : "NIC", key->table_id,
+ cache_matcher->priority,
+ key->direction ? "tx" : "rx", (void *)cache_matcher,
+ rte_atomic32_read(&cache_matcher->refcnt));
+ return 0;
+}
+
+/**
+ * Find existing tag resource or create and register a new one.
+ *
+ * @param dev[in, out]
+ * Pointer to rte_eth_dev structure.
+ * @param[in, out] tag_be24
+ * Tag value in big endian then R-shift 8.
+ * @parm[in, out] dev_flow
+ * Pointer to the dev_flow.
+ * @param[out] error
+ * pointer to error structure.
+ *
+ * @return
+ * 0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_tag_resource_register
+ (struct rte_eth_dev *dev,
+ uint32_t tag_be24,
+ struct mlx5_flow *dev_flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_dv_tag_resource *cache_resource;
+ struct mlx5_hlist_entry *entry;
+
+ /* Lookup a matching resource from cache. */
+ entry = mlx5_hlist_lookup(sh->tag_table, (uint64_t)tag_be24);
+ if (entry) {
+ cache_resource = container_of
+ (entry, struct mlx5_flow_dv_tag_resource, entry);
+ rte_atomic32_inc(&cache_resource->refcnt);
+ dev_flow->handle->dvh.rix_tag = cache_resource->idx;
+ dev_flow->dv.tag_resource = cache_resource;
+ DRV_LOG(DEBUG, "cached tag resource %p: refcnt now %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ return 0;
+ }
+ /* Register new resource. */
+ cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_TAG],
+ &dev_flow->handle->dvh.rix_tag);
+ if (!cache_resource)
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot allocate resource memory");
+ cache_resource->entry.key = (uint64_t)tag_be24;
+ cache_resource->action = mlx5_glue->dv_create_flow_action_tag(tag_be24);
+ if (!cache_resource->action) {
+ rte_free(cache_resource);
+ return rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot create action");
+ }
+ rte_atomic32_init(&cache_resource->refcnt);
+ rte_atomic32_inc(&cache_resource->refcnt);
+ if (mlx5_hlist_insert(sh->tag_table, &cache_resource->entry)) {
+ mlx5_glue->destroy_flow_action(cache_resource->action);
+ rte_free(cache_resource);
+ return rte_flow_error_set(error, EEXIST,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot insert tag");
+ }
+ dev_flow->dv.tag_resource = cache_resource;
+ DRV_LOG(DEBUG, "new tag resource %p: refcnt now %d++",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ return 0;
+}
+
+/**
+ * Release the tag.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param tag_idx
+ * Tag index.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_tag_release(struct rte_eth_dev *dev,
+ uint32_t tag_idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_dv_tag_resource *tag;
+
+ tag = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_TAG], tag_idx);
+ if (!tag)
+ return 0;
+ DRV_LOG(DEBUG, "port %u tag %p: refcnt %d--",
+ dev->data->port_id, (void *)tag,
+ rte_atomic32_read(&tag->refcnt));
+ if (rte_atomic32_dec_and_test(&tag->refcnt)) {
+ claim_zero(mlx5_glue->destroy_flow_action(tag->action));
+ mlx5_hlist_remove(sh->tag_table, &tag->entry);
+ DRV_LOG(DEBUG, "port %u tag %p: removed",
+ dev->data->port_id, (void *)tag);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_TAG], tag_idx);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Translate port ID action to vport.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in] action
+ * Pointer to the port ID action.
+ * @param[out] dst_port_id
+ * The target port ID.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_translate_action_port_id(struct rte_eth_dev *dev,
+ const struct rte_flow_action *action,
+ uint32_t *dst_port_id,
+ struct rte_flow_error *error)
+{
+ uint32_t port;
+ struct mlx5_priv *priv;
+ const struct rte_flow_action_port_id *conf =
+ (const struct rte_flow_action_port_id *)action->conf;
+
+ port = conf->original ? dev->data->port_id : conf->id;
+ priv = mlx5_port_to_eswitch_info(port, false);
+ if (!priv)
+ return rte_flow_error_set(error, -rte_errno,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "No eswitch info was found for port");
+#ifdef HAVE_MLX5DV_DR_DEVX_PORT
+ /*
+ * This parameter is transferred to
+ * mlx5dv_dr_action_create_dest_ib_port().
+ */
+ *dst_port_id = priv->ibv_port;
+#else
+ /*
+ * Legacy mode, no LAG configurations is supported.
+ * This parameter is transferred to
+ * mlx5dv_dr_action_create_dest_vport().
+ */
+ *dst_port_id = priv->vport_id;
+#endif
+ return 0;
+}
+
+/**
+ * Create a counter with aging configuration.
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[out] count
+ * Pointer to the counter action configuration.
+ * @param[in] age
+ * Pointer to the aging action configuration.
+ *
+ * @return
+ * Index to flow counter on success, 0 otherwise.
+ */
+static uint32_t
+flow_dv_translate_create_counter(struct rte_eth_dev *dev,
+ struct mlx5_flow *dev_flow,
+ const struct rte_flow_action_count *count,
+ const struct rte_flow_action_age *age)
+{
+ uint32_t counter;
+ struct mlx5_age_param *age_param;
+
+ counter = flow_dv_counter_alloc(dev,
+ count ? count->shared : 0,
+ count ? count->id : 0,
+ dev_flow->dv.group, !!age);
+ if (!counter || age == NULL)
+ return counter;
+ age_param = flow_dv_counter_idx_get_age(dev, counter);
+ /*
+ * The counter age accuracy may have a bit delay. Have 3/4
+ * second bias on the timeount in order to let it age in time.
+ */
+ age_param->context = age->context ? age->context :
+ (void *)(uintptr_t)(dev_flow->flow_idx);
+ /*
+ * The counter age accuracy may have a bit delay. Have 3/4
+ * second bias on the timeount in order to let it age in time.
+ */
+ age_param->timeout = age->timeout * 10 - MLX5_AGING_TIME_DELAY;
+ /* Set expire time in unit of 0.1 sec. */
+ age_param->port_id = dev->data->port_id;
+ age_param->expire = age_param->timeout +
+ rte_rdtsc() / (rte_get_tsc_hz() / 10);
+ rte_atomic16_set(&age_param->state, AGE_CANDIDATE);
+ return counter;
+}
+/**
+ * Add Tx queue matcher
+ *
+ * @param[in] dev
+ * Pointer to the dev struct.
+ * @param[in, out] matcher
+ * Flow matcher.
+ * @param[in, out] key
+ * Flow matcher value.
+ * @param[in] item
+ * Flow pattern to translate.
+ * @param[in] inner
+ * Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_tx_queue(struct rte_eth_dev *dev,
+ void *matcher, void *key,
+ const struct rte_flow_item *item)
+{
+ const struct mlx5_rte_flow_item_tx_queue *queue_m;
+ const struct mlx5_rte_flow_item_tx_queue *queue_v;
+ void *misc_m =
+ MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+ void *misc_v =
+ MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+ struct mlx5_txq_ctrl *txq;
+ uint32_t queue;
+
+
+ queue_m = (const void *)item->mask;
+ if (!queue_m)
+ return;
+ queue_v = (const void *)item->spec;
+ if (!queue_v)
+ return;
+ txq = mlx5_txq_get(dev, queue_v->queue);
+ if (!txq)
+ return;
+ queue = txq->obj->sq->id;
+ MLX5_SET(fte_match_set_misc, misc_m, source_sqn, queue_m->queue);
+ MLX5_SET(fte_match_set_misc, misc_v, source_sqn,
+ queue & queue_m->queue);
+ mlx5_txq_release(dev, queue_v->queue);
+}
+
+/**
+ * Set the hash fields according to the @p flow information.
+ *
+ * @param[in] dev_flow
+ * Pointer to the mlx5_flow.
+ * @param[in] rss_desc
+ * Pointer to the mlx5_flow_rss_desc.
+ */
+static void
+flow_dv_hashfields_set(struct mlx5_flow *dev_flow,
+ struct mlx5_flow_rss_desc *rss_desc)
+{
+ uint64_t items = dev_flow->handle->layers;
+ int rss_inner = 0;
+ uint64_t rss_types = rte_eth_rss_hf_refine(rss_desc->types);
+
+ dev_flow->hash_fields = 0;
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ if (rss_desc->level >= 2) {
+ dev_flow->hash_fields |= IBV_RX_HASH_INNER;
+ rss_inner = 1;
+ }
+#endif
+ if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L3_IPV4)) ||
+ (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L3_IPV4))) {
+ if (rss_types & MLX5_IPV4_LAYER_TYPES) {
+ if (rss_types & ETH_RSS_L3_SRC_ONLY)
+ dev_flow->hash_fields |= IBV_RX_HASH_SRC_IPV4;
+ else if (rss_types & ETH_RSS_L3_DST_ONLY)
+ dev_flow->hash_fields |= IBV_RX_HASH_DST_IPV4;
+ else
+ dev_flow->hash_fields |= MLX5_IPV4_IBV_RX_HASH;
+ }
+ } else if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L3_IPV6)) ||
+ (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L3_IPV6))) {
+ if (rss_types & MLX5_IPV6_LAYER_TYPES) {
+ if (rss_types & ETH_RSS_L3_SRC_ONLY)
+ dev_flow->hash_fields |= IBV_RX_HASH_SRC_IPV6;
+ else if (rss_types & ETH_RSS_L3_DST_ONLY)
+ dev_flow->hash_fields |= IBV_RX_HASH_DST_IPV6;
+ else
+ dev_flow->hash_fields |= MLX5_IPV6_IBV_RX_HASH;
+ }
+ }
+ if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L4_UDP)) ||
+ (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L4_UDP))) {
+ if (rss_types & ETH_RSS_UDP) {
+ if (rss_types & ETH_RSS_L4_SRC_ONLY)
+ dev_flow->hash_fields |=
+ IBV_RX_HASH_SRC_PORT_UDP;
+ else if (rss_types & ETH_RSS_L4_DST_ONLY)
+ dev_flow->hash_fields |=
+ IBV_RX_HASH_DST_PORT_UDP;
+ else
+ dev_flow->hash_fields |= MLX5_UDP_IBV_RX_HASH;
+ }
+ } else if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L4_TCP)) ||
+ (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L4_TCP))) {
+ if (rss_types & ETH_RSS_TCP) {
+ if (rss_types & ETH_RSS_L4_SRC_ONLY)
+ dev_flow->hash_fields |=
+ IBV_RX_HASH_SRC_PORT_TCP;
+ else if (rss_types & ETH_RSS_L4_DST_ONLY)
+ dev_flow->hash_fields |=
+ IBV_RX_HASH_DST_PORT_TCP;
+ else
+ dev_flow->hash_fields |= MLX5_TCP_IBV_RX_HASH;
+ }
+ }
+}
+
+/**
+ * Fill the flow with DV spec, lock free
+ * (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ * Pointer to rte_eth_dev structure.
+ * @param[in, out] dev_flow
+ * Pointer to the sub flow.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] items
+ * Pointer to the list of items.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+__flow_dv_translate(struct rte_eth_dev *dev,
+ struct mlx5_flow *dev_flow,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *dev_conf = &priv->config;
+ struct rte_flow *flow = dev_flow->flow;
+ struct mlx5_flow_handle *handle = dev_flow->handle;
+ struct mlx5_flow_rss_desc *rss_desc = &((struct mlx5_flow_rss_desc *)
+ priv->rss_desc)
+ [!!priv->flow_nested_idx];
+ uint64_t item_flags = 0;
+ uint64_t last_item = 0;
+ uint64_t action_flags = 0;
+ uint64_t priority = attr->priority;
+ struct mlx5_flow_dv_matcher matcher = {
+ .mask = {
+ .size = sizeof(matcher.mask.buf),
+ },
+ };
+ int actions_n = 0;
+ bool actions_end = false;
+ union {
+ struct mlx5_flow_dv_modify_hdr_resource res;
+ uint8_t len[sizeof(struct mlx5_flow_dv_modify_hdr_resource) +
+ sizeof(struct mlx5_modification_cmd) *
+ (MLX5_MAX_MODIFY_NUM + 1)];
+ } mhdr_dummy;
+ struct mlx5_flow_dv_modify_hdr_resource *mhdr_res = &mhdr_dummy.res;
+ const struct rte_flow_action_count *count = NULL;
+ const struct rte_flow_action_age *age = NULL;
+ union flow_dv_attr flow_attr = { .attr = 0 };
+ uint32_t tag_be;
+ union mlx5_flow_tbl_key tbl_key;
+ uint32_t modify_action_position = UINT32_MAX;
+ void *match_mask = matcher.mask.buf;
+ void *match_value = dev_flow->dv.value.buf;
+ uint8_t next_protocol = 0xff;
+ struct rte_vlan_hdr vlan = { 0 };
+ uint32_t table;
+ int ret = 0;
+
+ mhdr_res->ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX :
+ MLX5DV_FLOW_TABLE_TYPE_NIC_RX;
+ ret = mlx5_flow_group_to_table(attr, dev_flow->external, attr->group,
+ !!priv->fdb_def_rule, &table, error);
+ if (ret)
+ return ret;
+ dev_flow->dv.group = table;
+ if (attr->transfer)
+ mhdr_res->ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB;
+ if (priority == MLX5_FLOW_PRIO_RSVD)
+ priority = dev_conf->flow_prio - 1;
+ /* number of actions must be set to 0 in case of dirty stack. */
+ mhdr_res->actions_num = 0;
+ for (; !actions_end ; actions++) {
+ const struct rte_flow_action_queue *queue;
+ const struct rte_flow_action_rss *rss;
+ const struct rte_flow_action *action = actions;
+ const uint8_t *rss_key;
+ const struct rte_flow_action_jump *jump_data;
+ const struct rte_flow_action_meter *mtr;
+ struct mlx5_flow_tbl_resource *tbl;
+ uint32_t port_id = 0;
+ struct mlx5_flow_dv_port_id_action_resource port_id_resource;
+ int action_type = actions->type;
+ const struct rte_flow_action *found_action = NULL;
+ struct mlx5_flow_meter *fm = NULL;
+
+ switch (action_type) {
+ case RTE_FLOW_ACTION_TYPE_VOID:
+ break;
+ case RTE_FLOW_ACTION_TYPE_PORT_ID:
+ if (flow_dv_translate_action_port_id(dev, action,
+ &port_id, error))
+ return -rte_errno;
+ port_id_resource.port_id = port_id;
+ MLX5_ASSERT(!handle->rix_port_id_action);
+ if (flow_dv_port_id_action_resource_register
+ (dev, &port_id_resource, dev_flow, error))
+ return -rte_errno;
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.port_id_action->action;
+ action_flags |= MLX5_FLOW_ACTION_PORT_ID;
+ dev_flow->handle->fate_action = MLX5_FLOW_FATE_PORT_ID;
+ break;
+ case RTE_FLOW_ACTION_TYPE_FLAG:
+ action_flags |= MLX5_FLOW_ACTION_FLAG;
+ dev_flow->handle->mark = 1;
+ if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+ struct rte_flow_action_mark mark = {
+ .id = MLX5_FLOW_MARK_DEFAULT,
+ };
+
+ if (flow_dv_convert_action_mark(dev, &mark,
+ mhdr_res,
+ error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_MARK_EXT;
+ break;
+ }
+ tag_be = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT);
+ /*
+ * Only one FLAG or MARK is supported per device flow
+ * right now. So the pointer to the tag resource must be
+ * zero before the register process.
+ */
+ MLX5_ASSERT(!handle->dvh.rix_tag);
+ if (flow_dv_tag_resource_register(dev, tag_be,
+ dev_flow, error))
+ return -rte_errno;
+ MLX5_ASSERT(dev_flow->dv.tag_resource);
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.tag_resource->action;
+ break;
+ case RTE_FLOW_ACTION_TYPE_MARK:
+ action_flags |= MLX5_FLOW_ACTION_MARK;
+ dev_flow->handle->mark = 1;
+ if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+ const struct rte_flow_action_mark *mark =
+ (const struct rte_flow_action_mark *)
+ actions->conf;
+
+ if (flow_dv_convert_action_mark(dev, mark,
+ mhdr_res,
+ error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_MARK_EXT;
+ break;
+ }
+ /* Fall-through */
+ case MLX5_RTE_FLOW_ACTION_TYPE_MARK:
+ /* Legacy (non-extensive) MARK action. */
+ tag_be = mlx5_flow_mark_set
+ (((const struct rte_flow_action_mark *)
+ (actions->conf))->id);
+ MLX5_ASSERT(!handle->dvh.rix_tag);
+ if (flow_dv_tag_resource_register(dev, tag_be,
+ dev_flow, error))
+ return -rte_errno;
+ MLX5_ASSERT(dev_flow->dv.tag_resource);
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.tag_resource->action;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_META:
+ if (flow_dv_convert_action_set_meta
+ (dev, mhdr_res, attr,
+ (const struct rte_flow_action_set_meta *)
+ actions->conf, error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_SET_META;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_TAG:
+ if (flow_dv_convert_action_set_tag
+ (dev, mhdr_res,
+ (const struct rte_flow_action_set_tag *)
+ actions->conf, error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_SET_TAG;
+ break;
+ case RTE_FLOW_ACTION_TYPE_DROP:
+ action_flags |= MLX5_FLOW_ACTION_DROP;
+ dev_flow->handle->fate_action = MLX5_FLOW_FATE_DROP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_QUEUE:
+ queue = actions->conf;
+ rss_desc->queue_num = 1;
+ rss_desc->queue[0] = queue->index;
+ action_flags |= MLX5_FLOW_ACTION_QUEUE;
+ dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RSS:
+ rss = actions->conf;
+ memcpy(rss_desc->queue, rss->queue,
+ rss->queue_num * sizeof(uint16_t));
+ rss_desc->queue_num = rss->queue_num;
+ /* NULL RSS key indicates default RSS key. */
+ rss_key = !rss->key ? rss_hash_default_key : rss->key;
+ memcpy(rss_desc->key, rss_key, MLX5_RSS_HASH_KEY_LEN);
+ /*
+ * rss->level and rss.types should be set in advance
+ * when expanding items for RSS.
+ */
+ action_flags |= MLX5_FLOW_ACTION_RSS;
+ dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE;
+ break;
+ case RTE_FLOW_ACTION_TYPE_AGE:
+ case RTE_FLOW_ACTION_TYPE_COUNT:
+ if (!dev_conf->devx) {
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "count action not supported");
+ }
+ /* Save information first, will apply later. */
+ if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT)
+ count = action->conf;
+ else
+ age = action->conf;
+ action_flags |= MLX5_FLOW_ACTION_COUNT;
+ break;
+ case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
+ dev_flow->dv.actions[actions_n++] =
+ priv->sh->pop_vlan_action;
+ action_flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
+ break;
+ case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
+ if (!(action_flags &
+ MLX5_FLOW_ACTION_OF_SET_VLAN_VID))
+ flow_dev_get_vlan_info_from_items(items, &vlan);
+ vlan.eth_proto = rte_be_to_cpu_16
+ ((((const struct rte_flow_action_of_push_vlan *)
+ actions->conf)->ethertype));
+ found_action = mlx5_flow_find_action
+ (actions + 1,
+ RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID);
+ if (found_action)
+ mlx5_update_vlan_vid_pcp(found_action, &vlan);
+ found_action = mlx5_flow_find_action
+ (actions + 1,
+ RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP);
+ if (found_action)
+ mlx5_update_vlan_vid_pcp(found_action, &vlan);
+ if (flow_dv_create_action_push_vlan
+ (dev, attr, &vlan, dev_flow, error))
+ return -rte_errno;
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.push_vlan_res->action;
+ action_flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
+ break;
+ case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
+ /* of_vlan_push action handled this action */
+ MLX5_ASSERT(action_flags &
+ MLX5_FLOW_ACTION_OF_PUSH_VLAN);
+ break;
+ case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
+ if (action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)
+ break;
+ flow_dev_get_vlan_info_from_items(items, &vlan);
+ mlx5_update_vlan_vid_pcp(actions, &vlan);
+ /* If no VLAN push - this is a modify header action */
+ if (flow_dv_convert_action_modify_vlan_vid
+ (mhdr_res, actions, error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
+ break;
+ case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+ case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+ if (flow_dv_create_action_l2_encap(dev, actions,
+ dev_flow,
+ attr->transfer,
+ error))
+ return -rte_errno;
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.encap_decap->verbs_action;
+ action_flags |= MLX5_FLOW_ACTION_ENCAP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+ case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP:
+ if (flow_dv_create_action_l2_decap(dev, dev_flow,
+ attr->transfer,
+ error))
+ return -rte_errno;
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.encap_decap->verbs_action;
+ action_flags |= MLX5_FLOW_ACTION_DECAP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+ /* Handle encap with preceding decap. */
+ if (action_flags & MLX5_FLOW_ACTION_DECAP) {
+ if (flow_dv_create_action_raw_encap
+ (dev, actions, dev_flow, attr, error))
+ return -rte_errno;
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.encap_decap->verbs_action;
+ } else {
+ /* Handle encap without preceding decap. */
+ if (flow_dv_create_action_l2_encap
+ (dev, actions, dev_flow, attr->transfer,
+ error))
+ return -rte_errno;
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.encap_decap->verbs_action;
+ }
+ action_flags |= MLX5_FLOW_ACTION_ENCAP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+ while ((++action)->type == RTE_FLOW_ACTION_TYPE_VOID)
+ ;
+ if (action->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) {
+ if (flow_dv_create_action_l2_decap
+ (dev, dev_flow, attr->transfer, error))
+ return -rte_errno;
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.encap_decap->verbs_action;
+ }
+ /* If decap is followed by encap, handle it at encap. */
+ action_flags |= MLX5_FLOW_ACTION_DECAP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_JUMP:
+ jump_data = action->conf;
+ ret = mlx5_flow_group_to_table(attr, dev_flow->external,
+ jump_data->group,
+ !!priv->fdb_def_rule,
+ &table, error);
+ if (ret)
+ return ret;
+ tbl = flow_dv_tbl_resource_get(dev, table,
+ attr->egress,
+ attr->transfer, error);
+ if (!tbl)
+ return rte_flow_error_set
+ (error, errno,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "cannot create jump action.");
+ if (flow_dv_jump_tbl_resource_register
+ (dev, tbl, dev_flow, error)) {
+ flow_dv_tbl_resource_release(dev, tbl);
+ return rte_flow_error_set
+ (error, errno,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "cannot create jump action.");
+ }
+ dev_flow->dv.actions[actions_n++] =
+ dev_flow->dv.jump->action;
+ action_flags |= MLX5_FLOW_ACTION_JUMP;
+ dev_flow->handle->fate_action = MLX5_FLOW_FATE_JUMP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+ case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
+ if (flow_dv_convert_action_modify_mac
+ (mhdr_res, actions, error))
+ return -rte_errno;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
+ MLX5_FLOW_ACTION_SET_MAC_SRC :
+ MLX5_FLOW_ACTION_SET_MAC_DST;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+ case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
+ if (flow_dv_convert_action_modify_ipv4
+ (mhdr_res, actions, error))
+ return -rte_errno;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
+ MLX5_FLOW_ACTION_SET_IPV4_SRC :
+ MLX5_FLOW_ACTION_SET_IPV4_DST;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+ case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
+ if (flow_dv_convert_action_modify_ipv6
+ (mhdr_res, actions, error))
+ return -rte_errno;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
+ MLX5_FLOW_ACTION_SET_IPV6_SRC :
+ MLX5_FLOW_ACTION_SET_IPV6_DST;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
+ case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
+ if (flow_dv_convert_action_modify_tp
+ (mhdr_res, actions, items,
+ &flow_attr, dev_flow, !!(action_flags &
+ MLX5_FLOW_ACTION_DECAP), error))
+ return -rte_errno;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
+ MLX5_FLOW_ACTION_SET_TP_SRC :
+ MLX5_FLOW_ACTION_SET_TP_DST;
+ break;
+ case RTE_FLOW_ACTION_TYPE_DEC_TTL:
+ if (flow_dv_convert_action_modify_dec_ttl
+ (mhdr_res, items, &flow_attr, dev_flow,
+ !!(action_flags &
+ MLX5_FLOW_ACTION_DECAP), error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_DEC_TTL;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_TTL:
+ if (flow_dv_convert_action_modify_ttl
+ (mhdr_res, actions, items, &flow_attr,
+ dev_flow, !!(action_flags &
+ MLX5_FLOW_ACTION_DECAP), error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_SET_TTL;
+ break;
+ case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+ case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ:
+ if (flow_dv_convert_action_modify_tcp_seq
+ (mhdr_res, actions, error))
+ return -rte_errno;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ ?
+ MLX5_FLOW_ACTION_INC_TCP_SEQ :
+ MLX5_FLOW_ACTION_DEC_TCP_SEQ;
+ break;
+
+ case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+ case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+ if (flow_dv_convert_action_modify_tcp_ack
+ (mhdr_res, actions, error))
+ return -rte_errno;
+ action_flags |= actions->type ==
+ RTE_FLOW_ACTION_TYPE_INC_TCP_ACK ?
+ MLX5_FLOW_ACTION_INC_TCP_ACK :
+ MLX5_FLOW_ACTION_DEC_TCP_ACK;
+ break;
+ case MLX5_RTE_FLOW_ACTION_TYPE_TAG:
+ if (flow_dv_convert_action_set_reg
+ (mhdr_res, actions, error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_SET_TAG;
+ break;
+ case MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG:
+ if (flow_dv_convert_action_copy_mreg
+ (dev, mhdr_res, actions, error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_SET_TAG;
+ break;
+ case RTE_FLOW_ACTION_TYPE_METER:
+ mtr = actions->conf;
+ if (!flow->meter) {
+ fm = mlx5_flow_meter_attach(priv, mtr->mtr_id,
+ attr, error);
+ if (!fm)
+ return rte_flow_error_set(error,
+ rte_errno,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "meter not found "
+ "or invalid parameters");
+ flow->meter = fm->idx;
+ }
+ /* Set the meter action. */
+ if (!fm) {
+ fm = mlx5_ipool_get(priv->sh->ipool
+ [MLX5_IPOOL_MTR], flow->meter);
+ if (!fm)
+ return rte_flow_error_set(error,
+ rte_errno,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "meter not found "
+ "or invalid parameters");
+ }
+ dev_flow->dv.actions[actions_n++] =
+ fm->mfts->meter_action;
+ action_flags |= MLX5_FLOW_ACTION_METER;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+ if (flow_dv_convert_action_modify_ipv4_dscp(mhdr_res,
+ actions, error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_SET_IPV4_DSCP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP:
+ if (flow_dv_convert_action_modify_ipv6_dscp(mhdr_res,
+ actions, error))
+ return -rte_errno;
+ action_flags |= MLX5_FLOW_ACTION_SET_IPV6_DSCP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_END:
+ actions_end = true;
+ if (mhdr_res->actions_num) {
+ /* create modify action if needed. */
+ if (flow_dv_modify_hdr_resource_register
+ (dev, mhdr_res, dev_flow, error))
+ return -rte_errno;
+ dev_flow->dv.actions[modify_action_position] =
+ handle->dvh.modify_hdr->verbs_action;
+ }
+ if (action_flags & MLX5_FLOW_ACTION_COUNT) {
+ flow->counter =
+ flow_dv_translate_create_counter(dev,
+ dev_flow, count, age);
+
+ if (!flow->counter)
+ return rte_flow_error_set
+ (error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ NULL,
+ "cannot create counter"
+ " object.");
+ dev_flow->dv.actions[actions_n++] =
+ (flow_dv_counter_get_by_idx(dev,
+ flow->counter, NULL))->action;
+ }
+ break;
+ default:
+ break;
+ }
+ if (mhdr_res->actions_num &&
+ modify_action_position == UINT32_MAX)
+ modify_action_position = actions_n++;
+ }
+ dev_flow->dv.actions_n = actions_n;
+ dev_flow->act_flags = action_flags;
+ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+ int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ int item_type = items->type;
+
+ switch (item_type) {
+ case RTE_FLOW_ITEM_TYPE_PORT_ID:
+ flow_dv_translate_item_port_id(dev, match_mask,
+ match_value, items);
+ last_item = MLX5_FLOW_ITEM_PORT_ID;
+ break;
+ case RTE_FLOW_ITEM_TYPE_ETH:
+ flow_dv_translate_item_eth(match_mask, match_value,
+ items, tunnel,
+ dev_flow->dv.group);
+ matcher.priority = MLX5_PRIORITY_MAP_L2;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+ MLX5_FLOW_LAYER_OUTER_L2;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VLAN:
+ flow_dv_translate_item_vlan(dev_flow,
+ match_mask, match_value,
+ items, tunnel,
+ dev_flow->dv.group);
+ matcher.priority = MLX5_PRIORITY_MAP_L2;
+ last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
+ MLX5_FLOW_LAYER_INNER_VLAN) :
+ (MLX5_FLOW_LAYER_OUTER_L2 |
+ MLX5_FLOW_LAYER_OUTER_VLAN);
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV4:
+ mlx5_flow_tunnel_ip_check(items, next_protocol,
+ &item_flags, &tunnel);
+ flow_dv_translate_item_ipv4(match_mask, match_value,
+ items, item_flags, tunnel,
+ dev_flow->dv.group);
+ matcher.priority = MLX5_PRIORITY_MAP_L3;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+ if (items->mask != NULL &&
+ ((const struct rte_flow_item_ipv4 *)
+ items->mask)->hdr.next_proto_id) {
+ next_protocol =
+ ((const struct rte_flow_item_ipv4 *)
+ (items->spec))->hdr.next_proto_id;
+ next_protocol &=
+ ((const struct rte_flow_item_ipv4 *)
+ (items->mask))->hdr.next_proto_id;
+ } else {
+ /* Reset for inner layer. */
+ next_protocol = 0xff;
+ }
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV6:
+ mlx5_flow_tunnel_ip_check(items, next_protocol,
+ &item_flags, &tunnel);
+ flow_dv_translate_item_ipv6(match_mask, match_value,
+ items, item_flags, tunnel,
+ dev_flow->dv.group);
+ matcher.priority = MLX5_PRIORITY_MAP_L3;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+ if (items->mask != NULL &&
+ ((const struct rte_flow_item_ipv6 *)
+ items->mask)->hdr.proto) {
+ next_protocol =
+ ((const struct rte_flow_item_ipv6 *)
+ items->spec)->hdr.proto;
+ next_protocol &=
+ ((const struct rte_flow_item_ipv6 *)
+ items->mask)->hdr.proto;
+ } else {
+ /* Reset for inner layer. */
+ next_protocol = 0xff;
+ }
+ break;
+ case RTE_FLOW_ITEM_TYPE_TCP:
+ flow_dv_translate_item_tcp(match_mask, match_value,
+ items, tunnel);
+ matcher.priority = MLX5_PRIORITY_MAP_L4;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP :
+ MLX5_FLOW_LAYER_OUTER_L4_TCP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_UDP:
+ flow_dv_translate_item_udp(match_mask, match_value,
+ items, tunnel);
+ matcher.priority = MLX5_PRIORITY_MAP_L4;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP :
+ MLX5_FLOW_LAYER_OUTER_L4_UDP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GRE:
+ flow_dv_translate_item_gre(match_mask, match_value,
+ items, tunnel);
+ matcher.priority = rss_desc->level >= 2 ?
+ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+ last_item = MLX5_FLOW_LAYER_GRE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GRE_KEY:
+ flow_dv_translate_item_gre_key(match_mask,
+ match_value, items);
+ last_item = MLX5_FLOW_LAYER_GRE_KEY;
+ break;
+ case RTE_FLOW_ITEM_TYPE_NVGRE:
+ flow_dv_translate_item_nvgre(match_mask, match_value,
+ items, tunnel);
+ matcher.priority = rss_desc->level >= 2 ?
+ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+ last_item = MLX5_FLOW_LAYER_GRE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN:
+ flow_dv_translate_item_vxlan(match_mask, match_value,
+ items, tunnel);
+ matcher.priority = rss_desc->level >= 2 ?
+ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+ last_item = MLX5_FLOW_LAYER_VXLAN;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+ flow_dv_translate_item_vxlan_gpe(match_mask,
+ match_value, items,
+ tunnel);
+ matcher.priority = rss_desc->level >= 2 ?
+ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+ last_item = MLX5_FLOW_LAYER_VXLAN_GPE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GENEVE:
+ flow_dv_translate_item_geneve(match_mask, match_value,
+ items, tunnel);
+ matcher.priority = rss_desc->level >= 2 ?
+ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+ last_item = MLX5_FLOW_LAYER_GENEVE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_MPLS:
+ flow_dv_translate_item_mpls(match_mask, match_value,
+ items, last_item, tunnel);
+ matcher.priority = rss_desc->level >= 2 ?
+ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+ last_item = MLX5_FLOW_LAYER_MPLS;
+ break;
+ case RTE_FLOW_ITEM_TYPE_MARK:
+ flow_dv_translate_item_mark(dev, match_mask,
+ match_value, items);
+ last_item = MLX5_FLOW_ITEM_MARK;
+ break;
+ case RTE_FLOW_ITEM_TYPE_META:
+ flow_dv_translate_item_meta(dev, match_mask,
+ match_value, attr, items);
+ last_item = MLX5_FLOW_ITEM_METADATA;
+ break;
+ case RTE_FLOW_ITEM_TYPE_ICMP:
+ flow_dv_translate_item_icmp(match_mask, match_value,
+ items, tunnel);
+ last_item = MLX5_FLOW_LAYER_ICMP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_ICMP6:
+ flow_dv_translate_item_icmp6(match_mask, match_value,
+ items, tunnel);
+ last_item = MLX5_FLOW_LAYER_ICMP6;
+ break;
+ case RTE_FLOW_ITEM_TYPE_TAG:
+ flow_dv_translate_item_tag(dev, match_mask,
+ match_value, items);
+ last_item = MLX5_FLOW_ITEM_TAG;
+ break;
+ case MLX5_RTE_FLOW_ITEM_TYPE_TAG:
+ flow_dv_translate_mlx5_item_tag(dev, match_mask,
+ match_value, items);
+ last_item = MLX5_FLOW_ITEM_TAG;
+ break;
+ case MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE:
+ flow_dv_translate_item_tx_queue(dev, match_mask,
+ match_value,
+ items);
+ last_item = MLX5_FLOW_ITEM_TX_QUEUE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GTP:
+ flow_dv_translate_item_gtp(match_mask, match_value,
+ items, tunnel);
+ matcher.priority = rss_desc->level >= 2 ?
+ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+ last_item = MLX5_FLOW_LAYER_GTP;
+ break;
+ default:
+ break;
+ }
+ item_flags |= last_item;
+ }
+ /*
+ * When E-Switch mode is enabled, we have two cases where we need to
+ * set the source port manually.
+ * The first one, is in case of Nic steering rule, and the second is
+ * E-Switch rule where no port_id item was found. In both cases
+ * the source port is set according the current port in use.
+ */
+ if (!(item_flags & MLX5_FLOW_ITEM_PORT_ID) &&
+ (priv->representor || priv->master)) {
+ if (flow_dv_translate_item_port_id(dev, match_mask,
+ match_value, NULL))
+ return -rte_errno;
+ }
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+ MLX5_ASSERT(!flow_dv_check_valid_spec(matcher.mask.buf,
+ dev_flow->dv.value.buf));
+#endif
+ /*
+ * Layers may be already initialized from prefix flow if this dev_flow
+ * is the suffix flow.
+ */
+ handle->layers |= item_flags;
+ if (action_flags & MLX5_FLOW_ACTION_RSS)
+ flow_dv_hashfields_set(dev_flow, rss_desc);
+ /* Register matcher. */
+ matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf,
+ matcher.mask.size);
+ matcher.priority = mlx5_flow_adjust_priority(dev, priority,
+ matcher.priority);
+ /* reserved field no needs to be set to 0 here. */
+ tbl_key.domain = attr->transfer;
+ tbl_key.direction = attr->egress;
+ tbl_key.table_id = dev_flow->dv.group;
+ if (flow_dv_matcher_register(dev, &matcher, &tbl_key, dev_flow, error))
+ return -rte_errno;
+ return 0;
+}
+
+/**
+ * Apply the flow to the NIC, lock free,
+ * (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ * Pointer to flow structure.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+__flow_dv_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_flow_dv_workspace *dv;
+ struct mlx5_flow_handle *dh;
+ struct mlx5_flow_handle_dv *dv_h;
+ struct mlx5_flow *dev_flow;
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t handle_idx;
+ int n;
+ int err;
+ int idx;
+
+ for (idx = priv->flow_idx - 1; idx >= priv->flow_nested_idx; idx--) {
+ dev_flow = &((struct mlx5_flow *)priv->inter_flows)[idx];
+ dv = &dev_flow->dv;
+ dh = dev_flow->handle;
+ dv_h = &dh->dvh;
+ n = dv->actions_n;
+ if (dh->fate_action == MLX5_FLOW_FATE_DROP) {
+ if (dv->transfer) {
+ dv->actions[n++] = priv->sh->esw_drop_action;
+ } else {
+ struct mlx5_hrxq *drop_hrxq;
+ drop_hrxq = mlx5_hrxq_drop_new(dev);
+ if (!drop_hrxq) {
+ rte_flow_error_set
+ (error, errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "cannot get drop hash queue");
+ goto error;
+ }
+ /*
+ * Drop queues will be released by the specify
+ * mlx5_hrxq_drop_release() function. Assign
+ * the special index to hrxq to mark the queue
+ * has been allocated.
+ */
+ dh->rix_hrxq = UINT32_MAX;
+ dv->actions[n++] = drop_hrxq->action;
+ }
+ } else if (dh->fate_action == MLX5_FLOW_FATE_QUEUE) {
+ struct mlx5_hrxq *hrxq;
+ uint32_t hrxq_idx;
+ struct mlx5_flow_rss_desc *rss_desc =
+ &((struct mlx5_flow_rss_desc *)priv->rss_desc)
+ [!!priv->flow_nested_idx];
+
+ MLX5_ASSERT(rss_desc->queue_num);
+ hrxq_idx = mlx5_hrxq_get(dev, rss_desc->key,
+ MLX5_RSS_HASH_KEY_LEN,
+ dev_flow->hash_fields,
+ rss_desc->queue,
+ rss_desc->queue_num);
+ if (!hrxq_idx) {
+ hrxq_idx = mlx5_hrxq_new
+ (dev, rss_desc->key,
+ MLX5_RSS_HASH_KEY_LEN,
+ dev_flow->hash_fields,
+ rss_desc->queue,
+ rss_desc->queue_num,
+ !!(dh->layers &
+ MLX5_FLOW_LAYER_TUNNEL));
+ }
+ hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ],
+ hrxq_idx);
+ if (!hrxq) {
+ rte_flow_error_set
+ (error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot get hash queue");
+ goto error;
+ }
+ dh->rix_hrxq = hrxq_idx;
+ dv->actions[n++] = hrxq->action;
+ }
+ dh->ib_flow =
+ mlx5_glue->dv_create_flow(dv_h->matcher->matcher_object,
+ (void *)&dv->value, n,
+ dv->actions);
+ if (!dh->ib_flow) {
+ rte_flow_error_set(error, errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "hardware refuses to create flow");
+ goto error;
+ }
+ if (priv->vmwa_context &&
+ dh->vf_vlan.tag && !dh->vf_vlan.created) {
+ /*
+ * The rule contains the VLAN pattern.
+ * For VF we are going to create VLAN
+ * interface to make hypervisor set correct
+ * e-Switch vport context.
+ */
+ mlx5_vlan_vmwa_acquire(dev, &dh->vf_vlan);
+ }
+ }
+ return 0;
+error:
+ err = rte_errno; /* Save rte_errno before cleanup. */
+ SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+ handle_idx, dh, next) {
+ /* hrxq is union, don't clear it if the flag is not set. */
+ if (dh->rix_hrxq) {
+ if (dh->fate_action == MLX5_FLOW_FATE_DROP) {
+ mlx5_hrxq_drop_release(dev);
+ dh->rix_hrxq = 0;
+ } else if (dh->fate_action == MLX5_FLOW_FATE_QUEUE) {
+ mlx5_hrxq_release(dev, dh->rix_hrxq);
+ dh->rix_hrxq = 0;
+ }
+ }
+ if (dh->vf_vlan.tag && dh->vf_vlan.created)
+ mlx5_vlan_vmwa_release(dev, &dh->vf_vlan);
+ }
+ rte_errno = err; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Release the flow matcher.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param handle
+ * Pointer to mlx5_flow_handle.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_matcher_release(struct rte_eth_dev *dev,
+ struct mlx5_flow_handle *handle)
+{
+ struct mlx5_flow_dv_matcher *matcher = handle->dvh.matcher;
+
+ MLX5_ASSERT(matcher->matcher_object);
+ DRV_LOG(DEBUG, "port %u matcher %p: refcnt %d--",
+ dev->data->port_id, (void *)matcher,
+ rte_atomic32_read(&matcher->refcnt));
+ if (rte_atomic32_dec_and_test(&matcher->refcnt)) {
+ claim_zero(mlx5_glue->dv_destroy_flow_matcher
+ (matcher->matcher_object));
+ LIST_REMOVE(matcher, next);
+ /* table ref-- in release interface. */
+ flow_dv_tbl_resource_release(dev, matcher->tbl);
+ rte_free(matcher);
+ DRV_LOG(DEBUG, "port %u matcher %p: removed",
+ dev->data->port_id, (void *)matcher);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Release an encap/decap resource.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param handle
+ * Pointer to mlx5_flow_handle.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_encap_decap_resource_release(struct rte_eth_dev *dev,
+ struct mlx5_flow_handle *handle)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t idx = handle->dvh.rix_encap_decap;
+ struct mlx5_flow_dv_encap_decap_resource *cache_resource;
+
+ cache_resource = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_DECAP_ENCAP],
+ idx);
+ if (!cache_resource)
+ return 0;
+ MLX5_ASSERT(cache_resource->verbs_action);
+ DRV_LOG(DEBUG, "encap/decap resource %p: refcnt %d--",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+ claim_zero(mlx5_glue->destroy_flow_action
+ (cache_resource->verbs_action));
+ ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_DECAP_ENCAP],
+ &priv->sh->encaps_decaps, idx,
+ cache_resource, next);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_DECAP_ENCAP], idx);
+ DRV_LOG(DEBUG, "encap/decap resource %p: removed",
+ (void *)cache_resource);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Release an jump to table action resource.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param handle
+ * Pointer to mlx5_flow_handle.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_jump_tbl_resource_release(struct rte_eth_dev *dev,
+ struct mlx5_flow_handle *handle)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_dv_jump_tbl_resource *cache_resource;
+ struct mlx5_flow_tbl_data_entry *tbl_data;
+
+ tbl_data = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_JUMP],
+ handle->rix_jump);
+ if (!tbl_data)
+ return 0;
+ cache_resource = &tbl_data->jump;
+ MLX5_ASSERT(cache_resource->action);
+ DRV_LOG(DEBUG, "jump table resource %p: refcnt %d--",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+ claim_zero(mlx5_glue->destroy_flow_action
+ (cache_resource->action));
+ /* jump action memory free is inside the table release. */
+ flow_dv_tbl_resource_release(dev, &tbl_data->tbl);
+ DRV_LOG(DEBUG, "jump table resource %p: removed",
+ (void *)cache_resource);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Release a modify-header resource.
+ *
+ * @param handle
+ * Pointer to mlx5_flow_handle.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_modify_hdr_resource_release(struct mlx5_flow_handle *handle)
+{
+ struct mlx5_flow_dv_modify_hdr_resource *cache_resource =
+ handle->dvh.modify_hdr;
+
+ MLX5_ASSERT(cache_resource->verbs_action);
+ DRV_LOG(DEBUG, "modify-header resource %p: refcnt %d--",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+ claim_zero(mlx5_glue->destroy_flow_action
+ (cache_resource->verbs_action));
+ LIST_REMOVE(cache_resource, next);
+ rte_free(cache_resource);
+ DRV_LOG(DEBUG, "modify-header resource %p: removed",
+ (void *)cache_resource);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Release port ID action resource.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param handle
+ * Pointer to mlx5_flow_handle.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_port_id_action_resource_release(struct rte_eth_dev *dev,
+ struct mlx5_flow_handle *handle)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_dv_port_id_action_resource *cache_resource;
+ uint32_t idx = handle->rix_port_id_action;
+
+ cache_resource = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_PORT_ID],
+ idx);
+ if (!cache_resource)
+ return 0;
+ MLX5_ASSERT(cache_resource->action);
+ DRV_LOG(DEBUG, "port ID action resource %p: refcnt %d--",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+ claim_zero(mlx5_glue->destroy_flow_action
+ (cache_resource->action));
+ ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_PORT_ID],
+ &priv->sh->port_id_action_list, idx,
+ cache_resource, next);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_PORT_ID], idx);
+ DRV_LOG(DEBUG, "port id action resource %p: removed",
+ (void *)cache_resource);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Release push vlan action resource.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param handle
+ * Pointer to mlx5_flow_handle.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_push_vlan_action_resource_release(struct rte_eth_dev *dev,
+ struct mlx5_flow_handle *handle)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint32_t idx = handle->dvh.rix_push_vlan;
+ struct mlx5_flow_dv_push_vlan_action_resource *cache_resource;
+
+ cache_resource = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+ idx);
+ if (!cache_resource)
+ return 0;
+ MLX5_ASSERT(cache_resource->action);
+ DRV_LOG(DEBUG, "push VLAN action resource %p: refcnt %d--",
+ (void *)cache_resource,
+ rte_atomic32_read(&cache_resource->refcnt));
+ if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+ claim_zero(mlx5_glue->destroy_flow_action
+ (cache_resource->action));
+ ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+ &priv->sh->push_vlan_action_list, idx,
+ cache_resource, next);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_PUSH_VLAN], idx);
+ DRV_LOG(DEBUG, "push vlan action resource %p: removed",
+ (void *)cache_resource);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Release the fate resource.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param handle
+ * Pointer to mlx5_flow_handle.
+ */
+static void
+flow_dv_fate_resource_release(struct rte_eth_dev *dev,
+ struct mlx5_flow_handle *handle)
+{
+ if (!handle->rix_fate)
+ return;
+ if (handle->fate_action == MLX5_FLOW_FATE_DROP)
+ mlx5_hrxq_drop_release(dev);
+ else if (handle->fate_action == MLX5_FLOW_FATE_QUEUE)
+ mlx5_hrxq_release(dev, handle->rix_hrxq);
+ else if (handle->fate_action == MLX5_FLOW_FATE_JUMP)
+ flow_dv_jump_tbl_resource_release(dev, handle);
+ else if (handle->fate_action == MLX5_FLOW_FATE_PORT_ID)
+ flow_dv_port_id_action_resource_release(dev, handle);
+ else
+ DRV_LOG(DEBUG, "Incorrect fate action:%d", handle->fate_action);
+ handle->rix_fate = 0;
+}
+
+/**
+ * Remove the flow from the NIC but keeps it in memory.
+ * Lock free, (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in, out] flow
+ * Pointer to flow structure.
+ */
+static void
+__flow_dv_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ struct mlx5_flow_handle *dh;
+ uint32_t handle_idx;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!flow)
+ return;
+ handle_idx = flow->dev_handles;
+ while (handle_idx) {
+ dh = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+ handle_idx);
+ if (!dh)
+ return;
+ if (dh->ib_flow) {
+ claim_zero(mlx5_glue->dv_destroy_flow(dh->ib_flow));
+ dh->ib_flow = NULL;
+ }
+ if (dh->fate_action == MLX5_FLOW_FATE_DROP ||
+ dh->fate_action == MLX5_FLOW_FATE_QUEUE)
+ flow_dv_fate_resource_release(dev, dh);
+ if (dh->vf_vlan.tag && dh->vf_vlan.created)
+ mlx5_vlan_vmwa_release(dev, &dh->vf_vlan);
+ handle_idx = dh->next.next;
+ }
+}
+
+/**
+ * Remove the flow from the NIC and the memory.
+ * Lock free, (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ * Pointer to flow structure.
+ */
+static void
+__flow_dv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ struct mlx5_flow_handle *dev_handle;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!flow)
+ return;
+ __flow_dv_remove(dev, flow);
+ if (flow->counter) {
+ flow_dv_counter_release(dev, flow->counter);
+ flow->counter = 0;
+ }
+ if (flow->meter) {
+ struct mlx5_flow_meter *fm;
+
+ fm = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MTR],
+ flow->meter);
+ if (fm)
+ mlx5_flow_meter_detach(fm);
+ flow->meter = 0;
+ }
+ while (flow->dev_handles) {
+ uint32_t tmp_idx = flow->dev_handles;
+
+ dev_handle = mlx5_ipool_get(priv->sh->ipool
+ [MLX5_IPOOL_MLX5_FLOW], tmp_idx);
+ if (!dev_handle)
+ return;
+ flow->dev_handles = dev_handle->next.next;
+ if (dev_handle->dvh.matcher)
+ flow_dv_matcher_release(dev, dev_handle);
+ if (dev_handle->dvh.rix_encap_decap)
+ flow_dv_encap_decap_resource_release(dev, dev_handle);
+ if (dev_handle->dvh.modify_hdr)
+ flow_dv_modify_hdr_resource_release(dev_handle);
+ if (dev_handle->dvh.rix_push_vlan)
+ flow_dv_push_vlan_action_resource_release(dev,
+ dev_handle);
+ if (dev_handle->dvh.rix_tag)
+ flow_dv_tag_release(dev,
+ dev_handle->dvh.rix_tag);
+ flow_dv_fate_resource_release(dev, dev_handle);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+ tmp_idx);
+ }
+}
+
+/**
+ * Query a dv flow rule for its statistics via devx.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] flow
+ * Pointer to the sub flow.
+ * @param[out] data
+ * data retrieved by the query.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_query_count(struct rte_eth_dev *dev, struct rte_flow *flow,
+ void *data, struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow_query_count *qc = data;
+
+ if (!priv->config.devx)
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "counters are not supported");
+ if (flow->counter) {
+ uint64_t pkts, bytes;
+ struct mlx5_flow_counter *cnt;
+
+ cnt = flow_dv_counter_get_by_idx(dev, flow->counter,
+ NULL);
+ int err = _flow_dv_query_count(dev, flow->counter, &pkts,
+ &bytes);
+
+ if (err)
+ return rte_flow_error_set(error, -err,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "cannot read counters");
+ qc->hits_set = 1;
+ qc->bytes_set = 1;
+ qc->hits = pkts - cnt->hits;
+ qc->bytes = bytes - cnt->bytes;
+ if (qc->reset) {
+ cnt->hits = pkts;
+ cnt->bytes = bytes;
+ }
+ return 0;
+ }
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "counters are not available");
+}
+
+/**
+ * Query a flow.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+static int
+flow_dv_query(struct rte_eth_dev *dev,
+ struct rte_flow *flow __rte_unused,
+ const struct rte_flow_action *actions __rte_unused,
+ void *data __rte_unused,
+ struct rte_flow_error *error __rte_unused)
+{
+ int ret = -EINVAL;
+
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_VOID:
+ break;
+ case RTE_FLOW_ACTION_TYPE_COUNT:
+ ret = flow_dv_query_count(dev, flow, data, error);
+ break;
+ default:
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ actions,
+ "action not supported");
+ }
+ }
+ return ret;
+}
+
+/**
+ * Destroy the meter table set.
+ * Lock free, (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] tbl
+ * Pointer to the meter table set.
+ *
+ * @return
+ * Always 0.
+ */
+static int
+flow_dv_destroy_mtr_tbl(struct rte_eth_dev *dev,
+ struct mlx5_meter_domains_infos *tbl)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_meter_domains_infos *mtd =
+ (struct mlx5_meter_domains_infos *)tbl;
+
+ if (!mtd || !priv->config.dv_flow_en)
+ return 0;
+ if (mtd->ingress.policer_rules[RTE_MTR_DROPPED])
+ claim_zero(mlx5_glue->dv_destroy_flow
+ (mtd->ingress.policer_rules[RTE_MTR_DROPPED]));
+ if (mtd->egress.policer_rules[RTE_MTR_DROPPED])
+ claim_zero(mlx5_glue->dv_destroy_flow
+ (mtd->egress.policer_rules[RTE_MTR_DROPPED]));
+ if (mtd->transfer.policer_rules[RTE_MTR_DROPPED])
+ claim_zero(mlx5_glue->dv_destroy_flow
+ (mtd->transfer.policer_rules[RTE_MTR_DROPPED]));
+ if (mtd->egress.color_matcher)
+ claim_zero(mlx5_glue->dv_destroy_flow_matcher
+ (mtd->egress.color_matcher));
+ if (mtd->egress.any_matcher)
+ claim_zero(mlx5_glue->dv_destroy_flow_matcher
+ (mtd->egress.any_matcher));
+ if (mtd->egress.tbl)
+ flow_dv_tbl_resource_release(dev, mtd->egress.tbl);
+ if (mtd->egress.sfx_tbl)
+ flow_dv_tbl_resource_release(dev, mtd->egress.sfx_tbl);
+ if (mtd->ingress.color_matcher)
+ claim_zero(mlx5_glue->dv_destroy_flow_matcher
+ (mtd->ingress.color_matcher));
+ if (mtd->ingress.any_matcher)
+ claim_zero(mlx5_glue->dv_destroy_flow_matcher
+ (mtd->ingress.any_matcher));
+ if (mtd->ingress.tbl)
+ flow_dv_tbl_resource_release(dev, mtd->ingress.tbl);
+ if (mtd->ingress.sfx_tbl)
+ flow_dv_tbl_resource_release(dev, mtd->ingress.sfx_tbl);
+ if (mtd->transfer.color_matcher)
+ claim_zero(mlx5_glue->dv_destroy_flow_matcher
+ (mtd->transfer.color_matcher));
+ if (mtd->transfer.any_matcher)
+ claim_zero(mlx5_glue->dv_destroy_flow_matcher
+ (mtd->transfer.any_matcher));
+ if (mtd->transfer.tbl)
+ flow_dv_tbl_resource_release(dev, mtd->transfer.tbl);
+ if (mtd->transfer.sfx_tbl)
+ flow_dv_tbl_resource_release(dev, mtd->transfer.sfx_tbl);
+ if (mtd->drop_actn)
+ claim_zero(mlx5_glue->destroy_flow_action(mtd->drop_actn));
+ rte_free(mtd);
+ return 0;
+}
+
+/* Number of meter flow actions, count and jump or count and drop. */
+#define METER_ACTIONS 2
+
+/**
+ * Create specify domain meter table and suffix table.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in,out] mtb
+ * Pointer to DV meter table set.
+ * @param[in] egress
+ * Table attribute.
+ * @param[in] transfer
+ * Table attribute.
+ * @param[in] color_reg_c_idx
+ * Reg C index for color match.
+ *
+ * @return
+ * 0 on success, -1 otherwise and rte_errno is set.
+ */
+static int
+flow_dv_prepare_mtr_tables(struct rte_eth_dev *dev,
+ struct mlx5_meter_domains_infos *mtb,
+ uint8_t egress, uint8_t transfer,
+ uint32_t color_reg_c_idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_flow_dv_match_params mask = {
+ .size = sizeof(mask.buf),
+ };
+ struct mlx5_flow_dv_match_params value = {
+ .size = sizeof(value.buf),
+ };
+ struct mlx5dv_flow_matcher_attr dv_attr = {
+ .type = IBV_FLOW_ATTR_NORMAL,
+ .priority = 0,
+ .match_criteria_enable = 0,
+ .match_mask = (void *)&mask,
+ };
+ void *actions[METER_ACTIONS];
+ struct mlx5_meter_domain_info *dtb;
+ struct rte_flow_error error;
+ int i = 0;
+
+ if (transfer)
+ dtb = &mtb->transfer;
+ else if (egress)
+ dtb = &mtb->egress;
+ else
+ dtb = &mtb->ingress;
+ /* Create the meter table with METER level. */
+ dtb->tbl = flow_dv_tbl_resource_get(dev, MLX5_FLOW_TABLE_LEVEL_METER,
+ egress, transfer, &error);
+ if (!dtb->tbl) {
+ DRV_LOG(ERR, "Failed to create meter policer table.");
+ return -1;
+ }
+ /* Create the meter suffix table with SUFFIX level. */
+ dtb->sfx_tbl = flow_dv_tbl_resource_get(dev,
+ MLX5_FLOW_TABLE_LEVEL_SUFFIX,
+ egress, transfer, &error);
+ if (!dtb->sfx_tbl) {
+ DRV_LOG(ERR, "Failed to create meter suffix table.");
+ return -1;
+ }
+ /* Create matchers, Any and Color. */
+ dv_attr.priority = 3;
+ dv_attr.match_criteria_enable = 0;
+ dtb->any_matcher = mlx5_glue->dv_create_flow_matcher(sh->ctx,
+ &dv_attr,
+ dtb->tbl->obj);
+ if (!dtb->any_matcher) {
+ DRV_LOG(ERR, "Failed to create meter"
+ " policer default matcher.");
+ goto error_exit;
+ }
+ dv_attr.priority = 0;
+ dv_attr.match_criteria_enable =
+ 1 << MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT;
+ flow_dv_match_meta_reg(mask.buf, value.buf, color_reg_c_idx,
+ rte_col_2_mlx5_col(RTE_COLORS), UINT8_MAX);
+ dtb->color_matcher = mlx5_glue->dv_create_flow_matcher(sh->ctx,
+ &dv_attr,
+ dtb->tbl->obj);
+ if (!dtb->color_matcher) {
+ DRV_LOG(ERR, "Failed to create meter policer color matcher.");
+ goto error_exit;
+ }
+ if (mtb->count_actns[RTE_MTR_DROPPED])
+ actions[i++] = mtb->count_actns[RTE_MTR_DROPPED];
+ actions[i++] = mtb->drop_actn;
+ /* Default rule: lowest priority, match any, actions: drop. */
+ dtb->policer_rules[RTE_MTR_DROPPED] =
+ mlx5_glue->dv_create_flow(dtb->any_matcher,
+ (void *)&value, i, actions);
+ if (!dtb->policer_rules[RTE_MTR_DROPPED]) {
+ DRV_LOG(ERR, "Failed to create meter policer drop rule.");
+ goto error_exit;
+ }
+ return 0;
+error_exit:
+ return -1;
+}
+
+/**
+ * Create the needed meter and suffix tables.
+ * Lock free, (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] fm
+ * Pointer to the flow meter.
+ *
+ * @return
+ * Pointer to table set on success, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_meter_domains_infos *
+flow_dv_create_mtr_tbl(struct rte_eth_dev *dev,
+ const struct mlx5_flow_meter *fm)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_meter_domains_infos *mtb;
+ int ret;
+ int i;
+
+ if (!priv->mtr_en) {
+ rte_errno = ENOTSUP;
+ return NULL;
+ }
+ mtb = rte_calloc(__func__, 1, sizeof(*mtb), 0);
+ if (!mtb) {
+ DRV_LOG(ERR, "Failed to allocate memory for meter.");
+ return NULL;
+ }
+ /* Create meter count actions */
+ for (i = 0; i <= RTE_MTR_DROPPED; i++) {
+ struct mlx5_flow_counter *cnt;
+ if (!fm->policer_stats.cnt[i])
+ continue;
+ cnt = flow_dv_counter_get_by_idx(dev,
+ fm->policer_stats.cnt[i], NULL);
+ mtb->count_actns[i] = cnt->action;
+ }
+ /* Create drop action. */
+ mtb->drop_actn = mlx5_glue->dr_create_flow_action_drop();
+ if (!mtb->drop_actn) {
+ DRV_LOG(ERR, "Failed to create drop action.");
+ goto error_exit;
+ }
+ /* Egress meter table. */
+ ret = flow_dv_prepare_mtr_tables(dev, mtb, 1, 0, priv->mtr_color_reg);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to prepare egress meter table.");
+ goto error_exit;
+ }
+ /* Ingress meter table. */
+ ret = flow_dv_prepare_mtr_tables(dev, mtb, 0, 0, priv->mtr_color_reg);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to prepare ingress meter table.");
+ goto error_exit;
+ }
+ /* FDB meter table. */
+ if (priv->config.dv_esw_en) {
+ ret = flow_dv_prepare_mtr_tables(dev, mtb, 0, 1,
+ priv->mtr_color_reg);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to prepare fdb meter table.");
+ goto error_exit;
+ }
+ }
+ return mtb;
+error_exit:
+ flow_dv_destroy_mtr_tbl(dev, mtb);
+ return NULL;
+}
+
+/**
+ * Destroy domain policer rule.
+ *
+ * @param[in] dt
+ * Pointer to domain table.
+ */
+static void
+flow_dv_destroy_domain_policer_rule(struct mlx5_meter_domain_info *dt)
+{
+ int i;
+
+ for (i = 0; i < RTE_MTR_DROPPED; i++) {
+ if (dt->policer_rules[i]) {
+ claim_zero(mlx5_glue->dv_destroy_flow
+ (dt->policer_rules[i]));
+ dt->policer_rules[i] = NULL;
+ }
+ }
+ if (dt->jump_actn) {
+ claim_zero(mlx5_glue->destroy_flow_action(dt->jump_actn));
+ dt->jump_actn = NULL;
+ }
+}
+
+/**
+ * Destroy policer rules.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] fm
+ * Pointer to flow meter structure.
+ * @param[in] attr
+ * Pointer to flow attributes.
+ *
+ * @return
+ * Always 0.
+ */
+static int
+flow_dv_destroy_policer_rules(struct rte_eth_dev *dev __rte_unused,
+ const struct mlx5_flow_meter *fm,
+ const struct rte_flow_attr *attr)
+{
+ struct mlx5_meter_domains_infos *mtb = fm ? fm->mfts : NULL;
+
+ if (!mtb)
+ return 0;
+ if (attr->egress)
+ flow_dv_destroy_domain_policer_rule(&mtb->egress);
+ if (attr->ingress)
+ flow_dv_destroy_domain_policer_rule(&mtb->ingress);
+ if (attr->transfer)
+ flow_dv_destroy_domain_policer_rule(&mtb->transfer);
+ return 0;
+}
+
+/**
+ * Create specify domain meter policer rule.
+ *
+ * @param[in] fm
+ * Pointer to flow meter structure.
+ * @param[in] mtb
+ * Pointer to DV meter table set.
+ * @param[in] mtr_reg_c
+ * Color match REG_C.
+ *
+ * @return
+ * 0 on success, -1 otherwise.
+ */
+static int
+flow_dv_create_policer_forward_rule(struct mlx5_flow_meter *fm,
+ struct mlx5_meter_domain_info *dtb,
+ uint8_t mtr_reg_c)
+{
+ struct mlx5_flow_dv_match_params matcher = {
+ .size = sizeof(matcher.buf),
+ };
+ struct mlx5_flow_dv_match_params value = {
+ .size = sizeof(value.buf),
+ };
+ struct mlx5_meter_domains_infos *mtb = fm->mfts;
+ void *actions[METER_ACTIONS];
+ int i;
+
+ /* Create jump action. */
+ if (!dtb->jump_actn)
+ dtb->jump_actn =
+ mlx5_glue->dr_create_flow_action_dest_flow_tbl
+ (dtb->sfx_tbl->obj);
+ if (!dtb->jump_actn) {
+ DRV_LOG(ERR, "Failed to create policer jump action.");
+ goto error;
+ }
+ for (i = 0; i < RTE_MTR_DROPPED; i++) {
+ int j = 0;
+
+ flow_dv_match_meta_reg(matcher.buf, value.buf, mtr_reg_c,
+ rte_col_2_mlx5_col(i), UINT8_MAX);
+ if (mtb->count_actns[i])
+ actions[j++] = mtb->count_actns[i];
+ if (fm->action[i] == MTR_POLICER_ACTION_DROP)
+ actions[j++] = mtb->drop_actn;
+ else
+ actions[j++] = dtb->jump_actn;
+ dtb->policer_rules[i] =
+ mlx5_glue->dv_create_flow(dtb->color_matcher,
+ (void *)&value,
+ j, actions);
+ if (!dtb->policer_rules[i]) {
+ DRV_LOG(ERR, "Failed to create policer rule.");
+ goto error;
+ }
+ }
+ return 0;
+error:
+ rte_errno = errno;
+ return -1;
+}
+
+/**
+ * Create policer rules.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] fm
+ * Pointer to flow meter structure.
+ * @param[in] attr
+ * Pointer to flow attributes.
+ *
+ * @return
+ * 0 on success, -1 otherwise.
+ */
+static int
+flow_dv_create_policer_rules(struct rte_eth_dev *dev,
+ struct mlx5_flow_meter *fm,
+ const struct rte_flow_attr *attr)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_meter_domains_infos *mtb = fm->mfts;
+ int ret;
+
+ if (attr->egress) {
+ ret = flow_dv_create_policer_forward_rule(fm, &mtb->egress,
+ priv->mtr_color_reg);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to create egress policer.");
+ goto error;
+ }
+ }
+ if (attr->ingress) {
+ ret = flow_dv_create_policer_forward_rule(fm, &mtb->ingress,
+ priv->mtr_color_reg);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to create ingress policer.");
+ goto error;
+ }
+ }
+ if (attr->transfer) {
+ ret = flow_dv_create_policer_forward_rule(fm, &mtb->transfer,
+ priv->mtr_color_reg);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to create transfer policer.");
+ goto error;
+ }
+ }
+ return 0;
+error:
+ flow_dv_destroy_policer_rules(dev, fm, attr);
+ return -1;
+}
+
+/**
+ * Query a devx counter.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] cnt
+ * Index to the flow counter.
+ * @param[in] clear
+ * Set to clear the counter statistics.
+ * @param[out] pkts
+ * The statistics value of packets.
+ * @param[out] bytes
+ * The statistics value of bytes.
+ *
+ * @return
+ * 0 on success, otherwise return -1.
+ */
+static int
+flow_dv_counter_query(struct rte_eth_dev *dev, uint32_t counter, bool clear,
+ uint64_t *pkts, uint64_t *bytes)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_counter *cnt;
+ uint64_t inn_pkts, inn_bytes;
+ int ret;
+
+ if (!priv->config.devx)
+ return -1;
+
+ ret = _flow_dv_query_count(dev, counter, &inn_pkts, &inn_bytes);
+ if (ret)
+ return -1;
+ cnt = flow_dv_counter_get_by_idx(dev, counter, NULL);
+ *pkts = inn_pkts - cnt->hits;
+ *bytes = inn_bytes - cnt->bytes;
+ if (clear) {
+ cnt->hits = inn_pkts;
+ cnt->bytes = inn_bytes;
+ }
+ return 0;
+}
+
+/**
+ * Get aged-out flows.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] context
+ * The address of an array of pointers to the aged-out flows contexts.
+ * @param[in] nb_contexts
+ * The length of context array pointers.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL. Initialized in case of
+ * error only.
+ *
+ * @return
+ * how many contexts get in success, otherwise negative errno value.
+ * if nb_contexts is 0, return the amount of all aged contexts.
+ * if nb_contexts is not 0 , return the amount of aged flows reported
+ * in the context array.
+ * @note: only stub for now
+ */
+static int
+flow_get_aged_flows(struct rte_eth_dev *dev,
+ void **context,
+ uint32_t nb_contexts,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_age_info *age_info;
+ struct mlx5_age_param *age_param;
+ struct mlx5_flow_counter *counter;
+ int nb_flows = 0;
+
+ if (nb_contexts && !context)
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "Should assign at least one flow or"
+ " context to get if nb_contexts != 0");
+ age_info = GET_PORT_AGE_INFO(priv);
+ rte_spinlock_lock(&age_info->aged_sl);
+ TAILQ_FOREACH(counter, &age_info->aged_counters, next) {
+ nb_flows++;
+ if (nb_contexts) {
+ age_param = MLX5_CNT_TO_AGE(counter);
+ context[nb_flows - 1] = age_param->context;
+ if (!(--nb_contexts))
+ break;
+ }
+ }
+ rte_spinlock_unlock(&age_info->aged_sl);
+ MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER);
+ return nb_flows;
+}
+
+/*
+ * Mutex-protected thunk to lock-free __flow_dv_translate().
+ */
+static int
+flow_dv_translate(struct rte_eth_dev *dev,
+ struct mlx5_flow *dev_flow,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ int ret;
+
+ flow_dv_shared_lock(dev);
+ ret = __flow_dv_translate(dev, dev_flow, attr, items, actions, error);
+ flow_dv_shared_unlock(dev);
+ return ret;
+}
+
+/*
+ * Mutex-protected thunk to lock-free __flow_dv_apply().
+ */
+static int
+flow_dv_apply(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ struct rte_flow_error *error)
+{
+ int ret;
+
+ flow_dv_shared_lock(dev);
+ ret = __flow_dv_apply(dev, flow, error);
+ flow_dv_shared_unlock(dev);
+ return ret;
+}
+
+/*
+ * Mutex-protected thunk to lock-free __flow_dv_remove().
+ */
+static void
+flow_dv_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ flow_dv_shared_lock(dev);
+ __flow_dv_remove(dev, flow);
+ flow_dv_shared_unlock(dev);
+}
+
+/*
+ * Mutex-protected thunk to lock-free __flow_dv_destroy().
+ */
+static void
+flow_dv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ flow_dv_shared_lock(dev);
+ __flow_dv_destroy(dev, flow);
+ flow_dv_shared_unlock(dev);
+}
+
+/*
+ * Mutex-protected thunk to lock-free flow_dv_counter_alloc().
+ */
+static uint32_t
+flow_dv_counter_allocate(struct rte_eth_dev *dev)
+{
+ uint32_t cnt;
+
+ flow_dv_shared_lock(dev);
+ cnt = flow_dv_counter_alloc(dev, 0, 0, 1, 0);
+ flow_dv_shared_unlock(dev);
+ return cnt;
+}
+
+/*
+ * Mutex-protected thunk to lock-free flow_dv_counter_release().
+ */
+static void
+flow_dv_counter_free(struct rte_eth_dev *dev, uint32_t cnt)
+{
+ flow_dv_shared_lock(dev);
+ flow_dv_counter_release(dev, cnt);
+ flow_dv_shared_unlock(dev);
+}
+
+const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops = {
+ .validate = flow_dv_validate,
+ .prepare = flow_dv_prepare,
+ .translate = flow_dv_translate,
+ .apply = flow_dv_apply,
+ .remove = flow_dv_remove,
+ .destroy = flow_dv_destroy,
+ .query = flow_dv_query,
+ .create_mtr_tbls = flow_dv_create_mtr_tbl,
+ .destroy_mtr_tbls = flow_dv_destroy_mtr_tbl,
+ .create_policer_rules = flow_dv_create_policer_rules,
+ .destroy_policer_rules = flow_dv_destroy_policer_rules,
+ .counter_alloc = flow_dv_counter_allocate,
+ .counter_free = flow_dv_counter_free,
+ .counter_query = flow_dv_counter_query,
+ .get_aged_flows = flow_get_aged_flows,
+};
+
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c
new file mode 100644
index 000000000..08f7dc8d1
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c
@@ -0,0 +1,1292 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+#include <math.h>
+
+#include <rte_tailq.h>
+#include <rte_malloc.h>
+#include <rte_mtr.h>
+#include <rte_mtr_driver.h>
+
+#include <mlx5_devx_cmds.h>
+
+#include "mlx5.h"
+#include "mlx5_flow.h"
+
+/**
+ * Create the meter action.
+ *
+ * @param priv
+ * Pointer to mlx5_priv.
+ * @param[in] fm
+ * Pointer to flow meter to be converted.
+ *
+ * @return
+ * Pointer to the meter action on success, NULL otherwise.
+ */
+static void *
+mlx5_flow_meter_action_create(struct mlx5_priv *priv,
+ struct mlx5_flow_meter *fm)
+{
+#ifdef HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER
+ struct mlx5dv_dr_flow_meter_attr mtr_init;
+ void *attr = fm->mfts->fmp;
+ struct mlx5_flow_meter_srtcm_rfc2697_prm *srtcm =
+ &fm->profile->srtcm_prm;
+
+ fm->mfts->fmp_size = MLX5_ST_SZ_BYTES(flow_meter_parameters);
+ memset(attr, 0, fm->mfts->fmp_size);
+ MLX5_SET(flow_meter_parameters, attr, valid, 1);
+ MLX5_SET(flow_meter_parameters, attr, bucket_overflow, 1);
+ MLX5_SET(flow_meter_parameters, attr,
+ start_color, MLX5_FLOW_COLOR_GREEN);
+ MLX5_SET(flow_meter_parameters, attr, both_buckets_on_green, 0);
+ MLX5_SET(flow_meter_parameters,
+ attr, cbs_exponent, srtcm->cbs_exponent);
+ MLX5_SET(flow_meter_parameters,
+ attr, cbs_mantissa, srtcm->cbs_mantissa);
+ MLX5_SET(flow_meter_parameters,
+ attr, cir_exponent, srtcm->cir_exponent);
+ MLX5_SET(flow_meter_parameters,
+ attr, cir_mantissa, srtcm->cir_mantissa);
+ MLX5_SET(flow_meter_parameters,
+ attr, ebs_exponent, srtcm->ebs_exponent);
+ MLX5_SET(flow_meter_parameters,
+ attr, ebs_mantissa, srtcm->ebs_mantissa);
+ mtr_init.next_table =
+ fm->transfer ? fm->mfts->transfer.tbl->obj :
+ fm->egress ? fm->mfts->egress.tbl->obj :
+ fm->mfts->ingress.tbl->obj;
+ mtr_init.reg_c_index = priv->mtr_color_reg - REG_C_0;
+ mtr_init.flow_meter_parameter = fm->mfts->fmp;
+ mtr_init.flow_meter_parameter_sz = fm->mfts->fmp_size;
+ mtr_init.active = fm->active_state;
+ return mlx5_glue->dv_create_flow_action_meter(&mtr_init);
+#else
+ (void)priv;
+ (void)fm;
+ return NULL;
+#endif
+}
+
+/**
+ * Find meter profile by id.
+ *
+ * @param priv
+ * Pointer to mlx5_priv.
+ * @param meter_profile_id
+ * Meter profile id.
+ *
+ * @return
+ * Pointer to the profile found on success, NULL otherwise.
+ */
+static struct mlx5_flow_meter_profile *
+mlx5_flow_meter_profile_find(struct mlx5_priv *priv, uint32_t meter_profile_id)
+{
+ struct mlx5_mtr_profiles *fmps = &priv->flow_meter_profiles;
+ struct mlx5_flow_meter_profile *fmp;
+
+ TAILQ_FOREACH(fmp, fmps, next)
+ if (meter_profile_id == fmp->meter_profile_id)
+ return fmp;
+ return NULL;
+}
+
+/**
+ * Validate the MTR profile.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_profile_id
+ * Meter profile id.
+ * @param[in] profile
+ * Pointer to meter profile detail.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_profile_validate(struct rte_eth_dev *dev,
+ uint32_t meter_profile_id,
+ struct rte_mtr_meter_profile *profile,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meter_profile *fmp;
+
+ /* Profile must not be NULL. */
+ if (profile == NULL)
+ return -rte_mtr_error_set(error, EINVAL,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE,
+ NULL, "Meter profile is null.");
+ /* Meter profile ID must be valid. */
+ if (meter_profile_id == UINT32_MAX)
+ return -rte_mtr_error_set(error, EINVAL,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+ NULL, "Meter profile id not valid.");
+ /* Meter profile must not exist. */
+ fmp = mlx5_flow_meter_profile_find(priv, meter_profile_id);
+ if (fmp)
+ return -rte_mtr_error_set(error, EEXIST,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+ NULL,
+ "Meter profile already exists.");
+ if (profile->alg == RTE_MTR_SRTCM_RFC2697) {
+ if (priv->config.hca_attr.qos.srtcm_sup) {
+ /* Verify support for flow meter parameters. */
+ if (profile->srtcm_rfc2697.cir > 0 &&
+ profile->srtcm_rfc2697.cir <= MLX5_SRTCM_CIR_MAX &&
+ profile->srtcm_rfc2697.cbs > 0 &&
+ profile->srtcm_rfc2697.cbs <= MLX5_SRTCM_CBS_MAX &&
+ profile->srtcm_rfc2697.ebs <= MLX5_SRTCM_EBS_MAX)
+ return 0;
+ else
+ return -rte_mtr_error_set
+ (error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+ NULL,
+ profile->srtcm_rfc2697.ebs ?
+ "Metering value ebs must be 0." :
+ "Invalid metering parameters.");
+ }
+ }
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE,
+ NULL, "Metering algorithm not supported.");
+}
+
+/**
+ * Calculate mantissa and exponent for cir.
+ *
+ * @param[in] cir
+ * Value to be calculated.
+ * @param[out] man
+ * Pointer to the mantissa.
+ * @param[out] exp
+ * Pointer to the exp.
+ */
+static void
+mlx5_flow_meter_cir_man_exp_calc(int64_t cir, uint8_t *man, uint8_t *exp)
+{
+ int64_t _cir;
+ int64_t delta = INT64_MAX;
+ uint8_t _man = 0;
+ uint8_t _exp = 0;
+ uint64_t m, e;
+
+ for (m = 0; m <= 0xFF; m++) { /* man width 8 bit */
+ for (e = 0; e <= 0x1F; e++) { /* exp width 5bit */
+ _cir = (1000000000ULL * m) >> e;
+ if (llabs(cir - _cir) <= delta) {
+ delta = llabs(cir - _cir);
+ _man = m;
+ _exp = e;
+ }
+ }
+ }
+ *man = _man;
+ *exp = _exp;
+}
+
+/**
+ * Calculate mantissa and exponent for xbs.
+ *
+ * @param[in] xbs
+ * Value to be calculated.
+ * @param[out] man
+ * Pointer to the mantissa.
+ * @param[out] exp
+ * Pointer to the exp.
+ */
+static void
+mlx5_flow_meter_xbs_man_exp_calc(uint64_t xbs, uint8_t *man, uint8_t *exp)
+{
+ int _exp;
+ double _man;
+
+ /* Special case xbs == 0 ? both exp and matissa are 0. */
+ if (xbs == 0) {
+ *man = 0;
+ *exp = 0;
+ return;
+ }
+ /* xbs = xbs_mantissa * 2^xbs_exponent */
+ _man = frexp(xbs, &_exp);
+ _man = _man * pow(2, MLX5_MAN_WIDTH);
+ _exp = _exp - MLX5_MAN_WIDTH;
+ *man = (uint8_t)ceil(_man);
+ *exp = _exp;
+}
+
+/**
+ * Fill the prm meter parameter.
+ *
+ * @param[in,out] fmp
+ * Pointer to meter profie to be converted.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_param_fill(struct mlx5_flow_meter_profile *fmp,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_flow_meter_srtcm_rfc2697_prm *srtcm = &fmp->srtcm_prm;
+ uint8_t man, exp;
+
+ if (fmp->profile.alg != RTE_MTR_SRTCM_RFC2697)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE,
+ NULL, "Metering algorithm not supported.");
+ /* cbs = cbs_mantissa * 2^cbs_exponent */
+ mlx5_flow_meter_xbs_man_exp_calc(fmp->profile.srtcm_rfc2697.cbs,
+ &man, &exp);
+ srtcm->cbs_mantissa = man;
+ srtcm->cbs_exponent = exp;
+ /* Check if cbs mantissa is too large. */
+ if (srtcm->cbs_exponent != exp)
+ return -rte_mtr_error_set(error, EINVAL,
+ RTE_MTR_ERROR_TYPE_MTR_PARAMS, NULL,
+ "Metering profile parameter cbs is"
+ " invalid.");
+ /* ebs = ebs_mantissa * 2^ebs_exponent */
+ mlx5_flow_meter_xbs_man_exp_calc(fmp->profile.srtcm_rfc2697.ebs,
+ &man, &exp);
+ srtcm->ebs_mantissa = man;
+ srtcm->ebs_exponent = exp;
+ /* Check if ebs mantissa is too large. */
+ if (srtcm->ebs_exponent != exp)
+ return -rte_mtr_error_set(error, EINVAL,
+ RTE_MTR_ERROR_TYPE_MTR_PARAMS, NULL,
+ "Metering profile parameter ebs is"
+ " invalid.");
+ /* cir = 8G * cir_mantissa * 1/(2^cir_exponent)) Bytes/Sec */
+ mlx5_flow_meter_cir_man_exp_calc(fmp->profile.srtcm_rfc2697.cir,
+ &man, &exp);
+ srtcm->cir_mantissa = man;
+ srtcm->cir_exponent = exp;
+ /* Check if cir mantissa is too large. */
+ if (srtcm->cir_exponent != exp)
+ return -rte_mtr_error_set(error, EINVAL,
+ RTE_MTR_ERROR_TYPE_MTR_PARAMS, NULL,
+ "Metering profile parameter cir is"
+ " invalid.");
+ return 0;
+}
+
+/**
+ * Callback to get MTR capabilities.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[out] cap
+ * Pointer to save MTR capabilities.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_mtr_cap_get(struct rte_eth_dev *dev,
+ struct rte_mtr_capabilities *cap,
+ struct rte_mtr_error *error __rte_unused)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_hca_qos_attr *qattr = &priv->config.hca_attr.qos;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ memset(cap, 0, sizeof(*cap));
+ cap->n_max = 1 << qattr->log_max_flow_meter;
+ cap->n_shared_max = cap->n_max;
+ cap->identical = 1;
+ cap->shared_identical = 1;
+ cap->shared_n_flows_per_mtr_max = 4 << 20;
+ /* 2M flows can share the same meter. */
+ cap->chaining_n_mtrs_per_flow_max = 1; /* Chaining is not supported. */
+ cap->meter_srtcm_rfc2697_n_max = qattr->srtcm_sup ? cap->n_max : 0;
+ cap->meter_rate_max = 1ULL << 40; /* 1 Tera tokens per sec. */
+ cap->policer_action_drop_supported = 1;
+ cap->stats_mask = RTE_MTR_STATS_N_BYTES_DROPPED |
+ RTE_MTR_STATS_N_PKTS_DROPPED;
+ return 0;
+}
+
+/**
+ * Callback to add MTR profile.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_profile_id
+ * Meter profile id.
+ * @param[in] profile
+ * Pointer to meter profile detail.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_profile_add(struct rte_eth_dev *dev,
+ uint32_t meter_profile_id,
+ struct rte_mtr_meter_profile *profile,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_mtr_profiles *fmps = &priv->flow_meter_profiles;
+ struct mlx5_flow_meter_profile *fmp;
+ int ret;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ /* Check input params. */
+ ret = mlx5_flow_meter_profile_validate(dev, meter_profile_id,
+ profile, error);
+ if (ret)
+ return ret;
+ /* Meter profile memory allocation. */
+ fmp = rte_calloc(__func__, 1, sizeof(struct mlx5_flow_meter_profile),
+ RTE_CACHE_LINE_SIZE);
+ if (fmp == NULL)
+ return -rte_mtr_error_set(error, ENOMEM,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED,
+ NULL, "Meter profile memory "
+ "alloc failed.");
+ /* Fill profile info. */
+ fmp->meter_profile_id = meter_profile_id;
+ fmp->profile = *profile;
+ /* Fill the flow meter parameters for the PRM. */
+ ret = mlx5_flow_meter_param_fill(fmp, error);
+ if (ret)
+ goto error;
+ /* Add to list. */
+ TAILQ_INSERT_TAIL(fmps, fmp, next);
+ return 0;
+error:
+ rte_free(fmp);
+ return ret;
+}
+
+/**
+ * Callback to delete MTR profile.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_profile_id
+ * Meter profile id.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_profile_delete(struct rte_eth_dev *dev,
+ uint32_t meter_profile_id,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meter_profile *fmp;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ /* Meter profile must exist. */
+ fmp = mlx5_flow_meter_profile_find(priv, meter_profile_id);
+ if (fmp == NULL)
+ return -rte_mtr_error_set(error, ENOENT,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+ &meter_profile_id,
+ "Meter profile id invalid.");
+ /* Check profile is unused. */
+ if (fmp->ref_cnt)
+ return -rte_mtr_error_set(error, EBUSY,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+ NULL, "Meter profile in use.");
+ /* Remove from list. */
+ TAILQ_REMOVE(&priv->flow_meter_profiles, fmp, next);
+ rte_free(fmp);
+ return 0;
+}
+
+/**
+ * Convert wrong color setting action to verbose error.
+ *
+ * @param[in] action
+ * Policy color action.
+ *
+ * @return
+ * Verbose meter color error type.
+ */
+static inline enum rte_mtr_error_type
+action2error(enum rte_mtr_policer_action action)
+{
+ switch (action) {
+ case MTR_POLICER_ACTION_COLOR_GREEN:
+ return RTE_MTR_ERROR_TYPE_POLICER_ACTION_GREEN;
+ case MTR_POLICER_ACTION_COLOR_YELLOW:
+ return RTE_MTR_ERROR_TYPE_POLICER_ACTION_YELLOW;
+ case MTR_POLICER_ACTION_COLOR_RED:
+ return RTE_MTR_ERROR_TYPE_POLICER_ACTION_RED;
+ default:
+ break;
+ }
+ return RTE_MTR_ERROR_TYPE_UNSPECIFIED;
+}
+
+/**
+ * Check meter validation.
+ *
+ * @param[in] priv
+ * Pointer to mlx5 private data structure.
+ * @param[in] meter_id
+ * Meter id.
+ * @param[in] params
+ * Pointer to rte meter parameters.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_validate(struct mlx5_priv *priv, uint32_t meter_id,
+ struct rte_mtr_params *params,
+ struct rte_mtr_error *error)
+{
+ static enum rte_mtr_policer_action
+ valid_recol_action[RTE_COLORS] = {
+ MTR_POLICER_ACTION_COLOR_GREEN,
+ MTR_POLICER_ACTION_COLOR_YELLOW,
+ MTR_POLICER_ACTION_COLOR_RED };
+ int i;
+
+ /* Meter params must not be NULL. */
+ if (params == NULL)
+ return -rte_mtr_error_set(error, EINVAL,
+ RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+ NULL, "Meter object params null.");
+ /* Previous meter color is not supported. */
+ if (params->use_prev_mtr_color)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+ NULL,
+ "Previous meter color "
+ "not supported.");
+ /* Validate policer settings. */
+ for (i = 0; i < RTE_COLORS; i++)
+ if (params->action[i] != valid_recol_action[i] &&
+ params->action[i] != MTR_POLICER_ACTION_DROP)
+ return -rte_mtr_error_set
+ (error, ENOTSUP,
+ action2error(params->action[i]), NULL,
+ "Recolor action not supported.");
+ /* Validate meter id. */
+ if (mlx5_flow_meter_find(priv, meter_id))
+ return -rte_mtr_error_set(error, EEXIST,
+ RTE_MTR_ERROR_TYPE_MTR_ID, NULL,
+ "Meter object already exists.");
+ return 0;
+}
+
+/**
+ * Modify the flow meter action.
+ *
+ * @param[in] priv
+ * Pointer to mlx5 private data structure.
+ * @param[in] fm
+ * Pointer to flow meter to be modified.
+ * @param[in] srtcm
+ * Pointer to meter srtcm description parameter.
+ * @param[in] modify_bits
+ * The bit in srtcm to be updated.
+ * @param[in] active_state
+ * The state to be updated.
+ * @return
+ * 0 on success, o negative value otherwise.
+ */
+static int
+mlx5_flow_meter_action_modify(struct mlx5_priv *priv,
+ struct mlx5_flow_meter *fm,
+ const struct mlx5_flow_meter_srtcm_rfc2697_prm *srtcm,
+ uint64_t modify_bits, uint32_t active_state)
+{
+#ifdef HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER
+ uint32_t in[MLX5_ST_SZ_DW(flow_meter_parameters)] = { 0 };
+ uint32_t *attr;
+ struct mlx5dv_dr_flow_meter_attr mod_attr = { 0 };
+ int ret;
+
+ /* Fill command parameters. */
+ mod_attr.reg_c_index = priv->mtr_color_reg - REG_C_0;
+ mod_attr.flow_meter_parameter = in;
+ mod_attr.flow_meter_parameter_sz = fm->mfts->fmp_size;
+ if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_ACTIVE)
+ mod_attr.active = !!active_state;
+ else
+ mod_attr.active = 0;
+ attr = in;
+ if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS) {
+ MLX5_SET(flow_meter_parameters,
+ attr, cbs_exponent, srtcm->cbs_exponent);
+ MLX5_SET(flow_meter_parameters,
+ attr, cbs_mantissa, srtcm->cbs_mantissa);
+ }
+ if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR) {
+ MLX5_SET(flow_meter_parameters,
+ attr, cir_exponent, srtcm->cir_exponent);
+ MLX5_SET(flow_meter_parameters,
+ attr, cir_mantissa, srtcm->cir_mantissa);
+ }
+ if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_EBS) {
+ MLX5_SET(flow_meter_parameters,
+ attr, ebs_exponent, srtcm->ebs_exponent);
+ MLX5_SET(flow_meter_parameters,
+ attr, ebs_mantissa, srtcm->ebs_mantissa);
+ }
+ /* Apply modifications to meter only if it was created. */
+ if (fm->mfts->meter_action) {
+ ret = mlx5_glue->dv_modify_flow_action_meter
+ (fm->mfts->meter_action, &mod_attr,
+ rte_cpu_to_be_64(modify_bits));
+ if (ret)
+ return ret;
+ }
+ /* Update succeedded modify meter parameters. */
+ if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_ACTIVE)
+ fm->active_state = !!active_state;
+ attr = fm->mfts->fmp;
+ if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS) {
+ MLX5_SET(flow_meter_parameters,
+ attr, cbs_exponent, srtcm->cbs_exponent);
+ MLX5_SET(flow_meter_parameters,
+ attr, cbs_mantissa, srtcm->cbs_mantissa);
+ }
+ if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR) {
+ MLX5_SET(flow_meter_parameters,
+ attr, cir_exponent, srtcm->cir_exponent);
+ MLX5_SET(flow_meter_parameters,
+ attr, cir_mantissa, srtcm->cir_mantissa);
+ }
+ if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_EBS) {
+ MLX5_SET(flow_meter_parameters,
+ attr, ebs_exponent, srtcm->ebs_exponent);
+ MLX5_SET(flow_meter_parameters,
+ attr, ebs_mantissa, srtcm->ebs_mantissa);
+ }
+
+ return 0;
+#else
+ (void)priv;
+ (void)fm;
+ (void)srtcm;
+ (void)modify_bits;
+ (void)active_state;
+ return -ENOTSUP;
+#endif
+}
+
+/**
+ * Create meter rules.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_id
+ * Meter id.
+ * @param[in] params
+ * Pointer to rte meter parameters.
+ * @param[in] shared
+ * Meter shared with other flow or not.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_create(struct rte_eth_dev *dev, uint32_t meter_id,
+ struct rte_mtr_params *params, int shared,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meters *fms = &priv->flow_meters;
+ struct mlx5_flow_meter_profile *fmp;
+ struct mlx5_flow_meter *fm;
+ const struct rte_flow_attr attr = {
+ .ingress = 1,
+ .egress = 1,
+ .transfer = priv->config.dv_esw_en ? 1 : 0,
+ };
+ int ret;
+ unsigned int i;
+ uint32_t idx = 0;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ /* Validate the parameters. */
+ ret = mlx5_flow_meter_validate(priv, meter_id, params, error);
+ if (ret)
+ return ret;
+ /* Meter profile must exist. */
+ fmp = mlx5_flow_meter_profile_find(priv, params->meter_profile_id);
+ if (fmp == NULL)
+ return -rte_mtr_error_set(error, ENOENT,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+ NULL, "Meter profile id not valid.");
+ /* Allocate the flow meter memory. */
+ fm = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MTR], &idx);
+ if (fm == NULL)
+ return -rte_mtr_error_set(error, ENOMEM,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Memory alloc failed for meter.");
+ fm->idx = idx;
+ /* Fill the flow meter parameters. */
+ fm->meter_id = meter_id;
+ fm->profile = fmp;
+ memcpy(fm->action, params->action, sizeof(params->action));
+ fm->stats_mask = params->stats_mask;
+
+ /* Alloc policer counters. */
+ for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++) {
+ fm->policer_stats.cnt[i] = mlx5_counter_alloc(dev);
+ if (!fm->policer_stats.cnt[i])
+ goto error;
+ }
+ fm->mfts = mlx5_flow_create_mtr_tbls(dev, fm);
+ if (!fm->mfts)
+ goto error;
+ ret = mlx5_flow_create_policer_rules(dev, fm, &attr);
+ if (ret)
+ goto error;
+ /* Add to the flow meter list. */
+ TAILQ_INSERT_TAIL(fms, fm, next);
+ fm->active_state = 1; /* Config meter starts as active. */
+ fm->shared = !!shared;
+ fm->policer_stats.stats_mask = params->stats_mask;
+ fm->profile->ref_cnt++;
+ return 0;
+error:
+ mlx5_flow_destroy_policer_rules(dev, fm, &attr);
+ mlx5_flow_destroy_mtr_tbls(dev, fm->mfts);
+ /* Free policer counters. */
+ for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++)
+ if (fm->policer_stats.cnt[i])
+ mlx5_counter_free(dev, fm->policer_stats.cnt[i]);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MTR], idx);
+ return -rte_mtr_error_set(error, -ret,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED,
+ NULL, "Failed to create devx meter.");
+}
+
+/**
+ * Destroy meter rules.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_id
+ * Meter id.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_destroy(struct rte_eth_dev *dev, uint32_t meter_id,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meters *fms = &priv->flow_meters;
+ struct mlx5_flow_meter_profile *fmp;
+ struct mlx5_flow_meter *fm;
+ const struct rte_flow_attr attr = {
+ .ingress = 1,
+ .egress = 1,
+ .transfer = priv->config.dv_esw_en ? 1 : 0,
+ };
+ unsigned int i;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ /* Meter object must exist. */
+ fm = mlx5_flow_meter_find(priv, meter_id);
+ if (fm == NULL)
+ return -rte_mtr_error_set(error, ENOENT,
+ RTE_MTR_ERROR_TYPE_MTR_ID,
+ NULL, "Meter object id not valid.");
+ /* Meter object must not have any owner. */
+ if (fm->ref_cnt > 0)
+ return -rte_mtr_error_set(error, EBUSY,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED,
+ NULL, "Meter object is being used.");
+ /* Get the meter profile. */
+ fmp = fm->profile;
+ MLX5_ASSERT(fmp);
+ /* Update dependencies. */
+ fmp->ref_cnt--;
+ /* Remove from the flow meter list. */
+ TAILQ_REMOVE(fms, fm, next);
+ /* Free policer counters. */
+ for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++)
+ if (fm->policer_stats.cnt[i])
+ mlx5_counter_free(dev, fm->policer_stats.cnt[i]);
+ /* Free meter flow table */
+ mlx5_flow_destroy_policer_rules(dev, fm, &attr);
+ mlx5_flow_destroy_mtr_tbls(dev, fm->mfts);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MTR], fm->idx);
+ return 0;
+}
+
+/**
+ * Modify meter state.
+ *
+ * @param[in] priv
+ * Pointer to mlx5 private data structure.
+ * @param[in] fm
+ * Pointer to flow meter.
+ * @param[in] new_state
+ * New state to update.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_modify_state(struct mlx5_priv *priv,
+ struct mlx5_flow_meter *fm,
+ uint32_t new_state,
+ struct rte_mtr_error *error)
+{
+ static const struct mlx5_flow_meter_srtcm_rfc2697_prm srtcm = {
+ .cbs_exponent = 20,
+ .cbs_mantissa = 191,
+ .cir_exponent = 0,
+ .cir_mantissa = 200,
+ .ebs_exponent = 0,
+ .ebs_mantissa = 0,
+ };
+ uint64_t modify_bits = MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS |
+ MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR;
+ int ret;
+
+ if (new_state == MLX5_FLOW_METER_DISABLE)
+ ret = mlx5_flow_meter_action_modify(priv, fm, &srtcm,
+ modify_bits, 0);
+ else
+ ret = mlx5_flow_meter_action_modify(priv, fm,
+ &fm->profile->srtcm_prm,
+ modify_bits, 0);
+ if (ret)
+ return -rte_mtr_error_set(error, -ret,
+ RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+ NULL,
+ new_state ?
+ "Failed to enable meter." :
+ "Failed to disable meter.");
+ return 0;
+}
+
+/**
+ * Callback to enable flow meter.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_id
+ * Meter id.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_enable(struct rte_eth_dev *dev,
+ uint32_t meter_id,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meter *fm;
+ int ret;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ /* Meter object must exist. */
+ fm = mlx5_flow_meter_find(priv, meter_id);
+ if (fm == NULL)
+ return -rte_mtr_error_set(error, ENOENT,
+ RTE_MTR_ERROR_TYPE_MTR_ID,
+ NULL, "Meter not found.");
+ if (fm->active_state == MLX5_FLOW_METER_ENABLE)
+ return 0;
+ ret = mlx5_flow_meter_modify_state(priv, fm, MLX5_FLOW_METER_ENABLE,
+ error);
+ if (!ret)
+ fm->active_state = MLX5_FLOW_METER_ENABLE;
+ return ret;
+}
+
+/**
+ * Callback to disable flow meter.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_id
+ * Meter id.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_disable(struct rte_eth_dev *dev,
+ uint32_t meter_id,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meter *fm;
+ int ret;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ /* Meter object must exist. */
+ fm = mlx5_flow_meter_find(priv, meter_id);
+ if (fm == NULL)
+ return -rte_mtr_error_set(error, ENOENT,
+ RTE_MTR_ERROR_TYPE_MTR_ID,
+ NULL, "Meter not found.");
+ if (fm->active_state == MLX5_FLOW_METER_DISABLE)
+ return 0;
+ ret = mlx5_flow_meter_modify_state(priv, fm, MLX5_FLOW_METER_DISABLE,
+ error);
+ if (!ret)
+ fm->active_state = MLX5_FLOW_METER_DISABLE;
+ return ret;
+}
+
+/**
+ * Callback to update meter profile.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_id
+ * Meter id.
+ * @param[in] meter_profile_id
+ * To be updated meter profile id.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_profile_update(struct rte_eth_dev *dev,
+ uint32_t meter_id,
+ uint32_t meter_profile_id,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meter_profile *fmp;
+ struct mlx5_flow_meter_profile *old_fmp;
+ struct mlx5_flow_meter *fm;
+ uint64_t modify_bits = MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS |
+ MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR;
+ int ret;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ /* Meter profile must exist. */
+ fmp = mlx5_flow_meter_profile_find(priv, meter_profile_id);
+ if (fmp == NULL)
+ return -rte_mtr_error_set(error, ENOENT,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+ NULL, "Meter profile not found.");
+ /* Meter object must exist. */
+ fm = mlx5_flow_meter_find(priv, meter_id);
+ if (fm == NULL)
+ return -rte_mtr_error_set(error, ENOENT,
+ RTE_MTR_ERROR_TYPE_MTR_ID,
+ NULL, "Meter not found.");
+ /* MTR object already set to meter profile id. */
+ old_fmp = fm->profile;
+ if (fmp == old_fmp)
+ return 0;
+ /* Update the profile. */
+ fm->profile = fmp;
+ /* Update meter params in HW (if not disabled). */
+ if (fm->active_state == MLX5_FLOW_METER_DISABLE)
+ return 0;
+ ret = mlx5_flow_meter_action_modify(priv, fm, &fm->profile->srtcm_prm,
+ modify_bits, fm->active_state);
+ if (ret) {
+ fm->profile = old_fmp;
+ return -rte_mtr_error_set(error, -ret,
+ RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+ NULL, "Failed to update meter"
+ " parmeters in hardware.");
+ }
+ old_fmp->ref_cnt--;
+ fmp->ref_cnt++;
+ return 0;
+}
+
+/**
+ * Callback to update meter stats mask.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_id
+ * Meter id.
+ * @param[in] stats_mask
+ * To be updated stats_mask.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_stats_update(struct rte_eth_dev *dev,
+ uint32_t meter_id,
+ uint64_t stats_mask,
+ struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meter *fm;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ /* Meter object must exist. */
+ fm = mlx5_flow_meter_find(priv, meter_id);
+ if (fm == NULL)
+ return -rte_mtr_error_set(error, ENOENT,
+ RTE_MTR_ERROR_TYPE_MTR_ID,
+ NULL, "Meter object id not valid.");
+ fm->policer_stats.stats_mask = stats_mask;
+ return 0;
+}
+
+/**
+ * Callback to read meter statistics.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] meter_id
+ * Meter id.
+ * @param[out] stats
+ * Pointer to store the statistics.
+ * @param[out] stats_mask
+ * Pointer to store the stats_mask.
+ * @param[in] clear
+ * Statistic to be cleared after read or not.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_stats_read(struct rte_eth_dev *dev,
+ uint32_t meter_id,
+ struct rte_mtr_stats *stats,
+ uint64_t *stats_mask,
+ int clear,
+ struct rte_mtr_error *error)
+{
+ static uint64_t meter2mask[RTE_MTR_DROPPED + 1] = {
+ RTE_MTR_STATS_N_PKTS_GREEN | RTE_MTR_STATS_N_BYTES_GREEN,
+ RTE_MTR_STATS_N_PKTS_YELLOW | RTE_MTR_STATS_N_BYTES_YELLOW,
+ RTE_MTR_STATS_N_PKTS_RED | RTE_MTR_STATS_N_BYTES_RED,
+ RTE_MTR_STATS_N_PKTS_DROPPED | RTE_MTR_STATS_N_BYTES_DROPPED
+ };
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meter *fm;
+ struct mlx5_flow_policer_stats *ps;
+ uint64_t pkts_dropped = 0;
+ uint64_t bytes_dropped = 0;
+ uint64_t pkts;
+ uint64_t bytes;
+ int i;
+ int ret = 0;
+
+ if (!priv->mtr_en)
+ return -rte_mtr_error_set(error, ENOTSUP,
+ RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter is not support");
+ /* Meter object must exist. */
+ fm = mlx5_flow_meter_find(priv, meter_id);
+ if (fm == NULL)
+ return -rte_mtr_error_set(error, ENOENT,
+ RTE_MTR_ERROR_TYPE_MTR_ID,
+ NULL, "Meter object id not valid.");
+ ps = &fm->policer_stats;
+ *stats_mask = ps->stats_mask;
+ for (i = 0; i < RTE_MTR_DROPPED; i++) {
+ if (*stats_mask & meter2mask[i]) {
+ ret = mlx5_counter_query(dev, ps->cnt[i], clear, &pkts,
+ &bytes);
+ if (ret)
+ goto error;
+ if (fm->action[i] == MTR_POLICER_ACTION_DROP) {
+ pkts_dropped += pkts;
+ bytes_dropped += bytes;
+ }
+ /* If need to read the packets, set it. */
+ if ((1 << i) & (*stats_mask & meter2mask[i]))
+ stats->n_pkts[i] = pkts;
+ /* If need to read the bytes, set it. */
+ if ((1 << (RTE_MTR_DROPPED + 1 + i)) &
+ (*stats_mask & meter2mask[i]))
+ stats->n_bytes[i] = bytes;
+ }
+ }
+ /* Dropped packets/bytes are treated differently. */
+ if (*stats_mask & meter2mask[i]) {
+ ret = mlx5_counter_query(dev, ps->cnt[i], clear, &pkts,
+ &bytes);
+ if (ret)
+ goto error;
+ pkts += pkts_dropped;
+ bytes += bytes_dropped;
+ /* If need to read the packets, set it. */
+ if ((*stats_mask & meter2mask[i]) &
+ RTE_MTR_STATS_N_PKTS_DROPPED)
+ stats->n_pkts_dropped = pkts;
+ /* If need to read the bytes, set it. */
+ if ((*stats_mask & meter2mask[i]) &
+ RTE_MTR_STATS_N_BYTES_DROPPED)
+ stats->n_bytes_dropped = bytes;
+ }
+ return 0;
+error:
+ return -rte_mtr_error_set(error, ret, RTE_MTR_ERROR_TYPE_STATS, NULL,
+ "Failed to read policer counters.");
+}
+
+static const struct rte_mtr_ops mlx5_flow_mtr_ops = {
+ .capabilities_get = mlx5_flow_mtr_cap_get,
+ .meter_profile_add = mlx5_flow_meter_profile_add,
+ .meter_profile_delete = mlx5_flow_meter_profile_delete,
+ .create = mlx5_flow_meter_create,
+ .destroy = mlx5_flow_meter_destroy,
+ .meter_enable = mlx5_flow_meter_enable,
+ .meter_disable = mlx5_flow_meter_disable,
+ .meter_profile_update = mlx5_flow_meter_profile_update,
+ .meter_dscp_table_update = NULL,
+ .policer_actions_update = NULL,
+ .stats_update = mlx5_flow_meter_stats_update,
+ .stats_read = mlx5_flow_meter_stats_read,
+};
+
+/**
+ * Get meter operations.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param arg
+ * Pointer to set the mtr operations.
+ *
+ * @return
+ * Always 0.
+ */
+int
+mlx5_flow_meter_ops_get(struct rte_eth_dev *dev __rte_unused, void *arg)
+{
+ *(const struct rte_mtr_ops **)arg = &mlx5_flow_mtr_ops;
+ return 0;
+}
+
+/**
+ * Find meter by id.
+ *
+ * @param priv
+ * Pointer to mlx5_priv.
+ * @param meter_id
+ * Meter id.
+ *
+ * @return
+ * Pointer to the profile found on success, NULL otherwise.
+ */
+struct mlx5_flow_meter *
+mlx5_flow_meter_find(struct mlx5_priv *priv, uint32_t meter_id)
+{
+ struct mlx5_flow_meters *fms = &priv->flow_meters;
+ struct mlx5_flow_meter *fm;
+
+ TAILQ_FOREACH(fm, fms, next)
+ if (meter_id == fm->meter_id)
+ return fm;
+ return NULL;
+}
+
+/**
+ * Attach meter to flow.
+ * Unidirectional Meter creation can only be done
+ * when flow direction is known, i.e. when calling meter_attach.
+ *
+ * @param [in] priv
+ * Pointer to mlx5 private data.
+ * @param [in] meter_id
+ * Flow meter id.
+ * @param [in] attr
+ * Pointer to flow attributes.
+ * @param [out] error
+ * Pointer to error structure.
+ *
+ * @return the flow meter pointer, NULL otherwise.
+ */
+struct mlx5_flow_meter *
+mlx5_flow_meter_attach(struct mlx5_priv *priv, uint32_t meter_id,
+ const struct rte_flow_attr *attr,
+ struct rte_flow_error *error)
+{
+ struct mlx5_flow_meter *fm;
+
+ fm = mlx5_flow_meter_find(priv, meter_id);
+ if (fm == NULL) {
+ rte_flow_error_set(error, ENOENT,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter object id not valid");
+ goto error;
+ }
+ if (!fm->shared && fm->ref_cnt) {
+ DRV_LOG(ERR, "Cannot share a non-shared meter.");
+ rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Meter can't be shared");
+ goto error;
+ }
+ if (!fm->ref_cnt++) {
+ MLX5_ASSERT(!fm->mfts->meter_action);
+ fm->ingress = attr->ingress;
+ fm->egress = attr->egress;
+ fm->transfer = attr->transfer;
+ /* This also creates the meter object. */
+ fm->mfts->meter_action = mlx5_flow_meter_action_create(priv,
+ fm);
+ if (!fm->mfts->meter_action)
+ goto error_detach;
+ } else {
+ MLX5_ASSERT(fm->mfts->meter_action);
+ if (attr->transfer != fm->transfer ||
+ attr->ingress != fm->ingress ||
+ attr->egress != fm->egress) {
+ DRV_LOG(ERR, "meter I/O attributes do not "
+ "match flow I/O attributes.");
+ goto error_detach;
+ }
+ }
+ return fm;
+error_detach:
+ mlx5_flow_meter_detach(fm);
+ rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ fm->mfts->meter_action ? "Meter attr not match" :
+ "Meter action create failed");
+error:
+ return NULL;
+}
+
+/**
+ * Detach meter from flow.
+ *
+ * @param [in] fm
+ * Pointer to flow meter.
+ */
+void
+mlx5_flow_meter_detach(struct mlx5_flow_meter *fm)
+{
+ MLX5_ASSERT(fm->ref_cnt);
+ if (--fm->ref_cnt)
+ return;
+ if (fm->mfts->meter_action)
+ mlx5_glue->destroy_flow_action(fm->mfts->meter_action);
+ fm->mfts->meter_action = NULL;
+ fm->ingress = 0;
+ fm->egress = 0;
+ fm->transfer = 0;
+}
+
+/**
+ * Flush meter configuration.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[out] error
+ * Pointer to rte meter error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_meter_flush(struct rte_eth_dev *dev, struct rte_mtr_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_meters *fms = &priv->flow_meters;
+ struct mlx5_mtr_profiles *fmps = &priv->flow_meter_profiles;
+ struct mlx5_flow_meter_profile *fmp;
+ struct mlx5_flow_meter *fm;
+ const struct rte_flow_attr attr = {
+ .ingress = 1,
+ .egress = 1,
+ .transfer = priv->config.dv_esw_en ? 1 : 0,
+ };
+ void *tmp;
+ uint32_t i;
+
+ TAILQ_FOREACH_SAFE(fm, fms, next, tmp) {
+ /* Meter object must not have any owner. */
+ MLX5_ASSERT(!fm->ref_cnt);
+ /* Get meter profile. */
+ fmp = fm->profile;
+ if (fmp == NULL)
+ return -rte_mtr_error_set(error, EINVAL,
+ RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+ NULL, "MTR object meter profile invalid.");
+ /* Update dependencies. */
+ fmp->ref_cnt--;
+ /* Remove from list. */
+ TAILQ_REMOVE(fms, fm, next);
+ /* Free policer counters. */
+ for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++)
+ if (fm->policer_stats.cnt[i])
+ mlx5_counter_free(dev,
+ fm->policer_stats.cnt[i]);
+ /* Free meter flow table. */
+ mlx5_flow_destroy_policer_rules(dev, fm, &attr);
+ mlx5_flow_destroy_mtr_tbls(dev, fm->mfts);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MTR], fm->idx);
+ }
+ TAILQ_FOREACH_SAFE(fmp, fmps, next, tmp) {
+ /* Check unused. */
+ MLX5_ASSERT(!fmp->ref_cnt);
+ /* Remove from list. */
+ TAILQ_REMOVE(&priv->flow_meter_profiles, fmp, next);
+ rte_free(fmp);
+ }
+ return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c
new file mode 100644
index 000000000..c266e5683
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -0,0 +1,1987 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <netinet/in.h>
+#include <sys/queue.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+#include <rte_malloc.h>
+#include <rte_ip.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_flow.h"
+#include "mlx5_rxtx.h"
+
+#define VERBS_SPEC_INNER(item_flags) \
+ (!!((item_flags) & MLX5_FLOW_LAYER_TUNNEL) ? IBV_FLOW_SPEC_INNER : 0)
+
+/**
+ * Get Verbs flow counter by index.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] idx
+ * mlx5 flow counter index in the container.
+ * @param[out] ppool
+ * mlx5 flow counter pool in the container,
+ *
+ * @return
+ * A pointer to the counter, NULL otherwise.
+ */
+static struct mlx5_flow_counter *
+flow_verbs_counter_get_by_idx(struct rte_eth_dev *dev,
+ uint32_t idx,
+ struct mlx5_flow_counter_pool **ppool)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0);
+ struct mlx5_flow_counter_pool *pool;
+
+ idx--;
+ pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
+ MLX5_ASSERT(pool);
+ if (ppool)
+ *ppool = pool;
+ return MLX5_POOL_GET_CNT(pool, idx % MLX5_COUNTERS_PER_POOL);
+}
+
+/**
+ * Create Verbs flow counter with Verbs library.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in, out] counter
+ * mlx5 flow counter object, contains the counter id,
+ * handle of created Verbs flow counter is returned
+ * in cs field (if counters are supported).
+ *
+ * @return
+ * 0 On success else a negative errno value is returned
+ * and rte_errno is set.
+ */
+static int
+flow_verbs_counter_create(struct rte_eth_dev *dev,
+ struct mlx5_flow_counter_ext *counter)
+{
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ibv_context *ctx = priv->sh->ctx;
+ struct ibv_counter_set_init_attr init = {
+ .counter_set_id = counter->id};
+
+ counter->cs = mlx5_glue->create_counter_set(ctx, &init);
+ if (!counter->cs) {
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+ }
+ return 0;
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ibv_context *ctx = priv->sh->ctx;
+ struct ibv_counters_init_attr init = {0};
+ struct ibv_counter_attach_attr attach;
+ int ret;
+
+ memset(&attach, 0, sizeof(attach));
+ counter->cs = mlx5_glue->create_counters(ctx, &init);
+ if (!counter->cs) {
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+ }
+ attach.counter_desc = IBV_COUNTER_PACKETS;
+ attach.index = 0;
+ ret = mlx5_glue->attach_counters(counter->cs, &attach, NULL);
+ if (!ret) {
+ attach.counter_desc = IBV_COUNTER_BYTES;
+ attach.index = 1;
+ ret = mlx5_glue->attach_counters
+ (counter->cs, &attach, NULL);
+ }
+ if (ret) {
+ claim_zero(mlx5_glue->destroy_counters(counter->cs));
+ counter->cs = NULL;
+ rte_errno = ret;
+ return -ret;
+ }
+ return 0;
+#else
+ (void)dev;
+ (void)counter;
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+#endif
+}
+
+/**
+ * Get a flow counter.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] shared
+ * Indicate if this counter is shared with other flows.
+ * @param[in] id
+ * Counter identifier.
+ *
+ * @return
+ * Index to the counter, 0 otherwise and rte_errno is set.
+ */
+static uint32_t
+flow_verbs_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0);
+ struct mlx5_flow_counter_pool *pool = NULL;
+ struct mlx5_flow_counter_ext *cnt_ext = NULL;
+ struct mlx5_flow_counter *cnt = NULL;
+ uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
+ uint32_t pool_idx;
+ uint32_t i;
+ int ret;
+
+ if (shared) {
+ for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
+ pool = cont->pools[pool_idx];
+ for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+ cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
+ if (cnt_ext->shared && cnt_ext->id == id) {
+ cnt_ext->ref_cnt++;
+ return MLX5_MAKE_CNT_IDX(pool_idx, i);
+ }
+ }
+ }
+ }
+ for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
+ pool = cont->pools[pool_idx];
+ if (!pool)
+ continue;
+ cnt = TAILQ_FIRST(&pool->counters);
+ if (cnt)
+ break;
+ }
+ if (!cnt) {
+ struct mlx5_flow_counter_pool **pools;
+ uint32_t size;
+
+ if (n_valid == cont->n) {
+ /* Resize the container pool array. */
+ size = sizeof(struct mlx5_flow_counter_pool *) *
+ (n_valid + MLX5_CNT_CONTAINER_RESIZE);
+ pools = rte_zmalloc(__func__, size, 0);
+ if (!pools)
+ return 0;
+ if (n_valid) {
+ memcpy(pools, cont->pools,
+ sizeof(struct mlx5_flow_counter_pool *) *
+ n_valid);
+ rte_free(cont->pools);
+ }
+ cont->pools = pools;
+ cont->n += MLX5_CNT_CONTAINER_RESIZE;
+ }
+ /* Allocate memory for new pool*/
+ size = sizeof(*pool) + (sizeof(*cnt_ext) + sizeof(*cnt)) *
+ MLX5_COUNTERS_PER_POOL;
+ pool = rte_calloc(__func__, 1, size, 0);
+ if (!pool)
+ return 0;
+ pool->type |= CNT_POOL_TYPE_EXT;
+ for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+ cnt = MLX5_POOL_GET_CNT(pool, i);
+ TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+ }
+ cnt = MLX5_POOL_GET_CNT(pool, 0);
+ cont->pools[n_valid] = pool;
+ pool_idx = n_valid;
+ rte_atomic16_add(&cont->n_valid, 1);
+ TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+ }
+ i = MLX5_CNT_ARRAY_IDX(pool, cnt);
+ cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
+ cnt_ext->id = id;
+ cnt_ext->shared = shared;
+ cnt_ext->ref_cnt = 1;
+ cnt->hits = 0;
+ cnt->bytes = 0;
+ /* Create counter with Verbs. */
+ ret = flow_verbs_counter_create(dev, cnt_ext);
+ if (!ret) {
+ TAILQ_REMOVE(&pool->counters, cnt, next);
+ return MLX5_MAKE_CNT_IDX(pool_idx, i);
+ }
+ /* Some error occurred in Verbs library. */
+ rte_errno = -ret;
+ return 0;
+}
+
+/**
+ * Release a flow counter.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] counter
+ * Index to the counter handler.
+ */
+static void
+flow_verbs_counter_release(struct rte_eth_dev *dev, uint32_t counter)
+{
+ struct mlx5_flow_counter_pool *pool;
+ struct mlx5_flow_counter *cnt;
+ struct mlx5_flow_counter_ext *cnt_ext;
+
+ cnt = flow_verbs_counter_get_by_idx(dev, counter,
+ &pool);
+ cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+ if (--cnt_ext->ref_cnt == 0) {
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+ claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs));
+ cnt_ext->cs = NULL;
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ claim_zero(mlx5_glue->destroy_counters(cnt_ext->cs));
+ cnt_ext->cs = NULL;
+#endif
+ TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+ }
+}
+
+/**
+ * Query a flow counter via Verbs library call.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+static int
+flow_verbs_counter_query(struct rte_eth_dev *dev __rte_unused,
+ struct rte_flow *flow, void *data,
+ struct rte_flow_error *error)
+{
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \
+ defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ if (flow->counter) {
+ struct mlx5_flow_counter_pool *pool;
+ struct mlx5_flow_counter *cnt = flow_verbs_counter_get_by_idx
+ (dev, flow->counter, &pool);
+ struct mlx5_flow_counter_ext *cnt_ext = MLX5_CNT_TO_CNT_EXT
+ (pool, cnt);
+ struct rte_flow_query_count *qc = data;
+ uint64_t counters[2] = {0, 0};
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+ struct ibv_query_counter_set_attr query_cs_attr = {
+ .cs = cnt_ext->cs,
+ .query_flags = IBV_COUNTER_SET_FORCE_UPDATE,
+ };
+ struct ibv_counter_set_data query_out = {
+ .out = counters,
+ .outlen = 2 * sizeof(uint64_t),
+ };
+ int err = mlx5_glue->query_counter_set(&query_cs_attr,
+ &query_out);
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ int err = mlx5_glue->query_counters
+ (cnt_ext->cs, counters,
+ RTE_DIM(counters),
+ IBV_READ_COUNTERS_ATTR_PREFER_CACHED);
+#endif
+ if (err)
+ return rte_flow_error_set
+ (error, err,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "cannot read counter");
+ qc->hits_set = 1;
+ qc->bytes_set = 1;
+ qc->hits = counters[0] - cnt->hits;
+ qc->bytes = counters[1] - cnt->bytes;
+ if (qc->reset) {
+ cnt->hits = counters[0];
+ cnt->bytes = counters[1];
+ }
+ return 0;
+ }
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "flow does not have counter");
+#else
+ (void)flow;
+ (void)data;
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "counters are not available");
+#endif
+}
+
+/**
+ * Add a verbs item specification into @p verbs.
+ *
+ * @param[out] verbs
+ * Pointer to verbs structure.
+ * @param[in] src
+ * Create specification.
+ * @param[in] size
+ * Size in bytes of the specification to copy.
+ */
+static void
+flow_verbs_spec_add(struct mlx5_flow_verbs_workspace *verbs,
+ void *src, unsigned int size)
+{
+ void *dst;
+
+ if (!verbs)
+ return;
+ MLX5_ASSERT(verbs->specs);
+ dst = (void *)(verbs->specs + verbs->size);
+ memcpy(dst, src, size);
+ ++verbs->attr.num_of_specs;
+ verbs->size += size;
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_eth(struct mlx5_flow *dev_flow,
+ const struct rte_flow_item *item,
+ uint64_t item_flags)
+{
+ const struct rte_flow_item_eth *spec = item->spec;
+ const struct rte_flow_item_eth *mask = item->mask;
+ const unsigned int size = sizeof(struct ibv_flow_spec_eth);
+ struct ibv_flow_spec_eth eth = {
+ .type = IBV_FLOW_SPEC_ETH | VERBS_SPEC_INNER(item_flags),
+ .size = size,
+ };
+
+ if (!mask)
+ mask = &rte_flow_item_eth_mask;
+ if (spec) {
+ unsigned int i;
+
+ memcpy(&eth.val.dst_mac, spec->dst.addr_bytes,
+ RTE_ETHER_ADDR_LEN);
+ memcpy(&eth.val.src_mac, spec->src.addr_bytes,
+ RTE_ETHER_ADDR_LEN);
+ eth.val.ether_type = spec->type;
+ memcpy(&eth.mask.dst_mac, mask->dst.addr_bytes,
+ RTE_ETHER_ADDR_LEN);
+ memcpy(&eth.mask.src_mac, mask->src.addr_bytes,
+ RTE_ETHER_ADDR_LEN);
+ eth.mask.ether_type = mask->type;
+ /* Remove unwanted bits from values. */
+ for (i = 0; i < RTE_ETHER_ADDR_LEN; ++i) {
+ eth.val.dst_mac[i] &= eth.mask.dst_mac[i];
+ eth.val.src_mac[i] &= eth.mask.src_mac[i];
+ }
+ eth.val.ether_type &= eth.mask.ether_type;
+ }
+ flow_verbs_spec_add(&dev_flow->verbs, &eth, size);
+}
+
+/**
+ * Update the VLAN tag in the Verbs Ethernet specification.
+ * This function assumes that the input is valid and there is space to add
+ * the requested item.
+ *
+ * @param[in, out] attr
+ * Pointer to Verbs attributes structure.
+ * @param[in] eth
+ * Verbs structure containing the VLAN information to copy.
+ */
+static void
+flow_verbs_item_vlan_update(struct ibv_flow_attr *attr,
+ struct ibv_flow_spec_eth *eth)
+{
+ unsigned int i;
+ const enum ibv_flow_spec_type search = eth->type;
+ struct ibv_spec_header *hdr = (struct ibv_spec_header *)
+ ((uint8_t *)attr + sizeof(struct ibv_flow_attr));
+
+ for (i = 0; i != attr->num_of_specs; ++i) {
+ if (hdr->type == search) {
+ struct ibv_flow_spec_eth *e =
+ (struct ibv_flow_spec_eth *)hdr;
+
+ e->val.vlan_tag = eth->val.vlan_tag;
+ e->mask.vlan_tag = eth->mask.vlan_tag;
+ e->val.ether_type = eth->val.ether_type;
+ e->mask.ether_type = eth->mask.ether_type;
+ break;
+ }
+ hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size);
+ }
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_vlan(struct mlx5_flow *dev_flow,
+ const struct rte_flow_item *item,
+ uint64_t item_flags)
+{
+ const struct rte_flow_item_vlan *spec = item->spec;
+ const struct rte_flow_item_vlan *mask = item->mask;
+ unsigned int size = sizeof(struct ibv_flow_spec_eth);
+ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ struct ibv_flow_spec_eth eth = {
+ .type = IBV_FLOW_SPEC_ETH | VERBS_SPEC_INNER(item_flags),
+ .size = size,
+ };
+ const uint32_t l2m = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+ MLX5_FLOW_LAYER_OUTER_L2;
+
+ if (!mask)
+ mask = &rte_flow_item_vlan_mask;
+ if (spec) {
+ eth.val.vlan_tag = spec->tci;
+ eth.mask.vlan_tag = mask->tci;
+ eth.val.vlan_tag &= eth.mask.vlan_tag;
+ eth.val.ether_type = spec->inner_type;
+ eth.mask.ether_type = mask->inner_type;
+ eth.val.ether_type &= eth.mask.ether_type;
+ }
+ if (!(item_flags & l2m))
+ flow_verbs_spec_add(&dev_flow->verbs, &eth, size);
+ else
+ flow_verbs_item_vlan_update(&dev_flow->verbs.attr, &eth);
+ if (!tunnel)
+ dev_flow->handle->vf_vlan.tag =
+ rte_be_to_cpu_16(spec->tci) & 0x0fff;
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_ipv4(struct mlx5_flow *dev_flow,
+ const struct rte_flow_item *item,
+ uint64_t item_flags)
+{
+ const struct rte_flow_item_ipv4 *spec = item->spec;
+ const struct rte_flow_item_ipv4 *mask = item->mask;
+ unsigned int size = sizeof(struct ibv_flow_spec_ipv4_ext);
+ struct ibv_flow_spec_ipv4_ext ipv4 = {
+ .type = IBV_FLOW_SPEC_IPV4_EXT | VERBS_SPEC_INNER(item_flags),
+ .size = size,
+ };
+
+ if (!mask)
+ mask = &rte_flow_item_ipv4_mask;
+ if (spec) {
+ ipv4.val = (struct ibv_flow_ipv4_ext_filter){
+ .src_ip = spec->hdr.src_addr,
+ .dst_ip = spec->hdr.dst_addr,
+ .proto = spec->hdr.next_proto_id,
+ .tos = spec->hdr.type_of_service,
+ };
+ ipv4.mask = (struct ibv_flow_ipv4_ext_filter){
+ .src_ip = mask->hdr.src_addr,
+ .dst_ip = mask->hdr.dst_addr,
+ .proto = mask->hdr.next_proto_id,
+ .tos = mask->hdr.type_of_service,
+ };
+ /* Remove unwanted bits from values. */
+ ipv4.val.src_ip &= ipv4.mask.src_ip;
+ ipv4.val.dst_ip &= ipv4.mask.dst_ip;
+ ipv4.val.proto &= ipv4.mask.proto;
+ ipv4.val.tos &= ipv4.mask.tos;
+ }
+ flow_verbs_spec_add(&dev_flow->verbs, &ipv4, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_ipv6(struct mlx5_flow *dev_flow,
+ const struct rte_flow_item *item,
+ uint64_t item_flags)
+{
+ const struct rte_flow_item_ipv6 *spec = item->spec;
+ const struct rte_flow_item_ipv6 *mask = item->mask;
+ unsigned int size = sizeof(struct ibv_flow_spec_ipv6);
+ struct ibv_flow_spec_ipv6 ipv6 = {
+ .type = IBV_FLOW_SPEC_IPV6 | VERBS_SPEC_INNER(item_flags),
+ .size = size,
+ };
+
+ if (!mask)
+ mask = &rte_flow_item_ipv6_mask;
+ if (spec) {
+ unsigned int i;
+ uint32_t vtc_flow_val;
+ uint32_t vtc_flow_mask;
+
+ memcpy(&ipv6.val.src_ip, spec->hdr.src_addr,
+ RTE_DIM(ipv6.val.src_ip));
+ memcpy(&ipv6.val.dst_ip, spec->hdr.dst_addr,
+ RTE_DIM(ipv6.val.dst_ip));
+ memcpy(&ipv6.mask.src_ip, mask->hdr.src_addr,
+ RTE_DIM(ipv6.mask.src_ip));
+ memcpy(&ipv6.mask.dst_ip, mask->hdr.dst_addr,
+ RTE_DIM(ipv6.mask.dst_ip));
+ vtc_flow_val = rte_be_to_cpu_32(spec->hdr.vtc_flow);
+ vtc_flow_mask = rte_be_to_cpu_32(mask->hdr.vtc_flow);
+ ipv6.val.flow_label =
+ rte_cpu_to_be_32((vtc_flow_val & RTE_IPV6_HDR_FL_MASK) >>
+ RTE_IPV6_HDR_FL_SHIFT);
+ ipv6.val.traffic_class = (vtc_flow_val & RTE_IPV6_HDR_TC_MASK) >>
+ RTE_IPV6_HDR_TC_SHIFT;
+ ipv6.val.next_hdr = spec->hdr.proto;
+ ipv6.mask.flow_label =
+ rte_cpu_to_be_32((vtc_flow_mask & RTE_IPV6_HDR_FL_MASK) >>
+ RTE_IPV6_HDR_FL_SHIFT);
+ ipv6.mask.traffic_class = (vtc_flow_mask & RTE_IPV6_HDR_TC_MASK) >>
+ RTE_IPV6_HDR_TC_SHIFT;
+ ipv6.mask.next_hdr = mask->hdr.proto;
+ /* Remove unwanted bits from values. */
+ for (i = 0; i < RTE_DIM(ipv6.val.src_ip); ++i) {
+ ipv6.val.src_ip[i] &= ipv6.mask.src_ip[i];
+ ipv6.val.dst_ip[i] &= ipv6.mask.dst_ip[i];
+ }
+ ipv6.val.flow_label &= ipv6.mask.flow_label;
+ ipv6.val.traffic_class &= ipv6.mask.traffic_class;
+ ipv6.val.next_hdr &= ipv6.mask.next_hdr;
+ }
+ flow_verbs_spec_add(&dev_flow->verbs, &ipv6, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_tcp(struct mlx5_flow *dev_flow,
+ const struct rte_flow_item *item,
+ uint64_t item_flags __rte_unused)
+{
+ const struct rte_flow_item_tcp *spec = item->spec;
+ const struct rte_flow_item_tcp *mask = item->mask;
+ unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp);
+ struct ibv_flow_spec_tcp_udp tcp = {
+ .type = IBV_FLOW_SPEC_TCP | VERBS_SPEC_INNER(item_flags),
+ .size = size,
+ };
+
+ if (!mask)
+ mask = &rte_flow_item_tcp_mask;
+ if (spec) {
+ tcp.val.dst_port = spec->hdr.dst_port;
+ tcp.val.src_port = spec->hdr.src_port;
+ tcp.mask.dst_port = mask->hdr.dst_port;
+ tcp.mask.src_port = mask->hdr.src_port;
+ /* Remove unwanted bits from values. */
+ tcp.val.src_port &= tcp.mask.src_port;
+ tcp.val.dst_port &= tcp.mask.dst_port;
+ }
+ flow_verbs_spec_add(&dev_flow->verbs, &tcp, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_udp(struct mlx5_flow *dev_flow,
+ const struct rte_flow_item *item,
+ uint64_t item_flags __rte_unused)
+{
+ const struct rte_flow_item_udp *spec = item->spec;
+ const struct rte_flow_item_udp *mask = item->mask;
+ unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp);
+ struct ibv_flow_spec_tcp_udp udp = {
+ .type = IBV_FLOW_SPEC_UDP | VERBS_SPEC_INNER(item_flags),
+ .size = size,
+ };
+
+ if (!mask)
+ mask = &rte_flow_item_udp_mask;
+ if (spec) {
+ udp.val.dst_port = spec->hdr.dst_port;
+ udp.val.src_port = spec->hdr.src_port;
+ udp.mask.dst_port = mask->hdr.dst_port;
+ udp.mask.src_port = mask->hdr.src_port;
+ /* Remove unwanted bits from values. */
+ udp.val.src_port &= udp.mask.src_port;
+ udp.val.dst_port &= udp.mask.dst_port;
+ }
+ item++;
+ while (item->type == RTE_FLOW_ITEM_TYPE_VOID)
+ item++;
+ if (!(udp.val.dst_port & udp.mask.dst_port)) {
+ switch ((item)->type) {
+ case RTE_FLOW_ITEM_TYPE_VXLAN:
+ udp.val.dst_port = htons(MLX5_UDP_PORT_VXLAN);
+ udp.mask.dst_port = 0xffff;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+ udp.val.dst_port = htons(MLX5_UDP_PORT_VXLAN_GPE);
+ udp.mask.dst_port = 0xffff;
+ break;
+ case RTE_FLOW_ITEM_TYPE_MPLS:
+ udp.val.dst_port = htons(MLX5_UDP_PORT_MPLS);
+ udp.mask.dst_port = 0xffff;
+ break;
+ default:
+ break;
+ }
+ }
+
+ flow_verbs_spec_add(&dev_flow->verbs, &udp, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_vxlan(struct mlx5_flow *dev_flow,
+ const struct rte_flow_item *item,
+ uint64_t item_flags __rte_unused)
+{
+ const struct rte_flow_item_vxlan *spec = item->spec;
+ const struct rte_flow_item_vxlan *mask = item->mask;
+ unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
+ struct ibv_flow_spec_tunnel vxlan = {
+ .type = IBV_FLOW_SPEC_VXLAN_TUNNEL,
+ .size = size,
+ };
+ union vni {
+ uint32_t vlan_id;
+ uint8_t vni[4];
+ } id = { .vlan_id = 0, };
+
+ if (!mask)
+ mask = &rte_flow_item_vxlan_mask;
+ if (spec) {
+ memcpy(&id.vni[1], spec->vni, 3);
+ vxlan.val.tunnel_id = id.vlan_id;
+ memcpy(&id.vni[1], mask->vni, 3);
+ vxlan.mask.tunnel_id = id.vlan_id;
+ /* Remove unwanted bits from values. */
+ vxlan.val.tunnel_id &= vxlan.mask.tunnel_id;
+ }
+ flow_verbs_spec_add(&dev_flow->verbs, &vxlan, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_vxlan_gpe(struct mlx5_flow *dev_flow,
+ const struct rte_flow_item *item,
+ uint64_t item_flags __rte_unused)
+{
+ const struct rte_flow_item_vxlan_gpe *spec = item->spec;
+ const struct rte_flow_item_vxlan_gpe *mask = item->mask;
+ unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
+ struct ibv_flow_spec_tunnel vxlan_gpe = {
+ .type = IBV_FLOW_SPEC_VXLAN_TUNNEL,
+ .size = size,
+ };
+ union vni {
+ uint32_t vlan_id;
+ uint8_t vni[4];
+ } id = { .vlan_id = 0, };
+
+ if (!mask)
+ mask = &rte_flow_item_vxlan_gpe_mask;
+ if (spec) {
+ memcpy(&id.vni[1], spec->vni, 3);
+ vxlan_gpe.val.tunnel_id = id.vlan_id;
+ memcpy(&id.vni[1], mask->vni, 3);
+ vxlan_gpe.mask.tunnel_id = id.vlan_id;
+ /* Remove unwanted bits from values. */
+ vxlan_gpe.val.tunnel_id &= vxlan_gpe.mask.tunnel_id;
+ }
+ flow_verbs_spec_add(&dev_flow->verbs, &vxlan_gpe, size);
+}
+
+/**
+ * Update the protocol in Verbs IPv4/IPv6 spec.
+ *
+ * @param[in, out] attr
+ * Pointer to Verbs attributes structure.
+ * @param[in] search
+ * Specification type to search in order to update the IP protocol.
+ * @param[in] protocol
+ * Protocol value to set if none is present in the specification.
+ */
+static void
+flow_verbs_item_gre_ip_protocol_update(struct ibv_flow_attr *attr,
+ enum ibv_flow_spec_type search,
+ uint8_t protocol)
+{
+ unsigned int i;
+ struct ibv_spec_header *hdr = (struct ibv_spec_header *)
+ ((uint8_t *)attr + sizeof(struct ibv_flow_attr));
+
+ if (!attr)
+ return;
+ for (i = 0; i != attr->num_of_specs; ++i) {
+ if (hdr->type == search) {
+ union {
+ struct ibv_flow_spec_ipv4_ext *ipv4;
+ struct ibv_flow_spec_ipv6 *ipv6;
+ } ip;
+
+ switch (search) {
+ case IBV_FLOW_SPEC_IPV4_EXT:
+ ip.ipv4 = (struct ibv_flow_spec_ipv4_ext *)hdr;
+ if (!ip.ipv4->val.proto) {
+ ip.ipv4->val.proto = protocol;
+ ip.ipv4->mask.proto = 0xff;
+ }
+ break;
+ case IBV_FLOW_SPEC_IPV6:
+ ip.ipv6 = (struct ibv_flow_spec_ipv6 *)hdr;
+ if (!ip.ipv6->val.next_hdr) {
+ ip.ipv6->val.next_hdr = protocol;
+ ip.ipv6->mask.next_hdr = 0xff;
+ }
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size);
+ }
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_gre(struct mlx5_flow *dev_flow,
+ const struct rte_flow_item *item __rte_unused,
+ uint64_t item_flags)
+{
+ struct mlx5_flow_verbs_workspace *verbs = &dev_flow->verbs;
+#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT
+ unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
+ struct ibv_flow_spec_tunnel tunnel = {
+ .type = IBV_FLOW_SPEC_VXLAN_TUNNEL,
+ .size = size,
+ };
+#else
+ const struct rte_flow_item_gre *spec = item->spec;
+ const struct rte_flow_item_gre *mask = item->mask;
+ unsigned int size = sizeof(struct ibv_flow_spec_gre);
+ struct ibv_flow_spec_gre tunnel = {
+ .type = IBV_FLOW_SPEC_GRE,
+ .size = size,
+ };
+
+ if (!mask)
+ mask = &rte_flow_item_gre_mask;
+ if (spec) {
+ tunnel.val.c_ks_res0_ver = spec->c_rsvd0_ver;
+ tunnel.val.protocol = spec->protocol;
+ tunnel.mask.c_ks_res0_ver = mask->c_rsvd0_ver;
+ tunnel.mask.protocol = mask->protocol;
+ /* Remove unwanted bits from values. */
+ tunnel.val.c_ks_res0_ver &= tunnel.mask.c_ks_res0_ver;
+ tunnel.val.protocol &= tunnel.mask.protocol;
+ tunnel.val.key &= tunnel.mask.key;
+ }
+#endif
+ if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4)
+ flow_verbs_item_gre_ip_protocol_update(&verbs->attr,
+ IBV_FLOW_SPEC_IPV4_EXT,
+ IPPROTO_GRE);
+ else
+ flow_verbs_item_gre_ip_protocol_update(&verbs->attr,
+ IBV_FLOW_SPEC_IPV6,
+ IPPROTO_GRE);
+ flow_verbs_spec_add(verbs, &tunnel, size);
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow. This function also return the action that was added.
+ *
+ * @param[in, out] dev_flow
+ * Pointer to dev_flow structure.
+ * @param[in] item
+ * Item specification.
+ * @param[in] item_flags
+ * Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_mpls(struct mlx5_flow *dev_flow __rte_unused,
+ const struct rte_flow_item *item __rte_unused,
+ uint64_t item_flags __rte_unused)
+{
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+ const struct rte_flow_item_mpls *spec = item->spec;
+ const struct rte_flow_item_mpls *mask = item->mask;
+ unsigned int size = sizeof(struct ibv_flow_spec_mpls);
+ struct ibv_flow_spec_mpls mpls = {
+ .type = IBV_FLOW_SPEC_MPLS,
+ .size = size,
+ };
+
+ if (!mask)
+ mask = &rte_flow_item_mpls_mask;
+ if (spec) {
+ memcpy(&mpls.val.label, spec, sizeof(mpls.val.label));
+ memcpy(&mpls.mask.label, mask, sizeof(mpls.mask.label));
+ /* Remove unwanted bits from values. */
+ mpls.val.label &= mpls.mask.label;
+ }
+ flow_verbs_spec_add(&dev_flow->verbs, &mpls, size);
+#endif
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] dev_flow
+ * Pointer to mlx5_flow.
+ * @param[in] action
+ * Action configuration.
+ */
+static void
+flow_verbs_translate_action_drop
+ (struct mlx5_flow *dev_flow,
+ const struct rte_flow_action *action __rte_unused)
+{
+ unsigned int size = sizeof(struct ibv_flow_spec_action_drop);
+ struct ibv_flow_spec_action_drop drop = {
+ .type = IBV_FLOW_SPEC_ACTION_DROP,
+ .size = size,
+ };
+
+ flow_verbs_spec_add(&dev_flow->verbs, &drop, size);
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] rss_desc
+ * Pointer to mlx5_flow_rss_desc.
+ * @param[in] action
+ * Action configuration.
+ */
+static void
+flow_verbs_translate_action_queue(struct mlx5_flow_rss_desc *rss_desc,
+ const struct rte_flow_action *action)
+{
+ const struct rte_flow_action_queue *queue = action->conf;
+
+ rss_desc->queue[0] = queue->index;
+ rss_desc->queue_num = 1;
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] rss_desc
+ * Pointer to mlx5_flow_rss_desc.
+ * @param[in] action
+ * Action configuration.
+ */
+static void
+flow_verbs_translate_action_rss(struct mlx5_flow_rss_desc *rss_desc,
+ const struct rte_flow_action *action)
+{
+ const struct rte_flow_action_rss *rss = action->conf;
+ const uint8_t *rss_key;
+
+ memcpy(rss_desc->queue, rss->queue, rss->queue_num * sizeof(uint16_t));
+ rss_desc->queue_num = rss->queue_num;
+ /* NULL RSS key indicates default RSS key. */
+ rss_key = !rss->key ? rss_hash_default_key : rss->key;
+ memcpy(rss_desc->key, rss_key, MLX5_RSS_HASH_KEY_LEN);
+ /*
+ * rss->level and rss.types should be set in advance when expanding
+ * items for RSS.
+ */
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] dev_flow
+ * Pointer to mlx5_flow.
+ * @param[in] action
+ * Action configuration.
+ */
+static void
+flow_verbs_translate_action_flag
+ (struct mlx5_flow *dev_flow,
+ const struct rte_flow_action *action __rte_unused)
+{
+ unsigned int size = sizeof(struct ibv_flow_spec_action_tag);
+ struct ibv_flow_spec_action_tag tag = {
+ .type = IBV_FLOW_SPEC_ACTION_TAG,
+ .size = size,
+ .tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT),
+ };
+
+ flow_verbs_spec_add(&dev_flow->verbs, &tag, size);
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] dev_flow
+ * Pointer to mlx5_flow.
+ * @param[in] action
+ * Action configuration.
+ */
+static void
+flow_verbs_translate_action_mark(struct mlx5_flow *dev_flow,
+ const struct rte_flow_action *action)
+{
+ const struct rte_flow_action_mark *mark = action->conf;
+ unsigned int size = sizeof(struct ibv_flow_spec_action_tag);
+ struct ibv_flow_spec_action_tag tag = {
+ .type = IBV_FLOW_SPEC_ACTION_TAG,
+ .size = size,
+ .tag_id = mlx5_flow_mark_set(mark->id),
+ };
+
+ flow_verbs_spec_add(&dev_flow->verbs, &tag, size);
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] action
+ * Action configuration.
+ * @param[in] dev_flow
+ * Pointer to mlx5_flow.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 On success else a negative errno value is returned and rte_errno is set.
+ */
+static int
+flow_verbs_translate_action_count(struct mlx5_flow *dev_flow,
+ const struct rte_flow_action *action,
+ struct rte_eth_dev *dev,
+ struct rte_flow_error *error)
+{
+ const struct rte_flow_action_count *count = action->conf;
+ struct rte_flow *flow = dev_flow->flow;
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \
+ defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ struct mlx5_flow_counter_pool *pool;
+ struct mlx5_flow_counter *cnt = NULL;
+ struct mlx5_flow_counter_ext *cnt_ext;
+ unsigned int size = sizeof(struct ibv_flow_spec_counter_action);
+ struct ibv_flow_spec_counter_action counter = {
+ .type = IBV_FLOW_SPEC_ACTION_COUNT,
+ .size = size,
+ };
+#endif
+
+ if (!flow->counter) {
+ flow->counter = flow_verbs_counter_new(dev, count->shared,
+ count->id);
+ if (!flow->counter)
+ return rte_flow_error_set(error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ action,
+ "cannot get counter"
+ " context.");
+ }
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+ cnt = flow_verbs_counter_get_by_idx(dev, flow->counter, &pool);
+ cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+ counter.counter_set_handle = cnt_ext->cs->handle;
+ flow_verbs_spec_add(&dev_flow->verbs, &counter, size);
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ cnt = flow_verbs_counter_get_by_idx(dev, flow->counter, &pool);
+ cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+ counter.counters = cnt_ext->cs;
+ flow_verbs_spec_add(&dev_flow->verbs, &counter, size);
+#endif
+ return 0;
+}
+
+/**
+ * Internal validation function. For validating both actions and items.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] items
+ * Pointer to the list of items.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[in] external
+ * This flow rule is created by request external to PMD.
+ * @param[in] hairpin
+ * Number of hairpin TX actions, 0 means classic flow.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_verbs_validate(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ bool external __rte_unused,
+ int hairpin __rte_unused,
+ struct rte_flow_error *error)
+{
+ int ret;
+ uint64_t action_flags = 0;
+ uint64_t item_flags = 0;
+ uint64_t last_item = 0;
+ uint8_t next_protocol = 0xff;
+ uint16_t ether_type = 0;
+
+ if (items == NULL)
+ return -1;
+ ret = mlx5_flow_validate_attributes(dev, attr, error);
+ if (ret < 0)
+ return ret;
+ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+ int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+ int ret = 0;
+
+ switch (items->type) {
+ case RTE_FLOW_ITEM_TYPE_VOID:
+ break;
+ case RTE_FLOW_ITEM_TYPE_ETH:
+ ret = mlx5_flow_validate_item_eth(items, item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+ MLX5_FLOW_LAYER_OUTER_L2;
+ if (items->mask != NULL && items->spec != NULL) {
+ ether_type =
+ ((const struct rte_flow_item_eth *)
+ items->spec)->type;
+ ether_type &=
+ ((const struct rte_flow_item_eth *)
+ items->mask)->type;
+ ether_type = rte_be_to_cpu_16(ether_type);
+ } else {
+ ether_type = 0;
+ }
+ break;
+ case RTE_FLOW_ITEM_TYPE_VLAN:
+ ret = mlx5_flow_validate_item_vlan(items, item_flags,
+ dev, error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
+ MLX5_FLOW_LAYER_INNER_VLAN) :
+ (MLX5_FLOW_LAYER_OUTER_L2 |
+ MLX5_FLOW_LAYER_OUTER_VLAN);
+ if (items->mask != NULL && items->spec != NULL) {
+ ether_type =
+ ((const struct rte_flow_item_vlan *)
+ items->spec)->inner_type;
+ ether_type &=
+ ((const struct rte_flow_item_vlan *)
+ items->mask)->inner_type;
+ ether_type = rte_be_to_cpu_16(ether_type);
+ } else {
+ ether_type = 0;
+ }
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV4:
+ ret = mlx5_flow_validate_item_ipv4(items, item_flags,
+ last_item,
+ ether_type, NULL,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+ if (items->mask != NULL &&
+ ((const struct rte_flow_item_ipv4 *)
+ items->mask)->hdr.next_proto_id) {
+ next_protocol =
+ ((const struct rte_flow_item_ipv4 *)
+ (items->spec))->hdr.next_proto_id;
+ next_protocol &=
+ ((const struct rte_flow_item_ipv4 *)
+ (items->mask))->hdr.next_proto_id;
+ } else {
+ /* Reset for inner layer. */
+ next_protocol = 0xff;
+ }
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV6:
+ ret = mlx5_flow_validate_item_ipv6(items, item_flags,
+ last_item,
+ ether_type, NULL,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+ if (items->mask != NULL &&
+ ((const struct rte_flow_item_ipv6 *)
+ items->mask)->hdr.proto) {
+ next_protocol =
+ ((const struct rte_flow_item_ipv6 *)
+ items->spec)->hdr.proto;
+ next_protocol &=
+ ((const struct rte_flow_item_ipv6 *)
+ items->mask)->hdr.proto;
+ } else {
+ /* Reset for inner layer. */
+ next_protocol = 0xff;
+ }
+ break;
+ case RTE_FLOW_ITEM_TYPE_UDP:
+ ret = mlx5_flow_validate_item_udp(items, item_flags,
+ next_protocol,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP :
+ MLX5_FLOW_LAYER_OUTER_L4_UDP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_TCP:
+ ret = mlx5_flow_validate_item_tcp
+ (items, item_flags,
+ next_protocol,
+ &rte_flow_item_tcp_mask,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP :
+ MLX5_FLOW_LAYER_OUTER_L4_TCP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN:
+ ret = mlx5_flow_validate_item_vxlan(items, item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_VXLAN;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+ ret = mlx5_flow_validate_item_vxlan_gpe(items,
+ item_flags,
+ dev, error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_VXLAN_GPE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GRE:
+ ret = mlx5_flow_validate_item_gre(items, item_flags,
+ next_protocol, error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_GRE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_MPLS:
+ ret = mlx5_flow_validate_item_mpls(dev, items,
+ item_flags,
+ last_item, error);
+ if (ret < 0)
+ return ret;
+ last_item = MLX5_FLOW_LAYER_MPLS;
+ break;
+ default:
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ NULL, "item not supported");
+ }
+ item_flags |= last_item;
+ }
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_VOID:
+ break;
+ case RTE_FLOW_ACTION_TYPE_FLAG:
+ ret = mlx5_flow_validate_action_flag(action_flags,
+ attr,
+ error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_FLAG;
+ break;
+ case RTE_FLOW_ACTION_TYPE_MARK:
+ ret = mlx5_flow_validate_action_mark(actions,
+ action_flags,
+ attr,
+ error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_MARK;
+ break;
+ case RTE_FLOW_ACTION_TYPE_DROP:
+ ret = mlx5_flow_validate_action_drop(action_flags,
+ attr,
+ error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_DROP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_QUEUE:
+ ret = mlx5_flow_validate_action_queue(actions,
+ action_flags, dev,
+ attr,
+ error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_QUEUE;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RSS:
+ ret = mlx5_flow_validate_action_rss(actions,
+ action_flags, dev,
+ attr, item_flags,
+ error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_RSS;
+ break;
+ case RTE_FLOW_ACTION_TYPE_COUNT:
+ ret = mlx5_flow_validate_action_count(dev, attr, error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_COUNT;
+ break;
+ default:
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ actions,
+ "action not supported");
+ }
+ }
+ /*
+ * Validate the drop action mutual exclusion with other actions.
+ * Drop action is mutually-exclusive with any other action, except for
+ * Count action.
+ */
+ if ((action_flags & MLX5_FLOW_ACTION_DROP) &&
+ (action_flags & ~(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_COUNT)))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+ "Drop action is mutually-exclusive "
+ "with any other action, except for "
+ "Count action");
+ if (!(action_flags & MLX5_FLOW_FATE_ACTIONS))
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ACTION, actions,
+ "no fate action is found");
+ return 0;
+}
+
+/**
+ * Calculate the required bytes that are needed for the action part of the verbs
+ * flow.
+ *
+ * @param[in] actions
+ * Pointer to the list of actions.
+ *
+ * @return
+ * The size of the memory needed for all actions.
+ */
+static int
+flow_verbs_get_actions_size(const struct rte_flow_action actions[])
+{
+ int size = 0;
+
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_VOID:
+ break;
+ case RTE_FLOW_ACTION_TYPE_FLAG:
+ size += sizeof(struct ibv_flow_spec_action_tag);
+ break;
+ case RTE_FLOW_ACTION_TYPE_MARK:
+ size += sizeof(struct ibv_flow_spec_action_tag);
+ break;
+ case RTE_FLOW_ACTION_TYPE_DROP:
+ size += sizeof(struct ibv_flow_spec_action_drop);
+ break;
+ case RTE_FLOW_ACTION_TYPE_QUEUE:
+ break;
+ case RTE_FLOW_ACTION_TYPE_RSS:
+ break;
+ case RTE_FLOW_ACTION_TYPE_COUNT:
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \
+ defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ size += sizeof(struct ibv_flow_spec_counter_action);
+#endif
+ break;
+ default:
+ break;
+ }
+ }
+ return size;
+}
+
+/**
+ * Calculate the required bytes that are needed for the item part of the verbs
+ * flow.
+ *
+ * @param[in] items
+ * Pointer to the list of items.
+ *
+ * @return
+ * The size of the memory needed for all items.
+ */
+static int
+flow_verbs_get_items_size(const struct rte_flow_item items[])
+{
+ int size = 0;
+
+ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+ switch (items->type) {
+ case RTE_FLOW_ITEM_TYPE_VOID:
+ break;
+ case RTE_FLOW_ITEM_TYPE_ETH:
+ size += sizeof(struct ibv_flow_spec_eth);
+ break;
+ case RTE_FLOW_ITEM_TYPE_VLAN:
+ size += sizeof(struct ibv_flow_spec_eth);
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV4:
+ size += sizeof(struct ibv_flow_spec_ipv4_ext);
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV6:
+ size += sizeof(struct ibv_flow_spec_ipv6);
+ break;
+ case RTE_FLOW_ITEM_TYPE_UDP:
+ size += sizeof(struct ibv_flow_spec_tcp_udp);
+ break;
+ case RTE_FLOW_ITEM_TYPE_TCP:
+ size += sizeof(struct ibv_flow_spec_tcp_udp);
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN:
+ size += sizeof(struct ibv_flow_spec_tunnel);
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+ size += sizeof(struct ibv_flow_spec_tunnel);
+ break;
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+ case RTE_FLOW_ITEM_TYPE_GRE:
+ size += sizeof(struct ibv_flow_spec_gre);
+ break;
+ case RTE_FLOW_ITEM_TYPE_MPLS:
+ size += sizeof(struct ibv_flow_spec_mpls);
+ break;
+#else
+ case RTE_FLOW_ITEM_TYPE_GRE:
+ size += sizeof(struct ibv_flow_spec_tunnel);
+ break;
+#endif
+ default:
+ break;
+ }
+ }
+ return size;
+}
+
+/**
+ * Internal preparation function. Allocate mlx5_flow with the required size.
+ * The required size is calculate based on the actions and items. This function
+ * also returns the detected actions and items for later use.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] items
+ * Pointer to the list of items.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * Pointer to mlx5_flow object on success, otherwise NULL and rte_errno
+ * is set.
+ */
+static struct mlx5_flow *
+flow_verbs_prepare(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr __rte_unused,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ size_t size = 0;
+ uint32_t handle_idx = 0;
+ struct mlx5_flow *dev_flow;
+ struct mlx5_flow_handle *dev_handle;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ size += flow_verbs_get_actions_size(actions);
+ size += flow_verbs_get_items_size(items);
+ if (size > MLX5_VERBS_MAX_SPEC_ACT_SIZE) {
+ rte_flow_error_set(error, E2BIG,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "Verbs spec/action size too large");
+ return NULL;
+ }
+ /* In case of corrupting the memory. */
+ if (priv->flow_idx >= MLX5_NUM_MAX_DEV_FLOWS) {
+ rte_flow_error_set(error, ENOSPC,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "not free temporary device flow");
+ return NULL;
+ }
+ dev_handle = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+ &handle_idx);
+ if (!dev_handle) {
+ rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "not enough memory to create flow handle");
+ return NULL;
+ }
+ /* No multi-thread supporting. */
+ dev_flow = &((struct mlx5_flow *)priv->inter_flows)[priv->flow_idx++];
+ dev_flow->handle = dev_handle;
+ dev_flow->handle_idx = handle_idx;
+ /* Memcpy is used, only size needs to be cleared to 0. */
+ dev_flow->verbs.size = 0;
+ dev_flow->verbs.attr.num_of_specs = 0;
+ dev_flow->ingress = attr->ingress;
+ dev_flow->hash_fields = 0;
+ /* Need to set transfer attribute: not supported in Verbs mode. */
+ return dev_flow;
+}
+
+/**
+ * Fill the flow with verb spec.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[in, out] dev_flow
+ * Pointer to the mlx5 flow.
+ * @param[in] attr
+ * Pointer to the flow attributes.
+ * @param[in] items
+ * Pointer to the list of items.
+ * @param[in] actions
+ * Pointer to the list of actions.
+ * @param[out] error
+ * Pointer to the error structure.
+ *
+ * @return
+ * 0 on success, else a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_verbs_translate(struct rte_eth_dev *dev,
+ struct mlx5_flow *dev_flow,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ uint64_t item_flags = 0;
+ uint64_t action_flags = 0;
+ uint64_t priority = attr->priority;
+ uint32_t subpriority = 0;
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_rss_desc *rss_desc = &((struct mlx5_flow_rss_desc *)
+ priv->rss_desc)
+ [!!priv->flow_nested_idx];
+
+ if (priority == MLX5_FLOW_PRIO_RSVD)
+ priority = priv->config.flow_prio - 1;
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ int ret;
+
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_VOID:
+ break;
+ case RTE_FLOW_ACTION_TYPE_FLAG:
+ flow_verbs_translate_action_flag(dev_flow, actions);
+ action_flags |= MLX5_FLOW_ACTION_FLAG;
+ dev_flow->handle->mark = 1;
+ break;
+ case RTE_FLOW_ACTION_TYPE_MARK:
+ flow_verbs_translate_action_mark(dev_flow, actions);
+ action_flags |= MLX5_FLOW_ACTION_MARK;
+ dev_flow->handle->mark = 1;
+ break;
+ case RTE_FLOW_ACTION_TYPE_DROP:
+ flow_verbs_translate_action_drop(dev_flow, actions);
+ action_flags |= MLX5_FLOW_ACTION_DROP;
+ dev_flow->handle->fate_action = MLX5_FLOW_FATE_DROP;
+ break;
+ case RTE_FLOW_ACTION_TYPE_QUEUE:
+ flow_verbs_translate_action_queue(rss_desc, actions);
+ action_flags |= MLX5_FLOW_ACTION_QUEUE;
+ dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE;
+ break;
+ case RTE_FLOW_ACTION_TYPE_RSS:
+ flow_verbs_translate_action_rss(rss_desc, actions);
+ action_flags |= MLX5_FLOW_ACTION_RSS;
+ dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE;
+ break;
+ case RTE_FLOW_ACTION_TYPE_COUNT:
+ ret = flow_verbs_translate_action_count(dev_flow,
+ actions,
+ dev, error);
+ if (ret < 0)
+ return ret;
+ action_flags |= MLX5_FLOW_ACTION_COUNT;
+ break;
+ default:
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ actions,
+ "action not supported");
+ }
+ }
+ dev_flow->act_flags = action_flags;
+ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+ int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+
+ switch (items->type) {
+ case RTE_FLOW_ITEM_TYPE_VOID:
+ break;
+ case RTE_FLOW_ITEM_TYPE_ETH:
+ flow_verbs_translate_item_eth(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L2;
+ item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+ MLX5_FLOW_LAYER_OUTER_L2;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VLAN:
+ flow_verbs_translate_item_vlan(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L2;
+ item_flags |= tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
+ MLX5_FLOW_LAYER_INNER_VLAN) :
+ (MLX5_FLOW_LAYER_OUTER_L2 |
+ MLX5_FLOW_LAYER_OUTER_VLAN);
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV4:
+ flow_verbs_translate_item_ipv4(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L3;
+ dev_flow->hash_fields |=
+ mlx5_flow_hashfields_adjust
+ (rss_desc, tunnel,
+ MLX5_IPV4_LAYER_TYPES,
+ MLX5_IPV4_IBV_RX_HASH);
+ item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+ break;
+ case RTE_FLOW_ITEM_TYPE_IPV6:
+ flow_verbs_translate_item_ipv6(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L3;
+ dev_flow->hash_fields |=
+ mlx5_flow_hashfields_adjust
+ (rss_desc, tunnel,
+ MLX5_IPV6_LAYER_TYPES,
+ MLX5_IPV6_IBV_RX_HASH);
+ item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+ MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+ break;
+ case RTE_FLOW_ITEM_TYPE_TCP:
+ flow_verbs_translate_item_tcp(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L4;
+ dev_flow->hash_fields |=
+ mlx5_flow_hashfields_adjust
+ (rss_desc, tunnel, ETH_RSS_TCP,
+ (IBV_RX_HASH_SRC_PORT_TCP |
+ IBV_RX_HASH_DST_PORT_TCP));
+ item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP :
+ MLX5_FLOW_LAYER_OUTER_L4_TCP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_UDP:
+ flow_verbs_translate_item_udp(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L4;
+ dev_flow->hash_fields |=
+ mlx5_flow_hashfields_adjust
+ (rss_desc, tunnel, ETH_RSS_UDP,
+ (IBV_RX_HASH_SRC_PORT_UDP |
+ IBV_RX_HASH_DST_PORT_UDP));
+ item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP :
+ MLX5_FLOW_LAYER_OUTER_L4_UDP;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN:
+ flow_verbs_translate_item_vxlan(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L2;
+ item_flags |= MLX5_FLOW_LAYER_VXLAN;
+ break;
+ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+ flow_verbs_translate_item_vxlan_gpe(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L2;
+ item_flags |= MLX5_FLOW_LAYER_VXLAN_GPE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_GRE:
+ flow_verbs_translate_item_gre(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L2;
+ item_flags |= MLX5_FLOW_LAYER_GRE;
+ break;
+ case RTE_FLOW_ITEM_TYPE_MPLS:
+ flow_verbs_translate_item_mpls(dev_flow, items,
+ item_flags);
+ subpriority = MLX5_PRIORITY_MAP_L2;
+ item_flags |= MLX5_FLOW_LAYER_MPLS;
+ break;
+ default:
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ NULL,
+ "item not supported");
+ }
+ }
+ dev_flow->handle->layers = item_flags;
+ /* Other members of attr will be ignored. */
+ dev_flow->verbs.attr.priority =
+ mlx5_flow_adjust_priority(dev, priority, subpriority);
+ dev_flow->verbs.attr.port = (uint8_t)priv->ibv_port;
+ return 0;
+}
+
+/**
+ * Remove the flow from the NIC but keeps it in memory.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ * Pointer to flow structure.
+ */
+static void
+flow_verbs_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_handle *handle;
+ uint32_t handle_idx;
+
+ if (!flow)
+ return;
+ SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+ handle_idx, handle, next) {
+ if (handle->ib_flow) {
+ claim_zero(mlx5_glue->destroy_flow(handle->ib_flow));
+ handle->ib_flow = NULL;
+ }
+ /* hrxq is union, don't touch it only the flag is set. */
+ if (handle->rix_hrxq) {
+ if (handle->fate_action == MLX5_FLOW_FATE_DROP) {
+ mlx5_hrxq_drop_release(dev);
+ handle->rix_hrxq = 0;
+ } else if (handle->fate_action ==
+ MLX5_FLOW_FATE_QUEUE) {
+ mlx5_hrxq_release(dev, handle->rix_hrxq);
+ handle->rix_hrxq = 0;
+ }
+ }
+ if (handle->vf_vlan.tag && handle->vf_vlan.created)
+ mlx5_vlan_vmwa_release(dev, &handle->vf_vlan);
+ }
+}
+
+/**
+ * Remove the flow from the NIC and the memory.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ * Pointer to flow structure.
+ */
+static void
+flow_verbs_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_handle *handle;
+
+ if (!flow)
+ return;
+ flow_verbs_remove(dev, flow);
+ while (flow->dev_handles) {
+ uint32_t tmp_idx = flow->dev_handles;
+
+ handle = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+ tmp_idx);
+ if (!handle)
+ return;
+ flow->dev_handles = handle->next.next;
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+ tmp_idx);
+ }
+ if (flow->counter) {
+ flow_verbs_counter_release(dev, flow->counter);
+ flow->counter = 0;
+ }
+}
+
+/**
+ * Apply the flow to the NIC.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ * Pointer to flow structure.
+ * @param[out] error
+ * Pointer to error structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
+ struct rte_flow_error *error)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_handle *handle;
+ struct mlx5_flow *dev_flow;
+ struct mlx5_hrxq *hrxq;
+ uint32_t dev_handles;
+ int err;
+ int idx;
+
+ for (idx = priv->flow_idx - 1; idx >= priv->flow_nested_idx; idx--) {
+ dev_flow = &((struct mlx5_flow *)priv->inter_flows)[idx];
+ handle = dev_flow->handle;
+ if (handle->fate_action == MLX5_FLOW_FATE_DROP) {
+ hrxq = mlx5_hrxq_drop_new(dev);
+ if (!hrxq) {
+ rte_flow_error_set
+ (error, errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot get drop hash queue");
+ goto error;
+ }
+ } else {
+ uint32_t hrxq_idx;
+ struct mlx5_flow_rss_desc *rss_desc =
+ &((struct mlx5_flow_rss_desc *)priv->rss_desc)
+ [!!priv->flow_nested_idx];
+
+ MLX5_ASSERT(rss_desc->queue_num);
+ hrxq_idx = mlx5_hrxq_get(dev, rss_desc->key,
+ MLX5_RSS_HASH_KEY_LEN,
+ dev_flow->hash_fields,
+ rss_desc->queue,
+ rss_desc->queue_num);
+ if (!hrxq_idx)
+ hrxq_idx = mlx5_hrxq_new(dev, rss_desc->key,
+ MLX5_RSS_HASH_KEY_LEN,
+ dev_flow->hash_fields,
+ rss_desc->queue,
+ rss_desc->queue_num,
+ !!(handle->layers &
+ MLX5_FLOW_LAYER_TUNNEL));
+ hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ],
+ hrxq_idx);
+ if (!hrxq) {
+ rte_flow_error_set
+ (error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot get hash queue");
+ goto error;
+ }
+ handle->rix_hrxq = hrxq_idx;
+ }
+ MLX5_ASSERT(hrxq);
+ handle->ib_flow = mlx5_glue->create_flow(hrxq->qp,
+ &dev_flow->verbs.attr);
+ if (!handle->ib_flow) {
+ rte_flow_error_set(error, errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL,
+ "hardware refuses to create flow");
+ goto error;
+ }
+ if (priv->vmwa_context &&
+ handle->vf_vlan.tag && !handle->vf_vlan.created) {
+ /*
+ * The rule contains the VLAN pattern.
+ * For VF we are going to create VLAN
+ * interface to make hypervisor set correct
+ * e-Switch vport context.
+ */
+ mlx5_vlan_vmwa_acquire(dev, &handle->vf_vlan);
+ }
+ }
+ return 0;
+error:
+ err = rte_errno; /* Save rte_errno before cleanup. */
+ SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+ dev_handles, handle, next) {
+ /* hrxq is union, don't touch it only the flag is set. */
+ if (handle->rix_hrxq) {
+ if (handle->fate_action == MLX5_FLOW_FATE_DROP) {
+ mlx5_hrxq_drop_release(dev);
+ handle->rix_hrxq = 0;
+ } else if (handle->fate_action ==
+ MLX5_FLOW_FATE_QUEUE) {
+ mlx5_hrxq_release(dev, handle->rix_hrxq);
+ handle->rix_hrxq = 0;
+ }
+ }
+ if (handle->vf_vlan.tag && handle->vf_vlan.created)
+ mlx5_vlan_vmwa_release(dev, &handle->vf_vlan);
+ }
+ rte_errno = err; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Query a flow.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+static int
+flow_verbs_query(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ const struct rte_flow_action *actions,
+ void *data,
+ struct rte_flow_error *error)
+{
+ int ret = -EINVAL;
+
+ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+ switch (actions->type) {
+ case RTE_FLOW_ACTION_TYPE_VOID:
+ break;
+ case RTE_FLOW_ACTION_TYPE_COUNT:
+ ret = flow_verbs_counter_query(dev, flow, data, error);
+ break;
+ default:
+ return rte_flow_error_set(error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ACTION,
+ actions,
+ "action not supported");
+ }
+ }
+ return ret;
+}
+
+const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops = {
+ .validate = flow_verbs_validate,
+ .prepare = flow_verbs_prepare,
+ .translate = flow_verbs_translate,
+ .apply = flow_verbs_apply,
+ .remove = flow_verbs_remove,
+ .destroy = flow_verbs_destroy,
+ .query = flow_verbs_query,
+};
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c
new file mode 100644
index 000000000..291f7724c
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c
@@ -0,0 +1,255 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <arpa/inet.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+
+/**
+ * Get MAC address by querying netdevice.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[out] mac
+ * MAC address output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
+{
+ struct ifreq request;
+ int ret;
+
+ ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+ if (ret)
+ return ret;
+ memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
+ return 0;
+}
+
+/**
+ * Remove a MAC address from the internal array.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param index
+ * MAC address index.
+ */
+static void
+mlx5_internal_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const int vf = priv->config.vf;
+
+ MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
+ if (rte_is_zero_ether_addr(&dev->data->mac_addrs[index]))
+ return;
+ if (vf)
+ mlx5_nl_mac_addr_remove(priv->nl_socket_route,
+ mlx5_ifindex(dev), priv->mac_own,
+ &dev->data->mac_addrs[index], index);
+ memset(&dev->data->mac_addrs[index], 0, sizeof(struct rte_ether_addr));
+}
+
+/**
+ * Adds a MAC address to the internal array.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param mac_addr
+ * MAC address to register.
+ * @param index
+ * MAC address index.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_internal_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
+ uint32_t index)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const int vf = priv->config.vf;
+ unsigned int i;
+
+ MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
+ if (rte_is_zero_ether_addr(mac)) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ /* First, make sure this address isn't already configured. */
+ for (i = 0; (i != MLX5_MAX_MAC_ADDRESSES); ++i) {
+ /* Skip this index, it's going to be reconfigured. */
+ if (i == index)
+ continue;
+ if (memcmp(&dev->data->mac_addrs[i], mac, sizeof(*mac)))
+ continue;
+ /* Address already configured elsewhere, return with error. */
+ rte_errno = EADDRINUSE;
+ return -rte_errno;
+ }
+ if (vf) {
+ int ret = mlx5_nl_mac_addr_add(priv->nl_socket_route,
+ mlx5_ifindex(dev), priv->mac_own,
+ mac, index);
+
+ if (ret)
+ return ret;
+ }
+ dev->data->mac_addrs[index] = *mac;
+ return 0;
+}
+
+/**
+ * DPDK callback to remove a MAC address.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param index
+ * MAC address index.
+ */
+void
+mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+ int ret;
+
+ if (index >= MLX5_MAX_UC_MAC_ADDRESSES)
+ return;
+ mlx5_internal_mac_addr_remove(dev, index);
+ if (!dev->data->promiscuous) {
+ ret = mlx5_traffic_restart(dev);
+ if (ret)
+ DRV_LOG(ERR, "port %u cannot restart traffic: %s",
+ dev->data->port_id, strerror(rte_errno));
+ }
+}
+
+/**
+ * DPDK callback to add a MAC address.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param mac_addr
+ * MAC address to register.
+ * @param index
+ * MAC address index.
+ * @param vmdq
+ * VMDq pool index to associate address with (ignored).
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
+ uint32_t index, uint32_t vmdq __rte_unused)
+{
+ int ret;
+
+ if (index >= MLX5_MAX_UC_MAC_ADDRESSES) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ ret = mlx5_internal_mac_addr_add(dev, mac, index);
+ if (ret < 0)
+ return ret;
+ if (!dev->data->promiscuous)
+ return mlx5_traffic_restart(dev);
+ return 0;
+}
+
+/**
+ * DPDK callback to set primary MAC address.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param mac_addr
+ * MAC address to register.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr)
+{
+ uint16_t port_id;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ /* Configuring the VF instead of its representor. */
+ if (priv->representor) {
+ DRV_LOG(DEBUG, "VF represented by port %u setting primary MAC address",
+ dev->data->port_id);
+ RTE_ETH_FOREACH_DEV_SIBLING(port_id, dev->data->port_id) {
+ priv = rte_eth_devices[port_id].data->dev_private;
+ if (priv->master == 1) {
+ priv = dev->data->dev_private;
+ return mlx5_nl_vf_mac_addr_modify
+ (priv->nl_socket_route,
+ mlx5_ifindex(&rte_eth_devices[port_id]),
+ mac_addr, priv->representor_id);
+ }
+ }
+ rte_errno = -ENOTSUP;
+ return rte_errno;
+ }
+
+ DRV_LOG(DEBUG, "port %u setting primary MAC address",
+ dev->data->port_id);
+ return mlx5_mac_addr_add(dev, mac_addr, 0, 0);
+}
+
+/**
+ * DPDK callback to set multicast addresses list.
+ *
+ * @see rte_eth_dev_set_mc_addr_list()
+ */
+int
+mlx5_set_mc_addr_list(struct rte_eth_dev *dev,
+ struct rte_ether_addr *mc_addr_set, uint32_t nb_mc_addr)
+{
+ uint32_t i;
+ int ret;
+
+ if (nb_mc_addr >= MLX5_MAX_MC_MAC_ADDRESSES) {
+ rte_errno = ENOSPC;
+ return -rte_errno;
+ }
+ for (i = MLX5_MAX_UC_MAC_ADDRESSES; i != MLX5_MAX_MAC_ADDRESSES; ++i)
+ mlx5_internal_mac_addr_remove(dev, i);
+ i = MLX5_MAX_UC_MAC_ADDRESSES;
+ while (nb_mc_addr--) {
+ ret = mlx5_internal_mac_addr_add(dev, mc_addr_set++, i++);
+ if (ret)
+ return ret;
+ }
+ if (!dev->data->promiscuous)
+ return mlx5_traffic_restart(dev);
+ return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c
new file mode 100644
index 000000000..7ad322d47
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 6WIND S.A.
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#include <stdio.h>
+#include <time.h>
+
+#include <rte_eal.h>
+#include <rte_ethdev_driver.h>
+#include <rte_string_fns.h>
+
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+int
+mlx5_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+ struct rte_mp_msg mp_res;
+ struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param;
+ const struct mlx5_mp_param *param =
+ (const struct mlx5_mp_param *)mp_msg->param;
+ struct rte_eth_dev *dev;
+ struct mlx5_priv *priv;
+ struct mr_cache_entry entry;
+ uint32_t lkey;
+ int ret;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ if (!rte_eth_dev_is_valid_port(param->port_id)) {
+ rte_errno = ENODEV;
+ DRV_LOG(ERR, "port %u invalid port ID", param->port_id);
+ return -rte_errno;
+ }
+ dev = &rte_eth_devices[param->port_id];
+ priv = dev->data->dev_private;
+ switch (param->type) {
+ case MLX5_MP_REQ_CREATE_MR:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ lkey = mlx5_mr_create_primary(priv->sh->pd,
+ &priv->sh->share_cache,
+ &entry, param->args.addr,
+ priv->config.mr_ext_memseg_en);
+ if (lkey == UINT32_MAX)
+ res->result = -rte_errno;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_VERBS_CMD_FD:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ mp_res.num_fds = 1;
+ mp_res.fds[0] = priv->sh->ctx->cmd_fd;
+ res->result = 0;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_QUEUE_STATE_MODIFY:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = mlx5_queue_state_modify_primary
+ (dev, &param->args.state_modify);
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ default:
+ rte_errno = EINVAL;
+ DRV_LOG(ERR, "port %u invalid mp request type",
+ dev->data->port_id);
+ return -rte_errno;
+ }
+ return ret;
+}
+
+/**
+ * IPC message handler of a secondary process.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ * @param[in] peer
+ * Pointer to the peer socket path.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+ struct rte_mp_msg mp_res;
+ struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param;
+ const struct mlx5_mp_param *param =
+ (const struct mlx5_mp_param *)mp_msg->param;
+ struct rte_eth_dev *dev;
+ struct mlx5_priv *priv;
+ int ret;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
+ if (!rte_eth_dev_is_valid_port(param->port_id)) {
+ rte_errno = ENODEV;
+ DRV_LOG(ERR, "port %u invalid port ID", param->port_id);
+ return -rte_errno;
+ }
+ dev = &rte_eth_devices[param->port_id];
+ priv = dev->data->dev_private;
+ switch (param->type) {
+ case MLX5_MP_REQ_START_RXTX:
+ DRV_LOG(INFO, "port %u starting datapath", dev->data->port_id);
+ rte_mb();
+ dev->rx_pkt_burst = mlx5_select_rx_function(dev);
+ dev->tx_pkt_burst = mlx5_select_tx_function(dev);
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = 0;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_STOP_RXTX:
+ DRV_LOG(INFO, "port %u stopping datapath", dev->data->port_id);
+ dev->rx_pkt_burst = removed_rx_burst;
+ dev->tx_pkt_burst = removed_tx_burst;
+ rte_mb();
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = 0;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ default:
+ rte_errno = EINVAL;
+ DRV_LOG(ERR, "port %u invalid mp request type",
+ dev->data->port_id);
+ return -rte_errno;
+ }
+ return ret;
+}
+
+/**
+ * Broadcast request of stopping/starting data-path to secondary processes.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ * @param[in] type
+ * Request type.
+ */
+static void
+mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx5_mp_req_type type)
+{
+ struct rte_mp_msg mp_req;
+ struct rte_mp_msg *mp_res;
+ struct rte_mp_reply mp_rep;
+ struct mlx5_mp_param *res;
+ struct timespec ts = {.tv_sec = MLX5_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+ int i;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ if (!mlx5_shared_data->secondary_cnt)
+ return;
+ if (type != MLX5_MP_REQ_START_RXTX && type != MLX5_MP_REQ_STOP_RXTX) {
+ DRV_LOG(ERR, "port %u unknown request (req_type %d)",
+ dev->data->port_id, type);
+ return;
+ }
+ mp_init_msg(&priv->mp_id, &mp_req, type);
+ ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+ if (ret) {
+ if (rte_errno != ENOTSUP)
+ DRV_LOG(ERR, "port %u failed to request stop/start Rx/Tx (%d)",
+ dev->data->port_id, type);
+ goto exit;
+ }
+ if (mp_rep.nb_sent != mp_rep.nb_received) {
+ DRV_LOG(ERR,
+ "port %u not all secondaries responded (req_type %d)",
+ dev->data->port_id, type);
+ goto exit;
+ }
+ for (i = 0; i < mp_rep.nb_received; i++) {
+ mp_res = &mp_rep.msgs[i];
+ res = (struct mlx5_mp_param *)mp_res->param;
+ if (res->result) {
+ DRV_LOG(ERR, "port %u request failed on secondary #%d",
+ dev->data->port_id, i);
+ goto exit;
+ }
+ }
+exit:
+ free(mp_rep.msgs);
+}
+
+/**
+ * Broadcast request of starting data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ */
+void
+mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev)
+{
+ mp_req_on_rxtx(dev, MLX5_MP_REQ_START_RXTX);
+}
+
+/**
+ * Broadcast request of stopping data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ */
+void
+mlx5_mp_req_stop_rxtx(struct rte_eth_dev *dev)
+{
+ mp_req_on_rxtx(dev, MLX5_MP_REQ_STOP_RXTX);
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c
new file mode 100644
index 000000000..2b4b3e289
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox Technologies, Ltd
+ */
+
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_eal_memconfig.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_rwlock.h>
+#include <rte_bus_pci.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5.h"
+#include "mlx5_mr.h"
+#include "mlx5_rxtx.h"
+
+struct mr_find_contig_memsegs_data {
+ uintptr_t addr;
+ uintptr_t start;
+ uintptr_t end;
+ const struct rte_memseg_list *msl;
+};
+
+struct mr_update_mp_data {
+ struct rte_eth_dev *dev;
+ struct mlx5_mr_ctrl *mr_ctrl;
+ int ret;
+};
+
+/**
+ * Callback for memory free event. Iterate freed memsegs and check whether it
+ * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
+ * result, the MR would be fragmented. If it becomes empty, the MR will be freed
+ * later by mlx5_mr_garbage_collect(). Even if this callback is called from a
+ * secondary process, the garbage collector will be called in primary process
+ * as the secondary process can't call mlx5_mr_create().
+ *
+ * The global cache must be rebuilt if there's any change and this event has to
+ * be propagated to dataplane threads to flush the local caches.
+ *
+ * @param sh
+ * Pointer to the Ethernet device shared context.
+ * @param addr
+ * Address of freed memory.
+ * @param len
+ * Size of freed memory.
+ */
+static void
+mlx5_mr_mem_event_free_cb(struct mlx5_ibv_shared *sh,
+ const void *addr, size_t len)
+{
+ const struct rte_memseg_list *msl;
+ struct mlx5_mr *mr;
+ int ms_n;
+ int i;
+ int rebuild = 0;
+
+ DEBUG("device %s free callback: addr=%p, len=%zu",
+ sh->ibdev_name, addr, len);
+ msl = rte_mem_virt2memseg_list(addr);
+ /* addr and len must be page-aligned. */
+ MLX5_ASSERT((uintptr_t)addr ==
+ RTE_ALIGN((uintptr_t)addr, msl->page_sz));
+ MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz));
+ ms_n = len / msl->page_sz;
+ rte_rwlock_write_lock(&sh->share_cache.rwlock);
+ /* Clear bits of freed memsegs from MR. */
+ for (i = 0; i < ms_n; ++i) {
+ const struct rte_memseg *ms;
+ struct mr_cache_entry entry;
+ uintptr_t start;
+ int ms_idx;
+ uint32_t pos;
+
+ /* Find MR having this memseg. */
+ start = (uintptr_t)addr + i * msl->page_sz;
+ mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, start);
+ if (mr == NULL)
+ continue;
+ MLX5_ASSERT(mr->msl); /* Can't be external memory. */
+ ms = rte_mem_virt2memseg((void *)start, msl);
+ MLX5_ASSERT(ms != NULL);
+ MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
+ ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ pos = ms_idx - mr->ms_base_idx;
+ MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos));
+ MLX5_ASSERT(pos < mr->ms_bmp_n);
+ DEBUG("device %s MR(%p): clear bitmap[%u] for addr %p",
+ sh->ibdev_name, (void *)mr, pos, (void *)start);
+ rte_bitmap_clear(mr->ms_bmp, pos);
+ if (--mr->ms_n == 0) {
+ LIST_REMOVE(mr, mr);
+ LIST_INSERT_HEAD(&sh->share_cache.mr_free_list, mr, mr);
+ DEBUG("device %s remove MR(%p) from list",
+ sh->ibdev_name, (void *)mr);
+ }
+ /*
+ * MR is fragmented or will be freed. the global cache must be
+ * rebuilt.
+ */
+ rebuild = 1;
+ }
+ if (rebuild) {
+ mlx5_mr_rebuild_cache(&sh->share_cache);
+ /*
+ * Flush local caches by propagating invalidation across cores.
+ * rte_smp_wmb() is enough to synchronize this event. If one of
+ * freed memsegs is seen by other core, that means the memseg
+ * has been allocated by allocator, which will come after this
+ * free call. Therefore, this store instruction (incrementing
+ * generation below) will be guaranteed to be seen by other core
+ * before the core sees the newly allocated memory.
+ */
+ ++sh->share_cache.dev_gen;
+ DEBUG("broadcasting local cache flush, gen=%d",
+ sh->share_cache.dev_gen);
+ rte_smp_wmb();
+ }
+ rte_rwlock_write_unlock(&sh->share_cache.rwlock);
+}
+
+/**
+ * Callback for memory event. This can be called from both primary and secondary
+ * process.
+ *
+ * @param event_type
+ * Memory event type.
+ * @param addr
+ * Address of memory.
+ * @param len
+ * Size of memory.
+ */
+void
+mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+ size_t len, void *arg __rte_unused)
+{
+ struct mlx5_ibv_shared *sh;
+ struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list;
+
+ /* Must be called from the primary process. */
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ switch (event_type) {
+ case RTE_MEM_EVENT_FREE:
+ rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+ /* Iterate all the existing mlx5 devices. */
+ LIST_FOREACH(sh, dev_list, mem_event_cb)
+ mlx5_mr_mem_event_free_cb(sh, addr, len);
+ rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+ break;
+ case RTE_MEM_EVENT_ALLOC:
+ default:
+ break;
+ }
+}
+
+/**
+ * Bottom-half of LKey search on Rx.
+ *
+ * @param rxq
+ * Pointer to Rx queue structure.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr)
+{
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+ struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+ struct mlx5_priv *priv = rxq_ctrl->priv;
+
+ return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
+ &priv->sh->share_cache, mr_ctrl, addr,
+ priv->config.mr_ext_memseg_en);
+}
+
+/**
+ * Bottom-half of LKey search on Tx.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr)
+{
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq, struct mlx5_txq_ctrl, txq);
+ struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+ struct mlx5_priv *priv = txq_ctrl->priv;
+
+ return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
+ &priv->sh->share_cache, mr_ctrl, addr,
+ priv->config.mr_ext_memseg_en);
+}
+
+/**
+ * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
+ * list, register the mempool of the mbuf as externally allocated memory.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param mb
+ * Pointer to mbuf.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
+{
+ uintptr_t addr = (uintptr_t)mb->buf_addr;
+ uint32_t lkey;
+
+ lkey = mlx5_tx_addr2mr_bh(txq, addr);
+ if (lkey == UINT32_MAX && rte_errno == ENXIO) {
+ /* Mempool may have externally allocated memory. */
+ return mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb));
+ }
+ return lkey;
+}
+
+/**
+ * Called during rte_mempool_mem_iter() by mlx5_mr_update_ext_mp().
+ *
+ * Externally allocated chunk is registered and a MR is created for the chunk.
+ * The MR object is added to the global list. If memseg list of a MR object
+ * (mr->msl) is null, the MR object can be regarded as externally allocated
+ * memory.
+ *
+ * Once external memory is registered, it should be static. If the memory is
+ * freed and the virtual address range has different physical memory mapped
+ * again, it may cause crash on device due to the wrong translation entry. PMD
+ * can't track the free event of the external memory for now.
+ */
+static void
+mlx5_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
+ struct rte_mempool_memhdr *memhdr,
+ unsigned mem_idx __rte_unused)
+{
+ struct mr_update_mp_data *data = opaque;
+ struct rte_eth_dev *dev = data->dev;
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
+ struct mlx5_mr *mr = NULL;
+ uintptr_t addr = (uintptr_t)memhdr->addr;
+ size_t len = memhdr->len;
+ struct mr_cache_entry entry;
+ uint32_t lkey;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ /* If already registered, it should return. */
+ rte_rwlock_read_lock(&sh->share_cache.rwlock);
+ lkey = mlx5_mr_lookup_cache(&sh->share_cache, &entry, addr);
+ rte_rwlock_read_unlock(&sh->share_cache.rwlock);
+ if (lkey != UINT32_MAX)
+ return;
+ DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
+ dev->data->port_id, mem_idx, mp->name);
+ mr = mlx5_create_mr_ext(sh->pd, addr, len, mp->socket_id);
+ if (!mr) {
+ DRV_LOG(WARNING,
+ "port %u unable to allocate a new MR of"
+ " mempool (%s).",
+ dev->data->port_id, mp->name);
+ data->ret = -1;
+ return;
+ }
+ rte_rwlock_write_lock(&sh->share_cache.rwlock);
+ LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr);
+ /* Insert to the global cache table. */
+ mlx5_mr_insert_cache(&sh->share_cache, mr);
+ rte_rwlock_write_unlock(&sh->share_cache.rwlock);
+ /* Insert to the local cache table */
+ mlx5_mr_addr2mr_bh(sh->pd, &priv->mp_id, &sh->share_cache,
+ mr_ctrl, addr, priv->config.mr_ext_memseg_en);
+}
+
+/**
+ * Finds the first ethdev that match the pci device.
+ * The existence of multiple ethdev per pci device is only with representors.
+ * On such case, it is enough to get only one of the ports as they all share
+ * the same ibv context.
+ *
+ * @param pdev
+ * Pointer to the PCI device.
+ *
+ * @return
+ * Pointer to the ethdev if found, NULL otherwise.
+ */
+static struct rte_eth_dev *
+pci_dev_to_eth_dev(struct rte_pci_device *pdev)
+{
+ uint16_t port_id;
+
+ RTE_ETH_FOREACH_DEV_OF(port_id, &pdev->device)
+ return &rte_eth_devices[port_id];
+ return NULL;
+}
+
+/**
+ * DPDK callback to DMA map external memory to a PCI device.
+ *
+ * @param pdev
+ * Pointer to the PCI device.
+ * @param addr
+ * Starting virtual address of memory to be mapped.
+ * @param iova
+ * Starting IOVA address of memory to be mapped.
+ * @param len
+ * Length of memory segment being mapped.
+ *
+ * @return
+ * 0 on success, negative value on error.
+ */
+int
+mlx5_dma_map(struct rte_pci_device *pdev, void *addr,
+ uint64_t iova __rte_unused, size_t len)
+{
+ struct rte_eth_dev *dev;
+ struct mlx5_mr *mr;
+ struct mlx5_priv *priv;
+ struct mlx5_ibv_shared *sh;
+
+ dev = pci_dev_to_eth_dev(pdev);
+ if (!dev) {
+ DRV_LOG(WARNING, "unable to find matching ethdev "
+ "to PCI device %p", (void *)pdev);
+ rte_errno = ENODEV;
+ return -1;
+ }
+ priv = dev->data->dev_private;
+ sh = priv->sh;
+ mr = mlx5_create_mr_ext(sh->pd, (uintptr_t)addr, len, SOCKET_ID_ANY);
+ if (!mr) {
+ DRV_LOG(WARNING,
+ "port %u unable to dma map", dev->data->port_id);
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_rwlock_write_lock(&sh->share_cache.rwlock);
+ LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr);
+ /* Insert to the global cache table. */
+ mlx5_mr_insert_cache(&sh->share_cache, mr);
+ rte_rwlock_write_unlock(&sh->share_cache.rwlock);
+ return 0;
+}
+
+/**
+ * DPDK callback to DMA unmap external memory to a PCI device.
+ *
+ * @param pdev
+ * Pointer to the PCI device.
+ * @param addr
+ * Starting virtual address of memory to be unmapped.
+ * @param iova
+ * Starting IOVA address of memory to be unmapped.
+ * @param len
+ * Length of memory segment being unmapped.
+ *
+ * @return
+ * 0 on success, negative value on error.
+ */
+int
+mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr,
+ uint64_t iova __rte_unused, size_t len __rte_unused)
+{
+ struct rte_eth_dev *dev;
+ struct mlx5_priv *priv;
+ struct mlx5_ibv_shared *sh;
+ struct mlx5_mr *mr;
+ struct mr_cache_entry entry;
+
+ dev = pci_dev_to_eth_dev(pdev);
+ if (!dev) {
+ DRV_LOG(WARNING, "unable to find matching ethdev "
+ "to PCI device %p", (void *)pdev);
+ rte_errno = ENODEV;
+ return -1;
+ }
+ priv = dev->data->dev_private;
+ sh = priv->sh;
+ rte_rwlock_read_lock(&sh->share_cache.rwlock);
+ mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, (uintptr_t)addr);
+ if (!mr) {
+ rte_rwlock_read_unlock(&sh->share_cache.rwlock);
+ DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
+ "to PCI device %p", (uintptr_t)addr,
+ (void *)pdev);
+ rte_errno = EINVAL;
+ return -1;
+ }
+ LIST_REMOVE(mr, mr);
+ LIST_INSERT_HEAD(&sh->share_cache.mr_free_list, mr, mr);
+ DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
+ (void *)mr);
+ mlx5_mr_rebuild_cache(&sh->share_cache);
+ /*
+ * Flush local caches by propagating invalidation across cores.
+ * rte_smp_wmb() is enough to synchronize this event. If one of
+ * freed memsegs is seen by other core, that means the memseg
+ * has been allocated by allocator, which will come after this
+ * free call. Therefore, this store instruction (incrementing
+ * generation below) will be guaranteed to be seen by other core
+ * before the core sees the newly allocated memory.
+ */
+ ++sh->share_cache.dev_gen;
+ DEBUG("broadcasting local cache flush, gen=%d",
+ sh->share_cache.dev_gen);
+ rte_smp_wmb();
+ rte_rwlock_read_unlock(&sh->share_cache.rwlock);
+ return 0;
+}
+
+/**
+ * Register MR for entire memory chunks in a Mempool having externally allocated
+ * memory and fill in local cache.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ * @param mp
+ * Pointer to registering Mempool.
+ *
+ * @return
+ * 0 on success, -1 on failure.
+ */
+static uint32_t
+mlx5_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
+ struct rte_mempool *mp)
+{
+ struct mr_update_mp_data data = {
+ .dev = dev,
+ .mr_ctrl = mr_ctrl,
+ .ret = 0,
+ };
+
+ rte_mempool_mem_iter(mp, mlx5_mr_update_ext_mp_cb, &data);
+ return data.ret;
+}
+
+/**
+ * Register MR entire memory chunks in a Mempool having externally allocated
+ * memory and search LKey of the address to return.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param addr
+ * Search key.
+ * @param mp
+ * Pointer to registering Mempool where addr belongs.
+ *
+ * @return
+ * LKey for address on success, UINT32_MAX on failure.
+ */
+uint32_t
+mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
+ struct rte_mempool *mp)
+{
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq, struct mlx5_txq_ctrl, txq);
+ struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+ struct mlx5_priv *priv = txq_ctrl->priv;
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ DRV_LOG(WARNING,
+ "port %u using address (%p) from unregistered mempool"
+ " having externally allocated memory"
+ " in secondary process, please create mempool"
+ " prior to rte_eth_dev_start()",
+ PORT_ID(priv), (void *)addr);
+ return UINT32_MAX;
+ }
+ mlx5_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
+ return mlx5_tx_addr2mr_bh(txq, addr);
+}
+
+/* Called during rte_mempool_mem_iter() by mlx5_mr_update_mp(). */
+static void
+mlx5_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque,
+ struct rte_mempool_memhdr *memhdr,
+ unsigned mem_idx __rte_unused)
+{
+ struct mr_update_mp_data *data = opaque;
+ struct rte_eth_dev *dev = data->dev;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ uint32_t lkey;
+
+ /* Stop iteration if failed in the previous walk. */
+ if (data->ret < 0)
+ return;
+ /* Register address of the chunk and update local caches. */
+ lkey = mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
+ &priv->sh->share_cache, data->mr_ctrl,
+ (uintptr_t)memhdr->addr,
+ priv->config.mr_ext_memseg_en);
+ if (lkey == UINT32_MAX)
+ data->ret = -1;
+}
+
+/**
+ * Register entire memory chunks in a Mempool.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ * @param mp
+ * Pointer to registering Mempool.
+ *
+ * @return
+ * 0 on success, -1 on failure.
+ */
+int
+mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
+ struct rte_mempool *mp)
+{
+ struct mr_update_mp_data data = {
+ .dev = dev,
+ .mr_ctrl = mr_ctrl,
+ .ret = 0,
+ };
+
+ rte_mempool_mem_iter(mp, mlx5_mr_update_mp_cb, &data);
+ if (data.ret < 0 && rte_errno == ENXIO) {
+ /* Mempool may have externally allocated memory. */
+ return mlx5_mr_update_ext_mp(dev, mr_ctrl, mp);
+ }
+ return data.ret;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h
new file mode 100644
index 000000000..0c5877b3d
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_MR_H_
+#define RTE_PMD_MLX5_MR_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_rwlock.h>
+#include <rte_bitmap.h>
+#include <rte_memory.h>
+
+#include <mlx5_common_mr.h>
+
+/* First entry must be NULL for comparison. */
+#define mlx5_mr_btree_len(bt) ((bt)->len - 1)
+
+void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+ size_t len, void *arg);
+int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
+ struct rte_mempool *mp);
+
+#endif /* RTE_PMD_MLX5_MR_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c
new file mode 100644
index 000000000..653b06914
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_malloc.h>
+#include <rte_ethdev_driver.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+
+/**
+ * DPDK callback to update the RSS hash configuration.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[in] rss_conf
+ * RSS configuration data.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rss_hash_update(struct rte_eth_dev *dev,
+ struct rte_eth_rss_conf *rss_conf)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+ unsigned int idx;
+
+ if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ if (rss_conf->rss_key && rss_conf->rss_key_len) {
+ if (rss_conf->rss_key_len != MLX5_RSS_HASH_KEY_LEN) {
+ DRV_LOG(ERR,
+ "port %u RSS key len must be %s Bytes long",
+ dev->data->port_id,
+ RTE_STR(MLX5_RSS_HASH_KEY_LEN));
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ priv->rss_conf.rss_key = rte_realloc(priv->rss_conf.rss_key,
+ rss_conf->rss_key_len, 0);
+ if (!priv->rss_conf.rss_key) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
+ rss_conf->rss_key_len);
+ priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
+ }
+ priv->rss_conf.rss_hf = rss_conf->rss_hf;
+ /* Enable the RSS hash in all Rx queues. */
+ for (i = 0, idx = 0; idx != priv->rxqs_n; ++i) {
+ if (!(*priv->rxqs)[i])
+ continue;
+ (*priv->rxqs)[i]->rss_hash = !!rss_conf->rss_hf &&
+ !!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS);
+ ++idx;
+ }
+ return 0;
+}
+
+/**
+ * DPDK callback to get the RSS hash configuration.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[in, out] rss_conf
+ * RSS configuration data.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rss_hash_conf_get(struct rte_eth_dev *dev,
+ struct rte_eth_rss_conf *rss_conf)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!rss_conf) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ if (rss_conf->rss_key &&
+ (rss_conf->rss_key_len >= priv->rss_conf.rss_key_len)) {
+ memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
+ priv->rss_conf.rss_key_len);
+ }
+ rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
+ rss_conf->rss_hf = priv->rss_conf.rss_hf;
+ return 0;
+}
+
+/**
+ * Allocate/reallocate RETA index table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @praram reta_size
+ * The size of the array to allocate.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ void *mem;
+ unsigned int old_size = priv->reta_idx_n;
+
+ if (priv->reta_idx_n == reta_size)
+ return 0;
+
+ mem = rte_realloc(priv->reta_idx,
+ reta_size * sizeof((*priv->reta_idx)[0]), 0);
+ if (!mem) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ priv->reta_idx = mem;
+ priv->reta_idx_n = reta_size;
+ if (old_size < reta_size)
+ memset(&(*priv->reta_idx)[old_size], 0,
+ (reta_size - old_size) *
+ sizeof((*priv->reta_idx)[0]));
+ return 0;
+}
+
+/**
+ * DPDK callback to get the RETA indirection table.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param reta_conf
+ * Pointer to RETA configuration structure array.
+ * @param reta_size
+ * Size of the RETA table.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_rss_reta_query(struct rte_eth_dev *dev,
+ struct rte_eth_rss_reta_entry64 *reta_conf,
+ uint16_t reta_size)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int idx;
+ unsigned int i;
+
+ if (!reta_size || reta_size > priv->reta_idx_n) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ /* Fill each entry of the table even if its bit is not set. */
+ for (idx = 0, i = 0; (i != reta_size); ++i) {
+ idx = i / RTE_RETA_GROUP_SIZE;
+ reta_conf[idx].reta[i % RTE_RETA_GROUP_SIZE] =
+ (*priv->reta_idx)[i];
+ }
+ return 0;
+}
+
+/**
+ * DPDK callback to update the RETA indirection table.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param reta_conf
+ * Pointer to RETA configuration structure array.
+ * @param reta_size
+ * Size of the RETA table.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_rss_reta_update(struct rte_eth_dev *dev,
+ struct rte_eth_rss_reta_entry64 *reta_conf,
+ uint16_t reta_size)
+{
+ int ret;
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int idx;
+ unsigned int i;
+ unsigned int pos;
+
+ if (!reta_size) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ ret = mlx5_rss_reta_index_resize(dev, reta_size);
+ if (ret)
+ return ret;
+ for (idx = 0, i = 0; (i != reta_size); ++i) {
+ idx = i / RTE_RETA_GROUP_SIZE;
+ pos = i % RTE_RETA_GROUP_SIZE;
+ if (((reta_conf[idx].mask >> i) & 0x1) == 0)
+ continue;
+ MLX5_ASSERT(reta_conf[idx].reta[pos] < priv->rxqs_n);
+ (*priv->reta_idx)[i] = reta_conf[idx].reta[pos];
+ }
+ if (dev->data->dev_started) {
+ mlx5_dev_stop(dev);
+ priv->skip_default_rss_reta = 1;
+ return mlx5_dev_start(dev);
+ }
+ return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c
new file mode 100644
index 000000000..84c8b0526
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev_driver.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+/**
+ * DPDK callback to enable promiscuous mode.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_promiscuous_enable(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+
+ dev->data->promiscuous = 1;
+ if (priv->isolated) {
+ DRV_LOG(WARNING,
+ "port %u cannot enable promiscuous mode"
+ " in flow isolation mode",
+ dev->data->port_id);
+ return 0;
+ }
+ if (priv->config.vf) {
+ ret = mlx5_nl_promisc(priv->nl_socket_route, mlx5_ifindex(dev),
+ 1);
+ if (ret)
+ return ret;
+ }
+ ret = mlx5_traffic_restart(dev);
+ if (ret)
+ DRV_LOG(ERR, "port %u cannot enable promiscuous mode: %s",
+ dev->data->port_id, strerror(rte_errno));
+
+ /*
+ * rte_eth_dev_promiscuous_enable() rollback
+ * dev->data->promiscuous in the case of failure.
+ */
+ return ret;
+}
+
+/**
+ * DPDK callback to disable promiscuous mode.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_promiscuous_disable(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+
+ dev->data->promiscuous = 0;
+ if (priv->config.vf) {
+ ret = mlx5_nl_promisc(priv->nl_socket_route, mlx5_ifindex(dev),
+ 0);
+ if (ret)
+ return ret;
+ }
+ ret = mlx5_traffic_restart(dev);
+ if (ret)
+ DRV_LOG(ERR, "port %u cannot disable promiscuous mode: %s",
+ dev->data->port_id, strerror(rte_errno));
+
+ /*
+ * rte_eth_dev_promiscuous_disable() rollback
+ * dev->data->promiscuous in the case of failure.
+ */
+ return ret;
+}
+
+/**
+ * DPDK callback to enable allmulti mode.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_allmulticast_enable(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+
+ dev->data->all_multicast = 1;
+ if (priv->isolated) {
+ DRV_LOG(WARNING,
+ "port %u cannot enable allmulticast mode"
+ " in flow isolation mode",
+ dev->data->port_id);
+ return 0;
+ }
+ if (priv->config.vf) {
+ ret = mlx5_nl_allmulti(priv->nl_socket_route, mlx5_ifindex(dev),
+ 1);
+ if (ret)
+ goto error;
+ }
+ ret = mlx5_traffic_restart(dev);
+ if (ret)
+ DRV_LOG(ERR, "port %u cannot enable allmulicast mode: %s",
+ dev->data->port_id, strerror(rte_errno));
+error:
+ /*
+ * rte_eth_allmulticast_enable() rollback
+ * dev->data->all_multicast in the case of failure.
+ */
+ return ret;
+}
+
+/**
+ * DPDK callback to disable allmulti mode.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_allmulticast_disable(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+
+ dev->data->all_multicast = 0;
+ if (priv->config.vf) {
+ ret = mlx5_nl_allmulti(priv->nl_socket_route, mlx5_ifindex(dev),
+ 0);
+ if (ret)
+ goto error;
+ }
+ ret = mlx5_traffic_restart(dev);
+ if (ret)
+ DRV_LOG(ERR, "port %u cannot disable allmulicast mode: %s",
+ dev->data->port_id, strerror(rte_errno));
+error:
+ /*
+ * rte_eth_allmulticast_disable() rollback
+ * dev->data->all_multicast in the case of failure.
+ */
+ return ret;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c
new file mode 100644
index 000000000..7a50ec6f1
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c
@@ -0,0 +1,2976 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <sys/queue.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_malloc.h>
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+#include <rte_interrupts.h>
+#include <rte_debug.h>
+#include <rte_io.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_flow.h"
+
+
+/* Default RSS hash key also used for ConnectX-3. */
+uint8_t rss_hash_default_key[] = {
+ 0x2c, 0xc6, 0x81, 0xd1,
+ 0x5b, 0xdb, 0xf4, 0xf7,
+ 0xfc, 0xa2, 0x83, 0x19,
+ 0xdb, 0x1a, 0x3e, 0x94,
+ 0x6b, 0x9e, 0x38, 0xd9,
+ 0x2c, 0x9c, 0x03, 0xd1,
+ 0xad, 0x99, 0x44, 0xa7,
+ 0xd9, 0x56, 0x3d, 0x59,
+ 0x06, 0x3c, 0x25, 0xf3,
+ 0xfc, 0x1f, 0xdc, 0x2a,
+};
+
+/* Length of the default RSS hash key. */
+static_assert(MLX5_RSS_HASH_KEY_LEN ==
+ (unsigned int)sizeof(rss_hash_default_key),
+ "wrong RSS default key size.");
+
+/**
+ * Check whether Multi-Packet RQ can be enabled for the device.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 1 if supported, negative errno value if not.
+ */
+inline int
+mlx5_check_mprq_support(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (priv->config.mprq.enabled &&
+ priv->rxqs_n >= priv->config.mprq.min_rxqs_num)
+ return 1;
+ return -ENOTSUP;
+}
+
+/**
+ * Check whether Multi-Packet RQ is enabled for the Rx queue.
+ *
+ * @param rxq
+ * Pointer to receive queue structure.
+ *
+ * @return
+ * 0 if disabled, otherwise enabled.
+ */
+inline int
+mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq)
+{
+ return rxq->strd_num_n > 0;
+}
+
+/**
+ * Check whether Multi-Packet RQ is enabled for the device.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 0 if disabled, otherwise enabled.
+ */
+inline int
+mlx5_mprq_enabled(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint16_t i;
+ uint16_t n = 0;
+ uint16_t n_ibv = 0;
+
+ if (mlx5_check_mprq_support(dev) < 0)
+ return 0;
+ /* All the configured queues should be enabled. */
+ for (i = 0; i < priv->rxqs_n; ++i) {
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+ struct mlx5_rxq_ctrl *rxq_ctrl = container_of
+ (rxq, struct mlx5_rxq_ctrl, rxq);
+
+ if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD)
+ continue;
+ n_ibv++;
+ if (mlx5_rxq_mprq_enabled(rxq))
+ ++n;
+ }
+ /* Multi-Packet RQ can't be partially configured. */
+ MLX5_ASSERT(n == 0 || n == n_ibv);
+ return n == n_ibv;
+}
+
+/**
+ * Allocate RX queue elements for Multi-Packet RQ.
+ *
+ * @param rxq_ctrl
+ * Pointer to RX queue structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+ struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+ unsigned int wqe_n = 1 << rxq->elts_n;
+ unsigned int i;
+ int err;
+
+ /* Iterate on segments. */
+ for (i = 0; i <= wqe_n; ++i) {
+ struct mlx5_mprq_buf *buf;
+
+ if (rte_mempool_get(rxq->mprq_mp, (void **)&buf) < 0) {
+ DRV_LOG(ERR, "port %u empty mbuf pool", rxq->port_id);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ if (i < wqe_n)
+ (*rxq->mprq_bufs)[i] = buf;
+ else
+ rxq->mprq_repl = buf;
+ }
+ DRV_LOG(DEBUG,
+ "port %u Rx queue %u allocated and configured %u segments",
+ rxq->port_id, rxq->idx, wqe_n);
+ return 0;
+error:
+ err = rte_errno; /* Save rte_errno before cleanup. */
+ wqe_n = i;
+ for (i = 0; (i != wqe_n); ++i) {
+ if ((*rxq->mprq_bufs)[i] != NULL)
+ rte_mempool_put(rxq->mprq_mp,
+ (*rxq->mprq_bufs)[i]);
+ (*rxq->mprq_bufs)[i] = NULL;
+ }
+ DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
+ rxq->port_id, rxq->idx);
+ rte_errno = err; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Allocate RX queue elements for Single-Packet RQ.
+ *
+ * @param rxq_ctrl
+ * Pointer to RX queue structure.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+ const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
+ unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
+ unsigned int i;
+ int err;
+
+ /* Iterate on segments. */
+ for (i = 0; (i != elts_n); ++i) {
+ struct rte_mbuf *buf;
+
+ buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
+ if (buf == NULL) {
+ DRV_LOG(ERR, "port %u empty mbuf pool",
+ PORT_ID(rxq_ctrl->priv));
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ /* Headroom is reserved by rte_pktmbuf_alloc(). */
+ MLX5_ASSERT(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
+ /* Buffer is supposed to be empty. */
+ MLX5_ASSERT(rte_pktmbuf_data_len(buf) == 0);
+ MLX5_ASSERT(rte_pktmbuf_pkt_len(buf) == 0);
+ MLX5_ASSERT(!buf->next);
+ /* Only the first segment keeps headroom. */
+ if (i % sges_n)
+ SET_DATA_OFF(buf, 0);
+ PORT(buf) = rxq_ctrl->rxq.port_id;
+ DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
+ PKT_LEN(buf) = DATA_LEN(buf);
+ NB_SEGS(buf) = 1;
+ (*rxq_ctrl->rxq.elts)[i] = buf;
+ }
+ /* If Rx vector is activated. */
+ if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
+ struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+ struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
+ struct rte_pktmbuf_pool_private *priv =
+ (struct rte_pktmbuf_pool_private *)
+ rte_mempool_get_priv(rxq_ctrl->rxq.mp);
+ int j;
+
+ /* Initialize default rearm_data for vPMD. */
+ mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
+ rte_mbuf_refcnt_set(mbuf_init, 1);
+ mbuf_init->nb_segs = 1;
+ mbuf_init->port = rxq->port_id;
+ if (priv->flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF)
+ mbuf_init->ol_flags = EXT_ATTACHED_MBUF;
+ /*
+ * prevent compiler reordering:
+ * rearm_data covers previous fields.
+ */
+ rte_compiler_barrier();
+ rxq->mbuf_initializer =
+ *(rte_xmm_t *)&mbuf_init->rearm_data;
+ /* Padding with a fake mbuf for vectorized Rx. */
+ for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
+ (*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
+ }
+ DRV_LOG(DEBUG,
+ "port %u Rx queue %u allocated and configured %u segments"
+ " (max %u packets)",
+ PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx, elts_n,
+ elts_n / (1 << rxq_ctrl->rxq.sges_n));
+ return 0;
+error:
+ err = rte_errno; /* Save rte_errno before cleanup. */
+ elts_n = i;
+ for (i = 0; (i != elts_n); ++i) {
+ if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+ (*rxq_ctrl->rxq.elts)[i] = NULL;
+ }
+ DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
+ PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx);
+ rte_errno = err; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Allocate RX queue elements.
+ *
+ * @param rxq_ctrl
+ * Pointer to RX queue structure.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+int
+rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+ return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
+ rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
+}
+
+/**
+ * Free RX queue elements for Multi-Packet RQ.
+ *
+ * @param rxq_ctrl
+ * Pointer to RX queue structure.
+ */
+static void
+rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+ struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+ uint16_t i;
+
+ DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs",
+ rxq->port_id, rxq->idx);
+ if (rxq->mprq_bufs == NULL)
+ return;
+ MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0);
+ for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
+ if ((*rxq->mprq_bufs)[i] != NULL)
+ mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
+ (*rxq->mprq_bufs)[i] = NULL;
+ }
+ if (rxq->mprq_repl != NULL) {
+ mlx5_mprq_buf_free(rxq->mprq_repl);
+ rxq->mprq_repl = NULL;
+ }
+}
+
+/**
+ * Free RX queue elements for Single-Packet RQ.
+ *
+ * @param rxq_ctrl
+ * Pointer to RX queue structure.
+ */
+static void
+rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+ struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+ const uint16_t q_n = (1 << rxq->elts_n);
+ const uint16_t q_mask = q_n - 1;
+ uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
+ uint16_t i;
+
+ DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
+ PORT_ID(rxq_ctrl->priv), rxq->idx);
+ if (rxq->elts == NULL)
+ return;
+ /**
+ * Some mbuf in the Ring belongs to the application. They cannot be
+ * freed.
+ */
+ if (mlx5_rxq_check_vec_support(rxq) > 0) {
+ for (i = 0; i < used; ++i)
+ (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
+ rxq->rq_pi = rxq->rq_ci;
+ }
+ for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
+ if ((*rxq->elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq->elts)[i]);
+ (*rxq->elts)[i] = NULL;
+ }
+}
+
+/**
+ * Free RX queue elements.
+ *
+ * @param rxq_ctrl
+ * Pointer to RX queue structure.
+ */
+static void
+rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+ if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+ rxq_free_elts_mprq(rxq_ctrl);
+ else
+ rxq_free_elts_sprq(rxq_ctrl);
+}
+
+/**
+ * Returns the per-queue supported offloads.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * Supported Rx offloads.
+ */
+uint64_t
+mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER |
+ DEV_RX_OFFLOAD_TIMESTAMP |
+ DEV_RX_OFFLOAD_JUMBO_FRAME |
+ DEV_RX_OFFLOAD_RSS_HASH);
+
+ if (config->hw_fcs_strip)
+ offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
+
+ if (config->hw_csum)
+ offloads |= (DEV_RX_OFFLOAD_IPV4_CKSUM |
+ DEV_RX_OFFLOAD_UDP_CKSUM |
+ DEV_RX_OFFLOAD_TCP_CKSUM);
+ if (config->hw_vlan_strip)
+ offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
+ if (MLX5_LRO_SUPPORTED(dev))
+ offloads |= DEV_RX_OFFLOAD_TCP_LRO;
+ return offloads;
+}
+
+
+/**
+ * Returns the per-port supported offloads.
+ *
+ * @return
+ * Supported Rx offloads.
+ */
+uint64_t
+mlx5_get_rx_port_offloads(void)
+{
+ uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER;
+
+ return offloads;
+}
+
+/**
+ * Verify if the queue can be released.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * RX queue index.
+ *
+ * @return
+ * 1 if the queue can be released
+ * 0 if the queue can not be released, there are references to it.
+ * Negative errno and rte_errno is set if queue doesn't exist.
+ */
+static int
+mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+
+ if (!(*priv->rxqs)[idx]) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
+ return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
+}
+
+/**
+ * Rx queue presetup checks.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * RX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!rte_is_power_of_2(desc)) {
+ desc = 1 << log2above(desc);
+ DRV_LOG(WARNING,
+ "port %u increased number of descriptors in Rx queue %u"
+ " to the next power of two (%d)",
+ dev->data->port_id, idx, desc);
+ }
+ DRV_LOG(DEBUG, "port %u configuring Rx queue %u for %u descriptors",
+ dev->data->port_id, idx, desc);
+ if (idx >= priv->rxqs_n) {
+ DRV_LOG(ERR, "port %u Rx queue index out of range (%u >= %u)",
+ dev->data->port_id, idx, priv->rxqs_n);
+ rte_errno = EOVERFLOW;
+ return -rte_errno;
+ }
+ if (!mlx5_rxq_releasable(dev, idx)) {
+ DRV_LOG(ERR, "port %u unable to release queue index %u",
+ dev->data->port_id, idx);
+ rte_errno = EBUSY;
+ return -rte_errno;
+ }
+ mlx5_rxq_release(dev, idx);
+ return 0;
+}
+
+/**
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * RX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ * Thresholds parameters.
+ * @param mp
+ * Memory pool for buffer allocations.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_rxconf *conf,
+ struct rte_mempool *mp)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+ int res;
+
+ res = mlx5_rx_queue_pre_setup(dev, idx, desc);
+ if (res)
+ return res;
+ rxq_ctrl = mlx5_rxq_new(dev, idx, desc, socket, conf, mp);
+ if (!rxq_ctrl) {
+ DRV_LOG(ERR, "port %u unable to allocate queue index %u",
+ dev->data->port_id, idx);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ DRV_LOG(DEBUG, "port %u adding Rx queue %u to list",
+ dev->data->port_id, idx);
+ (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+ return 0;
+}
+
+/**
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * RX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param hairpin_conf
+ * Hairpin configuration parameters.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+ uint16_t desc,
+ const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+ int res;
+
+ res = mlx5_rx_queue_pre_setup(dev, idx, desc);
+ if (res)
+ return res;
+ if (hairpin_conf->peer_count != 1 ||
+ hairpin_conf->peers[0].port != dev->data->port_id ||
+ hairpin_conf->peers[0].queue >= priv->txqs_n) {
+ DRV_LOG(ERR, "port %u unable to setup hairpin queue index %u "
+ " invalid hairpind configuration", dev->data->port_id,
+ idx);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ rxq_ctrl = mlx5_rxq_hairpin_new(dev, idx, desc, hairpin_conf);
+ if (!rxq_ctrl) {
+ DRV_LOG(ERR, "port %u unable to allocate queue index %u",
+ dev->data->port_id, idx);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ DRV_LOG(DEBUG, "port %u adding Rx queue %u to list",
+ dev->data->port_id, idx);
+ (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+ return 0;
+}
+
+/**
+ * DPDK callback to release a RX queue.
+ *
+ * @param dpdk_rxq
+ * Generic RX queue pointer.
+ */
+void
+mlx5_rx_queue_release(void *dpdk_rxq)
+{
+ struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq;
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+ struct mlx5_priv *priv;
+
+ if (rxq == NULL)
+ return;
+ rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+ priv = rxq_ctrl->priv;
+ if (!mlx5_rxq_releasable(ETH_DEV(priv), rxq_ctrl->rxq.idx))
+ rte_panic("port %u Rx queue %u is still used by a flow and"
+ " cannot be removed\n",
+ PORT_ID(priv), rxq->idx);
+ mlx5_rxq_release(ETH_DEV(priv), rxq_ctrl->rxq.idx);
+}
+
+/**
+ * Get an Rx queue Verbs/DevX object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Rx queue array
+ *
+ * @return
+ * The Verbs/DevX object if it exists.
+ */
+static struct mlx5_rxq_obj *
+mlx5_rxq_obj_get(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+
+ if (idx >= priv->rxqs_n)
+ return NULL;
+ if (!rxq_data)
+ return NULL;
+ rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ if (rxq_ctrl->obj)
+ rte_atomic32_inc(&rxq_ctrl->obj->refcnt);
+ return rxq_ctrl->obj;
+}
+
+/**
+ * Release the resources allocated for an RQ DevX object.
+ *
+ * @param rxq_ctrl
+ * DevX Rx queue object.
+ */
+static void
+rxq_release_rq_resources(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+ if (rxq_ctrl->rxq.wqes) {
+ rte_free((void *)(uintptr_t)rxq_ctrl->rxq.wqes);
+ rxq_ctrl->rxq.wqes = NULL;
+ }
+ if (rxq_ctrl->wq_umem) {
+ mlx5_glue->devx_umem_dereg(rxq_ctrl->wq_umem);
+ rxq_ctrl->wq_umem = NULL;
+ }
+}
+
+/**
+ * Release an Rx hairpin related resources.
+ *
+ * @param rxq_obj
+ * Hairpin Rx queue object.
+ */
+static void
+rxq_obj_hairpin_release(struct mlx5_rxq_obj *rxq_obj)
+{
+ struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
+
+ MLX5_ASSERT(rxq_obj);
+ rq_attr.state = MLX5_RQC_STATE_RST;
+ rq_attr.rq_state = MLX5_RQC_STATE_RDY;
+ mlx5_devx_cmd_modify_rq(rxq_obj->rq, &rq_attr);
+ claim_zero(mlx5_devx_cmd_destroy(rxq_obj->rq));
+}
+
+/**
+ * Release an Rx verbs/DevX queue object.
+ *
+ * @param rxq_obj
+ * Verbs/DevX Rx queue object.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+static int
+mlx5_rxq_obj_release(struct mlx5_rxq_obj *rxq_obj)
+{
+ MLX5_ASSERT(rxq_obj);
+ if (rte_atomic32_dec_and_test(&rxq_obj->refcnt)) {
+ switch (rxq_obj->type) {
+ case MLX5_RXQ_OBJ_TYPE_IBV:
+ MLX5_ASSERT(rxq_obj->wq);
+ MLX5_ASSERT(rxq_obj->cq);
+ rxq_free_elts(rxq_obj->rxq_ctrl);
+ claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
+ claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq));
+ break;
+ case MLX5_RXQ_OBJ_TYPE_DEVX_RQ:
+ MLX5_ASSERT(rxq_obj->cq);
+ MLX5_ASSERT(rxq_obj->rq);
+ rxq_free_elts(rxq_obj->rxq_ctrl);
+ claim_zero(mlx5_devx_cmd_destroy(rxq_obj->rq));
+ rxq_release_rq_resources(rxq_obj->rxq_ctrl);
+ claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq));
+ break;
+ case MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN:
+ MLX5_ASSERT(rxq_obj->rq);
+ rxq_obj_hairpin_release(rxq_obj);
+ break;
+ }
+ if (rxq_obj->channel)
+ claim_zero(mlx5_glue->destroy_comp_channel
+ (rxq_obj->channel));
+ LIST_REMOVE(rxq_obj, next);
+ rte_free(rxq_obj);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Allocate queue vector and fill epoll fd list for Rx interrupts.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+ unsigned int rxqs_n = priv->rxqs_n;
+ unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+ unsigned int count = 0;
+ struct rte_intr_handle *intr_handle = dev->intr_handle;
+
+ if (!dev->data->dev_conf.intr_conf.rxq)
+ return 0;
+ mlx5_rx_intr_vec_disable(dev);
+ intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0]));
+ if (intr_handle->intr_vec == NULL) {
+ DRV_LOG(ERR,
+ "port %u failed to allocate memory for interrupt"
+ " vector, Rx interrupts will not be supported",
+ dev->data->port_id);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ intr_handle->type = RTE_INTR_HANDLE_EXT;
+ for (i = 0; i != n; ++i) {
+ /* This rxq obj must not be released in this function. */
+ struct mlx5_rxq_obj *rxq_obj = mlx5_rxq_obj_get(dev, i);
+ int fd;
+ int flags;
+ int rc;
+
+ /* Skip queues that cannot request interrupts. */
+ if (!rxq_obj || !rxq_obj->channel) {
+ /* Use invalid intr_vec[] index to disable entry. */
+ intr_handle->intr_vec[i] =
+ RTE_INTR_VEC_RXTX_OFFSET +
+ RTE_MAX_RXTX_INTR_VEC_ID;
+ continue;
+ }
+ if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
+ DRV_LOG(ERR,
+ "port %u too many Rx queues for interrupt"
+ " vector size (%d), Rx interrupts cannot be"
+ " enabled",
+ dev->data->port_id, RTE_MAX_RXTX_INTR_VEC_ID);
+ mlx5_rx_intr_vec_disable(dev);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ fd = rxq_obj->channel->fd;
+ flags = fcntl(fd, F_GETFL);
+ rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+ if (rc < 0) {
+ rte_errno = errno;
+ DRV_LOG(ERR,
+ "port %u failed to make Rx interrupt file"
+ " descriptor %d non-blocking for queue index"
+ " %d",
+ dev->data->port_id, fd, i);
+ mlx5_rx_intr_vec_disable(dev);
+ return -rte_errno;
+ }
+ intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
+ intr_handle->efds[count] = fd;
+ count++;
+ }
+ if (!count)
+ mlx5_rx_intr_vec_disable(dev);
+ else
+ intr_handle->nb_efd = count;
+ return 0;
+}
+
+/**
+ * Clean up Rx interrupts handler.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_intr_handle *intr_handle = dev->intr_handle;
+ unsigned int i;
+ unsigned int rxqs_n = priv->rxqs_n;
+ unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+
+ if (!dev->data->dev_conf.intr_conf.rxq)
+ return;
+ if (!intr_handle->intr_vec)
+ goto free;
+ for (i = 0; i != n; ++i) {
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+ struct mlx5_rxq_data *rxq_data;
+
+ if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
+ RTE_MAX_RXTX_INTR_VEC_ID)
+ continue;
+ /**
+ * Need to access directly the queue to release the reference
+ * kept in mlx5_rx_intr_vec_enable().
+ */
+ rxq_data = (*priv->rxqs)[i];
+ rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ if (rxq_ctrl->obj)
+ mlx5_rxq_obj_release(rxq_ctrl->obj);
+ }
+free:
+ rte_intr_free_epoll_fd(intr_handle);
+ if (intr_handle->intr_vec)
+ free(intr_handle->intr_vec);
+ intr_handle->nb_efd = 0;
+ intr_handle->intr_vec = NULL;
+}
+
+/**
+ * MLX5 CQ notification .
+ *
+ * @param rxq
+ * Pointer to receive queue structure.
+ * @param sq_n_rxq
+ * Sequence number per receive queue .
+ */
+static inline void
+mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
+{
+ int sq_n = 0;
+ uint32_t doorbell_hi;
+ uint64_t doorbell;
+ void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL;
+
+ sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK;
+ doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
+ doorbell = (uint64_t)doorbell_hi << 32;
+ doorbell |= rxq->cqn;
+ rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
+ mlx5_uar_write64(rte_cpu_to_be_64(doorbell),
+ cq_db_reg, rxq->uar_lock_cq);
+}
+
+/**
+ * DPDK callback for Rx queue interrupt enable.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param rx_queue_id
+ * Rx queue number.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq_data;
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+
+ rxq_data = (*priv->rxqs)[rx_queue_id];
+ if (!rxq_data) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ if (rxq_ctrl->irq) {
+ struct mlx5_rxq_obj *rxq_obj;
+
+ rxq_obj = mlx5_rxq_obj_get(dev, rx_queue_id);
+ if (!rxq_obj) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn);
+ mlx5_rxq_obj_release(rxq_obj);
+ }
+ return 0;
+}
+
+/**
+ * DPDK callback for Rx queue interrupt disable.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param rx_queue_id
+ * Rx queue number.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq_data;
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+ struct mlx5_rxq_obj *rxq_obj = NULL;
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+ int ret;
+
+ rxq_data = (*priv->rxqs)[rx_queue_id];
+ if (!rxq_data) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ if (!rxq_ctrl->irq)
+ return 0;
+ rxq_obj = mlx5_rxq_obj_get(dev, rx_queue_id);
+ if (!rxq_obj) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ ret = mlx5_glue->get_cq_event(rxq_obj->channel, &ev_cq, &ev_ctx);
+ if (ret || ev_cq != rxq_obj->cq) {
+ rte_errno = EINVAL;
+ goto exit;
+ }
+ rxq_data->cq_arm_sn++;
+ mlx5_glue->ack_cq_events(rxq_obj->cq, 1);
+ mlx5_rxq_obj_release(rxq_obj);
+ return 0;
+exit:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ if (rxq_obj)
+ mlx5_rxq_obj_release(rxq_obj);
+ DRV_LOG(WARNING, "port %u unable to disable interrupt on Rx queue %d",
+ dev->data->port_id, rx_queue_id);
+ rte_errno = ret; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Create a CQ Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param priv
+ * Pointer to device private data.
+ * @param rxq_data
+ * Pointer to Rx queue data.
+ * @param cqe_n
+ * Number of CQEs in CQ.
+ * @param rxq_obj
+ * Pointer to Rx queue object data.
+ *
+ * @return
+ * The Verbs object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_cq *
+mlx5_ibv_cq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv,
+ struct mlx5_rxq_data *rxq_data,
+ unsigned int cqe_n, struct mlx5_rxq_obj *rxq_obj)
+{
+ struct {
+ struct ibv_cq_init_attr_ex ibv;
+ struct mlx5dv_cq_init_attr mlx5;
+ } cq_attr;
+
+ cq_attr.ibv = (struct ibv_cq_init_attr_ex){
+ .cqe = cqe_n,
+ .channel = rxq_obj->channel,
+ .comp_mask = 0,
+ };
+ cq_attr.mlx5 = (struct mlx5dv_cq_init_attr){
+ .comp_mask = 0,
+ };
+ if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
+ !rxq_data->lro) {
+ cq_attr.mlx5.comp_mask |=
+ MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ cq_attr.mlx5.cqe_comp_res_format =
+ mlx5_rxq_mprq_enabled(rxq_data) ?
+ MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX :
+ MLX5DV_CQE_RES_FORMAT_HASH;
+#else
+ cq_attr.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
+#endif
+ /*
+ * For vectorized Rx, it must not be doubled in order to
+ * make cq_ci and rq_ci aligned.
+ */
+ if (mlx5_rxq_check_vec_support(rxq_data) < 0)
+ cq_attr.ibv.cqe *= 2;
+ } else if (priv->config.cqe_comp && rxq_data->hw_timestamp) {
+ DRV_LOG(DEBUG,
+ "port %u Rx CQE compression is disabled for HW"
+ " timestamp",
+ dev->data->port_id);
+ } else if (priv->config.cqe_comp && rxq_data->lro) {
+ DRV_LOG(DEBUG,
+ "port %u Rx CQE compression is disabled for LRO",
+ dev->data->port_id);
+ }
+#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
+ if (priv->config.cqe_pad) {
+ cq_attr.mlx5.comp_mask |= MLX5DV_CQ_INIT_ATTR_MASK_FLAGS;
+ cq_attr.mlx5.flags |= MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD;
+ }
+#endif
+ return mlx5_glue->cq_ex_to_cq(mlx5_glue->dv_create_cq(priv->sh->ctx,
+ &cq_attr.ibv,
+ &cq_attr.mlx5));
+}
+
+/**
+ * Create a WQ Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param priv
+ * Pointer to device private data.
+ * @param rxq_data
+ * Pointer to Rx queue data.
+ * @param idx
+ * Queue index in DPDK Rx queue array
+ * @param wqe_n
+ * Number of WQEs in WQ.
+ * @param rxq_obj
+ * Pointer to Rx queue object data.
+ *
+ * @return
+ * The Verbs object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_wq *
+mlx5_ibv_wq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv,
+ struct mlx5_rxq_data *rxq_data, uint16_t idx,
+ unsigned int wqe_n, struct mlx5_rxq_obj *rxq_obj)
+{
+ struct {
+ struct ibv_wq_init_attr ibv;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ struct mlx5dv_wq_init_attr mlx5;
+#endif
+ } wq_attr;
+
+ wq_attr.ibv = (struct ibv_wq_init_attr){
+ .wq_context = NULL, /* Could be useful in the future. */
+ .wq_type = IBV_WQT_RQ,
+ /* Max number of outstanding WRs. */
+ .max_wr = wqe_n >> rxq_data->sges_n,
+ /* Max number of scatter/gather elements in a WR. */
+ .max_sge = 1 << rxq_data->sges_n,
+ .pd = priv->sh->pd,
+ .cq = rxq_obj->cq,
+ .comp_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING | 0,
+ .create_flags = (rxq_data->vlan_strip ?
+ IBV_WQ_FLAGS_CVLAN_STRIPPING : 0),
+ };
+ /* By default, FCS (CRC) is stripped by hardware. */
+ if (rxq_data->crc_present) {
+ wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
+ wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+ }
+ if (priv->config.hw_padding) {
+#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
+ wq_attr.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
+ wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
+ wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_PCI_WRITE_END_PADDING;
+ wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+#endif
+ }
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ wq_attr.mlx5 = (struct mlx5dv_wq_init_attr){
+ .comp_mask = 0,
+ };
+ if (mlx5_rxq_mprq_enabled(rxq_data)) {
+ struct mlx5dv_striding_rq_init_attr *mprq_attr =
+ &wq_attr.mlx5.striding_rq_attrs;
+
+ wq_attr.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
+ *mprq_attr = (struct mlx5dv_striding_rq_init_attr){
+ .single_stride_log_num_of_bytes = rxq_data->strd_sz_n,
+ .single_wqe_log_num_of_strides = rxq_data->strd_num_n,
+ .two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT,
+ };
+ }
+ rxq_obj->wq = mlx5_glue->dv_create_wq(priv->sh->ctx, &wq_attr.ibv,
+ &wq_attr.mlx5);
+#else
+ rxq_obj->wq = mlx5_glue->create_wq(priv->sh->ctx, &wq_attr.ibv);
+#endif
+ if (rxq_obj->wq) {
+ /*
+ * Make sure number of WRs*SGEs match expectations since a queue
+ * cannot allocate more than "desc" buffers.
+ */
+ if (wq_attr.ibv.max_wr != (wqe_n >> rxq_data->sges_n) ||
+ wq_attr.ibv.max_sge != (1u << rxq_data->sges_n)) {
+ DRV_LOG(ERR,
+ "port %u Rx queue %u requested %u*%u but got"
+ " %u*%u WRs*SGEs",
+ dev->data->port_id, idx,
+ wqe_n >> rxq_data->sges_n,
+ (1 << rxq_data->sges_n),
+ wq_attr.ibv.max_wr, wq_attr.ibv.max_sge);
+ claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
+ rxq_obj->wq = NULL;
+ rte_errno = EINVAL;
+ }
+ }
+ return rxq_obj->wq;
+}
+
+/**
+ * Fill common fields of create RQ attributes structure.
+ *
+ * @param rxq_data
+ * Pointer to Rx queue data.
+ * @param cqn
+ * CQ number to use with this RQ.
+ * @param rq_attr
+ * RQ attributes structure to fill..
+ */
+static void
+mlx5_devx_create_rq_attr_fill(struct mlx5_rxq_data *rxq_data, uint32_t cqn,
+ struct mlx5_devx_create_rq_attr *rq_attr)
+{
+ rq_attr->state = MLX5_RQC_STATE_RST;
+ rq_attr->vsd = (rxq_data->vlan_strip) ? 0 : 1;
+ rq_attr->cqn = cqn;
+ rq_attr->scatter_fcs = (rxq_data->crc_present) ? 1 : 0;
+}
+
+/**
+ * Fill common fields of DevX WQ attributes structure.
+ *
+ * @param priv
+ * Pointer to device private data.
+ * @param rxq_ctrl
+ * Pointer to Rx queue control structure.
+ * @param wq_attr
+ * WQ attributes structure to fill..
+ */
+static void
+mlx5_devx_wq_attr_fill(struct mlx5_priv *priv, struct mlx5_rxq_ctrl *rxq_ctrl,
+ struct mlx5_devx_wq_attr *wq_attr)
+{
+ wq_attr->end_padding_mode = priv->config.cqe_pad ?
+ MLX5_WQ_END_PAD_MODE_ALIGN :
+ MLX5_WQ_END_PAD_MODE_NONE;
+ wq_attr->pd = priv->sh->pdn;
+ wq_attr->dbr_addr = rxq_ctrl->dbr_offset;
+ wq_attr->dbr_umem_id = rxq_ctrl->dbr_umem_id;
+ wq_attr->dbr_umem_valid = 1;
+ wq_attr->wq_umem_id = rxq_ctrl->wq_umem->umem_id;
+ wq_attr->wq_umem_valid = 1;
+}
+
+/**
+ * Create a RQ object using DevX.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Rx queue array
+ * @param cqn
+ * CQ number to use with this RQ.
+ *
+ * @return
+ * The DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_devx_obj *
+mlx5_devx_rq_new(struct rte_eth_dev *dev, uint16_t idx, uint32_t cqn)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ struct mlx5_devx_create_rq_attr rq_attr;
+ uint32_t wqe_n = 1 << (rxq_data->elts_n - rxq_data->sges_n);
+ uint32_t wq_size = 0;
+ uint32_t wqe_size = 0;
+ uint32_t log_wqe_size = 0;
+ void *buf = NULL;
+ struct mlx5_devx_obj *rq;
+
+ memset(&rq_attr, 0, sizeof(rq_attr));
+ /* Fill RQ attributes. */
+ rq_attr.mem_rq_type = MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE;
+ rq_attr.flush_in_error_en = 1;
+ mlx5_devx_create_rq_attr_fill(rxq_data, cqn, &rq_attr);
+ /* Fill WQ attributes for this RQ. */
+ if (mlx5_rxq_mprq_enabled(rxq_data)) {
+ rq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ;
+ /*
+ * Number of strides in each WQE:
+ * 512*2^single_wqe_log_num_of_strides.
+ */
+ rq_attr.wq_attr.single_wqe_log_num_of_strides =
+ rxq_data->strd_num_n -
+ MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
+ /* Stride size = (2^single_stride_log_num_of_bytes)*64B. */
+ rq_attr.wq_attr.single_stride_log_num_of_bytes =
+ rxq_data->strd_sz_n -
+ MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
+ wqe_size = sizeof(struct mlx5_wqe_mprq);
+ } else {
+ rq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
+ wqe_size = sizeof(struct mlx5_wqe_data_seg);
+ }
+ log_wqe_size = log2above(wqe_size) + rxq_data->sges_n;
+ rq_attr.wq_attr.log_wq_stride = log_wqe_size;
+ rq_attr.wq_attr.log_wq_sz = rxq_data->elts_n - rxq_data->sges_n;
+ /* Calculate and allocate WQ memory space. */
+ wqe_size = 1 << log_wqe_size; /* round up power of two.*/
+ wq_size = wqe_n * wqe_size;
+ buf = rte_calloc_socket(__func__, 1, wq_size, MLX5_WQE_BUF_ALIGNMENT,
+ rxq_ctrl->socket);
+ if (!buf)
+ return NULL;
+ rxq_data->wqes = buf;
+ rxq_ctrl->wq_umem = mlx5_glue->devx_umem_reg(priv->sh->ctx,
+ buf, wq_size, 0);
+ if (!rxq_ctrl->wq_umem) {
+ rte_free(buf);
+ return NULL;
+ }
+ mlx5_devx_wq_attr_fill(priv, rxq_ctrl, &rq_attr.wq_attr);
+ rq = mlx5_devx_cmd_create_rq(priv->sh->ctx, &rq_attr, rxq_ctrl->socket);
+ if (!rq)
+ rxq_release_rq_resources(rxq_ctrl);
+ return rq;
+}
+
+/**
+ * Create the Rx hairpin queue object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Rx queue array
+ *
+ * @return
+ * The hairpin DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_rxq_obj *
+mlx5_rxq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ struct mlx5_devx_create_rq_attr attr = { 0 };
+ struct mlx5_rxq_obj *tmpl = NULL;
+ int ret = 0;
+ uint32_t max_wq_data;
+
+ MLX5_ASSERT(rxq_data);
+ MLX5_ASSERT(!rxq_ctrl->obj);
+ tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+ rxq_ctrl->socket);
+ if (!tmpl) {
+ DRV_LOG(ERR,
+ "port %u Rx queue %u cannot allocate verbs resources",
+ dev->data->port_id, rxq_data->idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ tmpl->type = MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN;
+ tmpl->rxq_ctrl = rxq_ctrl;
+ attr.hairpin = 1;
+ max_wq_data = priv->config.hca_attr.log_max_hairpin_wq_data_sz;
+ /* Jumbo frames > 9KB should be supported, and more packets. */
+ if (priv->config.log_hp_size != (uint32_t)MLX5_ARG_UNSET) {
+ if (priv->config.log_hp_size > max_wq_data) {
+ DRV_LOG(ERR, "total data size %u power of 2 is "
+ "too large for hairpin",
+ priv->config.log_hp_size);
+ rte_errno = ERANGE;
+ return NULL;
+ }
+ attr.wq_attr.log_hairpin_data_sz = priv->config.log_hp_size;
+ } else {
+ attr.wq_attr.log_hairpin_data_sz =
+ (max_wq_data < MLX5_HAIRPIN_JUMBO_LOG_SIZE) ?
+ max_wq_data : MLX5_HAIRPIN_JUMBO_LOG_SIZE;
+ }
+ /* Set the packets number to the maximum value for performance. */
+ attr.wq_attr.log_hairpin_num_packets =
+ attr.wq_attr.log_hairpin_data_sz -
+ MLX5_HAIRPIN_QUEUE_STRIDE;
+ tmpl->rq = mlx5_devx_cmd_create_rq(priv->sh->ctx, &attr,
+ rxq_ctrl->socket);
+ if (!tmpl->rq) {
+ DRV_LOG(ERR,
+ "port %u Rx hairpin queue %u can't create rq object",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ goto error;
+ }
+ DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id,
+ idx, (void *)&tmpl);
+ rte_atomic32_inc(&tmpl->refcnt);
+ LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next);
+ priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+ return tmpl;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ if (tmpl->rq)
+ mlx5_devx_cmd_destroy(tmpl->rq);
+ rte_errno = ret; /* Restore rte_errno. */
+ return NULL;
+}
+
+/**
+ * Create the Rx queue Verbs/DevX object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Rx queue array
+ * @param type
+ * Type of Rx queue object to create.
+ *
+ * @return
+ * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_rxq_obj *
+mlx5_rxq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
+ enum mlx5_rxq_obj_type type)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ struct ibv_wq_attr mod;
+ unsigned int cqe_n;
+ unsigned int wqe_n = 1 << rxq_data->elts_n;
+ struct mlx5_rxq_obj *tmpl = NULL;
+ struct mlx5dv_cq cq_info;
+ struct mlx5dv_rwq rwq;
+ int ret = 0;
+ struct mlx5dv_obj obj;
+
+ MLX5_ASSERT(rxq_data);
+ MLX5_ASSERT(!rxq_ctrl->obj);
+ if (type == MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN)
+ return mlx5_rxq_obj_hairpin_new(dev, idx);
+ priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_RX_QUEUE;
+ priv->verbs_alloc_ctx.obj = rxq_ctrl;
+ tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+ rxq_ctrl->socket);
+ if (!tmpl) {
+ DRV_LOG(ERR,
+ "port %u Rx queue %u cannot allocate verbs resources",
+ dev->data->port_id, rxq_data->idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ tmpl->type = type;
+ tmpl->rxq_ctrl = rxq_ctrl;
+ if (rxq_ctrl->irq) {
+ tmpl->channel = mlx5_glue->create_comp_channel(priv->sh->ctx);
+ if (!tmpl->channel) {
+ DRV_LOG(ERR, "port %u: comp channel creation failure",
+ dev->data->port_id);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ }
+ if (mlx5_rxq_mprq_enabled(rxq_data))
+ cqe_n = wqe_n * (1 << rxq_data->strd_num_n) - 1;
+ else
+ cqe_n = wqe_n - 1;
+ tmpl->cq = mlx5_ibv_cq_new(dev, priv, rxq_data, cqe_n, tmpl);
+ if (!tmpl->cq) {
+ DRV_LOG(ERR, "port %u Rx queue %u CQ creation failure",
+ dev->data->port_id, idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ obj.cq.in = tmpl->cq;
+ obj.cq.out = &cq_info;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ);
+ if (ret) {
+ rte_errno = ret;
+ goto error;
+ }
+ if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+ DRV_LOG(ERR,
+ "port %u wrong MLX5_CQE_SIZE environment variable"
+ " value: it should be set to %u",
+ dev->data->port_id, RTE_CACHE_LINE_SIZE);
+ rte_errno = EINVAL;
+ goto error;
+ }
+ DRV_LOG(DEBUG, "port %u device_attr.max_qp_wr is %d",
+ dev->data->port_id, priv->sh->device_attr.orig_attr.max_qp_wr);
+ DRV_LOG(DEBUG, "port %u device_attr.max_sge is %d",
+ dev->data->port_id, priv->sh->device_attr.orig_attr.max_sge);
+ /* Allocate door-bell for types created with DevX. */
+ if (tmpl->type != MLX5_RXQ_OBJ_TYPE_IBV) {
+ struct mlx5_devx_dbr_page *dbr_page;
+ int64_t dbr_offset;
+
+ dbr_offset = mlx5_get_dbr(dev, &dbr_page);
+ if (dbr_offset < 0)
+ goto error;
+ rxq_ctrl->dbr_offset = dbr_offset;
+ rxq_ctrl->dbr_umem_id = dbr_page->umem->umem_id;
+ rxq_ctrl->dbr_umem_id_valid = 1;
+ rxq_data->rq_db = (uint32_t *)((uintptr_t)dbr_page->dbrs +
+ (uintptr_t)rxq_ctrl->dbr_offset);
+ }
+ if (tmpl->type == MLX5_RXQ_OBJ_TYPE_IBV) {
+ tmpl->wq = mlx5_ibv_wq_new(dev, priv, rxq_data, idx, wqe_n,
+ tmpl);
+ if (!tmpl->wq) {
+ DRV_LOG(ERR, "port %u Rx queue %u WQ creation failure",
+ dev->data->port_id, idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ /* Change queue state to ready. */
+ mod = (struct ibv_wq_attr){
+ .attr_mask = IBV_WQ_ATTR_STATE,
+ .wq_state = IBV_WQS_RDY,
+ };
+ ret = mlx5_glue->modify_wq(tmpl->wq, &mod);
+ if (ret) {
+ DRV_LOG(ERR,
+ "port %u Rx queue %u WQ state to IBV_WQS_RDY"
+ " failed", dev->data->port_id, idx);
+ rte_errno = ret;
+ goto error;
+ }
+ obj.rwq.in = tmpl->wq;
+ obj.rwq.out = &rwq;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_RWQ);
+ if (ret) {
+ rte_errno = ret;
+ goto error;
+ }
+ rxq_data->wqes = rwq.buf;
+ rxq_data->rq_db = rwq.dbrec;
+ } else if (tmpl->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) {
+ struct mlx5_devx_modify_rq_attr rq_attr;
+
+ memset(&rq_attr, 0, sizeof(rq_attr));
+ tmpl->rq = mlx5_devx_rq_new(dev, idx, cq_info.cqn);
+ if (!tmpl->rq) {
+ DRV_LOG(ERR, "port %u Rx queue %u RQ creation failure",
+ dev->data->port_id, idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ /* Change queue state to ready. */
+ rq_attr.rq_state = MLX5_RQC_STATE_RST;
+ rq_attr.state = MLX5_RQC_STATE_RDY;
+ ret = mlx5_devx_cmd_modify_rq(tmpl->rq, &rq_attr);
+ if (ret)
+ goto error;
+ }
+ /* Fill the rings. */
+ rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
+ rxq_data->cq_db = cq_info.dbrec;
+ rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
+ rxq_data->cq_uar = cq_info.cq_uar;
+ rxq_data->cqn = cq_info.cqn;
+ rxq_data->cq_arm_sn = 0;
+ mlx5_rxq_initialize(rxq_data);
+ rxq_data->cq_ci = 0;
+ DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id,
+ idx, (void *)&tmpl);
+ rte_atomic32_inc(&tmpl->refcnt);
+ LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next);
+ priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+ return tmpl;
+error:
+ if (tmpl) {
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ if (tmpl->type == MLX5_RXQ_OBJ_TYPE_IBV && tmpl->wq)
+ claim_zero(mlx5_glue->destroy_wq(tmpl->wq));
+ else if (tmpl->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ && tmpl->rq)
+ claim_zero(mlx5_devx_cmd_destroy(tmpl->rq));
+ if (tmpl->cq)
+ claim_zero(mlx5_glue->destroy_cq(tmpl->cq));
+ if (tmpl->channel)
+ claim_zero(mlx5_glue->destroy_comp_channel
+ (tmpl->channel));
+ rte_free(tmpl);
+ rte_errno = ret; /* Restore rte_errno. */
+ }
+ if (type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ)
+ rxq_release_rq_resources(rxq_ctrl);
+ priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+ return NULL;
+}
+
+/**
+ * Verify the Rx queue objects list is empty
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The number of objects not released.
+ */
+int
+mlx5_rxq_obj_verify(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret = 0;
+ struct mlx5_rxq_obj *rxq_obj;
+
+ LIST_FOREACH(rxq_obj, &priv->rxqsobj, next) {
+ DRV_LOG(DEBUG, "port %u Rx queue %u still referenced",
+ dev->data->port_id, rxq_obj->rxq_ctrl->rxq.idx);
+ ++ret;
+ }
+ return ret;
+}
+
+/**
+ * Callback function to initialize mbufs for Multi-Packet RQ.
+ */
+static inline void
+mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg,
+ void *_m, unsigned int i __rte_unused)
+{
+ struct mlx5_mprq_buf *buf = _m;
+ struct rte_mbuf_ext_shared_info *shinfo;
+ unsigned int strd_n = (unsigned int)(uintptr_t)opaque_arg;
+ unsigned int j;
+
+ memset(_m, 0, sizeof(*buf));
+ buf->mp = mp;
+ rte_atomic16_set(&buf->refcnt, 1);
+ for (j = 0; j != strd_n; ++j) {
+ shinfo = &buf->shinfos[j];
+ shinfo->free_cb = mlx5_mprq_buf_free_cb;
+ shinfo->fcb_opaque = buf;
+ }
+}
+
+/**
+ * Free mempool of Multi-Packet RQ.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 0 on success, negative errno value on failure.
+ */
+int
+mlx5_mprq_free_mp(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_mempool *mp = priv->mprq_mp;
+ unsigned int i;
+
+ if (mp == NULL)
+ return 0;
+ DRV_LOG(DEBUG, "port %u freeing mempool (%s) for Multi-Packet RQ",
+ dev->data->port_id, mp->name);
+ /*
+ * If a buffer in the pool has been externally attached to a mbuf and it
+ * is still in use by application, destroying the Rx queue can spoil
+ * the packet. It is unlikely to happen but if application dynamically
+ * creates and destroys with holding Rx packets, this can happen.
+ *
+ * TODO: It is unavoidable for now because the mempool for Multi-Packet
+ * RQ isn't provided by application but managed by PMD.
+ */
+ if (!rte_mempool_full(mp)) {
+ DRV_LOG(ERR,
+ "port %u mempool for Multi-Packet RQ is still in use",
+ dev->data->port_id);
+ rte_errno = EBUSY;
+ return -rte_errno;
+ }
+ rte_mempool_free(mp);
+ /* Unset mempool for each Rx queue. */
+ for (i = 0; i != priv->rxqs_n; ++i) {
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+
+ if (rxq == NULL)
+ continue;
+ rxq->mprq_mp = NULL;
+ }
+ priv->mprq_mp = NULL;
+ return 0;
+}
+
+/**
+ * Allocate a mempool for Multi-Packet RQ. All configured Rx queues share the
+ * mempool. If already allocated, reuse it if there're enough elements.
+ * Otherwise, resize it.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 0 on success, negative errno value on failure.
+ */
+int
+mlx5_mprq_alloc_mp(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_mempool *mp = priv->mprq_mp;
+ char name[RTE_MEMPOOL_NAMESIZE];
+ unsigned int desc = 0;
+ unsigned int buf_len;
+ unsigned int obj_num;
+ unsigned int obj_size;
+ unsigned int strd_num_n = 0;
+ unsigned int strd_sz_n = 0;
+ unsigned int i;
+ unsigned int n_ibv = 0;
+
+ if (!mlx5_mprq_enabled(dev))
+ return 0;
+ /* Count the total number of descriptors configured. */
+ for (i = 0; i != priv->rxqs_n; ++i) {
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+ struct mlx5_rxq_ctrl *rxq_ctrl = container_of
+ (rxq, struct mlx5_rxq_ctrl, rxq);
+
+ if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD)
+ continue;
+ n_ibv++;
+ desc += 1 << rxq->elts_n;
+ /* Get the max number of strides. */
+ if (strd_num_n < rxq->strd_num_n)
+ strd_num_n = rxq->strd_num_n;
+ /* Get the max size of a stride. */
+ if (strd_sz_n < rxq->strd_sz_n)
+ strd_sz_n = rxq->strd_sz_n;
+ }
+ MLX5_ASSERT(strd_num_n && strd_sz_n);
+ buf_len = (1 << strd_num_n) * (1 << strd_sz_n);
+ obj_size = sizeof(struct mlx5_mprq_buf) + buf_len + (1 << strd_num_n) *
+ sizeof(struct rte_mbuf_ext_shared_info) + RTE_PKTMBUF_HEADROOM;
+ /*
+ * Received packets can be either memcpy'd or externally referenced. In
+ * case that the packet is attached to an mbuf as an external buffer, as
+ * it isn't possible to predict how the buffers will be queued by
+ * application, there's no option to exactly pre-allocate needed buffers
+ * in advance but to speculatively prepares enough buffers.
+ *
+ * In the data path, if this Mempool is depleted, PMD will try to memcpy
+ * received packets to buffers provided by application (rxq->mp) until
+ * this Mempool gets available again.
+ */
+ desc *= 4;
+ obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * n_ibv;
+ /*
+ * rte_mempool_create_empty() has sanity check to refuse large cache
+ * size compared to the number of elements.
+ * CACHE_FLUSHTHRESH_MULTIPLIER is defined in a C file, so using a
+ * constant number 2 instead.
+ */
+ obj_num = RTE_MAX(obj_num, MLX5_MPRQ_MP_CACHE_SZ * 2);
+ /* Check a mempool is already allocated and if it can be resued. */
+ if (mp != NULL && mp->elt_size >= obj_size && mp->size >= obj_num) {
+ DRV_LOG(DEBUG, "port %u mempool %s is being reused",
+ dev->data->port_id, mp->name);
+ /* Reuse. */
+ goto exit;
+ } else if (mp != NULL) {
+ DRV_LOG(DEBUG, "port %u mempool %s should be resized, freeing it",
+ dev->data->port_id, mp->name);
+ /*
+ * If failed to free, which means it may be still in use, no way
+ * but to keep using the existing one. On buffer underrun,
+ * packets will be memcpy'd instead of external buffer
+ * attachment.
+ */
+ if (mlx5_mprq_free_mp(dev)) {
+ if (mp->elt_size >= obj_size)
+ goto exit;
+ else
+ return -rte_errno;
+ }
+ }
+ snprintf(name, sizeof(name), "port-%u-mprq", dev->data->port_id);
+ mp = rte_mempool_create(name, obj_num, obj_size, MLX5_MPRQ_MP_CACHE_SZ,
+ 0, NULL, NULL, mlx5_mprq_buf_init,
+ (void *)(uintptr_t)(1 << strd_num_n),
+ dev->device->numa_node, 0);
+ if (mp == NULL) {
+ DRV_LOG(ERR,
+ "port %u failed to allocate a mempool for"
+ " Multi-Packet RQ, count=%u, size=%u",
+ dev->data->port_id, obj_num, obj_size);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ priv->mprq_mp = mp;
+exit:
+ /* Set mempool for each Rx queue. */
+ for (i = 0; i != priv->rxqs_n; ++i) {
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+ struct mlx5_rxq_ctrl *rxq_ctrl = container_of
+ (rxq, struct mlx5_rxq_ctrl, rxq);
+
+ if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD)
+ continue;
+ rxq->mprq_mp = mp;
+ }
+ DRV_LOG(INFO, "port %u Multi-Packet RQ is configured",
+ dev->data->port_id);
+ return 0;
+}
+
+#define MLX5_MAX_TCP_HDR_OFFSET ((unsigned int)(sizeof(struct rte_ether_hdr) + \
+ sizeof(struct rte_vlan_hdr) * 2 + \
+ sizeof(struct rte_ipv6_hdr)))
+#define MAX_TCP_OPTION_SIZE 40u
+#define MLX5_MAX_LRO_HEADER_FIX ((unsigned int)(MLX5_MAX_TCP_HDR_OFFSET + \
+ sizeof(struct rte_tcp_hdr) + \
+ MAX_TCP_OPTION_SIZE))
+
+/**
+ * Adjust the maximum LRO massage size.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * RX queue index.
+ * @param max_lro_size
+ * The maximum size for LRO packet.
+ */
+static void
+mlx5_max_lro_msg_size_adjust(struct rte_eth_dev *dev, uint16_t idx,
+ uint32_t max_lro_size)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (priv->config.hca_attr.lro_max_msg_sz_mode ==
+ MLX5_LRO_MAX_MSG_SIZE_START_FROM_L4 && max_lro_size >
+ MLX5_MAX_TCP_HDR_OFFSET)
+ max_lro_size -= MLX5_MAX_TCP_HDR_OFFSET;
+ max_lro_size = RTE_MIN(max_lro_size, MLX5_MAX_LRO_SIZE);
+ MLX5_ASSERT(max_lro_size >= MLX5_LRO_SEG_CHUNK_SIZE);
+ max_lro_size /= MLX5_LRO_SEG_CHUNK_SIZE;
+ if (priv->max_lro_msg_size)
+ priv->max_lro_msg_size =
+ RTE_MIN((uint32_t)priv->max_lro_msg_size, max_lro_size);
+ else
+ priv->max_lro_msg_size = max_lro_size;
+ DRV_LOG(DEBUG,
+ "port %u Rx Queue %u max LRO message size adjusted to %u bytes",
+ dev->data->port_id, idx,
+ priv->max_lro_msg_size * MLX5_LRO_SEG_CHUNK_SIZE);
+}
+
+/**
+ * Create a DPDK Rx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * RX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ *
+ * @return
+ * A DPDK queue object on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_rxq_ctrl *
+mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_rxconf *conf,
+ struct rte_mempool *mp)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_ctrl *tmpl;
+ unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+ unsigned int mprq_stride_nums;
+ unsigned int mprq_stride_size;
+ unsigned int mprq_stride_cap;
+ struct mlx5_dev_config *config = &priv->config;
+ /*
+ * Always allocate extra slots, even if eventually
+ * the vector Rx will not be used.
+ */
+ uint16_t desc_n =
+ desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
+ uint64_t offloads = conf->offloads |
+ dev->data->dev_conf.rxmode.offloads;
+ unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);
+ const int mprq_en = mlx5_check_mprq_support(dev) > 0;
+ unsigned int max_rx_pkt_len = lro_on_queue ?
+ dev->data->dev_conf.rxmode.max_lro_pkt_size :
+ dev->data->dev_conf.rxmode.max_rx_pkt_len;
+ unsigned int non_scatter_min_mbuf_size = max_rx_pkt_len +
+ RTE_PKTMBUF_HEADROOM;
+ unsigned int max_lro_size = 0;
+ unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
+
+ if (non_scatter_min_mbuf_size > mb_len && !(offloads &
+ DEV_RX_OFFLOAD_SCATTER)) {
+ DRV_LOG(ERR, "port %u Rx queue %u: Scatter offload is not"
+ " configured and no enough mbuf space(%u) to contain "
+ "the maximum RX packet length(%u) with head-room(%u)",
+ dev->data->port_id, idx, mb_len, max_rx_pkt_len,
+ RTE_PKTMBUF_HEADROOM);
+ rte_errno = ENOSPC;
+ return NULL;
+ }
+ tmpl = rte_calloc_socket("RXQ", 1,
+ sizeof(*tmpl) +
+ desc_n * sizeof(struct rte_mbuf *),
+ 0, socket);
+ if (!tmpl) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ tmpl->type = MLX5_RXQ_TYPE_STANDARD;
+ if (mlx5_mr_btree_init(&tmpl->rxq.mr_ctrl.cache_bh,
+ MLX5_MR_BTREE_CACHE_N, socket)) {
+ /* rte_errno is already set. */
+ goto error;
+ }
+ tmpl->socket = socket;
+ if (dev->data->dev_conf.intr_conf.rxq)
+ tmpl->irq = 1;
+ mprq_stride_nums = config->mprq.stride_num_n ?
+ config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
+ mprq_stride_size = non_scatter_min_mbuf_size <=
+ (1U << config->mprq.max_stride_size_n) ?
+ log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
+ mprq_stride_cap = (config->mprq.stride_num_n ?
+ (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
+ (config->mprq.stride_size_n ?
+ (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
+ /*
+ * This Rx queue can be configured as a Multi-Packet RQ if all of the
+ * following conditions are met:
+ * - MPRQ is enabled.
+ * - The number of descs is more than the number of strides.
+ * - max_rx_pkt_len plus overhead is less than the max size
+ * of a stride or mprq_stride_size is specified by a user.
+ * Need to nake sure that there are enough stides to encap
+ * the maximum packet size in case mprq_stride_size is set.
+ * Otherwise, enable Rx scatter if necessary.
+ */
+ if (mprq_en && desc > (1U << mprq_stride_nums) &&
+ (non_scatter_min_mbuf_size <=
+ (1U << config->mprq.max_stride_size_n) ||
+ (config->mprq.stride_size_n &&
+ non_scatter_min_mbuf_size <= mprq_stride_cap))) {
+ /* TODO: Rx scatter isn't supported yet. */
+ tmpl->rxq.sges_n = 0;
+ /* Trim the number of descs needed. */
+ desc >>= mprq_stride_nums;
+ tmpl->rxq.strd_num_n = config->mprq.stride_num_n ?
+ config->mprq.stride_num_n : mprq_stride_nums;
+ tmpl->rxq.strd_sz_n = config->mprq.stride_size_n ?
+ config->mprq.stride_size_n : mprq_stride_size;
+ tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT;
+ tmpl->rxq.strd_scatter_en =
+ !!(offloads & DEV_RX_OFFLOAD_SCATTER);
+ tmpl->rxq.mprq_max_memcpy_len = RTE_MIN(first_mb_free_size,
+ config->mprq.max_memcpy_len);
+ max_lro_size = RTE_MIN(max_rx_pkt_len,
+ (1u << tmpl->rxq.strd_num_n) *
+ (1u << tmpl->rxq.strd_sz_n));
+ DRV_LOG(DEBUG,
+ "port %u Rx queue %u: Multi-Packet RQ is enabled"
+ " strd_num_n = %u, strd_sz_n = %u",
+ dev->data->port_id, idx,
+ tmpl->rxq.strd_num_n, tmpl->rxq.strd_sz_n);
+ } else if (max_rx_pkt_len <= first_mb_free_size) {
+ tmpl->rxq.sges_n = 0;
+ max_lro_size = max_rx_pkt_len;
+ } else if (offloads & DEV_RX_OFFLOAD_SCATTER) {
+ unsigned int size = non_scatter_min_mbuf_size;
+ unsigned int sges_n;
+
+ if (lro_on_queue && first_mb_free_size <
+ MLX5_MAX_LRO_HEADER_FIX) {
+ DRV_LOG(ERR, "Not enough space in the first segment(%u)"
+ " to include the max header size(%u) for LRO",
+ first_mb_free_size, MLX5_MAX_LRO_HEADER_FIX);
+ rte_errno = ENOTSUP;
+ goto error;
+ }
+ /*
+ * Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two.
+ */
+ sges_n = log2above((size / mb_len) + !!(size % mb_len));
+ if (sges_n > MLX5_MAX_LOG_RQ_SEGS) {
+ DRV_LOG(ERR,
+ "port %u too many SGEs (%u) needed to handle"
+ " requested maximum packet size %u, the maximum"
+ " supported are %u", dev->data->port_id,
+ 1 << sges_n, max_rx_pkt_len,
+ 1u << MLX5_MAX_LOG_RQ_SEGS);
+ rte_errno = ENOTSUP;
+ goto error;
+ }
+ tmpl->rxq.sges_n = sges_n;
+ max_lro_size = max_rx_pkt_len;
+ }
+ if (config->mprq.enabled && !mlx5_rxq_mprq_enabled(&tmpl->rxq))
+ DRV_LOG(WARNING,
+ "port %u MPRQ is requested but cannot be enabled\n"
+ " (requested: pkt_sz = %u, desc_num = %u,"
+ " rxq_num = %u, stride_sz = %u, stride_num = %u\n"
+ " supported: min_rxqs_num = %u,"
+ " min_stride_sz = %u, max_stride_sz = %u).",
+ dev->data->port_id, non_scatter_min_mbuf_size,
+ desc, priv->rxqs_n,
+ config->mprq.stride_size_n ?
+ (1U << config->mprq.stride_size_n) :
+ (1U << mprq_stride_size),
+ config->mprq.stride_num_n ?
+ (1U << config->mprq.stride_num_n) :
+ (1U << mprq_stride_nums),
+ config->mprq.min_rxqs_num,
+ (1U << config->mprq.min_stride_size_n),
+ (1U << config->mprq.max_stride_size_n));
+ DRV_LOG(DEBUG, "port %u maximum number of segments per packet: %u",
+ dev->data->port_id, 1 << tmpl->rxq.sges_n);
+ if (desc % (1 << tmpl->rxq.sges_n)) {
+ DRV_LOG(ERR,
+ "port %u number of Rx queue descriptors (%u) is not a"
+ " multiple of SGEs per packet (%u)",
+ dev->data->port_id,
+ desc,
+ 1 << tmpl->rxq.sges_n);
+ rte_errno = EINVAL;
+ goto error;
+ }
+ mlx5_max_lro_msg_size_adjust(dev, idx, max_lro_size);
+ /* Toggle RX checksum offload if hardware supports it. */
+ tmpl->rxq.csum = !!(offloads & DEV_RX_OFFLOAD_CHECKSUM);
+ tmpl->rxq.hw_timestamp = !!(offloads & DEV_RX_OFFLOAD_TIMESTAMP);
+ /* Configure VLAN stripping. */
+ tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
+ /* By default, FCS (CRC) is stripped by hardware. */
+ tmpl->rxq.crc_present = 0;
+ tmpl->rxq.lro = lro_on_queue;
+ if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) {
+ if (config->hw_fcs_strip) {
+ /*
+ * RQs used for LRO-enabled TIRs should not be
+ * configured to scatter the FCS.
+ */
+ if (lro_on_queue)
+ DRV_LOG(WARNING,
+ "port %u CRC stripping has been "
+ "disabled but will still be performed "
+ "by hardware, because LRO is enabled",
+ dev->data->port_id);
+ else
+ tmpl->rxq.crc_present = 1;
+ } else {
+ DRV_LOG(WARNING,
+ "port %u CRC stripping has been disabled but will"
+ " still be performed by hardware, make sure MLNX_OFED"
+ " and firmware are up to date",
+ dev->data->port_id);
+ }
+ }
+ DRV_LOG(DEBUG,
+ "port %u CRC stripping is %s, %u bytes will be subtracted from"
+ " incoming frames to hide it",
+ dev->data->port_id,
+ tmpl->rxq.crc_present ? "disabled" : "enabled",
+ tmpl->rxq.crc_present << 2);
+ /* Save port ID. */
+ tmpl->rxq.rss_hash = !!priv->rss_conf.rss_hf &&
+ (!!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS));
+ tmpl->rxq.port_id = dev->data->port_id;
+ tmpl->priv = priv;
+ tmpl->rxq.mp = mp;
+ tmpl->rxq.elts_n = log2above(desc);
+ tmpl->rxq.rq_repl_thresh =
+ MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
+ tmpl->rxq.elts =
+ (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
+#ifndef RTE_ARCH_64
+ tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq;
+#endif
+ tmpl->rxq.idx = idx;
+ rte_atomic32_inc(&tmpl->refcnt);
+ LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
+ return tmpl;
+error:
+ rte_free(tmpl);
+ return NULL;
+}
+
+/**
+ * Create a DPDK Rx hairpin queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * RX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param hairpin_conf
+ * The hairpin binding configuration.
+ *
+ * @return
+ * A DPDK queue object on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_rxq_ctrl *
+mlx5_rxq_hairpin_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_ctrl *tmpl;
+
+ tmpl = rte_calloc_socket("RXQ", 1, sizeof(*tmpl), 0, SOCKET_ID_ANY);
+ if (!tmpl) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ tmpl->type = MLX5_RXQ_TYPE_HAIRPIN;
+ tmpl->socket = SOCKET_ID_ANY;
+ tmpl->rxq.rss_hash = 0;
+ tmpl->rxq.port_id = dev->data->port_id;
+ tmpl->priv = priv;
+ tmpl->rxq.mp = NULL;
+ tmpl->rxq.elts_n = log2above(desc);
+ tmpl->rxq.elts = NULL;
+ tmpl->rxq.mr_ctrl.cache_bh = (struct mlx5_mr_btree) { 0 };
+ tmpl->hairpin_conf = *hairpin_conf;
+ tmpl->rxq.idx = idx;
+ rte_atomic32_inc(&tmpl->refcnt);
+ LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
+ return tmpl;
+}
+
+/**
+ * Get a Rx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * RX queue index.
+ *
+ * @return
+ * A pointer to the queue if it exists, NULL otherwise.
+ */
+struct mlx5_rxq_ctrl *
+mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
+
+ if ((*priv->rxqs)[idx]) {
+ rxq_ctrl = container_of((*priv->rxqs)[idx],
+ struct mlx5_rxq_ctrl,
+ rxq);
+ mlx5_rxq_obj_get(dev, idx);
+ rte_atomic32_inc(&rxq_ctrl->refcnt);
+ }
+ return rxq_ctrl;
+}
+
+/**
+ * Release a Rx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * RX queue index.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+int
+mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+
+ if (!(*priv->rxqs)[idx])
+ return 0;
+ rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
+ MLX5_ASSERT(rxq_ctrl->priv);
+ if (rxq_ctrl->obj && !mlx5_rxq_obj_release(rxq_ctrl->obj))
+ rxq_ctrl->obj = NULL;
+ if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) {
+ if (rxq_ctrl->dbr_umem_id_valid)
+ claim_zero(mlx5_release_dbr(dev, rxq_ctrl->dbr_umem_id,
+ rxq_ctrl->dbr_offset));
+ if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD)
+ mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh);
+ LIST_REMOVE(rxq_ctrl, next);
+ rte_free(rxq_ctrl);
+ (*priv->rxqs)[idx] = NULL;
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Verify the Rx Queue list is empty
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The number of object not released.
+ */
+int
+mlx5_rxq_verify(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+ int ret = 0;
+
+ LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) {
+ DRV_LOG(DEBUG, "port %u Rx Queue %u still referenced",
+ dev->data->port_id, rxq_ctrl->rxq.idx);
+ ++ret;
+ }
+ return ret;
+}
+
+/**
+ * Get a Rx queue type.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Rx queue index.
+ *
+ * @return
+ * The Rx queue type.
+ */
+enum mlx5_rxq_type
+mlx5_rxq_get_type(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
+
+ if (idx < priv->rxqs_n && (*priv->rxqs)[idx]) {
+ rxq_ctrl = container_of((*priv->rxqs)[idx],
+ struct mlx5_rxq_ctrl,
+ rxq);
+ return rxq_ctrl->type;
+ }
+ return MLX5_RXQ_TYPE_UNDEFINED;
+}
+
+/**
+ * Create an indirection table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param queues
+ * Queues entering in the indirection table.
+ * @param queues_n
+ * Number of queues in the array.
+ *
+ * @return
+ * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_ind_table_obj *
+mlx5_ind_table_obj_new(struct rte_eth_dev *dev, const uint16_t *queues,
+ uint32_t queues_n, enum mlx5_ind_tbl_type type)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ind_table_obj *ind_tbl;
+ unsigned int i = 0, j = 0, k = 0;
+
+ ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) +
+ queues_n * sizeof(uint16_t), 0);
+ if (!ind_tbl) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ ind_tbl->type = type;
+ if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) {
+ const unsigned int wq_n = rte_is_power_of_2(queues_n) ?
+ log2above(queues_n) :
+ log2above(priv->config.ind_table_max_size);
+ struct ibv_wq *wq[1 << wq_n];
+
+ for (i = 0; i != queues_n; ++i) {
+ struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev,
+ queues[i]);
+ if (!rxq)
+ goto error;
+ wq[i] = rxq->obj->wq;
+ ind_tbl->queues[i] = queues[i];
+ }
+ ind_tbl->queues_n = queues_n;
+ /* Finalise indirection table. */
+ k = i; /* Retain value of i for use in error case. */
+ for (j = 0; k != (unsigned int)(1 << wq_n); ++k, ++j)
+ wq[k] = wq[j];
+ ind_tbl->ind_table = mlx5_glue->create_rwq_ind_table
+ (priv->sh->ctx,
+ &(struct ibv_rwq_ind_table_init_attr){
+ .log_ind_tbl_size = wq_n,
+ .ind_tbl = wq,
+ .comp_mask = 0,
+ });
+ if (!ind_tbl->ind_table) {
+ rte_errno = errno;
+ goto error;
+ }
+ } else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
+ struct mlx5_devx_rqt_attr *rqt_attr = NULL;
+ const unsigned int rqt_n =
+ 1 << (rte_is_power_of_2(queues_n) ?
+ log2above(queues_n) :
+ log2above(priv->config.ind_table_max_size));
+
+ rqt_attr = rte_calloc(__func__, 1, sizeof(*rqt_attr) +
+ rqt_n * sizeof(uint32_t), 0);
+ if (!rqt_attr) {
+ DRV_LOG(ERR, "port %u cannot allocate RQT resources",
+ dev->data->port_id);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ rqt_attr->rqt_max_size = priv->config.ind_table_max_size;
+ rqt_attr->rqt_actual_size = rqt_n;
+ for (i = 0; i != queues_n; ++i) {
+ struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev,
+ queues[i]);
+ if (!rxq)
+ goto error;
+ rqt_attr->rq_list[i] = rxq->obj->rq->id;
+ ind_tbl->queues[i] = queues[i];
+ }
+ k = i; /* Retain value of i for use in error case. */
+ for (j = 0; k != rqt_n; ++k, ++j)
+ rqt_attr->rq_list[k] = rqt_attr->rq_list[j];
+ ind_tbl->rqt = mlx5_devx_cmd_create_rqt(priv->sh->ctx,
+ rqt_attr);
+ rte_free(rqt_attr);
+ if (!ind_tbl->rqt) {
+ DRV_LOG(ERR, "port %u cannot create DevX RQT",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ ind_tbl->queues_n = queues_n;
+ }
+ rte_atomic32_inc(&ind_tbl->refcnt);
+ LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next);
+ return ind_tbl;
+error:
+ for (j = 0; j < i; j++)
+ mlx5_rxq_release(dev, ind_tbl->queues[j]);
+ rte_free(ind_tbl);
+ DEBUG("port %u cannot create indirection table", dev->data->port_id);
+ return NULL;
+}
+
+/**
+ * Get an indirection table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param queues
+ * Queues entering in the indirection table.
+ * @param queues_n
+ * Number of queues in the array.
+ *
+ * @return
+ * An indirection table if found.
+ */
+static struct mlx5_ind_table_obj *
+mlx5_ind_table_obj_get(struct rte_eth_dev *dev, const uint16_t *queues,
+ uint32_t queues_n)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ind_table_obj *ind_tbl;
+
+ LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
+ if ((ind_tbl->queues_n == queues_n) &&
+ (memcmp(ind_tbl->queues, queues,
+ ind_tbl->queues_n * sizeof(ind_tbl->queues[0]))
+ == 0))
+ break;
+ }
+ if (ind_tbl) {
+ unsigned int i;
+
+ rte_atomic32_inc(&ind_tbl->refcnt);
+ for (i = 0; i != ind_tbl->queues_n; ++i)
+ mlx5_rxq_get(dev, ind_tbl->queues[i]);
+ }
+ return ind_tbl;
+}
+
+/**
+ * Release an indirection table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param ind_table
+ * Indirection table to release.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+static int
+mlx5_ind_table_obj_release(struct rte_eth_dev *dev,
+ struct mlx5_ind_table_obj *ind_tbl)
+{
+ unsigned int i;
+
+ if (rte_atomic32_dec_and_test(&ind_tbl->refcnt)) {
+ if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV)
+ claim_zero(mlx5_glue->destroy_rwq_ind_table
+ (ind_tbl->ind_table));
+ else if (ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX)
+ claim_zero(mlx5_devx_cmd_destroy(ind_tbl->rqt));
+ }
+ for (i = 0; i != ind_tbl->queues_n; ++i)
+ claim_nonzero(mlx5_rxq_release(dev, ind_tbl->queues[i]));
+ if (!rte_atomic32_read(&ind_tbl->refcnt)) {
+ LIST_REMOVE(ind_tbl, next);
+ rte_free(ind_tbl);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Verify the Rx Queue list is empty
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The number of object not released.
+ */
+int
+mlx5_ind_table_obj_verify(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ind_table_obj *ind_tbl;
+ int ret = 0;
+
+ LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
+ DRV_LOG(DEBUG,
+ "port %u indirection table obj %p still referenced",
+ dev->data->port_id, (void *)ind_tbl);
+ ++ret;
+ }
+ return ret;
+}
+
+/**
+ * Create an Rx Hash queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param rss_key
+ * RSS key for the Rx hash queue.
+ * @param rss_key_len
+ * RSS key length.
+ * @param hash_fields
+ * Verbs protocol hash field to make the RSS on.
+ * @param queues
+ * Queues entering in hash queue. In case of empty hash_fields only the
+ * first queue index will be taken for the indirection table.
+ * @param queues_n
+ * Number of queues.
+ * @param tunnel
+ * Tunnel type.
+ *
+ * @return
+ * The Verbs/DevX object initialised index, 0 otherwise and rte_errno is set.
+ */
+uint32_t
+mlx5_hrxq_new(struct rte_eth_dev *dev,
+ const uint8_t *rss_key, uint32_t rss_key_len,
+ uint64_t hash_fields,
+ const uint16_t *queues, uint32_t queues_n,
+ int tunnel __rte_unused)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_hrxq *hrxq;
+ uint32_t hrxq_idx = 0;
+ struct ibv_qp *qp = NULL;
+ struct mlx5_ind_table_obj *ind_tbl;
+ int err;
+ struct mlx5_devx_obj *tir = NULL;
+ struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[queues[0]];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+
+ queues_n = hash_fields ? queues_n : 1;
+ ind_tbl = mlx5_ind_table_obj_get(dev, queues, queues_n);
+ if (!ind_tbl) {
+ enum mlx5_ind_tbl_type type;
+
+ type = rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV ?
+ MLX5_IND_TBL_TYPE_IBV : MLX5_IND_TBL_TYPE_DEVX;
+ ind_tbl = mlx5_ind_table_obj_new(dev, queues, queues_n, type);
+ }
+ if (!ind_tbl) {
+ rte_errno = ENOMEM;
+ return 0;
+ }
+ if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) {
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ struct mlx5dv_qp_init_attr qp_init_attr;
+
+ memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+ if (tunnel) {
+ qp_init_attr.comp_mask =
+ MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+ qp_init_attr.create_flags =
+ MLX5DV_QP_CREATE_TUNNEL_OFFLOADS;
+ }
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ if (dev->data->dev_conf.lpbk_mode) {
+ /*
+ * Allow packet sent from NIC loop back
+ * w/o source MAC check.
+ */
+ qp_init_attr.comp_mask |=
+ MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+ qp_init_attr.create_flags |=
+ MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC;
+ }
+#endif
+ qp = mlx5_glue->dv_create_qp
+ (priv->sh->ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .comp_mask =
+ IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_IND_TABLE |
+ IBV_QP_INIT_ATTR_RX_HASH,
+ .rx_hash_conf = (struct ibv_rx_hash_conf){
+ .rx_hash_function =
+ IBV_RX_HASH_FUNC_TOEPLITZ,
+ .rx_hash_key_len = rss_key_len,
+ .rx_hash_key =
+ (void *)(uintptr_t)rss_key,
+ .rx_hash_fields_mask = hash_fields,
+ },
+ .rwq_ind_tbl = ind_tbl->ind_table,
+ .pd = priv->sh->pd,
+ },
+ &qp_init_attr);
+#else
+ qp = mlx5_glue->create_qp_ex
+ (priv->sh->ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .comp_mask =
+ IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_IND_TABLE |
+ IBV_QP_INIT_ATTR_RX_HASH,
+ .rx_hash_conf = (struct ibv_rx_hash_conf){
+ .rx_hash_function =
+ IBV_RX_HASH_FUNC_TOEPLITZ,
+ .rx_hash_key_len = rss_key_len,
+ .rx_hash_key =
+ (void *)(uintptr_t)rss_key,
+ .rx_hash_fields_mask = hash_fields,
+ },
+ .rwq_ind_tbl = ind_tbl->ind_table,
+ .pd = priv->sh->pd,
+ });
+#endif
+ if (!qp) {
+ rte_errno = errno;
+ goto error;
+ }
+ } else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
+ struct mlx5_devx_tir_attr tir_attr;
+ uint32_t i;
+ uint32_t lro = 1;
+
+ /* Enable TIR LRO only if all the queues were configured for. */
+ for (i = 0; i < queues_n; ++i) {
+ if (!(*priv->rxqs)[queues[i]]->lro) {
+ lro = 0;
+ break;
+ }
+ }
+ memset(&tir_attr, 0, sizeof(tir_attr));
+ tir_attr.disp_type = MLX5_TIRC_DISP_TYPE_INDIRECT;
+ tir_attr.rx_hash_fn = MLX5_RX_HASH_FN_TOEPLITZ;
+ tir_attr.tunneled_offload_en = !!tunnel;
+ /* If needed, translate hash_fields bitmap to PRM format. */
+ if (hash_fields) {
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ struct mlx5_rx_hash_field_select *rx_hash_field_select =
+ hash_fields & IBV_RX_HASH_INNER ?
+ &tir_attr.rx_hash_field_selector_inner :
+ &tir_attr.rx_hash_field_selector_outer;
+#else
+ struct mlx5_rx_hash_field_select *rx_hash_field_select =
+ &tir_attr.rx_hash_field_selector_outer;
+#endif
+
+ /* 1 bit: 0: IPv4, 1: IPv6. */
+ rx_hash_field_select->l3_prot_type =
+ !!(hash_fields & MLX5_IPV6_IBV_RX_HASH);
+ /* 1 bit: 0: TCP, 1: UDP. */
+ rx_hash_field_select->l4_prot_type =
+ !!(hash_fields & MLX5_UDP_IBV_RX_HASH);
+ /* Bitmask which sets which fields to use in RX Hash. */
+ rx_hash_field_select->selected_fields =
+ ((!!(hash_fields & MLX5_L3_SRC_IBV_RX_HASH)) <<
+ MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP) |
+ (!!(hash_fields & MLX5_L3_DST_IBV_RX_HASH)) <<
+ MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP |
+ (!!(hash_fields & MLX5_L4_SRC_IBV_RX_HASH)) <<
+ MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT |
+ (!!(hash_fields & MLX5_L4_DST_IBV_RX_HASH)) <<
+ MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT;
+ }
+ if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN)
+ tir_attr.transport_domain = priv->sh->td->id;
+ else
+ tir_attr.transport_domain = priv->sh->tdn;
+ memcpy(tir_attr.rx_hash_toeplitz_key, rss_key,
+ MLX5_RSS_HASH_KEY_LEN);
+ tir_attr.indirect_table = ind_tbl->rqt->id;
+ if (dev->data->dev_conf.lpbk_mode)
+ tir_attr.self_lb_block =
+ MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
+ if (lro) {
+ tir_attr.lro_timeout_period_usecs =
+ priv->config.lro.timeout;
+ tir_attr.lro_max_msg_sz = priv->max_lro_msg_size;
+ tir_attr.lro_enable_mask =
+ MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
+ MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO;
+ }
+ tir = mlx5_devx_cmd_create_tir(priv->sh->ctx, &tir_attr);
+ if (!tir) {
+ DRV_LOG(ERR, "port %u cannot create DevX TIR",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ }
+ hrxq = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_HRXQ], &hrxq_idx);
+ if (!hrxq)
+ goto error;
+ hrxq->ind_table = ind_tbl;
+ if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) {
+ hrxq->qp = qp;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ hrxq->action =
+ mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
+ if (!hrxq->action) {
+ rte_errno = errno;
+ goto error;
+ }
+#endif
+ } else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
+ hrxq->tir = tir;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ hrxq->action = mlx5_glue->dv_create_flow_action_dest_devx_tir
+ (hrxq->tir->obj);
+ if (!hrxq->action) {
+ rte_errno = errno;
+ goto error;
+ }
+#endif
+ }
+ hrxq->rss_key_len = rss_key_len;
+ hrxq->hash_fields = hash_fields;
+ memcpy(hrxq->rss_key, rss_key, rss_key_len);
+ rte_atomic32_inc(&hrxq->refcnt);
+ ILIST_INSERT(priv->sh->ipool[MLX5_IPOOL_HRXQ], &priv->hrxqs, hrxq_idx,
+ hrxq, next);
+ return hrxq_idx;
+error:
+ err = rte_errno; /* Save rte_errno before cleanup. */
+ mlx5_ind_table_obj_release(dev, ind_tbl);
+ if (qp)
+ claim_zero(mlx5_glue->destroy_qp(qp));
+ else if (tir)
+ claim_zero(mlx5_devx_cmd_destroy(tir));
+ rte_errno = err; /* Restore rte_errno. */
+ return 0;
+}
+
+/**
+ * Get an Rx Hash queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param rss_conf
+ * RSS configuration for the Rx hash queue.
+ * @param queues
+ * Queues entering in hash queue. In case of empty hash_fields only the
+ * first queue index will be taken for the indirection table.
+ * @param queues_n
+ * Number of queues.
+ *
+ * @return
+ * An hash Rx queue index on success.
+ */
+uint32_t
+mlx5_hrxq_get(struct rte_eth_dev *dev,
+ const uint8_t *rss_key, uint32_t rss_key_len,
+ uint64_t hash_fields,
+ const uint16_t *queues, uint32_t queues_n)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_hrxq *hrxq;
+ uint32_t idx;
+
+ queues_n = hash_fields ? queues_n : 1;
+ ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_HRXQ], priv->hrxqs, idx,
+ hrxq, next) {
+ struct mlx5_ind_table_obj *ind_tbl;
+
+ if (hrxq->rss_key_len != rss_key_len)
+ continue;
+ if (memcmp(hrxq->rss_key, rss_key, rss_key_len))
+ continue;
+ if (hrxq->hash_fields != hash_fields)
+ continue;
+ ind_tbl = mlx5_ind_table_obj_get(dev, queues, queues_n);
+ if (!ind_tbl)
+ continue;
+ if (ind_tbl != hrxq->ind_table) {
+ mlx5_ind_table_obj_release(dev, ind_tbl);
+ continue;
+ }
+ rte_atomic32_inc(&hrxq->refcnt);
+ return idx;
+ }
+ return 0;
+}
+
+/**
+ * Release the hash Rx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param hrxq
+ * Index to Hash Rx queue to release.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+int
+mlx5_hrxq_release(struct rte_eth_dev *dev, uint32_t hrxq_idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_hrxq *hrxq;
+
+ hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ], hrxq_idx);
+ if (!hrxq)
+ return 0;
+ if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ mlx5_glue->destroy_flow_action(hrxq->action);
+#endif
+ if (hrxq->ind_table->type == MLX5_IND_TBL_TYPE_IBV)
+ claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+ else /* hrxq->ind_table->type == MLX5_IND_TBL_TYPE_DEVX */
+ claim_zero(mlx5_devx_cmd_destroy(hrxq->tir));
+ mlx5_ind_table_obj_release(dev, hrxq->ind_table);
+ ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_HRXQ], &priv->hrxqs,
+ hrxq_idx, hrxq, next);
+ mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_HRXQ], hrxq_idx);
+ return 0;
+ }
+ claim_nonzero(mlx5_ind_table_obj_release(dev, hrxq->ind_table));
+ return 1;
+}
+
+/**
+ * Verify the Rx Queue list is empty
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The number of object not released.
+ */
+int
+mlx5_hrxq_verify(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_hrxq *hrxq;
+ uint32_t idx;
+ int ret = 0;
+
+ ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_HRXQ], priv->hrxqs, idx,
+ hrxq, next) {
+ DRV_LOG(DEBUG,
+ "port %u hash Rx queue %p still referenced",
+ dev->data->port_id, (void *)hrxq);
+ ++ret;
+ }
+ return ret;
+}
+
+/**
+ * Create a drop Rx queue Verbs/DevX object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_rxq_obj *
+mlx5_rxq_obj_drop_new(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ibv_context *ctx = priv->sh->ctx;
+ struct ibv_cq *cq;
+ struct ibv_wq *wq = NULL;
+ struct mlx5_rxq_obj *rxq;
+
+ if (priv->drop_queue.rxq)
+ return priv->drop_queue.rxq;
+ cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
+ if (!cq) {
+ DEBUG("port %u cannot allocate CQ for drop queue",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ wq = mlx5_glue->create_wq(ctx,
+ &(struct ibv_wq_init_attr){
+ .wq_type = IBV_WQT_RQ,
+ .max_wr = 1,
+ .max_sge = 1,
+ .pd = priv->sh->pd,
+ .cq = cq,
+ });
+ if (!wq) {
+ DEBUG("port %u cannot allocate WQ for drop queue",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ rxq = rte_calloc(__func__, 1, sizeof(*rxq), 0);
+ if (!rxq) {
+ DEBUG("port %u cannot allocate drop Rx queue memory",
+ dev->data->port_id);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ rxq->cq = cq;
+ rxq->wq = wq;
+ priv->drop_queue.rxq = rxq;
+ return rxq;
+error:
+ if (wq)
+ claim_zero(mlx5_glue->destroy_wq(wq));
+ if (cq)
+ claim_zero(mlx5_glue->destroy_cq(cq));
+ return NULL;
+}
+
+/**
+ * Release a drop Rx queue Verbs/DevX object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static void
+mlx5_rxq_obj_drop_release(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_obj *rxq = priv->drop_queue.rxq;
+
+ if (rxq->wq)
+ claim_zero(mlx5_glue->destroy_wq(rxq->wq));
+ if (rxq->cq)
+ claim_zero(mlx5_glue->destroy_cq(rxq->cq));
+ rte_free(rxq);
+ priv->drop_queue.rxq = NULL;
+}
+
+/**
+ * Create a drop indirection table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_ind_table_obj *
+mlx5_ind_table_obj_drop_new(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ind_table_obj *ind_tbl;
+ struct mlx5_rxq_obj *rxq;
+ struct mlx5_ind_table_obj tmpl;
+
+ rxq = mlx5_rxq_obj_drop_new(dev);
+ if (!rxq)
+ return NULL;
+ tmpl.ind_table = mlx5_glue->create_rwq_ind_table
+ (priv->sh->ctx,
+ &(struct ibv_rwq_ind_table_init_attr){
+ .log_ind_tbl_size = 0,
+ .ind_tbl = &rxq->wq,
+ .comp_mask = 0,
+ });
+ if (!tmpl.ind_table) {
+ DEBUG("port %u cannot allocate indirection table for drop"
+ " queue",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl), 0);
+ if (!ind_tbl) {
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ ind_tbl->ind_table = tmpl.ind_table;
+ return ind_tbl;
+error:
+ mlx5_rxq_obj_drop_release(dev);
+ return NULL;
+}
+
+/**
+ * Release a drop indirection table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx5_ind_table_obj_drop_release(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ind_table_obj *ind_tbl = priv->drop_queue.hrxq->ind_table;
+
+ claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl->ind_table));
+ mlx5_rxq_obj_drop_release(dev);
+ rte_free(ind_tbl);
+ priv->drop_queue.hrxq->ind_table = NULL;
+}
+
+/**
+ * Create a drop Rx Hash queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_hrxq *
+mlx5_hrxq_drop_new(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ind_table_obj *ind_tbl = NULL;
+ struct ibv_qp *qp = NULL;
+ struct mlx5_hrxq *hrxq = NULL;
+
+ if (priv->drop_queue.hrxq) {
+ rte_atomic32_inc(&priv->drop_queue.hrxq->refcnt);
+ return priv->drop_queue.hrxq;
+ }
+ hrxq = rte_calloc(__func__, 1, sizeof(*hrxq), 0);
+ if (!hrxq) {
+ DRV_LOG(WARNING,
+ "port %u cannot allocate memory for drop queue",
+ dev->data->port_id);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ priv->drop_queue.hrxq = hrxq;
+ ind_tbl = mlx5_ind_table_obj_drop_new(dev);
+ if (!ind_tbl)
+ goto error;
+ hrxq->ind_table = ind_tbl;
+ qp = mlx5_glue->create_qp_ex(priv->sh->ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .comp_mask =
+ IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_IND_TABLE |
+ IBV_QP_INIT_ATTR_RX_HASH,
+ .rx_hash_conf = (struct ibv_rx_hash_conf){
+ .rx_hash_function =
+ IBV_RX_HASH_FUNC_TOEPLITZ,
+ .rx_hash_key_len = MLX5_RSS_HASH_KEY_LEN,
+ .rx_hash_key = rss_hash_default_key,
+ .rx_hash_fields_mask = 0,
+ },
+ .rwq_ind_tbl = ind_tbl->ind_table,
+ .pd = priv->sh->pd
+ });
+ if (!qp) {
+ DEBUG("port %u cannot allocate QP for drop queue",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ hrxq->qp = qp;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
+ if (!hrxq->action) {
+ rte_errno = errno;
+ goto error;
+ }
+#endif
+ rte_atomic32_set(&hrxq->refcnt, 1);
+ return hrxq;
+error:
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ if (hrxq && hrxq->action)
+ mlx5_glue->destroy_flow_action(hrxq->action);
+#endif
+ if (qp)
+ claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+ if (ind_tbl)
+ mlx5_ind_table_obj_drop_release(dev);
+ if (hrxq) {
+ priv->drop_queue.hrxq = NULL;
+ rte_free(hrxq);
+ }
+ return NULL;
+}
+
+/**
+ * Release a drop hash Rx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_hrxq_drop_release(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq;
+
+ if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ mlx5_glue->destroy_flow_action(hrxq->action);
+#endif
+ claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+ mlx5_ind_table_obj_drop_release(dev);
+ rte_free(hrxq);
+ priv->drop_queue.hrxq = NULL;
+ }
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c
new file mode 100644
index 000000000..6a17a9a5d
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c
@@ -0,0 +1,5691 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015-2019 Mellanox Technologies, Ltd
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+#include <rte_ether.h>
+#include <rte_cycles.h>
+#include <rte_flow.h>
+
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+#include <mlx5_common.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_mr.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_autoconf.h"
+
+/* TX burst subroutines return codes. */
+enum mlx5_txcmp_code {
+ MLX5_TXCMP_CODE_EXIT = 0,
+ MLX5_TXCMP_CODE_ERROR,
+ MLX5_TXCMP_CODE_SINGLE,
+ MLX5_TXCMP_CODE_MULTI,
+ MLX5_TXCMP_CODE_TSO,
+ MLX5_TXCMP_CODE_EMPW,
+};
+
+/*
+ * These defines are used to configure Tx burst routine option set
+ * supported at compile time. The not specified options are optimized out
+ * out due to if conditions can be explicitly calculated at compile time.
+ * The offloads with bigger runtime check (require more CPU cycles to
+ * skip) overhead should have the bigger index - this is needed to
+ * select the better matching routine function if no exact match and
+ * some offloads are not actually requested.
+ */
+#define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
+#define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
+#define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
+#define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
+#define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
+#define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
+#define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
+#define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
+#define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/
+
+/* The most common offloads groups. */
+#define MLX5_TXOFF_CONFIG_NONE 0
+#define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
+ MLX5_TXOFF_CONFIG_TSO | \
+ MLX5_TXOFF_CONFIG_SWP | \
+ MLX5_TXOFF_CONFIG_CSUM | \
+ MLX5_TXOFF_CONFIG_INLINE | \
+ MLX5_TXOFF_CONFIG_VLAN | \
+ MLX5_TXOFF_CONFIG_METADATA)
+
+#define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
+
+#define MLX5_TXOFF_DECL(func, olx) \
+static uint16_t mlx5_tx_burst_##func(void *txq, \
+ struct rte_mbuf **pkts, \
+ uint16_t pkts_n) \
+{ \
+ return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \
+ pkts, pkts_n, (olx)); \
+}
+
+#define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
+
+static __rte_always_inline uint32_t
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
+
+static __rte_always_inline int
+mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+ uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe);
+
+static __rte_always_inline uint32_t
+rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
+
+static __rte_always_inline void
+rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
+ volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
+
+static __rte_always_inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+ const unsigned int strd_n);
+
+static int
+mlx5_queue_state_modify(struct rte_eth_dev *dev,
+ struct mlx5_mp_arg_queue_state_modify *sm);
+
+static inline void
+mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
+ volatile struct mlx5_cqe *restrict cqe,
+ uint32_t phcsum);
+
+static inline void
+mlx5_lro_update_hdr(uint8_t *restrict padd,
+ volatile struct mlx5_cqe *restrict cqe,
+ uint32_t len);
+
+uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
+ [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
+};
+
+uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
+uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
+
+uint64_t rte_net_mlx5_dynf_inline_mask;
+#define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
+
+/**
+ * Build a table to translate Rx completion flags to packet type.
+ *
+ * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
+ */
+void
+mlx5_set_ptype_table(void)
+{
+ unsigned int i;
+ uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
+
+ /* Last entry must not be overwritten, reserved for errored packet. */
+ for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
+ (*p)[i] = RTE_PTYPE_UNKNOWN;
+ /*
+ * The index to the array should have:
+ * bit[1:0] = l3_hdr_type
+ * bit[4:2] = l4_hdr_type
+ * bit[5] = ip_frag
+ * bit[6] = tunneled
+ * bit[7] = outer_l3_type
+ */
+ /* L2 */
+ (*p)[0x00] = RTE_PTYPE_L2_ETHER;
+ /* L3 */
+ (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG;
+ (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG;
+ /* Fragmented */
+ (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG;
+ (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG;
+ /* TCP */
+ (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ /* UDP */
+ (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_UDP;
+ (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_UDP;
+ /* Repeat with outer_l3_type being set. Just in case. */
+ (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG;
+ (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG;
+ (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG;
+ (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG;
+ (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP;
+ (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_UDP;
+ (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_UDP;
+ /* Tunneled - L3 */
+ (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
+ (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG;
+ (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG;
+ (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
+ (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG;
+ (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG;
+ /* Tunneled - Fragmented */
+ (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG;
+ (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG;
+ (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG;
+ (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG;
+ /* Tunneled - TCP */
+ (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP;
+ /* Tunneled - UDP */
+ (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP;
+ (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP;
+ (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP;
+ (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP;
+}
+
+/**
+ * Build a table to translate packet to checksum type of Verbs.
+ */
+void
+mlx5_set_cksum_table(void)
+{
+ unsigned int i;
+ uint8_t v;
+
+ /*
+ * The index should have:
+ * bit[0] = PKT_TX_TCP_SEG
+ * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
+ * bit[4] = PKT_TX_IP_CKSUM
+ * bit[8] = PKT_TX_OUTER_IP_CKSUM
+ * bit[9] = tunnel
+ */
+ for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) {
+ v = 0;
+ if (i & (1 << 9)) {
+ /* Tunneled packet. */
+ if (i & (1 << 8)) /* Outer IP. */
+ v |= MLX5_ETH_WQE_L3_CSUM;
+ if (i & (1 << 4)) /* Inner IP. */
+ v |= MLX5_ETH_WQE_L3_INNER_CSUM;
+ if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
+ v |= MLX5_ETH_WQE_L4_INNER_CSUM;
+ } else {
+ /* No tunnel. */
+ if (i & (1 << 4)) /* IP. */
+ v |= MLX5_ETH_WQE_L3_CSUM;
+ if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
+ v |= MLX5_ETH_WQE_L4_CSUM;
+ }
+ mlx5_cksum_table[i] = v;
+ }
+}
+
+/**
+ * Build a table to translate packet type of mbuf to SWP type of Verbs.
+ */
+void
+mlx5_set_swp_types_table(void)
+{
+ unsigned int i;
+ uint8_t v;
+
+ /*
+ * The index should have:
+ * bit[0:1] = PKT_TX_L4_MASK
+ * bit[4] = PKT_TX_IPV6
+ * bit[8] = PKT_TX_OUTER_IPV6
+ * bit[9] = PKT_TX_OUTER_UDP
+ */
+ for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) {
+ v = 0;
+ if (i & (1 << 8))
+ v |= MLX5_ETH_WQE_L3_OUTER_IPV6;
+ if (i & (1 << 9))
+ v |= MLX5_ETH_WQE_L4_OUTER_UDP;
+ if (i & (1 << 4))
+ v |= MLX5_ETH_WQE_L3_INNER_IPV6;
+ if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52))
+ v |= MLX5_ETH_WQE_L4_INNER_UDP;
+ mlx5_swp_types_table[i] = v;
+ }
+}
+
+/**
+ * Set Software Parser flags and offsets in Ethernet Segment of WQE.
+ * Flags must be preliminary initialized to zero.
+ *
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param swp_flags
+ * Pointer to store Software Parser flags
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * Software Parser offsets packed in dword.
+ * Software Parser flags are set by pointer.
+ */
+static __rte_always_inline uint32_t
+txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc,
+ uint8_t *swp_flags,
+ unsigned int olx)
+{
+ uint64_t ol, tunnel;
+ unsigned int idx, off;
+ uint32_t set;
+
+ if (!MLX5_TXOFF_CONFIG(SWP))
+ return 0;
+ ol = loc->mbuf->ol_flags;
+ tunnel = ol & PKT_TX_TUNNEL_MASK;
+ /*
+ * Check whether Software Parser is required.
+ * Only customized tunnels may ask for.
+ */
+ if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP))
+ return 0;
+ /*
+ * The index should have:
+ * bit[0:1] = PKT_TX_L4_MASK
+ * bit[4] = PKT_TX_IPV6
+ * bit[8] = PKT_TX_OUTER_IPV6
+ * bit[9] = PKT_TX_OUTER_UDP
+ */
+ idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52;
+ idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0;
+ *swp_flags = mlx5_swp_types_table[idx];
+ /*
+ * Set offsets for SW parser. Since ConnectX-5, SW parser just
+ * complements HW parser. SW parser starts to engage only if HW parser
+ * can't reach a header. For the older devices, HW parser will not kick
+ * in if any of SWP offsets is set. Therefore, all of the L3 offsets
+ * should be set regardless of HW offload.
+ */
+ off = loc->mbuf->outer_l2_len;
+ if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT)
+ off += sizeof(struct rte_vlan_hdr);
+ set = (off >> 1) << 8; /* Outer L3 offset. */
+ off += loc->mbuf->outer_l3_len;
+ if (tunnel == PKT_TX_TUNNEL_UDP)
+ set |= off >> 1; /* Outer L4 offset. */
+ if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */
+ const uint64_t csum = ol & PKT_TX_L4_MASK;
+ off += loc->mbuf->l2_len;
+ set |= (off >> 1) << 24; /* Inner L3 offset. */
+ if (csum == PKT_TX_TCP_CKSUM ||
+ csum == PKT_TX_UDP_CKSUM ||
+ (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) {
+ off += loc->mbuf->l3_len;
+ set |= (off >> 1) << 16; /* Inner L4 offset. */
+ }
+ }
+ set = rte_cpu_to_le_32(set);
+ return set;
+}
+
+/**
+ * Convert the Checksum offloads to Verbs.
+ *
+ * @param buf
+ * Pointer to the mbuf.
+ *
+ * @return
+ * Converted checksum flags.
+ */
+static __rte_always_inline uint8_t
+txq_ol_cksum_to_cs(struct rte_mbuf *buf)
+{
+ uint32_t idx;
+ uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
+ const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
+ PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
+
+ /*
+ * The index should have:
+ * bit[0] = PKT_TX_TCP_SEG
+ * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
+ * bit[4] = PKT_TX_IP_CKSUM
+ * bit[8] = PKT_TX_OUTER_IP_CKSUM
+ * bit[9] = tunnel
+ */
+ idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
+ return mlx5_cksum_table[idx];
+}
+
+/**
+ * Internal function to compute the number of used descriptors in an RX queue
+ *
+ * @param rxq
+ * The Rx queue.
+ *
+ * @return
+ * The number of used rx descriptor.
+ */
+static uint32_t
+rx_queue_count(struct mlx5_rxq_data *rxq)
+{
+ struct rxq_zip *zip = &rxq->zip;
+ volatile struct mlx5_cqe *cqe;
+ const unsigned int cqe_n = (1 << rxq->cqe_n);
+ const unsigned int cqe_cnt = cqe_n - 1;
+ unsigned int cq_ci;
+ unsigned int used;
+
+ /* if we are processing a compressed cqe */
+ if (zip->ai) {
+ used = zip->cqe_cnt - zip->ca;
+ cq_ci = zip->cq_ci;
+ } else {
+ used = 0;
+ cq_ci = rxq->cq_ci;
+ }
+ cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
+ while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
+ int8_t op_own;
+ unsigned int n;
+
+ op_own = cqe->op_own;
+ if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
+ n = rte_be_to_cpu_32(cqe->byte_cnt);
+ else
+ n = 1;
+ cq_ci += n;
+ used += n;
+ cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
+ }
+ used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
+ return used;
+}
+
+/**
+ * DPDK callback to check the status of a rx descriptor.
+ *
+ * @param rx_queue
+ * The Rx queue.
+ * @param[in] offset
+ * The index of the descriptor in the ring.
+ *
+ * @return
+ * The status of the tx descriptor.
+ */
+int
+mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
+{
+ struct mlx5_rxq_data *rxq = rx_queue;
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+ struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv);
+
+ if (dev->rx_pkt_burst != mlx5_rx_burst) {
+ rte_errno = ENOTSUP;
+ return -rte_errno;
+ }
+ if (offset >= (1 << rxq->elts_n)) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ if (offset < rx_queue_count(rxq))
+ return RTE_ETH_RX_DESC_DONE;
+ return RTE_ETH_RX_DESC_AVAIL;
+}
+
+/**
+ * DPDK callback to get the RX queue information
+ *
+ * @param dev
+ * Pointer to the device structure.
+ *
+ * @param rx_queue_id
+ * Rx queue identificator.
+ *
+ * @param qinfo
+ * Pointer to the RX queue information structure.
+ *
+ * @return
+ * None.
+ */
+
+void
+mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+ struct rte_eth_rxq_info *qinfo)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+ if (!rxq)
+ return;
+ qinfo->mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
+ rxq->mprq_mp : rxq->mp;
+ qinfo->conf.rx_thresh.pthresh = 0;
+ qinfo->conf.rx_thresh.hthresh = 0;
+ qinfo->conf.rx_thresh.wthresh = 0;
+ qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh;
+ qinfo->conf.rx_drop_en = 1;
+ qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1;
+ qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
+ qinfo->scattered_rx = dev->data->scattered_rx;
+ qinfo->nb_desc = 1 << rxq->elts_n;
+}
+
+/**
+ * DPDK callback to get the RX packet burst mode information
+ *
+ * @param dev
+ * Pointer to the device structure.
+ *
+ * @param rx_queue_id
+ * Rx queue identificatior.
+ *
+ * @param mode
+ * Pointer to the burts mode information.
+ *
+ * @return
+ * 0 as success, -EINVAL as failure.
+ */
+
+int
+mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
+ uint16_t rx_queue_id __rte_unused,
+ struct rte_eth_burst_mode *mode)
+{
+ eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
+
+ if (pkt_burst == mlx5_rx_burst) {
+ snprintf(mode->info, sizeof(mode->info), "%s", "Scalar");
+ } else if (pkt_burst == mlx5_rx_burst_mprq) {
+ snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ");
+ } else if (pkt_burst == mlx5_rx_burst_vec) {
+#if defined RTE_ARCH_X86_64
+ snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE");
+#elif defined RTE_ARCH_ARM64
+ snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon");
+#elif defined RTE_ARCH_PPC_64
+ snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
+#else
+ return -EINVAL;
+#endif
+ } else {
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * DPDK callback to get the number of used descriptors in a RX queue
+ *
+ * @param dev
+ * Pointer to the device structure.
+ *
+ * @param rx_queue_id
+ * The Rx queue.
+ *
+ * @return
+ * The number of used rx descriptor.
+ * -EINVAL if the queue is invalid
+ */
+uint32_t
+mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq;
+
+ if (dev->rx_pkt_burst != mlx5_rx_burst) {
+ rte_errno = ENOTSUP;
+ return -rte_errno;
+ }
+ rxq = (*priv->rxqs)[rx_queue_id];
+ if (!rxq) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ return rx_queue_count(rxq);
+}
+
+#define MLX5_SYSTEM_LOG_DIR "/var/log"
+/**
+ * Dump debug information to log file.
+ *
+ * @param fname
+ * The file name.
+ * @param hex_title
+ * If not NULL this string is printed as a header to the output
+ * and the output will be in hexadecimal view.
+ * @param buf
+ * This is the buffer address to print out.
+ * @param len
+ * The number of bytes to dump out.
+ */
+void
+mlx5_dump_debug_information(const char *fname, const char *hex_title,
+ const void *buf, unsigned int hex_len)
+{
+ FILE *fd;
+
+ MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname);
+ fd = fopen(path, "a+");
+ if (!fd) {
+ DRV_LOG(WARNING, "cannot open %s for debug dump", path);
+ MKSTR(path2, "./%s", fname);
+ fd = fopen(path2, "a+");
+ if (!fd) {
+ DRV_LOG(ERR, "cannot open %s for debug dump", path2);
+ return;
+ }
+ DRV_LOG(INFO, "New debug dump in file %s", path2);
+ } else {
+ DRV_LOG(INFO, "New debug dump in file %s", path);
+ }
+ if (hex_title)
+ rte_hexdump(fd, hex_title, buf, hex_len);
+ else
+ fprintf(fd, "%s", (const char *)buf);
+ fprintf(fd, "\n\n\n");
+ fclose(fd);
+}
+
+/**
+ * Move QP from error state to running state and initialize indexes.
+ *
+ * @param txq_ctrl
+ * Pointer to TX queue control structure.
+ *
+ * @return
+ * 0 on success, else -1.
+ */
+static int
+tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl)
+{
+ struct mlx5_mp_arg_queue_state_modify sm = {
+ .is_wq = 0,
+ .queue_id = txq_ctrl->txq.idx,
+ };
+
+ if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm))
+ return -1;
+ txq_ctrl->txq.wqe_ci = 0;
+ txq_ctrl->txq.wqe_pi = 0;
+ txq_ctrl->txq.elts_comp = 0;
+ return 0;
+}
+
+/* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
+static int
+check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
+{
+ static const uint8_t magic[] = "seen";
+ int ret = 1;
+ unsigned int i;
+
+ for (i = 0; i < sizeof(magic); ++i)
+ if (!ret || err_cqe->rsvd1[i] != magic[i]) {
+ ret = 0;
+ err_cqe->rsvd1[i] = magic[i];
+ }
+ return ret;
+}
+
+/**
+ * Handle error CQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param error_cqe
+ * Pointer to the error CQE.
+ *
+ * @return
+ * Negative value if queue recovery failed, otherwise
+ * the error completion entry is handled successfully.
+ */
+static int
+mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq,
+ volatile struct mlx5_err_cqe *err_cqe)
+{
+ if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
+ const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq, struct mlx5_txq_ctrl, txq);
+ uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter);
+ int seen = check_err_cqe_seen(err_cqe);
+
+ if (!seen && txq_ctrl->dump_file_n <
+ txq_ctrl->priv->config.max_dump_files_num) {
+ MKSTR(err_str, "Unexpected CQE error syndrome "
+ "0x%02x CQN = %u SQN = %u wqe_counter = %u "
+ "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
+ txq->cqe_s, txq->qp_num_8s >> 8,
+ rte_be_to_cpu_16(err_cqe->wqe_counter),
+ txq->wqe_ci, txq->cq_ci);
+ MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
+ PORT_ID(txq_ctrl->priv), txq->idx,
+ txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc());
+ mlx5_dump_debug_information(name, NULL, err_str, 0);
+ mlx5_dump_debug_information(name, "MLX5 Error CQ:",
+ (const void *)((uintptr_t)
+ txq->cqes),
+ sizeof(*err_cqe) *
+ (1 << txq->cqe_n));
+ mlx5_dump_debug_information(name, "MLX5 Error SQ:",
+ (const void *)((uintptr_t)
+ txq->wqes),
+ MLX5_WQE_SIZE *
+ (1 << txq->wqe_n));
+ txq_ctrl->dump_file_n++;
+ }
+ if (!seen)
+ /*
+ * Count errors in WQEs units.
+ * Later it can be improved to count error packets,
+ * for example, by SQ parsing to find how much packets
+ * should be counted for each WQE.
+ */
+ txq->stats.oerrors += ((txq->wqe_ci & wqe_m) -
+ new_wqe_pi) & wqe_m;
+ if (tx_recover_qp(txq_ctrl)) {
+ /* Recovering failed - retry later on the same WQE. */
+ return -1;
+ }
+ /* Release all the remaining buffers. */
+ txq_free_elts(txq_ctrl);
+ }
+ return 0;
+}
+
+/**
+ * Translate RX completion flags to packet type.
+ *
+ * @param[in] rxq
+ * Pointer to RX queue structure.
+ * @param[in] cqe
+ * Pointer to CQE.
+ *
+ * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
+ *
+ * @return
+ * Packet type for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
+{
+ uint8_t idx;
+ uint8_t pinfo = cqe->pkt_info;
+ uint16_t ptype = cqe->hdr_type_etc;
+
+ /*
+ * The index to the array should have:
+ * bit[1:0] = l3_hdr_type
+ * bit[4:2] = l4_hdr_type
+ * bit[5] = ip_frag
+ * bit[6] = tunneled
+ * bit[7] = outer_l3_type
+ */
+ idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
+ return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
+}
+
+/**
+ * Initialize Rx WQ and indexes.
+ *
+ * @param[in] rxq
+ * Pointer to RX queue structure.
+ */
+void
+mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
+{
+ const unsigned int wqe_n = 1 << rxq->elts_n;
+ unsigned int i;
+
+ for (i = 0; (i != wqe_n); ++i) {
+ volatile struct mlx5_wqe_data_seg *scat;
+ uintptr_t addr;
+ uint32_t byte_count;
+
+ if (mlx5_rxq_mprq_enabled(rxq)) {
+ struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
+
+ scat = &((volatile struct mlx5_wqe_mprq *)
+ rxq->wqes)[i].dseg;
+ addr = (uintptr_t)mlx5_mprq_buf_addr(buf,
+ 1 << rxq->strd_num_n);
+ byte_count = (1 << rxq->strd_sz_n) *
+ (1 << rxq->strd_num_n);
+ } else {
+ struct rte_mbuf *buf = (*rxq->elts)[i];
+
+ scat = &((volatile struct mlx5_wqe_data_seg *)
+ rxq->wqes)[i];
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ byte_count = DATA_LEN(buf);
+ }
+ /* scat->addr must be able to store a pointer. */
+ MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t));
+ *scat = (struct mlx5_wqe_data_seg){
+ .addr = rte_cpu_to_be_64(addr),
+ .byte_count = rte_cpu_to_be_32(byte_count),
+ .lkey = mlx5_rx_addr2mr(rxq, addr),
+ };
+ }
+ rxq->consumed_strd = 0;
+ rxq->decompressed = 0;
+ rxq->rq_pi = 0;
+ rxq->zip = (struct rxq_zip){
+ .ai = 0,
+ };
+ /* Update doorbell counter. */
+ rxq->rq_ci = wqe_n >> rxq->sges_n;
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+}
+
+/**
+ * Modify a Verbs/DevX queue state.
+ * This must be called from the primary process.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param sm
+ * State modify request parameters.
+ *
+ * @return
+ * 0 in case of success else non-zero value and rte_errno is set.
+ */
+int
+mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
+ const struct mlx5_mp_arg_queue_state_modify *sm)
+{
+ int ret;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (sm->is_wq) {
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+ if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
+ struct ibv_wq_attr mod = {
+ .attr_mask = IBV_WQ_ATTR_STATE,
+ .wq_state = sm->state,
+ };
+
+ ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod);
+ } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */
+ struct mlx5_devx_modify_rq_attr rq_attr;
+
+ memset(&rq_attr, 0, sizeof(rq_attr));
+ if (sm->state == IBV_WQS_RESET) {
+ rq_attr.rq_state = MLX5_RQC_STATE_ERR;
+ rq_attr.state = MLX5_RQC_STATE_RST;
+ } else if (sm->state == IBV_WQS_RDY) {
+ rq_attr.rq_state = MLX5_RQC_STATE_RST;
+ rq_attr.state = MLX5_RQC_STATE_RDY;
+ } else if (sm->state == IBV_WQS_ERR) {
+ rq_attr.rq_state = MLX5_RQC_STATE_RDY;
+ rq_attr.state = MLX5_RQC_STATE_ERR;
+ }
+ ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq,
+ &rq_attr);
+ }
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s",
+ sm->state, strerror(errno));
+ rte_errno = errno;
+ return ret;
+ }
+ } else {
+ struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id];
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq, struct mlx5_txq_ctrl, txq);
+ struct ibv_qp_attr mod = {
+ .qp_state = IBV_QPS_RESET,
+ .port_num = (uint8_t)priv->ibv_port,
+ };
+ struct ibv_qp *qp = txq_ctrl->obj->qp;
+
+ ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change the Tx QP state to RESET "
+ "%s", strerror(errno));
+ rte_errno = errno;
+ return ret;
+ }
+ mod.qp_state = IBV_QPS_INIT;
+ ret = mlx5_glue->modify_qp(qp, &mod,
+ (IBV_QP_STATE | IBV_QP_PORT));
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s",
+ strerror(errno));
+ rte_errno = errno;
+ return ret;
+ }
+ mod.qp_state = IBV_QPS_RTR;
+ ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s",
+ strerror(errno));
+ rte_errno = errno;
+ return ret;
+ }
+ mod.qp_state = IBV_QPS_RTS;
+ ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s",
+ strerror(errno));
+ rte_errno = errno;
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Modify a Verbs queue state.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param sm
+ * State modify request parameters.
+ *
+ * @return
+ * 0 in case of success else non-zero value.
+ */
+static int
+mlx5_queue_state_modify(struct rte_eth_dev *dev,
+ struct mlx5_mp_arg_queue_state_modify *sm)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret = 0;
+
+ switch (rte_eal_process_type()) {
+ case RTE_PROC_PRIMARY:
+ ret = mlx5_queue_state_modify_primary(dev, sm);
+ break;
+ case RTE_PROC_SECONDARY:
+ ret = mlx5_mp_req_queue_state_modify(&priv->mp_id, sm);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+/**
+ * Handle a Rx error.
+ * The function inserts the RQ state to reset when the first error CQE is
+ * shown, then drains the CQ by the caller function loop. When the CQ is empty,
+ * it moves the RQ state to ready and initializes the RQ.
+ * Next CQE identification and error counting are in the caller responsibility.
+ *
+ * @param[in] rxq
+ * Pointer to RX queue structure.
+ * @param[in] vec
+ * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ.
+ * 0 when called from non-vectorized Rx burst.
+ *
+ * @return
+ * -1 in case of recovery error, otherwise the CQE status.
+ */
+int
+mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
+{
+ const uint16_t cqe_n = 1 << rxq->cqe_n;
+ const uint16_t cqe_mask = cqe_n - 1;
+ const unsigned int wqe_n = 1 << rxq->elts_n;
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+ union {
+ volatile struct mlx5_cqe *cqe;
+ volatile struct mlx5_err_cqe *err_cqe;
+ } u = {
+ .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
+ };
+ struct mlx5_mp_arg_queue_state_modify sm;
+ int ret;
+
+ switch (rxq->err_state) {
+ case MLX5_RXQ_ERR_STATE_NO_ERROR:
+ rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
+ /* Fall-through */
+ case MLX5_RXQ_ERR_STATE_NEED_RESET:
+ sm.is_wq = 1;
+ sm.queue_id = rxq->idx;
+ sm.state = IBV_WQS_RESET;
+ if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm))
+ return -1;
+ if (rxq_ctrl->dump_file_n <
+ rxq_ctrl->priv->config.max_dump_files_num) {
+ MKSTR(err_str, "Unexpected CQE error syndrome "
+ "0x%02x CQN = %u RQN = %u wqe_counter = %u"
+ " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
+ rxq->cqn, rxq_ctrl->wqn,
+ rte_be_to_cpu_16(u.err_cqe->wqe_counter),
+ rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
+ MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
+ rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
+ mlx5_dump_debug_information(name, NULL, err_str, 0);
+ mlx5_dump_debug_information(name, "MLX5 Error CQ:",
+ (const void *)((uintptr_t)
+ rxq->cqes),
+ sizeof(*u.cqe) * cqe_n);
+ mlx5_dump_debug_information(name, "MLX5 Error RQ:",
+ (const void *)((uintptr_t)
+ rxq->wqes),
+ 16 * wqe_n);
+ rxq_ctrl->dump_file_n++;
+ }
+ rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
+ /* Fall-through */
+ case MLX5_RXQ_ERR_STATE_NEED_READY:
+ ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
+ if (ret == MLX5_CQE_STATUS_HW_OWN) {
+ rte_cio_wmb();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ rte_cio_wmb();
+ /*
+ * The RQ consumer index must be zeroed while moving
+ * from RESET state to RDY state.
+ */
+ *rxq->rq_db = rte_cpu_to_be_32(0);
+ rte_cio_wmb();
+ sm.is_wq = 1;
+ sm.queue_id = rxq->idx;
+ sm.state = IBV_WQS_RDY;
+ if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv),
+ &sm))
+ return -1;
+ if (vec) {
+ const uint16_t q_mask = wqe_n - 1;
+ uint16_t elt_idx;
+ struct rte_mbuf **elt;
+ int i;
+ unsigned int n = wqe_n - (rxq->rq_ci -
+ rxq->rq_pi);
+
+ for (i = 0; i < (int)n; ++i) {
+ elt_idx = (rxq->rq_ci + i) & q_mask;
+ elt = &(*rxq->elts)[elt_idx];
+ *elt = rte_mbuf_raw_alloc(rxq->mp);
+ if (!*elt) {
+ for (i--; i >= 0; --i) {
+ elt_idx = (rxq->rq_ci +
+ i) & q_mask;
+ elt = &(*rxq->elts)
+ [elt_idx];
+ rte_pktmbuf_free_seg
+ (*elt);
+ }
+ return -1;
+ }
+ }
+ for (i = 0; i < (int)wqe_n; ++i) {
+ elt = &(*rxq->elts)[i];
+ DATA_LEN(*elt) =
+ (uint16_t)((*elt)->buf_len -
+ rte_pktmbuf_headroom(*elt));
+ }
+ /* Padding with a fake mbuf for vec Rx. */
+ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+ (*rxq->elts)[wqe_n + i] =
+ &rxq->fake_mbuf;
+ }
+ mlx5_rxq_initialize(rxq);
+ rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
+ }
+ return ret;
+ default:
+ return -1;
+ }
+}
+
+/**
+ * Get size of the next packet for a given CQE. For compressed CQEs, the
+ * consumer index is updated only once all packets of the current one have
+ * been processed.
+ *
+ * @param rxq
+ * Pointer to RX queue.
+ * @param cqe
+ * CQE to process.
+ * @param[out] mcqe
+ * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
+ * written.
+ *
+ * @return
+ * 0 in case of empty CQE, otherwise the packet size in bytes.
+ */
+static inline int
+mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+ uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
+{
+ struct rxq_zip *zip = &rxq->zip;
+ uint16_t cqe_n = cqe_cnt + 1;
+ int len;
+ uint16_t idx, end;
+
+ do {
+ len = 0;
+ /* Process compressed data in the CQE and mini arrays. */
+ if (zip->ai) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)(&(*rxq->cqes)[zip->ca &
+ cqe_cnt].pkt_info);
+
+ len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+ *mcqe = &(*mc)[zip->ai & 7];
+ if ((++zip->ai & 7) == 0) {
+ /* Invalidate consumed CQEs */
+ idx = zip->ca;
+ end = zip->na;
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_cnt].op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ /*
+ * Increment consumer index to skip the number
+ * of CQEs consumed. Hardware leaves holes in
+ * the CQ ring for software use.
+ */
+ zip->ca = zip->na;
+ zip->na += 8;
+ }
+ if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+ /* Invalidate the rest */
+ idx = zip->ca;
+ end = zip->cq_ci;
+
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_cnt].op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ rxq->cq_ci = zip->cq_ci;
+ zip->ai = 0;
+ }
+ /*
+ * No compressed data, get next CQE and verify if it is
+ * compressed.
+ */
+ } else {
+ int ret;
+ int8_t op_own;
+
+ ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
+ if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+ if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
+ rxq->err_state)) {
+ ret = mlx5_rx_err_handle(rxq, 0);
+ if (ret == MLX5_CQE_STATUS_HW_OWN ||
+ ret == -1)
+ return 0;
+ } else {
+ return 0;
+ }
+ }
+ ++rxq->cq_ci;
+ op_own = cqe->op_own;
+ if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)(&(*rxq->cqes)
+ [rxq->cq_ci &
+ cqe_cnt].pkt_info);
+
+ /* Fix endianness. */
+ zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
+ /*
+ * Current mini array position is the one
+ * returned by check_cqe64().
+ *
+ * If completion comprises several mini arrays,
+ * as a special case the second one is located
+ * 7 CQEs after the initial CQE instead of 8
+ * for subsequent ones.
+ */
+ zip->ca = rxq->cq_ci;
+ zip->na = zip->ca + 7;
+ /* Compute the next non compressed CQE. */
+ --rxq->cq_ci;
+ zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
+ /* Get packet size to return. */
+ len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+ *mcqe = &(*mc)[0];
+ zip->ai = 1;
+ /* Prefetch all to be invalidated */
+ idx = zip->ca;
+ end = zip->cq_ci;
+ while (idx != end) {
+ rte_prefetch0(&(*rxq->cqes)[(idx) &
+ cqe_cnt]);
+ ++idx;
+ }
+ } else {
+ len = rte_be_to_cpu_32(cqe->byte_cnt);
+ }
+ }
+ if (unlikely(rxq->err_state)) {
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ ++rxq->stats.idropped;
+ } else {
+ return len;
+ }
+ } while (1);
+}
+
+/**
+ * Translate RX completion flags to offload flags.
+ *
+ * @param[in] cqe
+ * Pointer to CQE.
+ *
+ * @return
+ * Offload flags (ol_flags) for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
+{
+ uint32_t ol_flags = 0;
+ uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
+
+ ol_flags =
+ TRANSPOSE(flags,
+ MLX5_CQE_RX_L3_HDR_VALID,
+ PKT_RX_IP_CKSUM_GOOD) |
+ TRANSPOSE(flags,
+ MLX5_CQE_RX_L4_HDR_VALID,
+ PKT_RX_L4_CKSUM_GOOD);
+ return ol_flags;
+}
+
+/**
+ * Fill in mbuf fields from RX completion flags.
+ * Note that pkt->ol_flags should be initialized outside of this function.
+ *
+ * @param rxq
+ * Pointer to RX queue.
+ * @param pkt
+ * mbuf to fill.
+ * @param cqe
+ * CQE to process.
+ * @param rss_hash_res
+ * Packet RSS Hash result.
+ */
+static inline void
+rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
+ volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
+{
+ /* Update packet information. */
+ pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
+ if (rss_hash_res && rxq->rss_hash) {
+ pkt->hash.rss = rss_hash_res;
+ pkt->ol_flags |= PKT_RX_RSS_HASH;
+ }
+ if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
+ pkt->ol_flags |= PKT_RX_FDIR;
+ if (cqe->sop_drop_qpn !=
+ rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
+ uint32_t mark = cqe->sop_drop_qpn;
+
+ pkt->ol_flags |= PKT_RX_FDIR_ID;
+ pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
+ }
+ }
+ if (rxq->dynf_meta && cqe->flow_table_metadata) {
+ pkt->ol_flags |= rxq->flow_meta_mask;
+ *RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, uint32_t *) =
+ cqe->flow_table_metadata;
+ }
+ if (rxq->csum)
+ pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
+ if (rxq->vlan_strip &&
+ (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
+ pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+ pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+ }
+ if (rxq->hw_timestamp) {
+ pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp);
+ pkt->ol_flags |= PKT_RX_TIMESTAMP;
+ }
+}
+
+/**
+ * DPDK callback for RX.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct mlx5_rxq_data *rxq = dpdk_rxq;
+ const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
+ const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
+ const unsigned int sges_n = rxq->sges_n;
+ struct rte_mbuf *pkt = NULL;
+ struct rte_mbuf *seg = NULL;
+ volatile struct mlx5_cqe *cqe =
+ &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ unsigned int i = 0;
+ unsigned int rq_ci = rxq->rq_ci << sges_n;
+ int len = 0; /* keep its value across iterations. */
+
+ while (pkts_n) {
+ unsigned int idx = rq_ci & wqe_cnt;
+ volatile struct mlx5_wqe_data_seg *wqe =
+ &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
+ struct rte_mbuf *rep = (*rxq->elts)[idx];
+ volatile struct mlx5_mini_cqe8 *mcqe = NULL;
+ uint32_t rss_hash_res;
+
+ if (pkt)
+ NEXT(seg) = rep;
+ seg = rep;
+ rte_prefetch0(seg);
+ rte_prefetch0(cqe);
+ rte_prefetch0(wqe);
+ rep = rte_mbuf_raw_alloc(rxq->mp);
+ if (unlikely(rep == NULL)) {
+ ++rxq->stats.rx_nombuf;
+ if (!pkt) {
+ /*
+ * no buffers before we even started,
+ * bail out silently.
+ */
+ break;
+ }
+ while (pkt != seg) {
+ MLX5_ASSERT(pkt != (*rxq->elts)[idx]);
+ rep = NEXT(pkt);
+ NEXT(pkt) = NULL;
+ NB_SEGS(pkt) = 1;
+ rte_mbuf_raw_free(pkt);
+ pkt = rep;
+ }
+ break;
+ }
+ if (!pkt) {
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
+ if (!len) {
+ rte_mbuf_raw_free(rep);
+ break;
+ }
+ pkt = seg;
+ MLX5_ASSERT(len >= (rxq->crc_present << 2));
+ pkt->ol_flags &= EXT_ATTACHED_MBUF;
+ /* If compressed, take hash result from mini-CQE. */
+ rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
+ cqe->rx_hash_res :
+ mcqe->rx_hash_result);
+ rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+ if (rxq->crc_present)
+ len -= RTE_ETHER_CRC_LEN;
+ PKT_LEN(pkt) = len;
+ if (cqe->lro_num_seg > 1) {
+ mlx5_lro_update_hdr
+ (rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
+ len);
+ pkt->ol_flags |= PKT_RX_LRO;
+ pkt->tso_segsz = len / cqe->lro_num_seg;
+ }
+ }
+ DATA_LEN(rep) = DATA_LEN(seg);
+ PKT_LEN(rep) = PKT_LEN(seg);
+ SET_DATA_OFF(rep, DATA_OFF(seg));
+ PORT(rep) = PORT(seg);
+ (*rxq->elts)[idx] = rep;
+ /*
+ * Fill NIC descriptor with the new buffer. The lkey and size
+ * of the buffers are already known, only the buffer address
+ * changes.
+ */
+ wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
+ if (len > DATA_LEN(seg)) {
+ len -= DATA_LEN(seg);
+ ++NB_SEGS(pkt);
+ ++rq_ci;
+ continue;
+ }
+ DATA_LEN(seg) = len;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment bytes counter. */
+ rxq->stats.ibytes += PKT_LEN(pkt);
+#endif
+ /* Return packet. */
+ *(pkts++) = pkt;
+ pkt = NULL;
+ --pkts_n;
+ ++i;
+ /* Align consumer index to the next stride. */
+ rq_ci >>= sges_n;
+ ++rq_ci;
+ rq_ci <<= sges_n;
+ }
+ if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
+ return 0;
+ /* Update the consumer index. */
+ rxq->rq_ci = rq_ci >> sges_n;
+ rte_cio_wmb();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment packets counter. */
+ rxq->stats.ipackets += i;
+#endif
+ return i;
+}
+
+/**
+ * Update LRO packet TCP header.
+ * The HW LRO feature doesn't update the TCP header after coalescing the
+ * TCP segments but supplies information in CQE to fill it by SW.
+ *
+ * @param tcp
+ * Pointer to the TCP header.
+ * @param cqe
+ * Pointer to the completion entry..
+ * @param phcsum
+ * The L3 pseudo-header checksum.
+ */
+static inline void
+mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
+ volatile struct mlx5_cqe *restrict cqe,
+ uint32_t phcsum)
+{
+ uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
+ MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+ /*
+ * The HW calculates only the TCP payload checksum, need to complete
+ * the TCP header checksum and the L3 pseudo-header checksum.
+ */
+ uint32_t csum = phcsum + cqe->csum;
+
+ if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK ||
+ l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) {
+ tcp->tcp_flags |= RTE_TCP_ACK_FLAG;
+ tcp->recv_ack = cqe->lro_ack_seq_num;
+ tcp->rx_win = cqe->lro_tcp_win;
+ }
+ if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK)
+ tcp->tcp_flags |= RTE_TCP_PSH_FLAG;
+ tcp->cksum = 0;
+ csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4);
+ csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff);
+ csum = (~csum) & 0xffff;
+ if (csum == 0)
+ csum = 0xffff;
+ tcp->cksum = csum;
+}
+
+/**
+ * Update LRO packet headers.
+ * The HW LRO feature doesn't update the L3/TCP headers after coalescing the
+ * TCP segments but supply information in CQE to fill it by SW.
+ *
+ * @param padd
+ * The packet address.
+ * @param cqe
+ * Pointer to the completion entry..
+ * @param len
+ * The packet length.
+ */
+static inline void
+mlx5_lro_update_hdr(uint8_t *restrict padd,
+ volatile struct mlx5_cqe *restrict cqe,
+ uint32_t len)
+{
+ union {
+ struct rte_ether_hdr *eth;
+ struct rte_vlan_hdr *vlan;
+ struct rte_ipv4_hdr *ipv4;
+ struct rte_ipv6_hdr *ipv6;
+ struct rte_tcp_hdr *tcp;
+ uint8_t *hdr;
+ } h = {
+ .hdr = padd,
+ };
+ uint16_t proto = h.eth->ether_type;
+ uint32_t phcsum;
+
+ h.eth++;
+ while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
+ proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
+ proto = h.vlan->eth_proto;
+ h.vlan++;
+ }
+ if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) {
+ h.ipv4->time_to_live = cqe->lro_min_ttl;
+ h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd));
+ h.ipv4->hdr_checksum = 0;
+ h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4);
+ phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0);
+ h.ipv4++;
+ } else {
+ h.ipv6->hop_limits = cqe->lro_min_ttl;
+ h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) -
+ sizeof(*h.ipv6));
+ phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
+ h.ipv6++;
+ }
+ mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
+}
+
+void
+mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
+{
+ struct mlx5_mprq_buf *buf = opaque;
+
+ if (rte_atomic16_read(&buf->refcnt) == 1) {
+ rte_mempool_put(buf->mp, buf);
+ } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
+ rte_atomic16_set(&buf->refcnt, 1);
+ rte_mempool_put(buf->mp, buf);
+ }
+}
+
+void
+mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
+{
+ mlx5_mprq_buf_free_cb(NULL, buf);
+}
+
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+ const unsigned int strd_n)
+{
+ struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+ volatile struct mlx5_wqe_data_seg *wqe =
+ &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+ void *addr;
+
+ MLX5_ASSERT(rep != NULL);
+ /* Replace MPRQ buf. */
+ (*rxq->mprq_bufs)[rq_idx] = rep;
+ /* Replace WQE. */
+ addr = mlx5_mprq_buf_addr(rep, strd_n);
+ wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+ /* Stash a mbuf for next replacement. */
+ if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+ rxq->mprq_repl = rep;
+ else
+ rxq->mprq_repl = NULL;
+}
+
+/**
+ * DPDK callback for RX with Multi-Packet RQ support.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct mlx5_rxq_data *rxq = dpdk_rxq;
+ const unsigned int strd_n = 1 << rxq->strd_num_n;
+ const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+ const unsigned int strd_shift =
+ MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+ const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
+ const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
+ volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
+ unsigned int i = 0;
+ uint32_t rq_ci = rxq->rq_ci;
+ uint16_t consumed_strd = rxq->consumed_strd;
+ struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
+
+ while (i < pkts_n) {
+ struct rte_mbuf *pkt;
+ void *addr;
+ int ret;
+ uint32_t len;
+ uint16_t strd_cnt;
+ uint16_t strd_idx;
+ uint32_t offset;
+ uint32_t byte_cnt;
+ int32_t hdrm_overlap;
+ volatile struct mlx5_mini_cqe8 *mcqe = NULL;
+ uint32_t rss_hash_res = 0;
+
+ if (consumed_strd == strd_n) {
+ /* Replace WQE only if the buffer is still in use. */
+ if (rte_atomic16_read(&buf->refcnt) > 1) {
+ mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);
+ /* Release the old buffer. */
+ mlx5_mprq_buf_free(buf);
+ } else if (unlikely(rxq->mprq_repl == NULL)) {
+ struct mlx5_mprq_buf *rep;
+
+ /*
+ * Currently, the MPRQ mempool is out of buffer
+ * and doing memcpy regardless of the size of Rx
+ * packet. Retry allocation to get back to
+ * normal.
+ */
+ if (!rte_mempool_get(rxq->mprq_mp,
+ (void **)&rep))
+ rxq->mprq_repl = rep;
+ }
+ /* Advance to the next WQE. */
+ consumed_strd = 0;
+ ++rq_ci;
+ buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
+ }
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
+ ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
+ if (!ret)
+ break;
+ byte_cnt = ret;
+ strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
+ MLX5_MPRQ_STRIDE_NUM_SHIFT;
+ MLX5_ASSERT(strd_cnt);
+ consumed_strd += strd_cnt;
+ if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
+ continue;
+ if (mcqe == NULL) {
+ rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
+ strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
+ } else {
+ /* mini-CQE for MPRQ doesn't have hash result. */
+ strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
+ }
+ MLX5_ASSERT(strd_idx < strd_n);
+ MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
+ wq_mask));
+ pkt = rte_pktmbuf_alloc(rxq->mp);
+ if (unlikely(pkt == NULL)) {
+ ++rxq->stats.rx_nombuf;
+ break;
+ }
+ len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
+ MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
+ if (rxq->crc_present)
+ len -= RTE_ETHER_CRC_LEN;
+ offset = strd_idx * strd_sz + strd_shift;
+ addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+ hdrm_overlap = len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz;
+ /*
+ * Memcpy packets to the target mbuf if:
+ * - The size of packet is smaller than mprq_max_memcpy_len.
+ * - Out of buffer in the Mempool for Multi-Packet RQ.
+ * - The packet's stride overlaps a headroom and scatter is off.
+ */
+ if (len <= rxq->mprq_max_memcpy_len ||
+ rxq->mprq_repl == NULL ||
+ (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
+ if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
+ rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+ addr, len);
+ DATA_LEN(pkt) = len;
+ } else if (rxq->strd_scatter_en) {
+ struct rte_mbuf *prev = pkt;
+ uint32_t seg_len =
+ RTE_MIN(rte_pktmbuf_tailroom(pkt), len);
+ uint32_t rem_len = len - seg_len;
+
+ rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+ addr, seg_len);
+ DATA_LEN(pkt) = seg_len;
+ while (rem_len) {
+ struct rte_mbuf *next =
+ rte_pktmbuf_alloc(rxq->mp);
+
+ if (unlikely(next == NULL)) {
+ rte_pktmbuf_free(pkt);
+ ++rxq->stats.rx_nombuf;
+ goto out;
+ }
+ NEXT(prev) = next;
+ SET_DATA_OFF(next, 0);
+ addr = RTE_PTR_ADD(addr, seg_len);
+ seg_len = RTE_MIN
+ (rte_pktmbuf_tailroom(next),
+ rem_len);
+ rte_memcpy
+ (rte_pktmbuf_mtod(next, void *),
+ addr, seg_len);
+ DATA_LEN(next) = seg_len;
+ rem_len -= seg_len;
+ prev = next;
+ ++NB_SEGS(pkt);
+ }
+ } else {
+ rte_pktmbuf_free_seg(pkt);
+ ++rxq->stats.idropped;
+ continue;
+ }
+ } else {
+ rte_iova_t buf_iova;
+ struct rte_mbuf_ext_shared_info *shinfo;
+ uint16_t buf_len = strd_cnt * strd_sz;
+ void *buf_addr;
+
+ /* Increment the refcnt of the whole chunk. */
+ rte_atomic16_add_return(&buf->refcnt, 1);
+ MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+ strd_n + 1);
+ buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+ /*
+ * MLX5 device doesn't use iova but it is necessary in a
+ * case where the Rx packet is transmitted via a
+ * different PMD.
+ */
+ buf_iova = rte_mempool_virt2iova(buf) +
+ RTE_PTR_DIFF(buf_addr, buf);
+ shinfo = &buf->shinfos[strd_idx];
+ rte_mbuf_ext_refcnt_set(shinfo, 1);
+ /*
+ * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+ * attaching the stride to mbuf and more offload flags
+ * will be added below by calling rxq_cq_to_mbuf().
+ * Other fields will be overwritten.
+ */
+ rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
+ buf_len, shinfo);
+ /* Set mbuf head-room. */
+ SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM);
+ MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF);
+ MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >=
+ len - (hdrm_overlap > 0 ? hdrm_overlap : 0));
+ DATA_LEN(pkt) = len;
+ /*
+ * Copy the last fragment of a packet (up to headroom
+ * size bytes) in case there is a stride overlap with
+ * a next packet's headroom. Allocate a separate mbuf
+ * to store this fragment and link it. Scatter is on.
+ */
+ if (hdrm_overlap > 0) {
+ MLX5_ASSERT(rxq->strd_scatter_en);
+ struct rte_mbuf *seg =
+ rte_pktmbuf_alloc(rxq->mp);
+
+ if (unlikely(seg == NULL)) {
+ rte_pktmbuf_free_seg(pkt);
+ ++rxq->stats.rx_nombuf;
+ break;
+ }
+ SET_DATA_OFF(seg, 0);
+ rte_memcpy(rte_pktmbuf_mtod(seg, void *),
+ RTE_PTR_ADD(addr, len - hdrm_overlap),
+ hdrm_overlap);
+ DATA_LEN(seg) = hdrm_overlap;
+ DATA_LEN(pkt) = len - hdrm_overlap;
+ NEXT(pkt) = seg;
+ NB_SEGS(pkt) = 2;
+ }
+ }
+ rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+ if (cqe->lro_num_seg > 1) {
+ mlx5_lro_update_hdr(addr, cqe, len);
+ pkt->ol_flags |= PKT_RX_LRO;
+ pkt->tso_segsz = len / cqe->lro_num_seg;
+ }
+ PKT_LEN(pkt) = len;
+ PORT(pkt) = rxq->port_id;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment bytes counter. */
+ rxq->stats.ibytes += PKT_LEN(pkt);
+#endif
+ /* Return packet. */
+ *(pkts++) = pkt;
+ ++i;
+ }
+out:
+ /* Update the consumer indexes. */
+ rxq->consumed_strd = consumed_strd;
+ rte_cio_wmb();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ if (rq_ci != rxq->rq_ci) {
+ rxq->rq_ci = rq_ci;
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment packets counter. */
+ rxq->stats.ipackets += i;
+#endif
+ return i;
+}
+
+/**
+ * Dummy DPDK callback for TX.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+removed_tx_burst(void *dpdk_txq __rte_unused,
+ struct rte_mbuf **pkts __rte_unused,
+ uint16_t pkts_n __rte_unused)
+{
+ rte_mb();
+ return 0;
+}
+
+/**
+ * Dummy DPDK callback for RX.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+removed_rx_burst(void *dpdk_txq __rte_unused,
+ struct rte_mbuf **pkts __rte_unused,
+ uint16_t pkts_n __rte_unused)
+{
+ rte_mb();
+ return 0;
+}
+
+/*
+ * Vectorized Rx/Tx routines are not compiled in when required vector
+ * instructions are not supported on a target architecture. The following null
+ * stubs are needed for linkage when those are not included outside of this file
+ * (e.g. mlx5_rxtx_vec_sse.c for x86).
+ */
+
+__rte_weak uint16_t
+mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
+ struct rte_mbuf **pkts __rte_unused,
+ uint16_t pkts_n __rte_unused)
+{
+ return 0;
+}
+
+__rte_weak int
+mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+__rte_weak int
+mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+/**
+ * Free the mbufs from the linear array of pointers.
+ *
+ * @param pkts
+ * Pointer to array of packets to be free.
+ * @param pkts_n
+ * Number of packets to be freed.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ unsigned int olx __rte_unused)
+{
+ struct rte_mempool *pool = NULL;
+ struct rte_mbuf **p_free = NULL;
+ struct rte_mbuf *mbuf;
+ unsigned int n_free = 0;
+
+ /*
+ * The implemented algorithm eliminates
+ * copying pointers to temporary array
+ * for rte_mempool_put_bulk() calls.
+ */
+ MLX5_ASSERT(pkts);
+ MLX5_ASSERT(pkts_n);
+ for (;;) {
+ for (;;) {
+ /*
+ * Decrement mbuf reference counter, detach
+ * indirect and external buffers if needed.
+ */
+ mbuf = rte_pktmbuf_prefree_seg(*pkts);
+ if (likely(mbuf != NULL)) {
+ MLX5_ASSERT(mbuf == *pkts);
+ if (likely(n_free != 0)) {
+ if (unlikely(pool != mbuf->pool))
+ /* From different pool. */
+ break;
+ } else {
+ /* Start new scan array. */
+ pool = mbuf->pool;
+ p_free = pkts;
+ }
+ ++n_free;
+ ++pkts;
+ --pkts_n;
+ if (unlikely(pkts_n == 0)) {
+ mbuf = NULL;
+ break;
+ }
+ } else {
+ /*
+ * This happens if mbuf is still referenced.
+ * We can't put it back to the pool, skip.
+ */
+ ++pkts;
+ --pkts_n;
+ if (unlikely(n_free != 0))
+ /* There is some array to free.*/
+ break;
+ if (unlikely(pkts_n == 0))
+ /* Last mbuf, nothing to free. */
+ return;
+ }
+ }
+ for (;;) {
+ /*
+ * This loop is implemented to avoid multiple
+ * inlining of rte_mempool_put_bulk().
+ */
+ MLX5_ASSERT(pool);
+ MLX5_ASSERT(p_free);
+ MLX5_ASSERT(n_free);
+ /*
+ * Free the array of pre-freed mbufs
+ * belonging to the same memory pool.
+ */
+ rte_mempool_put_bulk(pool, (void *)p_free, n_free);
+ if (unlikely(mbuf != NULL)) {
+ /* There is the request to start new scan. */
+ pool = mbuf->pool;
+ p_free = pkts++;
+ n_free = 1;
+ --pkts_n;
+ if (likely(pkts_n != 0))
+ break;
+ /*
+ * This is the last mbuf to be freed.
+ * Do one more loop iteration to complete.
+ * This is rare case of the last unique mbuf.
+ */
+ mbuf = NULL;
+ continue;
+ }
+ if (likely(pkts_n == 0))
+ return;
+ n_free = 0;
+ break;
+ }
+ }
+}
+
+/**
+ * Free the mbuf from the elts ring buffer till new tail.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param tail
+ * Index in elts to free up to, becomes new elts tail.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
+ uint16_t tail,
+ unsigned int olx __rte_unused)
+{
+ uint16_t n_elts = tail - txq->elts_tail;
+
+ MLX5_ASSERT(n_elts);
+ MLX5_ASSERT(n_elts <= txq->elts_s);
+ /*
+ * Implement a loop to support ring buffer wraparound
+ * with single inlining of mlx5_tx_free_mbuf().
+ */
+ do {
+ unsigned int part;
+
+ part = txq->elts_s - (txq->elts_tail & txq->elts_m);
+ part = RTE_MIN(part, n_elts);
+ MLX5_ASSERT(part);
+ MLX5_ASSERT(part <= txq->elts_s);
+ mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m],
+ part, olx);
+ txq->elts_tail += part;
+ n_elts -= part;
+ } while (n_elts);
+}
+
+/**
+ * Store the mbuf being sent into elts ring buffer.
+ * On Tx completion these mbufs will be freed.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param pkts
+ * Pointer to array of packets to be stored.
+ * @param pkts_n
+ * Number of packets to be stored.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq,
+ struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ unsigned int olx __rte_unused)
+{
+ unsigned int part;
+ struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
+
+ MLX5_ASSERT(pkts);
+ MLX5_ASSERT(pkts_n);
+ part = txq->elts_s - (txq->elts_head & txq->elts_m);
+ MLX5_ASSERT(part);
+ MLX5_ASSERT(part <= txq->elts_s);
+ /* This code is a good candidate for vectorizing with SIMD. */
+ rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
+ (void *)pkts,
+ RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
+ txq->elts_head += pkts_n;
+ if (unlikely(part < pkts_n))
+ /* The copy is wrapping around the elts array. */
+ rte_memcpy((void *)elts, (void *)(pkts + part),
+ (pkts_n - part) * sizeof(struct rte_mbuf *));
+}
+
+/**
+ * Update completion queue consuming index via doorbell
+ * and flush the completed data buffers.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param valid CQE pointer
+ * if not NULL update txq->wqe_pi and flush the buffers
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq,
+ volatile struct mlx5_cqe *last_cqe,
+ unsigned int olx __rte_unused)
+{
+ if (likely(last_cqe != NULL)) {
+ uint16_t tail;
+
+ txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter);
+ tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m];
+ if (likely(tail != txq->elts_tail)) {
+ mlx5_tx_free_elts(txq, tail, olx);
+ MLX5_ASSERT(tail == txq->elts_tail);
+ }
+ }
+}
+
+/**
+ * Manage TX completions. This routine checks the CQ for
+ * arrived CQEs, deduces the last accomplished WQE in SQ,
+ * updates SQ producing index and frees all completed mbufs.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * NOTE: not inlined intentionally, it makes tx_burst
+ * routine smaller, simple and faster - from experiments.
+ */
+static void
+mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
+ unsigned int olx __rte_unused)
+{
+ unsigned int count = MLX5_TX_COMP_MAX_CQE;
+ volatile struct mlx5_cqe *last_cqe = NULL;
+ bool ring_doorbell = false;
+ int ret;
+
+ static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value");
+ static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value");
+ do {
+ volatile struct mlx5_cqe *cqe;
+
+ cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
+ ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
+ if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+ if (likely(ret != MLX5_CQE_STATUS_ERR)) {
+ /* No new CQEs in completion queue. */
+ MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN);
+ break;
+ }
+ /*
+ * Some error occurred, try to restart.
+ * We have no barrier after WQE related Doorbell
+ * written, make sure all writes are completed
+ * here, before we might perform SQ reset.
+ */
+ rte_wmb();
+ ret = mlx5_tx_error_cqe_handle
+ (txq, (volatile struct mlx5_err_cqe *)cqe);
+ if (unlikely(ret < 0)) {
+ /*
+ * Some error occurred on queue error
+ * handling, we do not advance the index
+ * here, allowing to retry on next call.
+ */
+ return;
+ }
+ /*
+ * We are going to fetch all entries with
+ * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status.
+ * The send queue is supposed to be empty.
+ */
+ ring_doorbell = true;
+ ++txq->cq_ci;
+ txq->cq_pi = txq->cq_ci;
+ last_cqe = NULL;
+ continue;
+ }
+ /* Normal transmit completion. */
+ MLX5_ASSERT(txq->cq_ci != txq->cq_pi);
+ MLX5_ASSERT((txq->fcqs[txq->cq_ci & txq->cqe_m] >> 16) ==
+ cqe->wqe_counter);
+ ring_doorbell = true;
+ ++txq->cq_ci;
+ last_cqe = cqe;
+ /*
+ * We have to restrict the amount of processed CQEs
+ * in one tx_burst routine call. The CQ may be large
+ * and many CQEs may be updated by the NIC in one
+ * transaction. Buffers freeing is time consuming,
+ * multiple iterations may introduce significant
+ * latency.
+ */
+ if (likely(--count == 0))
+ break;
+ } while (true);
+ if (likely(ring_doorbell)) {
+ /* Ring doorbell to notify hardware. */
+ rte_compiler_barrier();
+ *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
+ mlx5_tx_comp_flush(txq, last_cqe, olx);
+ }
+}
+
+/**
+ * Check if the completion request flag should be set in the last WQE.
+ * Both pushed mbufs and WQEs are monitored and the completion request
+ * flag is set if any of thresholds is reached.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ uint16_t head = txq->elts_head;
+ unsigned int part;
+
+ part = MLX5_TXOFF_CONFIG(INLINE) ?
+ 0 : loc->pkts_sent - loc->pkts_copy;
+ head += part;
+ if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
+ (MLX5_TXOFF_CONFIG(INLINE) &&
+ (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
+ volatile struct mlx5_wqe *last = loc->wqe_last;
+
+ MLX5_ASSERT(last);
+ txq->elts_comp = head;
+ if (MLX5_TXOFF_CONFIG(INLINE))
+ txq->wqe_comp = txq->wqe_ci;
+ /* Request unconditional completion on last WQE. */
+ last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+ MLX5_COMP_MODE_OFFSET);
+ /* Save elts_head in dedicated free on completion queue. */
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+ txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
+ (last->cseg.opcode >> 8) << 16;
+#else
+ txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
+#endif
+ /* A CQE slot must always be available. */
+ MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
+ }
+}
+
+/**
+ * DPDK callback to check the status of a tx descriptor.
+ *
+ * @param tx_queue
+ * The tx queue.
+ * @param[in] offset
+ * The index of the descriptor in the ring.
+ *
+ * @return
+ * The status of the tx descriptor.
+ */
+int
+mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
+{
+ struct mlx5_txq_data *restrict txq = tx_queue;
+ uint16_t used;
+
+ mlx5_tx_handle_completion(txq, 0);
+ used = txq->elts_head - txq->elts_tail;
+ if (offset < used)
+ return RTE_ETH_TX_DESC_FULL;
+ return RTE_ETH_TX_DESC_DONE;
+}
+
+/**
+ * Build the Control Segment with specified opcode:
+ * - MLX5_OPCODE_SEND
+ * - MLX5_OPCODE_ENHANCED_MPSW
+ * - MLX5_OPCODE_TSO
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param wqe
+ * Pointer to WQE to fill with built Control Segment.
+ * @param ds
+ * Supposed length of WQE in segments.
+ * @param opcode
+ * SQ WQE opcode to put into Control Segment.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc __rte_unused,
+ struct mlx5_wqe *restrict wqe,
+ unsigned int ds,
+ unsigned int opcode,
+ unsigned int olx __rte_unused)
+{
+ struct mlx5_wqe_cseg *restrict cs = &wqe->cseg;
+
+ /* For legacy MPW replace the EMPW by TSO with modifier. */
+ if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW)
+ opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24;
+ cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
+ cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+ cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
+ MLX5_COMP_MODE_OFFSET);
+ cs->misc = RTE_BE32(0);
+}
+
+/**
+ * Build the Ethernet Segment without inlined data.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param wqe
+ * Pointer to WQE to fill with built Ethernet Segment.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused,
+ struct mlx5_txq_local *restrict loc,
+ struct mlx5_wqe *restrict wqe,
+ unsigned int olx)
+{
+ struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+ uint32_t csum;
+
+ /*
+ * Calculate and set check sum flags first, dword field
+ * in segment may be shared with Software Parser flags.
+ */
+ csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+ es->flags = rte_cpu_to_le_32(csum);
+ /*
+ * Calculate and set Software Parser offsets and flags.
+ * These flags a set for custom UDP and IP tunnel packets.
+ */
+ es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+ /* Fill metadata field if needed. */
+ es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+ loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+ *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
+ /* Engage VLAN tag insertion feature if requested. */
+ if (MLX5_TXOFF_CONFIG(VLAN) &&
+ loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+ /*
+ * We should get here only if device support
+ * this feature correctly.
+ */
+ MLX5_ASSERT(txq->vlan_en);
+ es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
+ loc->mbuf->vlan_tci);
+ } else {
+ es->inline_hdr = RTE_BE32(0);
+ }
+}
+
+/**
+ * Build the Ethernet Segment with minimal inlined data
+ * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
+ * used to fill the gap in single WQEBB WQEs.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param wqe
+ * Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ * Length of VLAN tag insertion if any.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused,
+ struct mlx5_txq_local *restrict loc,
+ struct mlx5_wqe *restrict wqe,
+ unsigned int vlan,
+ unsigned int olx)
+{
+ struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+ uint32_t csum;
+ uint8_t *psrc, *pdst;
+
+ /*
+ * Calculate and set check sum flags first, dword field
+ * in segment may be shared with Software Parser flags.
+ */
+ csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+ es->flags = rte_cpu_to_le_32(csum);
+ /*
+ * Calculate and set Software Parser offsets and flags.
+ * These flags a set for custom UDP and IP tunnel packets.
+ */
+ es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+ /* Fill metadata field if needed. */
+ es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+ loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+ *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
+ static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+ (sizeof(uint16_t) +
+ sizeof(rte_v128u32_t)),
+ "invalid Ethernet Segment data size");
+ static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+ (sizeof(uint16_t) +
+ sizeof(struct rte_vlan_hdr) +
+ 2 * RTE_ETHER_ADDR_LEN),
+ "invalid Ethernet Segment data size");
+ psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+ es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
+ es->inline_data = *(unaligned_uint16_t *)psrc;
+ psrc += sizeof(uint16_t);
+ pdst = (uint8_t *)(es + 1);
+ if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+ /* Implement VLAN tag insertion as part inline data. */
+ memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+ pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+ psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+ /* Insert VLAN ethertype + VLAN tag. */
+ *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
+ ((RTE_ETHER_TYPE_VLAN << 16) |
+ loc->mbuf->vlan_tci);
+ pdst += sizeof(struct rte_vlan_hdr);
+ /* Copy the rest two bytes from packet data. */
+ MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
+ *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
+ } else {
+ /* Fill the gap in the title WQEBB with inline data. */
+ rte_mov16(pdst, psrc);
+ }
+}
+
+/**
+ * Build the Ethernet Segment with entire packet
+ * data inlining. Checks the boundary of WQEBB and
+ * ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param wqe
+ * Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ * Length of VLAN tag insertion if any.
+ * @param inlen
+ * Length of data to inline (VLAN included, if any).
+ * @param tso
+ * TSO flag, set mss field from the packet.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * Pointer to the next Data Segment (aligned and wrapped around).
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ struct mlx5_wqe *restrict wqe,
+ unsigned int vlan,
+ unsigned int inlen,
+ unsigned int tso,
+ unsigned int olx)
+{
+ struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+ uint32_t csum;
+ uint8_t *psrc, *pdst;
+ unsigned int part;
+
+ /*
+ * Calculate and set check sum flags first, dword field
+ * in segment may be shared with Software Parser flags.
+ */
+ csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+ if (tso) {
+ csum <<= 24;
+ csum |= loc->mbuf->tso_segsz;
+ es->flags = rte_cpu_to_be_32(csum);
+ } else {
+ es->flags = rte_cpu_to_le_32(csum);
+ }
+ /*
+ * Calculate and set Software Parser offsets and flags.
+ * These flags a set for custom UDP and IP tunnel packets.
+ */
+ es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+ /* Fill metadata field if needed. */
+ es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+ loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+ *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
+ static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+ (sizeof(uint16_t) +
+ sizeof(rte_v128u32_t)),
+ "invalid Ethernet Segment data size");
+ static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+ (sizeof(uint16_t) +
+ sizeof(struct rte_vlan_hdr) +
+ 2 * RTE_ETHER_ADDR_LEN),
+ "invalid Ethernet Segment data size");
+ psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+ es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
+ es->inline_data = *(unaligned_uint16_t *)psrc;
+ psrc += sizeof(uint16_t);
+ pdst = (uint8_t *)(es + 1);
+ if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+ /* Implement VLAN tag insertion as part inline data. */
+ memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+ pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+ psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+ /* Insert VLAN ethertype + VLAN tag. */
+ *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
+ ((RTE_ETHER_TYPE_VLAN << 16) |
+ loc->mbuf->vlan_tci);
+ pdst += sizeof(struct rte_vlan_hdr);
+ /* Copy the rest two bytes from packet data. */
+ MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
+ *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
+ psrc += sizeof(uint16_t);
+ } else {
+ /* Fill the gap in the title WQEBB with inline data. */
+ rte_mov16(pdst, psrc);
+ psrc += sizeof(rte_v128u32_t);
+ }
+ pdst = (uint8_t *)(es + 2);
+ MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
+ MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
+ inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
+ if (!inlen) {
+ MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
+ return (struct mlx5_wqe_dseg *)pdst;
+ }
+ /*
+ * The WQEBB space availability is checked by caller.
+ * Here we should be aware of WQE ring buffer wraparound only.
+ */
+ part = (uint8_t *)txq->wqes_end - pdst;
+ part = RTE_MIN(part, inlen);
+ do {
+ rte_memcpy(pdst, psrc, part);
+ inlen -= part;
+ if (likely(!inlen)) {
+ /*
+ * If return value is not used by the caller
+ * the code below will be optimized out.
+ */
+ pdst += part;
+ pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+ if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+ pdst = (uint8_t *)txq->wqes;
+ return (struct mlx5_wqe_dseg *)pdst;
+ }
+ pdst = (uint8_t *)txq->wqes;
+ psrc += part;
+ part = inlen;
+ } while (true);
+}
+
+/**
+ * Copy data from chain of mbuf to the specified linear buffer.
+ * Checksums and VLAN insertion Tx offload features. If data
+ * from some mbuf copied completely this mbuf is freed. Local
+ * structure is used to keep the byte stream state.
+ *
+ * @param pdst
+ * Pointer to the destination linear buffer.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param len
+ * Length of data to be copied.
+ * @param must
+ * Length of data to be copied ignoring no inline hint.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * Number of actual copied data bytes. This is always greater than or
+ * equal to must parameter and might be lesser than len in no inline
+ * hint flag is encountered.
+ */
+static __rte_always_inline unsigned int
+mlx5_tx_mseg_memcpy(uint8_t *pdst,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int len,
+ unsigned int must,
+ unsigned int olx __rte_unused)
+{
+ struct rte_mbuf *mbuf;
+ unsigned int part, dlen, copy = 0;
+ uint8_t *psrc;
+
+ MLX5_ASSERT(len);
+ MLX5_ASSERT(must <= len);
+ do {
+ /* Allow zero length packets, must check first. */
+ dlen = rte_pktmbuf_data_len(loc->mbuf);
+ if (dlen <= loc->mbuf_off) {
+ /* Exhausted packet, just free. */
+ mbuf = loc->mbuf;
+ loc->mbuf = mbuf->next;
+ rte_pktmbuf_free_seg(mbuf);
+ loc->mbuf_off = 0;
+ MLX5_ASSERT(loc->mbuf_nseg > 1);
+ MLX5_ASSERT(loc->mbuf);
+ --loc->mbuf_nseg;
+ if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) {
+ unsigned int diff;
+
+ if (copy >= must) {
+ /*
+ * We already copied the minimal
+ * requested amount of data.
+ */
+ return copy;
+ }
+ diff = must - copy;
+ if (diff <= rte_pktmbuf_data_len(loc->mbuf)) {
+ /*
+ * Copy only the minimal required
+ * part of the data buffer.
+ */
+ len = diff;
+ }
+ }
+ continue;
+ }
+ dlen -= loc->mbuf_off;
+ psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
+ loc->mbuf_off);
+ part = RTE_MIN(len, dlen);
+ rte_memcpy(pdst, psrc, part);
+ copy += part;
+ loc->mbuf_off += part;
+ len -= part;
+ if (!len) {
+ if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
+ loc->mbuf_off = 0;
+ /* Exhausted packet, just free. */
+ mbuf = loc->mbuf;
+ loc->mbuf = mbuf->next;
+ rte_pktmbuf_free_seg(mbuf);
+ loc->mbuf_off = 0;
+ MLX5_ASSERT(loc->mbuf_nseg >= 1);
+ --loc->mbuf_nseg;
+ }
+ return copy;
+ }
+ pdst += part;
+ } while (true);
+}
+
+/**
+ * Build the Ethernet Segment with inlined data from
+ * multi-segment packet. Checks the boundary of WQEBB
+ * and ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param wqe
+ * Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ * Length of VLAN tag insertion if any.
+ * @param inlen
+ * Length of data to inline (VLAN included, if any).
+ * @param tso
+ * TSO flag, set mss field from the packet.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * Pointer to the next Data Segment (aligned and
+ * possible NOT wrapped around - caller should do
+ * wrapping check on its own).
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ struct mlx5_wqe *restrict wqe,
+ unsigned int vlan,
+ unsigned int inlen,
+ unsigned int tso,
+ unsigned int olx)
+{
+ struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+ uint32_t csum;
+ uint8_t *pdst;
+ unsigned int part, tlen = 0;
+
+ /*
+ * Calculate and set check sum flags first, uint32_t field
+ * in segment may be shared with Software Parser flags.
+ */
+ csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+ if (tso) {
+ csum <<= 24;
+ csum |= loc->mbuf->tso_segsz;
+ es->flags = rte_cpu_to_be_32(csum);
+ } else {
+ es->flags = rte_cpu_to_le_32(csum);
+ }
+ /*
+ * Calculate and set Software Parser offsets and flags.
+ * These flags a set for custom UDP and IP tunnel packets.
+ */
+ es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+ /* Fill metadata field if needed. */
+ es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+ loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+ *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
+ static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+ (sizeof(uint16_t) +
+ sizeof(rte_v128u32_t)),
+ "invalid Ethernet Segment data size");
+ static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+ (sizeof(uint16_t) +
+ sizeof(struct rte_vlan_hdr) +
+ 2 * RTE_ETHER_ADDR_LEN),
+ "invalid Ethernet Segment data size");
+ MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
+ pdst = (uint8_t *)&es->inline_data;
+ if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+ /* Implement VLAN tag insertion as part inline data. */
+ mlx5_tx_mseg_memcpy(pdst, loc,
+ 2 * RTE_ETHER_ADDR_LEN,
+ 2 * RTE_ETHER_ADDR_LEN, olx);
+ pdst += 2 * RTE_ETHER_ADDR_LEN;
+ *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
+ ((RTE_ETHER_TYPE_VLAN << 16) |
+ loc->mbuf->vlan_tci);
+ pdst += sizeof(struct rte_vlan_hdr);
+ tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
+ }
+ MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
+ /*
+ * The WQEBB space availability is checked by caller.
+ * Here we should be aware of WQE ring buffer wraparound only.
+ */
+ part = (uint8_t *)txq->wqes_end - pdst;
+ part = RTE_MIN(part, inlen - tlen);
+ MLX5_ASSERT(part);
+ do {
+ unsigned int copy;
+
+ /*
+ * Copying may be interrupted inside the routine
+ * if run into no inline hint flag.
+ */
+ copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen);
+ copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx);
+ tlen += copy;
+ if (likely(inlen <= tlen) || copy < part) {
+ es->inline_hdr_sz = rte_cpu_to_be_16(tlen);
+ pdst += copy;
+ pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+ return (struct mlx5_wqe_dseg *)pdst;
+ }
+ pdst = (uint8_t *)txq->wqes;
+ part = inlen - tlen;
+ } while (true);
+}
+
+/**
+ * Build the Data Segment of pointer type.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param dseg
+ * Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ * Data buffer to point.
+ * @param len
+ * Data buffer length.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ struct mlx5_wqe_dseg *restrict dseg,
+ uint8_t *buf,
+ unsigned int len,
+ unsigned int olx __rte_unused)
+
+{
+ MLX5_ASSERT(len);
+ dseg->bcount = rte_cpu_to_be_32(len);
+ dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+ dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+}
+
+/**
+ * Build the Data Segment of pointer type or inline
+ * if data length is less than buffer in minimal
+ * Data Segment size.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param dseg
+ * Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ * Data buffer to point.
+ * @param len
+ * Data buffer length.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ struct mlx5_wqe_dseg *restrict dseg,
+ uint8_t *buf,
+ unsigned int len,
+ unsigned int olx __rte_unused)
+
+{
+ uintptr_t dst, src;
+
+ MLX5_ASSERT(len);
+ if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
+ dseg->bcount = rte_cpu_to_be_32(len);
+ dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+ dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+
+ return;
+ }
+ dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+ /* Unrolled implementation of generic rte_memcpy. */
+ dst = (uintptr_t)&dseg->inline_data[0];
+ src = (uintptr_t)buf;
+ if (len & 0x08) {
+#ifdef RTE_ARCH_STRICT_ALIGN
+ MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t)));
+ *(uint32_t *)dst = *(unaligned_uint32_t *)src;
+ dst += sizeof(uint32_t);
+ src += sizeof(uint32_t);
+ *(uint32_t *)dst = *(unaligned_uint32_t *)src;
+ dst += sizeof(uint32_t);
+ src += sizeof(uint32_t);
+#else
+ *(uint64_t *)dst = *(unaligned_uint64_t *)src;
+ dst += sizeof(uint64_t);
+ src += sizeof(uint64_t);
+#endif
+ }
+ if (len & 0x04) {
+ *(uint32_t *)dst = *(unaligned_uint32_t *)src;
+ dst += sizeof(uint32_t);
+ src += sizeof(uint32_t);
+ }
+ if (len & 0x02) {
+ *(uint16_t *)dst = *(unaligned_uint16_t *)src;
+ dst += sizeof(uint16_t);
+ src += sizeof(uint16_t);
+ }
+ if (len & 0x01)
+ *(uint8_t *)dst = *(uint8_t *)src;
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet, no VLAN insertion.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param dseg
+ * Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ * Data buffer to point.
+ * @param len
+ * Data buffer length.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * Pointer to the next Data Segment after inlined data.
+ * Ring buffer wraparound check is needed. We do not
+ * do it here because it may not be needed for the
+ * last packet in the eMPW session.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc __rte_unused,
+ struct mlx5_wqe_dseg *restrict dseg,
+ uint8_t *buf,
+ unsigned int len,
+ unsigned int olx __rte_unused)
+{
+ unsigned int part;
+ uint8_t *pdst;
+
+ if (!MLX5_TXOFF_CONFIG(MPW)) {
+ /* Store the descriptor byte counter for eMPW sessions. */
+ dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+ pdst = &dseg->inline_data[0];
+ } else {
+ /* The entire legacy MPW session counter is stored on close. */
+ pdst = (uint8_t *)dseg;
+ }
+ /*
+ * The WQEBB space availability is checked by caller.
+ * Here we should be aware of WQE ring buffer wraparound only.
+ */
+ part = (uint8_t *)txq->wqes_end - pdst;
+ part = RTE_MIN(part, len);
+ do {
+ rte_memcpy(pdst, buf, part);
+ len -= part;
+ if (likely(!len)) {
+ pdst += part;
+ if (!MLX5_TXOFF_CONFIG(MPW))
+ pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+ /* Note: no final wraparound check here. */
+ return (struct mlx5_wqe_dseg *)pdst;
+ }
+ pdst = (uint8_t *)txq->wqes;
+ buf += part;
+ part = len;
+ } while (true);
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet with VLAN insertion.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param dseg
+ * Pointer to the dseg fill with built Data Segment.
+ * @param buf
+ * Data buffer to point.
+ * @param len
+ * Data buffer length.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * Pointer to the next Data Segment after inlined data.
+ * Ring buffer wraparound check is needed.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc __rte_unused,
+ struct mlx5_wqe_dseg *restrict dseg,
+ uint8_t *buf,
+ unsigned int len,
+ unsigned int olx __rte_unused)
+
+{
+ unsigned int part;
+ uint8_t *pdst;
+
+ MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE);
+ static_assert(MLX5_DSEG_MIN_INLINE_SIZE ==
+ (2 * RTE_ETHER_ADDR_LEN),
+ "invalid Data Segment data size");
+ if (!MLX5_TXOFF_CONFIG(MPW)) {
+ /* Store the descriptor byte counter for eMPW sessions. */
+ dseg->bcount = rte_cpu_to_be_32
+ ((len + sizeof(struct rte_vlan_hdr)) |
+ MLX5_ETH_WQE_DATA_INLINE);
+ pdst = &dseg->inline_data[0];
+ } else {
+ /* The entire legacy MPW session counter is stored on close. */
+ pdst = (uint8_t *)dseg;
+ }
+ memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
+ buf += MLX5_DSEG_MIN_INLINE_SIZE;
+ pdst += MLX5_DSEG_MIN_INLINE_SIZE;
+ len -= MLX5_DSEG_MIN_INLINE_SIZE;
+ /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */
+ MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
+ if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+ pdst = (uint8_t *)txq->wqes;
+ *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
+ loc->mbuf->vlan_tci);
+ pdst += sizeof(struct rte_vlan_hdr);
+ /*
+ * The WQEBB space availability is checked by caller.
+ * Here we should be aware of WQE ring buffer wraparound only.
+ */
+ part = (uint8_t *)txq->wqes_end - pdst;
+ part = RTE_MIN(part, len);
+ do {
+ rte_memcpy(pdst, buf, part);
+ len -= part;
+ if (likely(!len)) {
+ pdst += part;
+ if (!MLX5_TXOFF_CONFIG(MPW))
+ pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+ /* Note: no final wraparound check here. */
+ return (struct mlx5_wqe_dseg *)pdst;
+ }
+ pdst = (uint8_t *)txq->wqes;
+ buf += part;
+ part = len;
+ } while (true);
+}
+
+/**
+ * Build the Ethernet Segment with optionally inlined data with
+ * VLAN insertion and following Data Segments (if any) from
+ * multi-segment packet. Used by ordinary send and TSO.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param wqe
+ * Pointer to WQE to fill with built Ethernet/Data Segments.
+ * @param vlan
+ * Length of VLAN header to insert, 0 means no VLAN insertion.
+ * @param inlen
+ * Data length to inline. For TSO this parameter specifies
+ * exact value, for ordinary send routine can be aligned by
+ * caller to provide better WQE space saving and data buffer
+ * start address alignment. This length includes VLAN header
+ * being inserted.
+ * @param tso
+ * Zero means ordinary send, inlined data can be extended,
+ * otherwise this is TSO, inlined data length is fixed.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * Actual size of built WQE in segments.
+ */
+static __rte_always_inline unsigned int
+mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ struct mlx5_wqe *restrict wqe,
+ unsigned int vlan,
+ unsigned int inlen,
+ unsigned int tso,
+ unsigned int olx __rte_unused)
+{
+ struct mlx5_wqe_dseg *restrict dseg;
+ unsigned int ds;
+
+ MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
+ loc->mbuf_nseg = NB_SEGS(loc->mbuf);
+ loc->mbuf_off = 0;
+
+ dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
+ if (!loc->mbuf_nseg)
+ goto dseg_done;
+ /*
+ * There are still some mbuf remaining, not inlined.
+ * The first mbuf may be partially inlined and we
+ * must process the possible non-zero data offset.
+ */
+ if (loc->mbuf_off) {
+ unsigned int dlen;
+ uint8_t *dptr;
+
+ /*
+ * Exhausted packets must be dropped before.
+ * Non-zero offset means there are some data
+ * remained in the packet.
+ */
+ MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
+ MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf));
+ dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
+ loc->mbuf_off);
+ dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
+ /*
+ * Build the pointer/minimal data Data Segment.
+ * Do ring buffer wrapping check in advance.
+ */
+ if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+ dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+ mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
+ /* Store the mbuf to be freed on completion. */
+ MLX5_ASSERT(loc->elts_free);
+ txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+ --loc->elts_free;
+ ++dseg;
+ if (--loc->mbuf_nseg == 0)
+ goto dseg_done;
+ loc->mbuf = loc->mbuf->next;
+ loc->mbuf_off = 0;
+ }
+ do {
+ if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+ struct rte_mbuf *mbuf;
+
+ /* Zero length segment found, just skip. */
+ mbuf = loc->mbuf;
+ loc->mbuf = loc->mbuf->next;
+ rte_pktmbuf_free_seg(mbuf);
+ if (--loc->mbuf_nseg == 0)
+ break;
+ } else {
+ if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+ dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+ mlx5_tx_dseg_iptr
+ (txq, loc, dseg,
+ rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+ rte_pktmbuf_data_len(loc->mbuf), olx);
+ MLX5_ASSERT(loc->elts_free);
+ txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+ --loc->elts_free;
+ ++dseg;
+ if (--loc->mbuf_nseg == 0)
+ break;
+ loc->mbuf = loc->mbuf->next;
+ }
+ } while (true);
+
+dseg_done:
+ /* Calculate actual segments used from the dseg pointer. */
+ if ((uintptr_t)wqe < (uintptr_t)dseg)
+ ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
+ else
+ ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
+ txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
+ return ds;
+}
+
+/**
+ * Tx one packet function for multi-segment TSO. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
+ * sends one packet per WQE.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ struct mlx5_wqe *restrict wqe;
+ unsigned int ds, dlen, inlen, ntcp, vlan = 0;
+
+ /*
+ * Calculate data length to be inlined to estimate
+ * the required space in WQE ring buffer.
+ */
+ dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+ if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+ vlan = sizeof(struct rte_vlan_hdr);
+ inlen = loc->mbuf->l2_len + vlan +
+ loc->mbuf->l3_len + loc->mbuf->l4_len;
+ if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
+ return MLX5_TXCMP_CODE_ERROR;
+ if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+ inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
+ /* Packet must contain all TSO headers. */
+ if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
+ inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+ inlen > (dlen + vlan)))
+ return MLX5_TXCMP_CODE_ERROR;
+ MLX5_ASSERT(inlen >= txq->inlen_mode);
+ /*
+ * Check whether there are enough free WQEBBs:
+ * - Control Segment
+ * - Ethernet Segment
+ * - First Segment of inlined Ethernet data
+ * - ... data continued ...
+ * - Data Segments of pointer/min inline type
+ */
+ ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+ MLX5_ESEG_MIN_INLINE_SIZE +
+ MLX5_WSEG_SIZE +
+ MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+ if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+ return MLX5_TXCMP_CODE_EXIT;
+ /* Check for maximal WQE size. */
+ if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+ return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes/packets counters. */
+ ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
+ loc->mbuf->tso_segsz;
+ /*
+ * One will be added for mbuf itself
+ * at the end of the mlx5_tx_burst from
+ * loc->pkts_sent field.
+ */
+ --ntcp;
+ txq->stats.opackets += ntcp;
+ txq->stats.obytes += dlen + vlan + ntcp * inlen;
+#endif
+ wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ loc->wqe_last = wqe;
+ mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
+ ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
+ wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+ txq->wqe_ci += (ds + 3) / 4;
+ loc->wqe_free -= (ds + 3) / 4;
+ return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, without any data inlining in
+ * Ethernet Segment.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ struct mlx5_wqe_dseg *restrict dseg;
+ struct mlx5_wqe *restrict wqe;
+ unsigned int ds, nseg;
+
+ MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+ /*
+ * No inline at all, it means the CPU cycles saving
+ * is prioritized at configuration, we should not
+ * copy any packet data to WQE.
+ */
+ nseg = NB_SEGS(loc->mbuf);
+ ds = 2 + nseg;
+ if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+ return MLX5_TXCMP_CODE_EXIT;
+ /* Check for maximal WQE size. */
+ if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+ return MLX5_TXCMP_CODE_ERROR;
+ /*
+ * Some Tx offloads may cause an error if
+ * packet is not long enough, check against
+ * assumed minimal length.
+ */
+ if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
+ return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes counter. */
+ txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
+ if (MLX5_TXOFF_CONFIG(VLAN) &&
+ loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+ txq->stats.obytes += sizeof(struct rte_vlan_hdr);
+#endif
+ /*
+ * SEND WQE, one WQEBB:
+ * - Control Segment, SEND opcode
+ * - Ethernet Segment, optional VLAN, no inline
+ * - Data Segments, pointer only type
+ */
+ wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ loc->wqe_last = wqe;
+ mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
+ mlx5_tx_eseg_none(txq, loc, wqe, olx);
+ dseg = &wqe->dseg[0];
+ do {
+ if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+ struct rte_mbuf *mbuf;
+
+ /*
+ * Zero length segment found, have to
+ * correct total size of WQE in segments.
+ * It is supposed to be rare occasion, so
+ * in normal case (no zero length segments)
+ * we avoid extra writing to the Control
+ * Segment.
+ */
+ --ds;
+ wqe->cseg.sq_ds -= RTE_BE32(1);
+ mbuf = loc->mbuf;
+ loc->mbuf = mbuf->next;
+ rte_pktmbuf_free_seg(mbuf);
+ if (--nseg == 0)
+ break;
+ } else {
+ mlx5_tx_dseg_ptr
+ (txq, loc, dseg,
+ rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+ rte_pktmbuf_data_len(loc->mbuf), olx);
+ txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+ --loc->elts_free;
+ if (--nseg == 0)
+ break;
+ ++dseg;
+ if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+ dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+ loc->mbuf = loc->mbuf->next;
+ }
+ } while (true);
+ txq->wqe_ci += (ds + 3) / 4;
+ loc->wqe_free -= (ds + 3) / 4;
+ return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, with data inlining in
+ * Ethernet Segment and minimal Data Segments.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ struct mlx5_wqe *restrict wqe;
+ unsigned int ds, inlen, dlen, vlan = 0;
+
+ MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+ MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+ /*
+ * First calculate data length to be inlined
+ * to estimate the required space for WQE.
+ */
+ dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+ if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+ vlan = sizeof(struct rte_vlan_hdr);
+ inlen = dlen + vlan;
+ /* Check against minimal length. */
+ if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+ return MLX5_TXCMP_CODE_ERROR;
+ MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+ if (inlen > txq->inlen_send ||
+ loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) {
+ struct rte_mbuf *mbuf;
+ unsigned int nxlen;
+ uintptr_t start;
+
+ /*
+ * Packet length exceeds the allowed inline
+ * data length, check whether the minimal
+ * inlining is required.
+ */
+ if (txq->inlen_mode) {
+ MLX5_ASSERT(txq->inlen_mode >=
+ MLX5_ESEG_MIN_INLINE_SIZE);
+ MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send);
+ inlen = txq->inlen_mode;
+ } else {
+ if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE ||
+ !vlan || txq->vlan_en) {
+ /*
+ * VLAN insertion will be done inside by HW.
+ * It is not utmost effective - VLAN flag is
+ * checked twice, but we should proceed the
+ * inlining length correctly and take into
+ * account the VLAN header being inserted.
+ */
+ return mlx5_tx_packet_multi_send
+ (txq, loc, olx);
+ }
+ inlen = MLX5_ESEG_MIN_INLINE_SIZE;
+ }
+ /*
+ * Now we know the minimal amount of data is requested
+ * to inline. Check whether we should inline the buffers
+ * from the chain beginning to eliminate some mbufs.
+ */
+ mbuf = loc->mbuf;
+ nxlen = rte_pktmbuf_data_len(mbuf);
+ if (unlikely(nxlen <= txq->inlen_send)) {
+ /* We can inline first mbuf at least. */
+ if (nxlen < inlen) {
+ unsigned int smlen;
+
+ /* Scan mbufs till inlen filled. */
+ do {
+ smlen = nxlen;
+ mbuf = NEXT(mbuf);
+ MLX5_ASSERT(mbuf);
+ nxlen = rte_pktmbuf_data_len(mbuf);
+ nxlen += smlen;
+ } while (unlikely(nxlen < inlen));
+ if (unlikely(nxlen > txq->inlen_send)) {
+ /* We cannot inline entire mbuf. */
+ smlen = inlen - smlen;
+ start = rte_pktmbuf_mtod_offset
+ (mbuf, uintptr_t, smlen);
+ goto do_align;
+ }
+ }
+ do {
+ inlen = nxlen;
+ mbuf = NEXT(mbuf);
+ /* There should be not end of packet. */
+ MLX5_ASSERT(mbuf);
+ nxlen = inlen + rte_pktmbuf_data_len(mbuf);
+ } while (unlikely(nxlen < txq->inlen_send));
+ }
+ start = rte_pktmbuf_mtod(mbuf, uintptr_t);
+ /*
+ * Check whether we can do inline to align start
+ * address of data buffer to cacheline.
+ */
+do_align:
+ start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1);
+ if (unlikely(start)) {
+ start += inlen;
+ if (start <= txq->inlen_send)
+ inlen = start;
+ }
+ }
+ /*
+ * Check whether there are enough free WQEBBs:
+ * - Control Segment
+ * - Ethernet Segment
+ * - First Segment of inlined Ethernet data
+ * - ... data continued ...
+ * - Data Segments of pointer/min inline type
+ *
+ * Estimate the number of Data Segments conservatively,
+ * supposing no any mbufs is being freed during inlining.
+ */
+ MLX5_ASSERT(inlen <= txq->inlen_send);
+ ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+ MLX5_ESEG_MIN_INLINE_SIZE +
+ MLX5_WSEG_SIZE +
+ MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+ if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+ return MLX5_TXCMP_CODE_EXIT;
+ /* Check for maximal WQE size. */
+ if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+ return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes/packets counters. */
+ txq->stats.obytes += dlen + vlan;
+#endif
+ wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ loc->wqe_last = wqe;
+ mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
+ ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
+ wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+ txq->wqe_ci += (ds + 3) / 4;
+ loc->wqe_free -= (ds + 3) / 4;
+ return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx burst function for multi-segment packets. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
+ * sends one packet per WQE. Function stops sending if it
+ * encounters the single-segment packet.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq,
+ struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+ MLX5_ASSERT(pkts_n > loc->pkts_sent);
+ pkts += loc->pkts_sent + 1;
+ pkts_n -= loc->pkts_sent;
+ for (;;) {
+ enum mlx5_txcmp_code ret;
+
+ MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+ /*
+ * Estimate the number of free elts quickly but
+ * conservatively. Some segment may be fully inlined
+ * and freed, ignore this here - precise estimation
+ * is costly.
+ */
+ if (loc->elts_free < NB_SEGS(loc->mbuf))
+ return MLX5_TXCMP_CODE_EXIT;
+ if (MLX5_TXOFF_CONFIG(TSO) &&
+ unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+ /* Proceed with multi-segment TSO. */
+ ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
+ } else if (MLX5_TXOFF_CONFIG(INLINE)) {
+ /* Proceed with multi-segment SEND with inlining. */
+ ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
+ } else {
+ /* Proceed with multi-segment SEND w/o inlining. */
+ ret = mlx5_tx_packet_multi_send(txq, loc, olx);
+ }
+ if (ret == MLX5_TXCMP_CODE_EXIT)
+ return MLX5_TXCMP_CODE_EXIT;
+ if (ret == MLX5_TXCMP_CODE_ERROR)
+ return MLX5_TXCMP_CODE_ERROR;
+ /* WQE is built, go to the next packet. */
+ ++loc->pkts_sent;
+ --pkts_n;
+ if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ loc->mbuf = *pkts++;
+ if (pkts_n > 1)
+ rte_prefetch0(*pkts);
+ if (likely(NB_SEGS(loc->mbuf) > 1))
+ continue;
+ /* Here ends the series of multi-segment packets. */
+ if (MLX5_TXOFF_CONFIG(TSO) &&
+ unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
+ return MLX5_TXCMP_CODE_TSO;
+ return MLX5_TXCMP_CODE_SINGLE;
+ }
+ MLX5_ASSERT(false);
+}
+
+/**
+ * Tx burst function for single-segment packets with TSO.
+ * Supports all types of Tx offloads, except multi-packets.
+ * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
+ * Function stops sending if it encounters the multi-segment
+ * packet or packet without TSO requested.
+ *
+ * The routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inline
+ * offloads is requested due to possible early freeing
+ * of the inlined mbufs (can not store pkts array in elts
+ * as a batch).
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq,
+ struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+ MLX5_ASSERT(pkts_n > loc->pkts_sent);
+ pkts += loc->pkts_sent + 1;
+ pkts_n -= loc->pkts_sent;
+ for (;;) {
+ struct mlx5_wqe_dseg *restrict dseg;
+ struct mlx5_wqe *restrict wqe;
+ unsigned int ds, dlen, hlen, ntcp, vlan = 0;
+ uint8_t *dptr;
+
+ MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+ dlen = rte_pktmbuf_data_len(loc->mbuf);
+ if (MLX5_TXOFF_CONFIG(VLAN) &&
+ loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+ vlan = sizeof(struct rte_vlan_hdr);
+ }
+ /*
+ * First calculate the WQE size to check
+ * whether we have enough space in ring buffer.
+ */
+ hlen = loc->mbuf->l2_len + vlan +
+ loc->mbuf->l3_len + loc->mbuf->l4_len;
+ if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
+ return MLX5_TXCMP_CODE_ERROR;
+ if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+ hlen += loc->mbuf->outer_l2_len +
+ loc->mbuf->outer_l3_len;
+ /* Segment must contain all TSO headers. */
+ if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
+ hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+ hlen > (dlen + vlan)))
+ return MLX5_TXCMP_CODE_ERROR;
+ /*
+ * Check whether there are enough free WQEBBs:
+ * - Control Segment
+ * - Ethernet Segment
+ * - First Segment of inlined Ethernet data
+ * - ... data continued ...
+ * - Finishing Data Segment of pointer type
+ */
+ ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
+ MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+ if (loc->wqe_free < ((ds + 3) / 4))
+ return MLX5_TXCMP_CODE_EXIT;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes/packets counters. */
+ ntcp = (dlen + vlan - hlen +
+ loc->mbuf->tso_segsz - 1) /
+ loc->mbuf->tso_segsz;
+ /*
+ * One will be added for mbuf itself at the end
+ * of the mlx5_tx_burst from loc->pkts_sent field.
+ */
+ --ntcp;
+ txq->stats.opackets += ntcp;
+ txq->stats.obytes += dlen + vlan + ntcp * hlen;
+#endif
+ /*
+ * Build the TSO WQE:
+ * - Control Segment
+ * - Ethernet Segment with hlen bytes inlined
+ * - Data Segment of pointer type
+ */
+ wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ loc->wqe_last = wqe;
+ mlx5_tx_cseg_init(txq, loc, wqe, ds,
+ MLX5_OPCODE_TSO, olx);
+ dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
+ dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
+ dlen -= hlen - vlan;
+ mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+ /*
+ * WQE is built, update the loop parameters
+ * and go to the next packet.
+ */
+ txq->wqe_ci += (ds + 3) / 4;
+ loc->wqe_free -= (ds + 3) / 4;
+ if (MLX5_TXOFF_CONFIG(INLINE))
+ txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+ --loc->elts_free;
+ ++loc->pkts_sent;
+ --pkts_n;
+ if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ loc->mbuf = *pkts++;
+ if (pkts_n > 1)
+ rte_prefetch0(*pkts);
+ if (MLX5_TXOFF_CONFIG(MULTI) &&
+ unlikely(NB_SEGS(loc->mbuf) > 1))
+ return MLX5_TXCMP_CODE_MULTI;
+ if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
+ return MLX5_TXCMP_CODE_SINGLE;
+ /* Continue with the next TSO packet. */
+ }
+ MLX5_ASSERT(false);
+}
+
+/**
+ * Analyze the packet and select the best method to send.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ * @param newp
+ * The predefined flag whether do complete check for
+ * multi-segment packets and TSO.
+ *
+ * @return
+ * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
+ * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
+ * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx,
+ bool newp)
+{
+ /* Check for multi-segment packet. */
+ if (newp &&
+ MLX5_TXOFF_CONFIG(MULTI) &&
+ unlikely(NB_SEGS(loc->mbuf) > 1))
+ return MLX5_TXCMP_CODE_MULTI;
+ /* Check for TSO packet. */
+ if (newp &&
+ MLX5_TXOFF_CONFIG(TSO) &&
+ unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
+ return MLX5_TXCMP_CODE_TSO;
+ /* Check if eMPW is enabled at all. */
+ if (!MLX5_TXOFF_CONFIG(EMPW))
+ return MLX5_TXCMP_CODE_SINGLE;
+ /* Check if eMPW can be engaged. */
+ if (MLX5_TXOFF_CONFIG(VLAN) &&
+ unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) &&
+ (!MLX5_TXOFF_CONFIG(INLINE) ||
+ unlikely((rte_pktmbuf_data_len(loc->mbuf) +
+ sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
+ /*
+ * eMPW does not support VLAN insertion offload,
+ * we have to inline the entire packet but
+ * packet is too long for inlining.
+ */
+ return MLX5_TXCMP_CODE_SINGLE;
+ }
+ return MLX5_TXCMP_CODE_EMPW;
+}
+
+/**
+ * Check the next packet attributes to match with the eMPW batch ones.
+ * In addition, for legacy MPW the packet length is checked either.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param es
+ * Pointer to Ethernet Segment of eMPW batch.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param dlen
+ * Length of previous packet in MPW descriptor.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * true - packet match with eMPW batch attributes.
+ * false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline bool
+mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
+ struct mlx5_wqe_eseg *restrict es,
+ struct mlx5_txq_local *restrict loc,
+ uint32_t dlen,
+ unsigned int olx)
+{
+ uint8_t swp_flags = 0;
+
+ /* Compare the checksum flags, if any. */
+ if (MLX5_TXOFF_CONFIG(CSUM) &&
+ txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
+ return false;
+ /* Compare the Software Parser offsets and flags. */
+ if (MLX5_TXOFF_CONFIG(SWP) &&
+ (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
+ es->swp_flags != swp_flags))
+ return false;
+ /* Fill metadata field if needed. */
+ if (MLX5_TXOFF_CONFIG(METADATA) &&
+ es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+ *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0))
+ return false;
+ /* Legacy MPW can send packets with the same lengt only. */
+ if (MLX5_TXOFF_CONFIG(MPW) &&
+ dlen != rte_pktmbuf_data_len(loc->mbuf))
+ return false;
+ /* There must be no VLAN packets in eMPW loop. */
+ if (MLX5_TXOFF_CONFIG(VLAN))
+ MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
+ return true;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * without data inlining. Number of Data Segments is
+ * equal to the number of sent packets.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param ds
+ * Number of packets/Data Segments/Packets.
+ * @param slen
+ * Accumulated statistics, bytes sent
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * true - packet match with eMPW batch attributes.
+ * false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int ds,
+ unsigned int slen,
+ unsigned int olx __rte_unused)
+{
+ MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes counter. */
+ txq->stats.obytes += slen;
+#else
+ (void)slen;
+#endif
+ loc->elts_free -= ds;
+ loc->pkts_sent += ds;
+ ds += 2;
+ loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+ txq->wqe_ci += (ds + 3) / 4;
+ loc->wqe_free -= (ds + 3) / 4;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * with data inlining. Gets the size of pushed descriptors
+ * and data to the WQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param len
+ * Total size of descriptor/data in bytes.
+ * @param slen
+ * Accumulated statistics, data bytes sent.
+ * @param wqem
+ * The base WQE for the eMPW/MPW descriptor.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * true - packet match with eMPW batch attributes.
+ * false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int len,
+ unsigned int slen,
+ struct mlx5_wqe *restrict wqem,
+ unsigned int olx __rte_unused)
+{
+ struct mlx5_wqe_dseg *dseg = &wqem->dseg[0];
+
+ MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes counter. */
+ txq->stats.obytes += slen;
+#else
+ (void)slen;
+#endif
+ if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) {
+ /*
+ * If the legacy MPW session contains the inline packets
+ * we should set the only inline data segment length
+ * and align the total length to the segment size.
+ */
+ MLX5_ASSERT(len > sizeof(dseg->bcount));
+ dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) |
+ MLX5_ETH_WQE_DATA_INLINE);
+ len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2;
+ } else {
+ /*
+ * The session is not legacy MPW or contains the
+ * data buffer pointer segments.
+ */
+ MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
+ len = len / MLX5_WSEG_SIZE + 2;
+ }
+ wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
+ txq->wqe_ci += (len + 3) / 4;
+ loc->wqe_free -= (len + 3) / 4;
+ loc->wqe_last = wqem;
+}
+
+/**
+ * The set of Tx burst functions for single-segment packets
+ * without TSO and with Multi-Packet Writing feature support.
+ * Supports all types of Tx offloads, except multi-packets
+ * and TSO.
+ *
+ * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends
+ * as many packet per WQE as it can. If eMPW is not configured
+ * or packet can not be sent with eMPW (VLAN insertion) the
+ * ordinary SEND opcode is used and only one packet placed
+ * in WQE.
+ *
+ * Functions stop sending if it encounters the multi-segment
+ * packet or packet with TSO requested.
+ *
+ * The routines are responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inlining
+ * offload is requested. Otherwise the copying mbufs to elts
+ * can be postponed and completed at the end of burst routine.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * @return
+ * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ * MLX5_TXCMP_CODE_TSO - TSO packet encountered.
+ * MLX5_TXCMP_CODE_SINGLE - used inside functions set.
+ * MLX5_TXCMP_CODE_EMPW - used inside functions set.
+ *
+ * Local context variables updated.
+ *
+ *
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * without inlining, this is dedicated optimized branch.
+ * No VLAN insertion is supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq,
+ struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ /*
+ * Subroutine is the part of mlx5_tx_burst_single()
+ * and sends single-segment packet with eMPW opcode
+ * without data inlining.
+ */
+ MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
+ MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
+ MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+ MLX5_ASSERT(pkts_n > loc->pkts_sent);
+ static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+ pkts += loc->pkts_sent + 1;
+ pkts_n -= loc->pkts_sent;
+ for (;;) {
+ struct mlx5_wqe_dseg *restrict dseg;
+ struct mlx5_wqe_eseg *restrict eseg;
+ enum mlx5_txcmp_code ret;
+ unsigned int part, loop;
+ unsigned int slen = 0;
+
+next_empw:
+ MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+ part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
+ MLX5_MPW_MAX_PACKETS :
+ MLX5_EMPW_MAX_PACKETS);
+ if (unlikely(loc->elts_free < part)) {
+ /* We have no enough elts to save all mbufs. */
+ if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
+ return MLX5_TXCMP_CODE_EXIT;
+ /* But we still able to send at least minimal eMPW. */
+ part = loc->elts_free;
+ }
+ /* Check whether we have enough WQEs */
+ if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
+ if (unlikely(loc->wqe_free <
+ ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+ return MLX5_TXCMP_CODE_EXIT;
+ part = (loc->wqe_free * 4) - 2;
+ }
+ if (likely(part > 1))
+ rte_prefetch0(*pkts);
+ loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ /*
+ * Build eMPW title WQEBB:
+ * - Control Segment, eMPW opcode
+ * - Ethernet Segment, no inline
+ */
+ mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
+ MLX5_OPCODE_ENHANCED_MPSW, olx);
+ mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+ olx & ~MLX5_TXOFF_CONFIG_VLAN);
+ eseg = &loc->wqe_last->eseg;
+ dseg = &loc->wqe_last->dseg[0];
+ loop = part;
+ /* Store the packet length for legacy MPW. */
+ if (MLX5_TXOFF_CONFIG(MPW))
+ eseg->mss = rte_cpu_to_be_16
+ (rte_pktmbuf_data_len(loc->mbuf));
+ for (;;) {
+ uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes counter. */
+ slen += dlen;
+#endif
+ mlx5_tx_dseg_ptr
+ (txq, loc, dseg,
+ rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+ dlen, olx);
+ if (unlikely(--loop == 0))
+ break;
+ loc->mbuf = *pkts++;
+ if (likely(loop > 1))
+ rte_prefetch0(*pkts);
+ ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+ /*
+ * Unroll the completion code to avoid
+ * returning variable value - it results in
+ * unoptimized sequent checking in caller.
+ */
+ if (ret == MLX5_TXCMP_CODE_MULTI) {
+ part -= loop;
+ mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+ if (unlikely(!loc->elts_free ||
+ !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ return MLX5_TXCMP_CODE_MULTI;
+ }
+ MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+ if (ret == MLX5_TXCMP_CODE_TSO) {
+ part -= loop;
+ mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+ if (unlikely(!loc->elts_free ||
+ !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ return MLX5_TXCMP_CODE_TSO;
+ }
+ if (ret == MLX5_TXCMP_CODE_SINGLE) {
+ part -= loop;
+ mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+ if (unlikely(!loc->elts_free ||
+ !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ return MLX5_TXCMP_CODE_SINGLE;
+ }
+ if (ret != MLX5_TXCMP_CODE_EMPW) {
+ MLX5_ASSERT(false);
+ part -= loop;
+ mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+ return MLX5_TXCMP_CODE_ERROR;
+ }
+ /*
+ * Check whether packet parameters coincide
+ * within assumed eMPW batch:
+ * - check sum settings
+ * - metadata value
+ * - software parser settings
+ * - packets length (legacy MPW only)
+ */
+ if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) {
+ MLX5_ASSERT(loop);
+ part -= loop;
+ mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+ if (unlikely(!loc->elts_free ||
+ !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ pkts_n -= part;
+ goto next_empw;
+ }
+ /* Packet attributes match, continue the same eMPW. */
+ ++dseg;
+ if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+ dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+ }
+ /* eMPW is built successfully, update loop parameters. */
+ MLX5_ASSERT(!loop);
+ MLX5_ASSERT(pkts_n >= part);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes counter. */
+ txq->stats.obytes += slen;
+#endif
+ loc->elts_free -= part;
+ loc->pkts_sent += part;
+ txq->wqe_ci += (2 + part + 3) / 4;
+ loc->wqe_free -= (2 + part + 3) / 4;
+ pkts_n -= part;
+ if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ loc->mbuf = *pkts++;
+ ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+ if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
+ return ret;
+ /* Continue sending eMPW batches. */
+ }
+ MLX5_ASSERT(false);
+}
+
+/**
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * with inlining, optionally supports VLAN insertion.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
+ struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ /*
+ * Subroutine is the part of mlx5_tx_burst_single()
+ * and sends single-segment packet with eMPW opcode
+ * with data inlining.
+ */
+ MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+ MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
+ MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+ MLX5_ASSERT(pkts_n > loc->pkts_sent);
+ static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+ pkts += loc->pkts_sent + 1;
+ pkts_n -= loc->pkts_sent;
+ for (;;) {
+ struct mlx5_wqe_dseg *restrict dseg;
+ struct mlx5_wqe *restrict wqem;
+ enum mlx5_txcmp_code ret;
+ unsigned int room, part, nlim;
+ unsigned int slen = 0;
+
+ MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+ /*
+ * Limits the amount of packets in one WQE
+ * to improve CQE latency generation.
+ */
+ nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
+ MLX5_MPW_INLINE_MAX_PACKETS :
+ MLX5_EMPW_MAX_PACKETS);
+ /* Check whether we have minimal amount WQEs */
+ if (unlikely(loc->wqe_free <
+ ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+ return MLX5_TXCMP_CODE_EXIT;
+ if (likely(pkts_n > 1))
+ rte_prefetch0(*pkts);
+ wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ /*
+ * Build eMPW title WQEBB:
+ * - Control Segment, eMPW opcode, zero DS
+ * - Ethernet Segment, no inline
+ */
+ mlx5_tx_cseg_init(txq, loc, wqem, 0,
+ MLX5_OPCODE_ENHANCED_MPSW, olx);
+ mlx5_tx_eseg_none(txq, loc, wqem,
+ olx & ~MLX5_TXOFF_CONFIG_VLAN);
+ dseg = &wqem->dseg[0];
+ /* Store the packet length for legacy MPW. */
+ if (MLX5_TXOFF_CONFIG(MPW))
+ wqem->eseg.mss = rte_cpu_to_be_16
+ (rte_pktmbuf_data_len(loc->mbuf));
+ room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
+ loc->wqe_free) * MLX5_WQE_SIZE -
+ MLX5_WQE_CSEG_SIZE -
+ MLX5_WQE_ESEG_SIZE;
+ /* Limit the room for legacy MPW sessions for performance. */
+ if (MLX5_TXOFF_CONFIG(MPW))
+ room = RTE_MIN(room,
+ RTE_MAX(txq->inlen_empw +
+ sizeof(dseg->bcount) +
+ (MLX5_TXOFF_CONFIG(VLAN) ?
+ sizeof(struct rte_vlan_hdr) : 0),
+ MLX5_MPW_INLINE_MAX_PACKETS *
+ MLX5_WQE_DSEG_SIZE));
+ /* Build WQE till we have space, packets and resources. */
+ part = room;
+ for (;;) {
+ uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+ uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+ unsigned int tlen;
+
+ MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
+ MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0);
+ MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
+ /*
+ * Some Tx offloads may cause an error if
+ * packet is not long enough, check against
+ * assumed minimal length.
+ */
+ if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
+ part -= room;
+ if (unlikely(!part))
+ return MLX5_TXCMP_CODE_ERROR;
+ /*
+ * We have some successfully built
+ * packet Data Segments to send.
+ */
+ mlx5_tx_idone_empw(txq, loc, part,
+ slen, wqem, olx);
+ return MLX5_TXCMP_CODE_ERROR;
+ }
+ /* Inline or not inline - that's the Question. */
+ if (dlen > txq->inlen_empw ||
+ loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE)
+ goto pointer_empw;
+ if (MLX5_TXOFF_CONFIG(MPW)) {
+ if (dlen > txq->inlen_send)
+ goto pointer_empw;
+ tlen = dlen;
+ if (part == room) {
+ /* Open new inline MPW session. */
+ tlen += sizeof(dseg->bcount);
+ dseg->bcount = RTE_BE32(0);
+ dseg = RTE_PTR_ADD
+ (dseg, sizeof(dseg->bcount));
+ } else {
+ /*
+ * No pointer and inline descriptor
+ * intermix for legacy MPW sessions.
+ */
+ if (wqem->dseg[0].bcount)
+ break;
+ }
+ } else {
+ tlen = sizeof(dseg->bcount) + dlen;
+ }
+ /* Inline entire packet, optional VLAN insertion. */
+ if (MLX5_TXOFF_CONFIG(VLAN) &&
+ loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+ /*
+ * The packet length must be checked in
+ * mlx5_tx_able_to_empw() and packet
+ * fits into inline length guaranteed.
+ */
+ MLX5_ASSERT((dlen +
+ sizeof(struct rte_vlan_hdr)) <=
+ txq->inlen_empw);
+ tlen += sizeof(struct rte_vlan_hdr);
+ if (room < tlen)
+ break;
+ dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
+ dptr, dlen, olx);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes counter. */
+ slen += sizeof(struct rte_vlan_hdr);
+#endif
+ } else {
+ if (room < tlen)
+ break;
+ dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
+ dptr, dlen, olx);
+ }
+ if (!MLX5_TXOFF_CONFIG(MPW))
+ tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
+ MLX5_ASSERT(room >= tlen);
+ room -= tlen;
+ /*
+ * Packet data are completely inlined,
+ * free the packet immediately.
+ */
+ rte_pktmbuf_free_seg(loc->mbuf);
+ goto next_mbuf;
+pointer_empw:
+ /*
+ * No pointer and inline descriptor
+ * intermix for legacy MPW sessions.
+ */
+ if (MLX5_TXOFF_CONFIG(MPW) &&
+ part != room &&
+ wqem->dseg[0].bcount == RTE_BE32(0))
+ break;
+ /*
+ * Not inlinable VLAN packets are
+ * proceeded outside of this routine.
+ */
+ MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
+ if (MLX5_TXOFF_CONFIG(VLAN))
+ MLX5_ASSERT(!(loc->mbuf->ol_flags &
+ PKT_TX_VLAN_PKT));
+ mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+ /* We have to store mbuf in elts.*/
+ txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+ room -= MLX5_WQE_DSEG_SIZE;
+ /* Ring buffer wraparound is checked at the loop end.*/
+ ++dseg;
+next_mbuf:
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes counter. */
+ slen += dlen;
+#endif
+ loc->pkts_sent++;
+ loc->elts_free--;
+ pkts_n--;
+ if (unlikely(!pkts_n || !loc->elts_free)) {
+ /*
+ * We have no resources/packets to
+ * continue build descriptors.
+ */
+ part -= room;
+ mlx5_tx_idone_empw(txq, loc, part,
+ slen, wqem, olx);
+ return MLX5_TXCMP_CODE_EXIT;
+ }
+ loc->mbuf = *pkts++;
+ if (likely(pkts_n > 1))
+ rte_prefetch0(*pkts);
+ ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+ /*
+ * Unroll the completion code to avoid
+ * returning variable value - it results in
+ * unoptimized sequent checking in caller.
+ */
+ if (ret == MLX5_TXCMP_CODE_MULTI) {
+ part -= room;
+ mlx5_tx_idone_empw(txq, loc, part,
+ slen, wqem, olx);
+ if (unlikely(!loc->elts_free ||
+ !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ return MLX5_TXCMP_CODE_MULTI;
+ }
+ MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+ if (ret == MLX5_TXCMP_CODE_TSO) {
+ part -= room;
+ mlx5_tx_idone_empw(txq, loc, part,
+ slen, wqem, olx);
+ if (unlikely(!loc->elts_free ||
+ !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ return MLX5_TXCMP_CODE_TSO;
+ }
+ if (ret == MLX5_TXCMP_CODE_SINGLE) {
+ part -= room;
+ mlx5_tx_idone_empw(txq, loc, part,
+ slen, wqem, olx);
+ if (unlikely(!loc->elts_free ||
+ !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ return MLX5_TXCMP_CODE_SINGLE;
+ }
+ if (ret != MLX5_TXCMP_CODE_EMPW) {
+ MLX5_ASSERT(false);
+ part -= room;
+ mlx5_tx_idone_empw(txq, loc, part,
+ slen, wqem, olx);
+ return MLX5_TXCMP_CODE_ERROR;
+ }
+ /* Check if we have minimal room left. */
+ nlim--;
+ if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE))
+ break;
+ /*
+ * Check whether packet parameters coincide
+ * within assumed eMPW batch:
+ * - check sum settings
+ * - metadata value
+ * - software parser settings
+ * - packets length (legacy MPW only)
+ */
+ if (!mlx5_tx_match_empw(txq, &wqem->eseg,
+ loc, dlen, olx))
+ break;
+ /* Packet attributes match, continue the same eMPW. */
+ if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+ dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+ }
+ /*
+ * We get here to close an existing eMPW
+ * session and start the new one.
+ */
+ MLX5_ASSERT(pkts_n);
+ part -= room;
+ if (unlikely(!part))
+ return MLX5_TXCMP_CODE_EXIT;
+ mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx);
+ if (unlikely(!loc->elts_free ||
+ !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ /* Continue the loop with new eMPW session. */
+ }
+ MLX5_ASSERT(false);
+}
+
+/**
+ * The routine sends packets with ordinary MLX5_OPCODE_SEND.
+ * Data inlining and VLAN insertion are supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
+ struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ /*
+ * Subroutine is the part of mlx5_tx_burst_single()
+ * and sends single-segment packet with SEND opcode.
+ */
+ MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+ MLX5_ASSERT(pkts_n > loc->pkts_sent);
+ pkts += loc->pkts_sent + 1;
+ pkts_n -= loc->pkts_sent;
+ for (;;) {
+ struct mlx5_wqe *restrict wqe;
+ enum mlx5_txcmp_code ret;
+
+ MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+ if (MLX5_TXOFF_CONFIG(INLINE)) {
+ unsigned int inlen, vlan = 0;
+
+ inlen = rte_pktmbuf_data_len(loc->mbuf);
+ if (MLX5_TXOFF_CONFIG(VLAN) &&
+ loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+ vlan = sizeof(struct rte_vlan_hdr);
+ inlen += vlan;
+ static_assert((sizeof(struct rte_vlan_hdr) +
+ sizeof(struct rte_ether_hdr)) ==
+ MLX5_ESEG_MIN_INLINE_SIZE,
+ "invalid min inline data size");
+ }
+ /*
+ * If inlining is enabled at configuration time
+ * the limit must be not less than minimal size.
+ * Otherwise we would do extra check for data
+ * size to avoid crashes due to length overflow.
+ */
+ MLX5_ASSERT(txq->inlen_send >=
+ MLX5_ESEG_MIN_INLINE_SIZE);
+ if (inlen <= txq->inlen_send) {
+ unsigned int seg_n, wqe_n;
+
+ rte_prefetch0(rte_pktmbuf_mtod
+ (loc->mbuf, uint8_t *));
+ /* Check against minimal length. */
+ if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+ return MLX5_TXCMP_CODE_ERROR;
+ if (loc->mbuf->ol_flags &
+ PKT_TX_DYNF_NOINLINE) {
+ /*
+ * The hint flag not to inline packet
+ * data is set. Check whether we can
+ * follow the hint.
+ */
+ if ((!MLX5_TXOFF_CONFIG(EMPW) &&
+ txq->inlen_mode) ||
+ (MLX5_TXOFF_CONFIG(MPW) &&
+ txq->inlen_mode)) {
+ /*
+ * The hardware requires the
+ * minimal inline data header.
+ */
+ goto single_min_inline;
+ }
+ if (MLX5_TXOFF_CONFIG(VLAN) &&
+ vlan && !txq->vlan_en) {
+ /*
+ * We must insert VLAN tag
+ * by software means.
+ */
+ goto single_part_inline;
+ }
+ goto single_no_inline;
+ }
+ /*
+ * Completely inlined packet data WQE:
+ * - Control Segment, SEND opcode
+ * - Ethernet Segment, no VLAN insertion
+ * - Data inlined, VLAN optionally inserted
+ * - Alignment to MLX5_WSEG_SIZE
+ * Have to estimate amount of WQEBBs
+ */
+ seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
+ MLX5_ESEG_MIN_INLINE_SIZE +
+ MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+ /* Check if there are enough WQEBBs. */
+ wqe_n = (seg_n + 3) / 4;
+ if (wqe_n > loc->wqe_free)
+ return MLX5_TXCMP_CODE_EXIT;
+ wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ loc->wqe_last = wqe;
+ mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
+ MLX5_OPCODE_SEND, olx);
+ mlx5_tx_eseg_data(txq, loc, wqe,
+ vlan, inlen, 0, olx);
+ txq->wqe_ci += wqe_n;
+ loc->wqe_free -= wqe_n;
+ /*
+ * Packet data are completely inlined,
+ * free the packet immediately.
+ */
+ rte_pktmbuf_free_seg(loc->mbuf);
+ } else if ((!MLX5_TXOFF_CONFIG(EMPW) ||
+ MLX5_TXOFF_CONFIG(MPW)) &&
+ txq->inlen_mode) {
+ /*
+ * If minimal inlining is requested the eMPW
+ * feature should be disabled due to data is
+ * inlined into Ethernet Segment, which can
+ * not contain inlined data for eMPW due to
+ * segment shared for all packets.
+ */
+ struct mlx5_wqe_dseg *restrict dseg;
+ unsigned int ds;
+ uint8_t *dptr;
+
+ /*
+ * The inline-mode settings require
+ * to inline the specified amount of
+ * data bytes to the Ethernet Segment.
+ * We should check the free space in
+ * WQE ring buffer to inline partially.
+ */
+single_min_inline:
+ MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode);
+ MLX5_ASSERT(inlen > txq->inlen_mode);
+ MLX5_ASSERT(txq->inlen_mode >=
+ MLX5_ESEG_MIN_INLINE_SIZE);
+ /*
+ * Check whether there are enough free WQEBBs:
+ * - Control Segment
+ * - Ethernet Segment
+ * - First Segment of inlined Ethernet data
+ * - ... data continued ...
+ * - Finishing Data Segment of pointer type
+ */
+ ds = (MLX5_WQE_CSEG_SIZE +
+ MLX5_WQE_ESEG_SIZE +
+ MLX5_WQE_DSEG_SIZE +
+ txq->inlen_mode -
+ MLX5_ESEG_MIN_INLINE_SIZE +
+ MLX5_WQE_DSEG_SIZE +
+ MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+ if (loc->wqe_free < ((ds + 3) / 4))
+ return MLX5_TXCMP_CODE_EXIT;
+ /*
+ * Build the ordinary SEND WQE:
+ * - Control Segment
+ * - Ethernet Segment, inline inlen_mode bytes
+ * - Data Segment of pointer type
+ */
+ wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ loc->wqe_last = wqe;
+ mlx5_tx_cseg_init(txq, loc, wqe, ds,
+ MLX5_OPCODE_SEND, olx);
+ dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
+ txq->inlen_mode,
+ 0, olx);
+ dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+ txq->inlen_mode - vlan;
+ inlen -= txq->inlen_mode;
+ mlx5_tx_dseg_ptr(txq, loc, dseg,
+ dptr, inlen, olx);
+ /*
+ * WQE is built, update the loop parameters
+ * and got to the next packet.
+ */
+ txq->wqe_ci += (ds + 3) / 4;
+ loc->wqe_free -= (ds + 3) / 4;
+ /* We have to store mbuf in elts.*/
+ MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+ txq->elts[txq->elts_head++ & txq->elts_m] =
+ loc->mbuf;
+ --loc->elts_free;
+ } else {
+ uint8_t *dptr;
+ unsigned int dlen;
+
+ /*
+ * Partially inlined packet data WQE, we have
+ * some space in title WQEBB, we can fill it
+ * with some packet data. It takes one WQEBB,
+ * it is available, no extra space check:
+ * - Control Segment, SEND opcode
+ * - Ethernet Segment, no VLAN insertion
+ * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
+ * - Data Segment, pointer type
+ *
+ * We also get here if VLAN insertion is not
+ * supported by HW, the inline is enabled.
+ */
+single_part_inline:
+ wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ loc->wqe_last = wqe;
+ mlx5_tx_cseg_init(txq, loc, wqe, 4,
+ MLX5_OPCODE_SEND, olx);
+ mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
+ dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+ MLX5_ESEG_MIN_INLINE_SIZE - vlan;
+ /*
+ * The length check is performed above, by
+ * comparing with txq->inlen_send. We should
+ * not get overflow here.
+ */
+ MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
+ dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
+ mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
+ dptr, dlen, olx);
+ ++txq->wqe_ci;
+ --loc->wqe_free;
+ /* We have to store mbuf in elts.*/
+ MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+ txq->elts[txq->elts_head++ & txq->elts_m] =
+ loc->mbuf;
+ --loc->elts_free;
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes counter. */
+ txq->stats.obytes += vlan +
+ rte_pktmbuf_data_len(loc->mbuf);
+#endif
+ } else {
+ /*
+ * No inline at all, it means the CPU cycles saving
+ * is prioritized at configuration, we should not
+ * copy any packet data to WQE.
+ *
+ * SEND WQE, one WQEBB:
+ * - Control Segment, SEND opcode
+ * - Ethernet Segment, optional VLAN, no inline
+ * - Data Segment, pointer type
+ */
+single_no_inline:
+ wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+ loc->wqe_last = wqe;
+ mlx5_tx_cseg_init(txq, loc, wqe, 3,
+ MLX5_OPCODE_SEND, olx);
+ mlx5_tx_eseg_none(txq, loc, wqe, olx);
+ mlx5_tx_dseg_ptr
+ (txq, loc, &wqe->dseg[0],
+ rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+ rte_pktmbuf_data_len(loc->mbuf), olx);
+ ++txq->wqe_ci;
+ --loc->wqe_free;
+ /*
+ * We should not store mbuf pointer in elts
+ * if no inlining is configured, this is done
+ * by calling routine in a batch copy.
+ */
+ MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
+ --loc->elts_free;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Update sent data bytes counter. */
+ txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
+ if (MLX5_TXOFF_CONFIG(VLAN) &&
+ loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+ txq->stats.obytes +=
+ sizeof(struct rte_vlan_hdr);
+#endif
+ }
+ ++loc->pkts_sent;
+ --pkts_n;
+ if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+ return MLX5_TXCMP_CODE_EXIT;
+ loc->mbuf = *pkts++;
+ if (pkts_n > 1)
+ rte_prefetch0(*pkts);
+ ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+ if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
+ return ret;
+ }
+ MLX5_ASSERT(false);
+}
+
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq,
+ struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ enum mlx5_txcmp_code ret;
+
+ ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
+ if (ret == MLX5_TXCMP_CODE_SINGLE)
+ goto ordinary_send;
+ MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW);
+ for (;;) {
+ /* Optimize for inline/no inline eMPW send. */
+ ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
+ mlx5_tx_burst_empw_inline
+ (txq, pkts, pkts_n, loc, olx) :
+ mlx5_tx_burst_empw_simple
+ (txq, pkts, pkts_n, loc, olx);
+ if (ret != MLX5_TXCMP_CODE_SINGLE)
+ return ret;
+ /* The resources to send one packet should remain. */
+ MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+ordinary_send:
+ ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
+ MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE);
+ if (ret != MLX5_TXCMP_CODE_EMPW)
+ return ret;
+ /* The resources to send one packet should remain. */
+ MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+ }
+}
+
+/**
+ * DPDK Tx callback template. This is configured template
+ * used to generate routines optimized for specified offload setup.
+ * One of this generated functions is chosen at SQ configuration
+ * time.
+ *
+ * @param txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ * @param olx
+ * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
+ * values. Should be static to take compile time static configuration
+ * advantages.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+static __rte_always_inline uint16_t
+mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
+ struct rte_mbuf **restrict pkts,
+ uint16_t pkts_n,
+ unsigned int olx)
+{
+ struct mlx5_txq_local loc;
+ enum mlx5_txcmp_code ret;
+ unsigned int part;
+
+ MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+ MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+ if (unlikely(!pkts_n))
+ return 0;
+ loc.pkts_sent = 0;
+ loc.pkts_copy = 0;
+ loc.wqe_last = NULL;
+
+send_loop:
+ loc.pkts_loop = loc.pkts_sent;
+ /*
+ * Check if there are some CQEs, if any:
+ * - process an encountered errors
+ * - process the completed WQEs
+ * - free related mbufs
+ * - doorbell the NIC about processed CQEs
+ */
+ rte_prefetch0(*(pkts + loc.pkts_sent));
+ mlx5_tx_handle_completion(txq, olx);
+ /*
+ * Calculate the number of available resources - elts and WQEs.
+ * There are two possible different scenarios:
+ * - no data inlining into WQEs, one WQEBB may contains up to
+ * four packets, in this case elts become scarce resource
+ * - data inlining into WQEs, one packet may require multiple
+ * WQEBBs, the WQEs become the limiting factor.
+ */
+ MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+ loc.elts_free = txq->elts_s -
+ (uint16_t)(txq->elts_head - txq->elts_tail);
+ MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+ loc.wqe_free = txq->wqe_s -
+ (uint16_t)(txq->wqe_ci - txq->wqe_pi);
+ if (unlikely(!loc.elts_free || !loc.wqe_free))
+ goto burst_exit;
+ for (;;) {
+ /*
+ * Fetch the packet from array. Usually this is
+ * the first packet in series of multi/single
+ * segment packets.
+ */
+ loc.mbuf = *(pkts + loc.pkts_sent);
+ /* Dedicated branch for multi-segment packets. */
+ if (MLX5_TXOFF_CONFIG(MULTI) &&
+ unlikely(NB_SEGS(loc.mbuf) > 1)) {
+ /*
+ * Multi-segment packet encountered.
+ * Hardware is able to process it only
+ * with SEND/TSO opcodes, one packet
+ * per WQE, do it in dedicated routine.
+ */
+enter_send_multi:
+ MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy);
+ part = loc.pkts_sent - loc.pkts_copy;
+ if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+ /*
+ * There are some single-segment mbufs not
+ * stored in elts. The mbufs must be in the
+ * same order as WQEs, so we must copy the
+ * mbufs to elts here, before the coming
+ * multi-segment packet mbufs is appended.
+ */
+ mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
+ part, olx);
+ loc.pkts_copy = loc.pkts_sent;
+ }
+ MLX5_ASSERT(pkts_n > loc.pkts_sent);
+ ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
+ if (!MLX5_TXOFF_CONFIG(INLINE))
+ loc.pkts_copy = loc.pkts_sent;
+ /*
+ * These returned code checks are supposed
+ * to be optimized out due to routine inlining.
+ */
+ if (ret == MLX5_TXCMP_CODE_EXIT) {
+ /*
+ * The routine returns this code when
+ * all packets are sent or there is no
+ * enough resources to complete request.
+ */
+ break;
+ }
+ if (ret == MLX5_TXCMP_CODE_ERROR) {
+ /*
+ * The routine returns this code when
+ * some error in the incoming packets
+ * format occurred.
+ */
+ txq->stats.oerrors++;
+ break;
+ }
+ if (ret == MLX5_TXCMP_CODE_SINGLE) {
+ /*
+ * The single-segment packet was encountered
+ * in the array, try to send it with the
+ * best optimized way, possible engaging eMPW.
+ */
+ goto enter_send_single;
+ }
+ if (MLX5_TXOFF_CONFIG(TSO) &&
+ ret == MLX5_TXCMP_CODE_TSO) {
+ /*
+ * The single-segment TSO packet was
+ * encountered in the array.
+ */
+ goto enter_send_tso;
+ }
+ /* We must not get here. Something is going wrong. */
+ MLX5_ASSERT(false);
+ txq->stats.oerrors++;
+ break;
+ }
+ /* Dedicated branch for single-segment TSO packets. */
+ if (MLX5_TXOFF_CONFIG(TSO) &&
+ unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+ /*
+ * TSO might require special way for inlining
+ * (dedicated parameters) and is sent with
+ * MLX5_OPCODE_TSO opcode only, provide this
+ * in dedicated branch.
+ */
+enter_send_tso:
+ MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1);
+ MLX5_ASSERT(pkts_n > loc.pkts_sent);
+ ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
+ /*
+ * These returned code checks are supposed
+ * to be optimized out due to routine inlining.
+ */
+ if (ret == MLX5_TXCMP_CODE_EXIT)
+ break;
+ if (ret == MLX5_TXCMP_CODE_ERROR) {
+ txq->stats.oerrors++;
+ break;
+ }
+ if (ret == MLX5_TXCMP_CODE_SINGLE)
+ goto enter_send_single;
+ if (MLX5_TXOFF_CONFIG(MULTI) &&
+ ret == MLX5_TXCMP_CODE_MULTI) {
+ /*
+ * The multi-segment packet was
+ * encountered in the array.
+ */
+ goto enter_send_multi;
+ }
+ /* We must not get here. Something is going wrong. */
+ MLX5_ASSERT(false);
+ txq->stats.oerrors++;
+ break;
+ }
+ /*
+ * The dedicated branch for the single-segment packets
+ * without TSO. Often these ones can be sent using
+ * MLX5_OPCODE_EMPW with multiple packets in one WQE.
+ * The routine builds the WQEs till it encounters
+ * the TSO or multi-segment packet (in case if these
+ * offloads are requested at SQ configuration time).
+ */
+enter_send_single:
+ MLX5_ASSERT(pkts_n > loc.pkts_sent);
+ ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
+ /*
+ * These returned code checks are supposed
+ * to be optimized out due to routine inlining.
+ */
+ if (ret == MLX5_TXCMP_CODE_EXIT)
+ break;
+ if (ret == MLX5_TXCMP_CODE_ERROR) {
+ txq->stats.oerrors++;
+ break;
+ }
+ if (MLX5_TXOFF_CONFIG(MULTI) &&
+ ret == MLX5_TXCMP_CODE_MULTI) {
+ /*
+ * The multi-segment packet was
+ * encountered in the array.
+ */
+ goto enter_send_multi;
+ }
+ if (MLX5_TXOFF_CONFIG(TSO) &&
+ ret == MLX5_TXCMP_CODE_TSO) {
+ /*
+ * The single-segment TSO packet was
+ * encountered in the array.
+ */
+ goto enter_send_tso;
+ }
+ /* We must not get here. Something is going wrong. */
+ MLX5_ASSERT(false);
+ txq->stats.oerrors++;
+ break;
+ }
+ /*
+ * Main Tx loop is completed, do the rest:
+ * - set completion request if thresholds are reached
+ * - doorbell the hardware
+ * - copy the rest of mbufs to elts (if any)
+ */
+ MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) ||
+ loc.pkts_sent >= loc.pkts_copy);
+ /* Take a shortcut if nothing is sent. */
+ if (unlikely(loc.pkts_sent == loc.pkts_loop))
+ goto burst_exit;
+ /* Request CQE generation if limits are reached. */
+ mlx5_tx_request_completion(txq, &loc, olx);
+ /*
+ * Ring QP doorbell immediately after WQE building completion
+ * to improve latencies. The pure software related data treatment
+ * can be completed after doorbell. Tx CQEs for this SQ are
+ * processed in this thread only by the polling.
+ *
+ * The rdma core library can map doorbell register in two ways,
+ * depending on the environment variable "MLX5_SHUT_UP_BF":
+ *
+ * - as regular cached memory, the variable is either missing or
+ * set to zero. This type of mapping may cause the significant
+ * doorbell register writing latency and requires explicit
+ * memory write barrier to mitigate this issue and prevent
+ * write combining.
+ *
+ * - as non-cached memory, the variable is present and set to
+ * not "0" value. This type of mapping may cause performance
+ * impact under heavy loading conditions but the explicit write
+ * memory barrier is not required and it may improve core
+ * performance.
+ *
+ * - the legacy behaviour (prior 19.08 release) was to use some
+ * heuristics to decide whether write memory barrier should
+ * be performed. This behavior is supported with specifying
+ * tx_db_nc=2, write barrier is skipped if application
+ * provides the full recommended burst of packets, it
+ * supposes the next packets are coming and the write barrier
+ * will be issued on the next burst (after descriptor writing,
+ * at least).
+ */
+ mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc &&
+ (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST));
+ /* Not all of the mbufs may be stored into elts yet. */
+ part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
+ if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+ /*
+ * There are some single-segment mbufs not stored in elts.
+ * It can be only if the last packet was single-segment.
+ * The copying is gathered into one place due to it is
+ * a good opportunity to optimize that with SIMD.
+ * Unfortunately if inlining is enabled the gaps in
+ * pointer array may happen due to early freeing of the
+ * inlined mbufs.
+ */
+ mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
+ loc.pkts_copy = loc.pkts_sent;
+ }
+ MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+ MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+ if (pkts_n > loc.pkts_sent) {
+ /*
+ * If burst size is large there might be no enough CQE
+ * fetched from completion queue and no enough resources
+ * freed to send all the packets.
+ */
+ goto send_loop;
+ }
+burst_exit:
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += loc.pkts_sent;
+#endif
+ return loc.pkts_sent;
+}
+
+/* Generate routines with Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full_empw,
+ MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(none_empw,
+ MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(md_empw,
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mt_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtsc_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mti_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtv_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtiv_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sc_empw,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sci_empw,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(scv_empw,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sciv_empw,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(i_empw,
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(v_empw,
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(iv_empw,
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+/* Generate routines without Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full,
+ MLX5_TXOFF_CONFIG_FULL)
+
+MLX5_TXOFF_DECL(none,
+ MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_DECL(md,
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mt,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mtsc,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mti,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtv,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtiv,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sc,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sci,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(scv,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(sciv,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(i,
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(v,
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(iv,
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+/*
+ * Generate routines with Legacy Multi-Packet Write support.
+ * This mode is supported by ConnectX-4 Lx only and imposes
+ * offload limitations, not supported:
+ * - ACL/Flows (metadata are becoming meaningless)
+ * - WQE Inline headers
+ * - SRIOV (E-Switch offloads)
+ * - VLAN insertion
+ * - tunnel encapsulation/decapsulation
+ * - TSO
+ */
+MLX5_TXOFF_DECL(none_mpw,
+ MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW |
+ MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_DECL(mci_mpw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
+ MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_DECL(mc_mpw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_DECL(i_mpw,
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
+ MLX5_TXOFF_CONFIG_MPW)
+
+/*
+ * Array of declared and compiled Tx burst function and corresponding
+ * supported offloads set. The array is used to select the Tx burst
+ * function for specified offloads set at Tx queue configuration time.
+ */
+const struct {
+ eth_tx_burst_t func;
+ unsigned int olx;
+} txoff_func[] = {
+MLX5_TXOFF_INFO(full_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(none_empw,
+ MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(md_empw,
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mt_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtsc_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mti_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtv_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtiv_empw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sc_empw,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sci_empw,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(scv_empw,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sciv_empw,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(i_empw,
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(v_empw,
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(iv_empw,
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(full,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(none,
+ MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_INFO(md,
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mt,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtsc,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mti,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtv,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtiv,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sc,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sci,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(scv,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sciv,
+ MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(i,
+ MLX5_TXOFF_CONFIG_INLINE |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(v,
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(iv,
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(none_mpw,
+ MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW |
+ MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_INFO(mci_mpw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
+ MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_INFO(mc_mpw,
+ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
+ MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_INFO(i_mpw,
+ MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
+ MLX5_TXOFF_CONFIG_MPW)
+};
+
+/**
+ * Configure the Tx function to use. The routine checks configured
+ * Tx offloads for the device and selects appropriate Tx burst
+ * routine. There are multiple Tx burst routines compiled from
+ * the same template in the most optimal way for the dedicated
+ * Tx offloads set.
+ *
+ * @param dev
+ * Pointer to private data structure.
+ *
+ * @return
+ * Pointer to selected Tx burst function.
+ */
+eth_tx_burst_t
+mlx5_select_tx_function(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
+ unsigned int diff = 0, olx = 0, i, m;
+
+ static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <=
+ MLX5_DSEG_MAX, "invalid WQE max size");
+ static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE,
+ "invalid WQE Control Segment size");
+ static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE,
+ "invalid WQE Ethernet Segment size");
+ static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE,
+ "invalid WQE Data Segment size");
+ static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE,
+ "invalid WQE size");
+ MLX5_ASSERT(priv);
+ if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) {
+ /* We should support Multi-Segment Packets. */
+ olx |= MLX5_TXOFF_CONFIG_MULTI;
+ }
+ if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+ DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+ DEV_TX_OFFLOAD_GRE_TNL_TSO |
+ DEV_TX_OFFLOAD_IP_TNL_TSO |
+ DEV_TX_OFFLOAD_UDP_TNL_TSO)) {
+ /* We should support TCP Send Offload. */
+ olx |= MLX5_TXOFF_CONFIG_TSO;
+ }
+ if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
+ DEV_TX_OFFLOAD_UDP_TNL_TSO |
+ DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+ /* We should support Software Parser for Tunnels. */
+ olx |= MLX5_TXOFF_CONFIG_SWP;
+ }
+ if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
+ DEV_TX_OFFLOAD_UDP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_CKSUM |
+ DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+ /* We should support IP/TCP/UDP Checksums. */
+ olx |= MLX5_TXOFF_CONFIG_CSUM;
+ }
+ if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) {
+ /* We should support VLAN insertion. */
+ olx |= MLX5_TXOFF_CONFIG_VLAN;
+ }
+ if (priv->txqs_n && (*priv->txqs)[0]) {
+ struct mlx5_txq_data *txd = (*priv->txqs)[0];
+
+ if (txd->inlen_send) {
+ /*
+ * Check the data inline requirements. Data inline
+ * is enabled on per device basis, we can check
+ * the first Tx queue only.
+ *
+ * If device does not support VLAN insertion in WQE
+ * and some queues are requested to perform VLAN
+ * insertion offload than inline must be enabled.
+ */
+ olx |= MLX5_TXOFF_CONFIG_INLINE;
+ }
+ }
+ if (config->mps == MLX5_MPW_ENHANCED &&
+ config->txq_inline_min <= 0) {
+ /*
+ * The NIC supports Enhanced Multi-Packet Write
+ * and does not require minimal inline data.
+ */
+ olx |= MLX5_TXOFF_CONFIG_EMPW;
+ }
+ if (rte_flow_dynf_metadata_avail()) {
+ /* We should support Flow metadata. */
+ olx |= MLX5_TXOFF_CONFIG_METADATA;
+ }
+ if (config->mps == MLX5_MPW) {
+ /*
+ * The NIC supports Legacy Multi-Packet Write.
+ * The MLX5_TXOFF_CONFIG_MPW controls the
+ * descriptor building method in combination
+ * with MLX5_TXOFF_CONFIG_EMPW.
+ */
+ if (!(olx & (MLX5_TXOFF_CONFIG_TSO |
+ MLX5_TXOFF_CONFIG_SWP |
+ MLX5_TXOFF_CONFIG_VLAN |
+ MLX5_TXOFF_CONFIG_METADATA)))
+ olx |= MLX5_TXOFF_CONFIG_EMPW |
+ MLX5_TXOFF_CONFIG_MPW;
+ }
+ /*
+ * Scan the routines table to find the minimal
+ * satisfying routine with requested offloads.
+ */
+ m = RTE_DIM(txoff_func);
+ for (i = 0; i < RTE_DIM(txoff_func); i++) {
+ unsigned int tmp;
+
+ tmp = txoff_func[i].olx;
+ if (tmp == olx) {
+ /* Meets requested offloads exactly.*/
+ m = i;
+ break;
+ }
+ if ((tmp & olx) != olx) {
+ /* Does not meet requested offloads at all. */
+ continue;
+ }
+ if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW)
+ /* Do not enable eMPW if not configured. */
+ continue;
+ if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE)
+ /* Do not enable inlining if not configured. */
+ continue;
+ /*
+ * Some routine meets the requirements.
+ * Check whether it has minimal amount
+ * of not requested offloads.
+ */
+ tmp = __builtin_popcountl(tmp & ~olx);
+ if (m >= RTE_DIM(txoff_func) || tmp < diff) {
+ /* First or better match, save and continue. */
+ m = i;
+ diff = tmp;
+ continue;
+ }
+ if (tmp == diff) {
+ tmp = txoff_func[i].olx ^ txoff_func[m].olx;
+ if (__builtin_ffsl(txoff_func[i].olx & ~tmp) <
+ __builtin_ffsl(txoff_func[m].olx & ~tmp)) {
+ /* Lighter not requested offload. */
+ m = i;
+ }
+ }
+ }
+ if (m >= RTE_DIM(txoff_func)) {
+ DRV_LOG(DEBUG, "port %u has no selected Tx function"
+ " for requested offloads %04X",
+ dev->data->port_id, olx);
+ return NULL;
+ }
+ DRV_LOG(DEBUG, "port %u has selected Tx function"
+ " supporting offloads %04X/%04X",
+ dev->data->port_id, olx, txoff_func[m].olx);
+ if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI)
+ DRV_LOG(DEBUG, "\tMULTI (multi segment)");
+ if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO)
+ DRV_LOG(DEBUG, "\tTSO (TCP send offload)");
+ if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP)
+ DRV_LOG(DEBUG, "\tSWP (software parser)");
+ if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM)
+ DRV_LOG(DEBUG, "\tCSUM (checksum offload)");
+ if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE)
+ DRV_LOG(DEBUG, "\tINLIN (inline data)");
+ if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN)
+ DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)");
+ if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA)
+ DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)");
+ if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) {
+ if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW)
+ DRV_LOG(DEBUG, "\tMPW (Legacy MPW)");
+ else
+ DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)");
+ }
+ return txoff_func[m].func;
+}
+
+/**
+ * DPDK callback to get the TX queue information
+ *
+ * @param dev
+ * Pointer to the device structure.
+ *
+ * @param tx_queue_id
+ * Tx queue identificator.
+ *
+ * @param qinfo
+ * Pointer to the TX queue information structure.
+ *
+ * @return
+ * None.
+ */
+
+void
+mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+ struct rte_eth_txq_info *qinfo)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id];
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq, struct mlx5_txq_ctrl, txq);
+
+ if (!txq)
+ return;
+ qinfo->nb_desc = txq->elts_s;
+ qinfo->conf.tx_thresh.pthresh = 0;
+ qinfo->conf.tx_thresh.hthresh = 0;
+ qinfo->conf.tx_thresh.wthresh = 0;
+ qinfo->conf.tx_rs_thresh = 0;
+ qinfo->conf.tx_free_thresh = 0;
+ qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1;
+ qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
+}
+
+/**
+ * DPDK callback to get the TX packet burst mode information
+ *
+ * @param dev
+ * Pointer to the device structure.
+ *
+ * @param tx_queue_id
+ * Tx queue identificatior.
+ *
+ * @param mode
+ * Pointer to the burts mode information.
+ *
+ * @return
+ * 0 as success, -EINVAL as failure.
+ */
+
+int
+mlx5_tx_burst_mode_get(struct rte_eth_dev *dev,
+ uint16_t tx_queue_id __rte_unused,
+ struct rte_eth_burst_mode *mode)
+{
+ eth_tx_burst_t pkt_burst = dev->tx_pkt_burst;
+ unsigned int i, olx;
+
+ for (i = 0; i < RTE_DIM(txoff_func); i++) {
+ if (pkt_burst == txoff_func[i].func) {
+ olx = txoff_func[i].olx;
+ snprintf(mode->info, sizeof(mode->info),
+ "%s%s%s%s%s%s%s%s",
+ (olx & MLX5_TXOFF_CONFIG_EMPW) ?
+ ((olx & MLX5_TXOFF_CONFIG_MPW) ?
+ "Legacy MPW" : "Enhanced MPW") : "No MPW",
+ (olx & MLX5_TXOFF_CONFIG_MULTI) ?
+ " + MULTI" : "",
+ (olx & MLX5_TXOFF_CONFIG_TSO) ?
+ " + TSO" : "",
+ (olx & MLX5_TXOFF_CONFIG_SWP) ?
+ " + SWP" : "",
+ (olx & MLX5_TXOFF_CONFIG_CSUM) ?
+ " + CSUM" : "",
+ (olx & MLX5_TXOFF_CONFIG_INLINE) ?
+ " + INLINE" : "",
+ (olx & MLX5_TXOFF_CONFIG_VLAN) ?
+ " + VLAN" : "",
+ (olx & MLX5_TXOFF_CONFIG_METADATA) ?
+ " + METADATA" : "");
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h
new file mode 100644
index 000000000..48f2b7941
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h
@@ -0,0 +1,683 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_H_
+#define RTE_PMD_MLX5_RXTX_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_common.h>
+#include <rte_hexdump.h>
+#include <rte_atomic.h>
+#include <rte_spinlock.h>
+#include <rte_io.h>
+#include <rte_bus_pci.h>
+#include <rte_malloc.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_prm.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5_defs.h"
+#include "mlx5_utils.h"
+#include "mlx5.h"
+#include "mlx5_autoconf.h"
+
+/* Support tunnel matching. */
+#define MLX5_FLOW_TUNNEL 10
+
+/* Mbuf dynamic flag offset for inline. */
+extern uint64_t rte_net_mlx5_dynf_inline_mask;
+
+struct mlx5_rxq_stats {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint64_t ipackets; /**< Total of successfully received packets. */
+ uint64_t ibytes; /**< Total of successfully received bytes. */
+#endif
+ uint64_t idropped; /**< Total of packets dropped when RX ring full. */
+ uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */
+};
+
+struct mlx5_txq_stats {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint64_t opackets; /**< Total of successfully sent packets. */
+ uint64_t obytes; /**< Total of successfully sent bytes. */
+#endif
+ uint64_t oerrors; /**< Total number of failed transmitted packets. */
+};
+
+struct mlx5_priv;
+
+/* Compressed CQE context. */
+struct rxq_zip {
+ uint16_t ai; /* Array index. */
+ uint16_t ca; /* Current array index. */
+ uint16_t na; /* Next array index. */
+ uint16_t cq_ci; /* The next CQE. */
+ uint32_t cqe_cnt; /* Number of CQEs. */
+};
+
+/* Multi-Packet RQ buffer header. */
+struct mlx5_mprq_buf {
+ struct rte_mempool *mp;
+ rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
+ uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first packet. */
+ struct rte_mbuf_ext_shared_info shinfos[];
+ /*
+ * Shared information per stride.
+ * More memory will be allocated for the first stride head-room and for
+ * the strides data.
+ */
+} __rte_cache_aligned;
+
+/* Get pointer to the first stride. */
+#define mlx5_mprq_buf_addr(ptr, strd_n) (RTE_PTR_ADD((ptr), \
+ sizeof(struct mlx5_mprq_buf) + \
+ (strd_n) * \
+ sizeof(struct rte_mbuf_ext_shared_info) + \
+ RTE_PKTMBUF_HEADROOM))
+
+#define MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES 6
+#define MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES 9
+
+enum mlx5_rxq_err_state {
+ MLX5_RXQ_ERR_STATE_NO_ERROR = 0,
+ MLX5_RXQ_ERR_STATE_NEED_RESET,
+ MLX5_RXQ_ERR_STATE_NEED_READY,
+};
+
+/* RX queue descriptor. */
+struct mlx5_rxq_data {
+ unsigned int csum:1; /* Enable checksum offloading. */
+ unsigned int hw_timestamp:1; /* Enable HW timestamp. */
+ unsigned int vlan_strip:1; /* Enable VLAN stripping. */
+ unsigned int crc_present:1; /* CRC must be subtracted. */
+ unsigned int sges_n:3; /* Log 2 of SGEs (max buffers per packet). */
+ unsigned int cqe_n:4; /* Log 2 of CQ elements. */
+ unsigned int elts_n:4; /* Log 2 of Mbufs. */
+ unsigned int rss_hash:1; /* RSS hash result is enabled. */
+ unsigned int mark:1; /* Marked flow available on the queue. */
+ unsigned int strd_num_n:5; /* Log 2 of the number of stride. */
+ unsigned int strd_sz_n:4; /* Log 2 of stride size. */
+ unsigned int strd_shift_en:1; /* Enable 2bytes shift on a stride. */
+ unsigned int err_state:2; /* enum mlx5_rxq_err_state. */
+ unsigned int strd_scatter_en:1; /* Scattered packets from a stride. */
+ unsigned int lro:1; /* Enable LRO. */
+ unsigned int dynf_meta:1; /* Dynamic metadata is configured. */
+ volatile uint32_t *rq_db;
+ volatile uint32_t *cq_db;
+ uint16_t port_id;
+ uint32_t rq_ci;
+ uint16_t consumed_strd; /* Number of consumed strides in WQE. */
+ uint32_t rq_pi;
+ uint32_t cq_ci;
+ uint16_t rq_repl_thresh; /* Threshold for buffer replenishment. */
+ union {
+ struct rxq_zip zip; /* Compressed context. */
+ uint16_t decompressed;
+ /* Number of ready mbufs decompressed from the CQ. */
+ };
+ struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
+ uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
+ volatile void *wqes;
+ volatile struct mlx5_cqe(*cqes)[];
+ RTE_STD_C11
+ union {
+ struct rte_mbuf *(*elts)[];
+ struct mlx5_mprq_buf *(*mprq_bufs)[];
+ };
+ struct rte_mempool *mp;
+ struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
+ struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */
+ uint16_t idx; /* Queue index. */
+ struct mlx5_rxq_stats stats;
+ rte_xmm_t mbuf_initializer; /* Default rearm/flags for vectorized Rx. */
+ struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */
+ void *cq_uar; /* CQ user access region. */
+ uint32_t cqn; /* CQ number. */
+ uint8_t cq_arm_sn; /* CQ arm seq number. */
+#ifndef RTE_ARCH_64
+ rte_spinlock_t *uar_lock_cq;
+ /* CQ (UAR) access lock required for 32bit implementations */
+#endif
+ uint32_t tunnel; /* Tunnel information. */
+ uint64_t flow_meta_mask;
+ int32_t flow_meta_offset;
+} __rte_cache_aligned;
+
+enum mlx5_rxq_obj_type {
+ MLX5_RXQ_OBJ_TYPE_IBV, /* mlx5_rxq_obj with ibv_wq. */
+ MLX5_RXQ_OBJ_TYPE_DEVX_RQ, /* mlx5_rxq_obj with mlx5_devx_rq. */
+ MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN,
+ /* mlx5_rxq_obj with mlx5_devx_rq and hairpin support. */
+};
+
+enum mlx5_rxq_type {
+ MLX5_RXQ_TYPE_STANDARD, /* Standard Rx queue. */
+ MLX5_RXQ_TYPE_HAIRPIN, /* Hairpin Rx queue. */
+ MLX5_RXQ_TYPE_UNDEFINED,
+};
+
+/* Verbs/DevX Rx queue elements. */
+struct mlx5_rxq_obj {
+ LIST_ENTRY(mlx5_rxq_obj) next; /* Pointer to the next element. */
+ rte_atomic32_t refcnt; /* Reference counter. */
+ struct mlx5_rxq_ctrl *rxq_ctrl; /* Back pointer to parent. */
+ struct ibv_cq *cq; /* Completion Queue. */
+ enum mlx5_rxq_obj_type type;
+ RTE_STD_C11
+ union {
+ struct ibv_wq *wq; /* Work Queue. */
+ struct mlx5_devx_obj *rq; /* DevX object for Rx Queue. */
+ };
+ struct ibv_comp_channel *channel;
+};
+
+/* RX queue control descriptor. */
+struct mlx5_rxq_ctrl {
+ struct mlx5_rxq_data rxq; /* Data path structure. */
+ LIST_ENTRY(mlx5_rxq_ctrl) next; /* Pointer to the next element. */
+ rte_atomic32_t refcnt; /* Reference counter. */
+ struct mlx5_rxq_obj *obj; /* Verbs/DevX elements. */
+ struct mlx5_priv *priv; /* Back pointer to private data. */
+ enum mlx5_rxq_type type; /* Rxq type. */
+ unsigned int socket; /* CPU socket ID for allocations. */
+ unsigned int irq:1; /* Whether IRQ is enabled. */
+ unsigned int dbr_umem_id_valid:1; /* dbr_umem_id holds a valid value. */
+ uint32_t flow_mark_n; /* Number of Mark/Flag flows using this Queue. */
+ uint32_t flow_tunnels_n[MLX5_FLOW_TUNNEL]; /* Tunnels counters. */
+ uint32_t wqn; /* WQ number. */
+ uint16_t dump_file_n; /* Number of dump files. */
+ uint32_t dbr_umem_id; /* Storing door-bell information, */
+ uint64_t dbr_offset; /* needed when freeing door-bell. */
+ struct mlx5dv_devx_umem *wq_umem; /* WQ buffer registration info. */
+ struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
+};
+
+enum mlx5_ind_tbl_type {
+ MLX5_IND_TBL_TYPE_IBV,
+ MLX5_IND_TBL_TYPE_DEVX,
+};
+
+/* Indirection table. */
+struct mlx5_ind_table_obj {
+ LIST_ENTRY(mlx5_ind_table_obj) next; /* Pointer to the next element. */
+ rte_atomic32_t refcnt; /* Reference counter. */
+ enum mlx5_ind_tbl_type type;
+ RTE_STD_C11
+ union {
+ struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */
+ struct mlx5_devx_obj *rqt; /* DevX RQT object. */
+ };
+ uint32_t queues_n; /**< Number of queues in the list. */
+ uint16_t queues[]; /**< Queue list. */
+};
+
+/* Hash Rx queue. */
+struct mlx5_hrxq {
+ ILIST_ENTRY(uint32_t)next; /* Index to the next element. */
+ rte_atomic32_t refcnt; /* Reference counter. */
+ struct mlx5_ind_table_obj *ind_table; /* Indirection table. */
+ RTE_STD_C11
+ union {
+ struct ibv_qp *qp; /* Verbs queue pair. */
+ struct mlx5_devx_obj *tir; /* DevX TIR object. */
+ };
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ void *action; /* DV QP action pointer. */
+#endif
+ uint64_t hash_fields; /* Verbs Hash fields. */
+ uint32_t rss_key_len; /* Hash key length in bytes. */
+ uint8_t rss_key[]; /* Hash key. */
+};
+
+/* TX queue send local data. */
+__extension__
+struct mlx5_txq_local {
+ struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */
+ struct rte_mbuf *mbuf; /* first mbuf to process. */
+ uint16_t pkts_copy; /* packets copied to elts. */
+ uint16_t pkts_sent; /* packets sent. */
+ uint16_t pkts_loop; /* packets sent on loop entry. */
+ uint16_t elts_free; /* available elts remain. */
+ uint16_t wqe_free; /* available wqe remain. */
+ uint16_t mbuf_off; /* data offset in current mbuf. */
+ uint16_t mbuf_nseg; /* number of remaining mbuf. */
+};
+
+/* TX queue descriptor. */
+__extension__
+struct mlx5_txq_data {
+ uint16_t elts_head; /* Current counter in (*elts)[]. */
+ uint16_t elts_tail; /* Counter of first element awaiting completion. */
+ uint16_t elts_comp; /* elts index since last completion request. */
+ uint16_t elts_s; /* Number of mbuf elements. */
+ uint16_t elts_m; /* Mask for mbuf elements indices. */
+ /* Fields related to elts mbuf storage. */
+ uint16_t wqe_ci; /* Consumer index for work queue. */
+ uint16_t wqe_pi; /* Producer index for work queue. */
+ uint16_t wqe_s; /* Number of WQ elements. */
+ uint16_t wqe_m; /* Mask Number for WQ elements. */
+ uint16_t wqe_comp; /* WQE index since last completion request. */
+ uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */
+ /* WQ related fields. */
+ uint16_t cq_ci; /* Consumer index for completion queue. */
+ uint16_t cq_pi; /* Production index for completion queue. */
+ uint16_t cqe_s; /* Number of CQ elements. */
+ uint16_t cqe_m; /* Mask for CQ indices. */
+ /* CQ related fields. */
+ uint16_t elts_n:4; /* elts[] length (in log2). */
+ uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
+ uint16_t wqe_n:4; /* Number of WQ elements (in log2). */
+ uint16_t tso_en:1; /* When set hardware TSO is enabled. */
+ uint16_t tunnel_en:1;
+ /* When set TX offload for tunneled packets are supported. */
+ uint16_t swp_en:1; /* Whether SW parser is enabled. */
+ uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
+ uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
+ uint16_t db_heu:1; /* Doorbell heuristic write barrier. */
+ uint16_t inlen_send; /* Ordinary send data inline size. */
+ uint16_t inlen_empw; /* eMPW max packet size to inline. */
+ uint16_t inlen_mode; /* Minimal data length to inline. */
+ uint32_t qp_num_8s; /* QP number shifted by 8. */
+ uint64_t offloads; /* Offloads for Tx Queue. */
+ struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
+ struct mlx5_wqe *wqes; /* Work queue. */
+ struct mlx5_wqe *wqes_end; /* Work queue array limit. */
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+ uint32_t *fcqs; /* Free completion queue (debug extended). */
+#else
+ uint16_t *fcqs; /* Free completion queue. */
+#endif
+ volatile struct mlx5_cqe *cqes; /* Completion queue. */
+ volatile uint32_t *qp_db; /* Work queue doorbell. */
+ volatile uint32_t *cq_db; /* Completion queue doorbell. */
+ uint16_t port_id; /* Port ID of device. */
+ uint16_t idx; /* Queue index. */
+ struct mlx5_txq_stats stats; /* TX queue counters. */
+#ifndef RTE_ARCH_64
+ rte_spinlock_t *uar_lock;
+ /* UAR access lock required for 32bit implementations */
+#endif
+ struct rte_mbuf *elts[0];
+ /* Storage for queued packets, must be the last field. */
+} __rte_cache_aligned;
+
+enum mlx5_txq_obj_type {
+ MLX5_TXQ_OBJ_TYPE_IBV, /* mlx5_txq_obj with ibv_wq. */
+ MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN,
+ /* mlx5_txq_obj with mlx5_devx_tq and hairpin support. */
+};
+
+enum mlx5_txq_type {
+ MLX5_TXQ_TYPE_STANDARD, /* Standard Tx queue. */
+ MLX5_TXQ_TYPE_HAIRPIN, /* Hairpin Rx queue. */
+};
+
+/* Verbs/DevX Tx queue elements. */
+struct mlx5_txq_obj {
+ LIST_ENTRY(mlx5_txq_obj) next; /* Pointer to the next element. */
+ rte_atomic32_t refcnt; /* Reference counter. */
+ struct mlx5_txq_ctrl *txq_ctrl; /* Pointer to the control queue. */
+ enum mlx5_txq_obj_type type; /* The txq object type. */
+ RTE_STD_C11
+ union {
+ struct {
+ struct ibv_cq *cq; /* Completion Queue. */
+ struct ibv_qp *qp; /* Queue Pair. */
+ };
+ struct {
+ struct mlx5_devx_obj *sq;
+ /* DevX object for Sx queue. */
+ struct mlx5_devx_obj *tis; /* The TIS object. */
+ };
+ };
+};
+
+/* TX queue control descriptor. */
+struct mlx5_txq_ctrl {
+ LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
+ rte_atomic32_t refcnt; /* Reference counter. */
+ unsigned int socket; /* CPU socket ID for allocations. */
+ enum mlx5_txq_type type; /* The txq ctrl type. */
+ unsigned int max_inline_data; /* Max inline data. */
+ unsigned int max_tso_header; /* Max TSO header size. */
+ struct mlx5_txq_obj *obj; /* Verbs/DevX queue object. */
+ struct mlx5_priv *priv; /* Back pointer to private data. */
+ off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
+ void *bf_reg; /* BlueFlame register from Verbs. */
+ uint16_t dump_file_n; /* Number of dump files. */
+ struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
+ struct mlx5_txq_data txq; /* Data path structure. */
+ /* Must be the last field in the structure, contains elts[]. */
+};
+
+#define MLX5_TX_BFREG(txq) \
+ (MLX5_PROC_PRIV((txq)->port_id)->uar_table[(txq)->idx])
+
+/* mlx5_rxq.c */
+
+extern uint8_t rss_hash_default_key[];
+
+int mlx5_check_mprq_support(struct rte_eth_dev *dev);
+int mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq);
+int mlx5_mprq_enabled(struct rte_eth_dev *dev);
+int mlx5_mprq_free_mp(struct rte_eth_dev *dev);
+int mlx5_mprq_alloc_mp(struct rte_eth_dev *dev);
+int mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_rxconf *conf,
+ struct rte_mempool *mp);
+int mlx5_rx_hairpin_queue_setup
+ (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ const struct rte_eth_hairpin_conf *hairpin_conf);
+void mlx5_rx_queue_release(void *dpdk_rxq);
+int mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev);
+void mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev);
+int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
+int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
+struct mlx5_rxq_obj *mlx5_rxq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
+ enum mlx5_rxq_obj_type type);
+int mlx5_rxq_obj_verify(struct rte_eth_dev *dev);
+struct mlx5_rxq_ctrl *mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_rxconf *conf,
+ struct rte_mempool *mp);
+struct mlx5_rxq_ctrl *mlx5_rxq_hairpin_new
+ (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ const struct rte_eth_hairpin_conf *hairpin_conf);
+struct mlx5_rxq_ctrl *mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_rxq_verify(struct rte_eth_dev *dev);
+int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl);
+int mlx5_ind_table_obj_verify(struct rte_eth_dev *dev);
+uint32_t mlx5_hrxq_new(struct rte_eth_dev *dev,
+ const uint8_t *rss_key, uint32_t rss_key_len,
+ uint64_t hash_fields,
+ const uint16_t *queues, uint32_t queues_n,
+ int tunnel __rte_unused);
+uint32_t mlx5_hrxq_get(struct rte_eth_dev *dev,
+ const uint8_t *rss_key, uint32_t rss_key_len,
+ uint64_t hash_fields,
+ const uint16_t *queues, uint32_t queues_n);
+int mlx5_hrxq_release(struct rte_eth_dev *dev, uint32_t hxrq_idx);
+int mlx5_hrxq_verify(struct rte_eth_dev *dev);
+enum mlx5_rxq_type mlx5_rxq_get_type(struct rte_eth_dev *dev, uint16_t idx);
+struct mlx5_hrxq *mlx5_hrxq_drop_new(struct rte_eth_dev *dev);
+void mlx5_hrxq_drop_release(struct rte_eth_dev *dev);
+uint64_t mlx5_get_rx_port_offloads(void);
+uint64_t mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev);
+
+/* mlx5_txq.c */
+
+int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_txconf *conf);
+int mlx5_tx_hairpin_queue_setup
+ (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ const struct rte_eth_hairpin_conf *hairpin_conf);
+void mlx5_tx_queue_release(void *dpdk_txq);
+int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd);
+struct mlx5_txq_obj *mlx5_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
+ enum mlx5_txq_obj_type type);
+struct mlx5_txq_obj *mlx5_txq_obj_get(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_obj_release(struct mlx5_txq_obj *txq_ibv);
+int mlx5_txq_obj_verify(struct rte_eth_dev *dev);
+struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_txconf *conf);
+struct mlx5_txq_ctrl *mlx5_txq_hairpin_new
+ (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ const struct rte_eth_hairpin_conf *hairpin_conf);
+struct mlx5_txq_ctrl *mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_verify(struct rte_eth_dev *dev);
+void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl);
+void txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl);
+uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev);
+
+/* mlx5_rxtx.c */
+
+extern uint32_t mlx5_ptype_table[];
+extern uint8_t mlx5_cksum_table[];
+extern uint8_t mlx5_swp_types_table[];
+
+void mlx5_set_ptype_table(void);
+void mlx5_set_cksum_table(void);
+void mlx5_set_swp_types_table(void);
+uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
+void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq);
+__rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec);
+void mlx5_mprq_buf_free_cb(void *addr, void *opaque);
+void mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf);
+uint16_t mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+uint16_t removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+uint16_t removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+int mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset);
+int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset);
+uint32_t mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id);
+void mlx5_dump_debug_information(const char *path, const char *title,
+ const void *buf, unsigned int len);
+int mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
+ const struct mlx5_mp_arg_queue_state_modify *sm);
+void mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
+ struct rte_eth_rxq_info *qinfo);
+void mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
+ struct rte_eth_txq_info *qinfo);
+int mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+ struct rte_eth_burst_mode *mode);
+int mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+ struct rte_eth_burst_mode *mode);
+
+/* Vectorized version of mlx5_rxtx.c */
+int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
+int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
+uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+
+/* mlx5_mr.c */
+
+void mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl);
+uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr);
+uint32_t mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb);
+uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
+ struct rte_mempool *mp);
+int mlx5_dma_map(struct rte_pci_device *pdev, void *addr, uint64_t iova,
+ size_t len);
+int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t iova,
+ size_t len);
+
+/**
+ * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and
+ * 64bit architectures.
+ *
+ * @param val
+ * value to write in CPU endian format.
+ * @param addr
+ * Address to write to.
+ * @param lock
+ * Address of the lock to use for that UAR access.
+ */
+static __rte_always_inline void
+__mlx5_uar_write64_relaxed(uint64_t val, void *addr,
+ rte_spinlock_t *lock __rte_unused)
+{
+#ifdef RTE_ARCH_64
+ *(uint64_t *)addr = val;
+#else /* !RTE_ARCH_64 */
+ rte_spinlock_lock(lock);
+ *(uint32_t *)addr = val;
+ rte_io_wmb();
+ *((uint32_t *)addr + 1) = val >> 32;
+ rte_spinlock_unlock(lock);
+#endif
+}
+
+/**
+ * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and
+ * 64bit architectures while guaranteeing the order of execution with the
+ * code being executed.
+ *
+ * @param val
+ * value to write in CPU endian format.
+ * @param addr
+ * Address to write to.
+ * @param lock
+ * Address of the lock to use for that UAR access.
+ */
+static __rte_always_inline void
+__mlx5_uar_write64(uint64_t val, void *addr, rte_spinlock_t *lock)
+{
+ rte_io_wmb();
+ __mlx5_uar_write64_relaxed(val, addr, lock);
+}
+
+/* Assist macros, used instead of directly calling the functions they wrap. */
+#ifdef RTE_ARCH_64
+#define mlx5_uar_write64_relaxed(val, dst, lock) \
+ __mlx5_uar_write64_relaxed(val, dst, NULL)
+#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, NULL)
+#else
+#define mlx5_uar_write64_relaxed(val, dst, lock) \
+ __mlx5_uar_write64_relaxed(val, dst, lock)
+#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, lock)
+#endif
+
+/**
+ * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which the
+ * cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ * Pointer to mbuf.
+ *
+ * @return
+ * Memory pool where data is located for given mbuf.
+ */
+static inline struct rte_mempool *
+mlx5_mb2mp(struct rte_mbuf *buf)
+{
+ if (unlikely(RTE_MBUF_CLONED(buf)))
+ return rte_mbuf_from_indirect(buf)->pool;
+ return buf->pool;
+}
+
+/**
+ * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx
+ * as mempool is pre-configured and static.
+ *
+ * @param rxq
+ * Pointer to Rx queue structure.
+ * @param addr
+ * Address to search.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx5_rx_addr2mr(struct mlx5_rxq_data *rxq, uintptr_t addr)
+{
+ struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+ uint32_t lkey;
+
+ /* Linear search on MR cache array. */
+ lkey = mlx5_mr_lookup_lkey(mr_ctrl->cache, &mr_ctrl->mru,
+ MLX5_MR_CACHE_N, addr);
+ if (likely(lkey != UINT32_MAX))
+ return lkey;
+ /* Take slower bottom-half (Binary Search) on miss. */
+ return mlx5_rx_addr2mr_bh(rxq, addr);
+}
+
+#define mlx5_rx_mb2mr(rxq, mb) mlx5_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr))
+
+/**
+ * Query LKey from a packet buffer for Tx. If not found, add the mempool.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param addr
+ * Address to search.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
+{
+ struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+ uintptr_t addr = (uintptr_t)mb->buf_addr;
+ uint32_t lkey;
+
+ /* Check generation bit to see if there's any change on existing MRs. */
+ if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen))
+ mlx5_mr_flush_local_cache(mr_ctrl);
+ /* Linear search on MR cache array. */
+ lkey = mlx5_mr_lookup_lkey(mr_ctrl->cache, &mr_ctrl->mru,
+ MLX5_MR_CACHE_N, addr);
+ if (likely(lkey != UINT32_MAX))
+ return lkey;
+ /* Take slower bottom-half on miss. */
+ return mlx5_tx_mb2mr_bh(txq, mb);
+}
+
+/**
+ * Ring TX queue doorbell and flush the update if requested.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the last WQE posted in the NIC.
+ * @param cond
+ * Request for write memory barrier after BlueFlame update.
+ */
+static __rte_always_inline void
+mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe,
+ int cond)
+{
+ uint64_t *dst = MLX5_TX_BFREG(txq);
+ volatile uint64_t *src = ((volatile uint64_t *)wqe);
+
+ rte_cio_wmb();
+ *txq->qp_db = rte_cpu_to_be_32(txq->wqe_ci);
+ /* Ensure ordering between DB record and BF copy. */
+ rte_wmb();
+ mlx5_uar_write64_relaxed(*src, dst, txq->uar_lock);
+ if (cond)
+ rte_wmb();
+}
+
+/**
+ * Ring TX queue doorbell and flush the update by write memory barrier.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the last WQE posted in the NIC.
+ */
+static __rte_always_inline void
+mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
+{
+ mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c
new file mode 100644
index 000000000..1518bdd5b
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+
+#if defined RTE_ARCH_X86_64
+#include "mlx5_rxtx_vec_sse.h"
+#elif defined RTE_ARCH_ARM64
+#include "mlx5_rxtx_vec_neon.h"
+#elif defined RTE_ARCH_PPC_64
+#include "mlx5_rxtx_vec_altivec.h"
+#else
+#error "This should not be compiled if SIMD instructions are not supported."
+#endif
+
+/**
+ * Skip error packets.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+static uint16_t
+rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n)
+{
+ uint16_t n = 0;
+ unsigned int i;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint32_t err_bytes = 0;
+#endif
+
+ for (i = 0; i < pkts_n; ++i) {
+ struct rte_mbuf *pkt = pkts[i];
+
+ if (pkt->packet_type == RTE_PTYPE_ALL_MASK || rxq->err_state) {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ err_bytes += PKT_LEN(pkt);
+#endif
+ rte_pktmbuf_free_seg(pkt);
+ } else {
+ pkts[n++] = pkt;
+ }
+ }
+ rxq->stats.idropped += (pkts_n - n);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Correct counters of errored completions. */
+ rxq->stats.ipackets -= (pkts_n - n);
+ rxq->stats.ibytes -= err_bytes;
+#endif
+ mlx5_rx_err_handle(rxq, 1);
+ return n;
+}
+
+/**
+ * DPDK callback for vectorized RX.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct mlx5_rxq_data *rxq = dpdk_rxq;
+ uint16_t nb_rx;
+ uint64_t err = 0;
+
+ nb_rx = rxq_burst_v(rxq, pkts, pkts_n, &err);
+ if (unlikely(err | rxq->err_state))
+ nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx);
+ return nb_rx;
+}
+
+/**
+ * Check a RX queue can support vectorized RX.
+ *
+ * @param rxq
+ * Pointer to RX queue.
+ *
+ * @return
+ * 1 if supported, negative errno value if not.
+ */
+int __rte_cold
+mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq)
+{
+ struct mlx5_rxq_ctrl *ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+ if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv)))
+ return -ENOTSUP;
+ if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)
+ return -ENOTSUP;
+ if (rxq->lro)
+ return -ENOTSUP;
+ return 1;
+}
+
+/**
+ * Check a device can support vectorized RX.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 1 if supported, negative errno value if not.
+ */
+int __rte_cold
+mlx5_check_vec_rx_support(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint16_t i;
+
+ if (!priv->config.rx_vec_en)
+ return -ENOTSUP;
+ if (mlx5_mprq_enabled(dev))
+ return -ENOTSUP;
+ /* All the configured queues should support. */
+ for (i = 0; i < priv->rxqs_n; ++i) {
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+
+ if (!rxq)
+ continue;
+ if (mlx5_rxq_check_vec_support(rxq) < 0)
+ break;
+ }
+ if (i != priv->rxqs_n)
+ return -ENOTSUP;
+ return 1;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h
new file mode 100644
index 000000000..6ddcbfb0a
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_H_
+#define RTE_PMD_MLX5_RXTX_VEC_H_
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_autoconf.h"
+
+#include "mlx5_mr.h"
+
+/* HW checksum offload capabilities of vectorized Tx. */
+#define MLX5_VEC_TX_CKSUM_OFFLOAD_CAP \
+ (DEV_TX_OFFLOAD_IPV4_CKSUM | \
+ DEV_TX_OFFLOAD_UDP_CKSUM | \
+ DEV_TX_OFFLOAD_TCP_CKSUM | \
+ DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)
+
+/*
+ * Compile time sanity check for vectorized functions.
+ */
+
+#define S_ASSERT_RTE_MBUF(s) \
+ static_assert(s, "A field of struct rte_mbuf is changed")
+#define S_ASSERT_MLX5_CQE(s) \
+ static_assert(s, "A field of struct mlx5_cqe is changed")
+
+/* rxq_cq_decompress_v() */
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, pkt_len) ==
+ offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, data_len) ==
+ offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, hash) ==
+ offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+/* rxq_cq_to_ptype_oflags_v() */
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, ol_flags) ==
+ offsetof(struct rte_mbuf, rearm_data) + 8);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, rearm_data) ==
+ RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+
+/* rxq_burst_v() */
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, pkt_len) ==
+ offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, data_len) ==
+ offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+#if (RTE_CACHE_LINE_SIZE == 128)
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, pkt_info) == 64);
+#else
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, pkt_info) == 0);
+#endif
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, rx_hash_res) ==
+ offsetof(struct mlx5_cqe, pkt_info) + 12);
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, rsvd1) + 11 ==
+ offsetof(struct mlx5_cqe, hdr_type_etc));
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, vlan_info) ==
+ offsetof(struct mlx5_cqe, hdr_type_etc) + 2);
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, lro_num_seg) + 12 ==
+ offsetof(struct mlx5_cqe, byte_cnt));
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, sop_drop_qpn) ==
+ RTE_ALIGN(offsetof(struct mlx5_cqe, sop_drop_qpn), 8));
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) ==
+ offsetof(struct mlx5_cqe, sop_drop_qpn) + 7);
+
+/**
+ * Replenish buffers for RX in bulk.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param n
+ * Number of buffers to be replenished.
+ */
+static inline void
+mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
+{
+ const uint16_t q_n = 1 << rxq->elts_n;
+ const uint16_t q_mask = q_n - 1;
+ uint16_t elts_idx = rxq->rq_ci & q_mask;
+ struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
+ volatile struct mlx5_wqe_data_seg *wq =
+ &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
+ unsigned int i;
+
+ MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
+ MLX5_ASSERT(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
+ MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >
+ MLX5_VPMD_DESCS_PER_LOOP);
+ /* Not to cross queue end. */
+ n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
+ if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
+ rxq->stats.rx_nombuf += n;
+ return;
+ }
+ for (i = 0; i < n; ++i) {
+ void *buf_addr;
+
+ /*
+ * In order to support the mbufs with external attached
+ * data buffer we should use the buf_addr pointer instead of
+ * rte_mbuf_buf_addr(). It touches the mbuf itself and may
+ * impact the performance.
+ */
+ buf_addr = elts[i]->buf_addr;
+ wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
+ RTE_PKTMBUF_HEADROOM);
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+ }
+ rxq->rq_ci += n;
+ /* Prevent overflowing into consumed mbufs. */
+ elts_idx = rxq->rq_ci & q_mask;
+ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+ (*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
new file mode 100644
index 000000000..26715ef45
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -0,0 +1,1114 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_
+#define RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <rte_altivec.h>
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param pkts
+ * Pointer to array of packets to be stored.
+ * @param pkts_n
+ * Number of packets to be stored.
+ */
+static inline void
+rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+{
+ const uint16_t q_mask = (1 << rxq->elts_n) - 1;
+ struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
+ unsigned int pos;
+ uint16_t p = n & -2;
+
+ for (pos = 0; pos < p; pos += 2) {
+ vector unsigned char mbp;
+
+ mbp = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&elts[pos]);
+ *(vector unsigned char *)&pkts[pos] = mbp;
+ }
+ if (n & 1)
+ pkts[pos] = elts[pos];
+}
+
+/**
+ * Decompress a compressed completion and fill in mbufs in RX SW ring with data
+ * extracted from the title completion descriptor.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param cq
+ * Pointer to completion array having a compressed completion at first.
+ * @param elts
+ * Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ * the title completion descriptor to be copied to the rest of mbufs.
+ *
+ * @return
+ * Number of mini-CQEs successfully decompressed.
+ */
+static inline uint16_t
+rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+ struct rte_mbuf **elts)
+{
+ volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info;
+ struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+ const vector unsigned char zero = (vector unsigned char){0};
+ /* Mask to shuffle from extracted mini CQE to mbuf. */
+ const vector unsigned char shuf_mask1 = (vector unsigned char){
+ -1, -1, -1, -1, /* skip packet_type */
+ 7, 6, -1, -1, /* bswap16, pkt_len */
+ 7, 6, /* bswap16, data_len */
+ -1, -1, /* skip vlan_tci */
+ 3, 2, 1, 0}; /* bswap32, rss */
+ const vector unsigned char shuf_mask2 = (vector unsigned char){
+ -1, -1, -1, -1, /* skip packet_type */
+ 15, 14, -1, -1, /* bswap16, pkt_len */
+ 15, 14, /* data_len, bswap16 */
+ -1, -1, /* skip vlan_tci */
+ 11, 10, 9, 8}; /* bswap32, rss */
+ /* Restore the compressed count. Must be 16 bits. */
+ const uint16_t mcqe_n = t_pkt->data_len +
+ (rxq->crc_present * RTE_ETHER_CRC_LEN);
+ const vector unsigned char rearm =
+ (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&t_pkt->rearm_data);
+ const vector unsigned char rxdf =
+ (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&t_pkt->rx_descriptor_fields1);
+ const vector unsigned char crc_adj =
+ (vector unsigned char)(vector unsigned short){
+ 0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0};
+ const vector unsigned short rxdf_sel_mask =
+ (vector unsigned short){
+ 0xffff, 0xffff, 0, 0, 0, 0xffff, 0, 0};
+ const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+ unsigned int pos;
+ unsigned int i;
+ unsigned int inv = 0;
+
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ const vector unsigned char ones = vec_splat_u8(-1);
+ uint32_t rcvd_byte = 0;
+ /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+ const vector unsigned char len_shuf_mask = (vector unsigned char){
+ 3, 2, 11, 10,
+ 7, 6, 15, 14,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1};
+#endif
+
+ /*
+ * A. load mCQEs into a 128bit register.
+ * B. store rearm data to mbuf.
+ * C. combine data from mCQEs with rx_descriptor_fields1.
+ * D. store rx_descriptor_fields1.
+ * E. store flow tag (rte_flow mark).
+ */
+ for (pos = 0; pos < mcqe_n; ) {
+ vector unsigned char mcqe1, mcqe2;
+ vector unsigned char rxdf1, rxdf2;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ const vector unsigned short mcqe_sel_mask =
+ (vector unsigned short){0, 0, 0xffff, 0xffff,
+ 0, 0, 0xfff, 0xffff};
+ const vector unsigned char lower_half = {
+ 0, 1, 4, 5, 8, 9, 12, 13, 16,
+ 17, 20, 21, 24, 25, 28, 29};
+ const vector unsigned char upper_half = {
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ vector unsigned short left, right;
+ vector unsigned char byte_cnt, invalid_mask;
+ vector unsigned long lshift;
+ __attribute__((altivec(vector__)))
+ __attribute__((altivec(bool__)))
+ unsigned long long shmask;
+ const vector unsigned long shmax = {64, 64};
+#endif
+
+ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+ if (likely(pos + i < mcqe_n))
+ rte_prefetch0((void *)(cq + pos + i));
+
+ /* A.1 load mCQEs into a 128bit register. */
+ mcqe1 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&mcq[pos % 8]);
+ mcqe2 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&mcq[pos % 8 + 2]);
+
+ /* B.1 store rearm data to mbuf. */
+ *(vector unsigned char *)
+ &elts[pos]->rearm_data = rearm;
+ *(vector unsigned char *)
+ &elts[pos + 1]->rearm_data = rearm;
+
+ /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+ rxdf1 = vec_perm(mcqe1, zero, shuf_mask1);
+ rxdf2 = vec_perm(mcqe1, zero, shuf_mask2);
+ rxdf1 = (vector unsigned char)
+ ((vector unsigned short)rxdf1 -
+ (vector unsigned short)crc_adj);
+ rxdf2 = (vector unsigned char)
+ ((vector unsigned short)rxdf2 -
+ (vector unsigned short)crc_adj);
+ rxdf1 = (vector unsigned char)
+ vec_sel((vector unsigned short)rxdf1,
+ (vector unsigned short)rxdf, rxdf_sel_mask);
+ rxdf2 = (vector unsigned char)
+ vec_sel((vector unsigned short)rxdf2,
+ (vector unsigned short)rxdf, rxdf_sel_mask);
+
+ /* D.1 store rx_descriptor_fields1. */
+ *(vector unsigned char *)
+ &elts[pos]->rx_descriptor_fields1 = rxdf1;
+ *(vector unsigned char *)
+ &elts[pos + 1]->rx_descriptor_fields1 = rxdf2;
+
+ /* B.1 store rearm data to mbuf. */
+ *(vector unsigned char *)
+ &elts[pos + 2]->rearm_data = rearm;
+ *(vector unsigned char *)
+ &elts[pos + 3]->rearm_data = rearm;
+
+ /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+ rxdf1 = vec_perm(mcqe2, zero, shuf_mask1);
+ rxdf2 = vec_perm(mcqe2, zero, shuf_mask2);
+ rxdf1 = (vector unsigned char)
+ ((vector unsigned short)rxdf1 -
+ (vector unsigned short)crc_adj);
+ rxdf2 = (vector unsigned char)
+ ((vector unsigned short)rxdf2 -
+ (vector unsigned short)crc_adj);
+ rxdf1 = (vector unsigned char)
+ vec_sel((vector unsigned short)rxdf1,
+ (vector unsigned short)rxdf, rxdf_sel_mask);
+ rxdf2 = (vector unsigned char)
+ vec_sel((vector unsigned short)rxdf2,
+ (vector unsigned short)rxdf, rxdf_sel_mask);
+
+ /* D.1 store rx_descriptor_fields1. */
+ *(vector unsigned char *)
+ &elts[pos + 2]->rx_descriptor_fields1 = rxdf1;
+ *(vector unsigned char *)
+ &elts[pos + 3]->rx_descriptor_fields1 = rxdf2;
+
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ invalid_mask = (vector unsigned char)(vector unsigned long){
+ (mcqe_n - pos) * sizeof(uint16_t) * 8, 0};
+
+ lshift =
+ vec_splat((vector unsigned long)invalid_mask, 0);
+ shmask = vec_cmpgt(shmax, lshift);
+ invalid_mask = (vector unsigned char)
+ vec_sl((vector unsigned long)ones, lshift);
+ invalid_mask = (vector unsigned char)
+ vec_sel((vector unsigned long)shmask,
+ (vector unsigned long)invalid_mask, shmask);
+
+ mcqe1 = (vector unsigned char)
+ vec_sro((vector unsigned short)mcqe1,
+ (vector unsigned char){32}),
+ byte_cnt = (vector unsigned char)
+ vec_sel((vector unsigned short)mcqe1,
+ (vector unsigned short)mcqe2, mcqe_sel_mask);
+ byte_cnt = vec_perm(byte_cnt, zero, len_shuf_mask);
+ byte_cnt = (vector unsigned char)
+ vec_andc((vector unsigned long)byte_cnt,
+ (vector unsigned long)invalid_mask);
+ left = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, lower_half);
+ right = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, upper_half);
+ byte_cnt = (vector unsigned char)vec_add(left, right);
+ left = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, lower_half);
+ right = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, upper_half);
+ byte_cnt = (vector unsigned char)vec_add(left, right);
+ rcvd_byte += ((vector unsigned long)byte_cnt)[0];
+#endif
+
+ if (rxq->mark) {
+ /* E.1 store flow tag (rte_flow mark). */
+ elts[pos]->hash.fdir.hi = flow_tag;
+ elts[pos + 1]->hash.fdir.hi = flow_tag;
+ elts[pos + 2]->hash.fdir.hi = flow_tag;
+ elts[pos + 3]->hash.fdir.hi = flow_tag;
+ }
+ if (rxq->dynf_meta) {
+ int32_t offs = rxq->flow_meta_offset;
+ const uint32_t meta =
+ *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
+
+ /* Check if title packet has valid metadata. */
+ if (meta) {
+ MLX5_ASSERT(t_pkt->ol_flags &
+ rxq->flow_meta_mask);
+ *RTE_MBUF_DYNFIELD(elts[pos], offs,
+ uint32_t *) = meta;
+ *RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
+ uint32_t *) = meta;
+ *RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
+ uint32_t *) = meta;
+ *RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
+ uint32_t *) = meta;
+ }
+ }
+
+ pos += MLX5_VPMD_DESCS_PER_LOOP;
+ /* Move to next CQE and invalidate consumed CQEs. */
+ if (!(pos & 0x7) && pos < mcqe_n) {
+ mcq = (void *)&(cq + pos)->pkt_info;
+ for (i = 0; i < 8; ++i)
+ cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+ }
+ }
+
+ /* Invalidate the rest of CQEs. */
+ for (; inv < mcqe_n; ++inv)
+ cq[inv].op_own = MLX5_CQE_INVALIDATE;
+
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ rxq->stats.ipackets += mcqe_n;
+ rxq->stats.ibytes += rcvd_byte;
+#endif
+
+ rxq->cq_ci += mcqe_n;
+ return mcqe_n;
+}
+
+/**
+ * Calculate packet type and offload flag for mbuf and store it.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param cqes[4]
+ * Array of four 16bytes completions extracted from the original completion
+ * descriptor.
+ * @param op_err
+ * Opcode vector having responder error status. Each field is 4B.
+ * @param pkts
+ * Pointer to array of packets to be filled.
+ */
+static inline void
+rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
+ vector unsigned char cqes[4], vector unsigned char op_err,
+ struct rte_mbuf **pkts)
+{
+ vector unsigned char pinfo0, pinfo1;
+ vector unsigned char pinfo, ptype;
+ vector unsigned char ol_flags = (vector unsigned char)
+ (vector unsigned int){
+ rxq->rss_hash * PKT_RX_RSS_HASH |
+ rxq->hw_timestamp * PKT_RX_TIMESTAMP,
+ rxq->rss_hash * PKT_RX_RSS_HASH |
+ rxq->hw_timestamp * PKT_RX_TIMESTAMP,
+ rxq->rss_hash * PKT_RX_RSS_HASH |
+ rxq->hw_timestamp * PKT_RX_TIMESTAMP,
+ rxq->rss_hash * PKT_RX_RSS_HASH |
+ rxq->hw_timestamp * PKT_RX_TIMESTAMP};
+ vector unsigned char cv_flags;
+ const vector unsigned char zero = (vector unsigned char){0};
+ const vector unsigned char ptype_mask =
+ (vector unsigned char)(vector unsigned int){
+ 0x0000fd06, 0x0000fd06, 0x0000fd06, 0x0000fd06};
+ const vector unsigned char ptype_ol_mask =
+ (vector unsigned char)(vector unsigned int){
+ 0x00000106, 0x00000106, 0x00000106, 0x00000106};
+ const vector unsigned char pinfo_mask =
+ (vector unsigned char)(vector unsigned int){
+ 0x00000003, 0x00000003, 0x00000003, 0x00000003};
+ const vector unsigned char cv_flag_sel = (vector unsigned char){
+ 0, (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+ (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), 0,
+ (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), 0,
+ (uint8_t)((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ const vector unsigned char cv_mask =
+ (vector unsigned char)(vector unsigned int){
+ PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+ PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+ PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+ PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED};
+ const vector unsigned char mbuf_init =
+ (vector unsigned char)vec_vsx_ld
+ (0, (vector unsigned char *)&rxq->mbuf_initializer);
+ const vector unsigned short rearm_sel_mask =
+ (vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
+ vector unsigned char rearm0, rearm1, rearm2, rearm3;
+ uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
+
+ /* Extract pkt_info field. */
+ pinfo0 = (vector unsigned char)
+ vec_mergeh((vector unsigned int)cqes[0],
+ (vector unsigned int)cqes[1]);
+ pinfo1 = (vector unsigned char)
+ vec_mergeh((vector unsigned int)cqes[2],
+ (vector unsigned int)cqes[3]);
+ pinfo = (vector unsigned char)
+ vec_mergeh((vector unsigned long)pinfo0,
+ (vector unsigned long)pinfo1);
+
+ /* Extract hdr_type_etc field. */
+ pinfo0 = (vector unsigned char)
+ vec_mergel((vector unsigned int)cqes[0],
+ (vector unsigned int)cqes[1]);
+ pinfo1 = (vector unsigned char)
+ vec_mergel((vector unsigned int)cqes[2],
+ (vector unsigned int)cqes[3]);
+ ptype = (vector unsigned char)
+ vec_mergeh((vector unsigned long)pinfo0,
+ (vector unsigned long)pinfo1);
+
+ if (rxq->mark) {
+ const vector unsigned char pinfo_ft_mask =
+ (vector unsigned char)(vector unsigned int){
+ 0xffffff00, 0xffffff00, 0xffffff00, 0xffffff00};
+ const vector unsigned char fdir_flags =
+ (vector unsigned char)(vector unsigned int){
+ PKT_RX_FDIR, PKT_RX_FDIR,
+ PKT_RX_FDIR, PKT_RX_FDIR};
+ vector unsigned char fdir_id_flags =
+ (vector unsigned char)(vector unsigned int){
+ PKT_RX_FDIR_ID, PKT_RX_FDIR_ID,
+ PKT_RX_FDIR_ID, PKT_RX_FDIR_ID};
+ vector unsigned char flow_tag, invalid_mask;
+
+ flow_tag = (vector unsigned char)
+ vec_and((vector unsigned long)pinfo,
+ (vector unsigned long)pinfo_ft_mask);
+
+ /* Check if flow tag is non-zero then set PKT_RX_FDIR. */
+ invalid_mask = (vector unsigned char)
+ vec_cmpeq((vector unsigned int)flow_tag,
+ (vector unsigned int)zero);
+ ol_flags = (vector unsigned char)
+ vec_or((vector unsigned long)ol_flags,
+ (vector unsigned long)
+ vec_andc((vector unsigned long)fdir_flags,
+ (vector unsigned long)invalid_mask));
+
+ /* Mask out invalid entries. */
+ fdir_id_flags = (vector unsigned char)
+ vec_andc((vector unsigned long)fdir_id_flags,
+ (vector unsigned long)invalid_mask);
+
+ /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+ ol_flags = (vector unsigned char)
+ vec_or((vector unsigned long)ol_flags,
+ (vector unsigned long)
+ vec_andc((vector unsigned long)fdir_id_flags,
+ (vector unsigned long)
+ vec_cmpeq((vector unsigned int)flow_tag,
+ (vector unsigned int)pinfo_ft_mask)));
+ }
+ /*
+ * Merge the two fields to generate the following:
+ * bit[1] = l3_ok
+ * bit[2] = l4_ok
+ * bit[8] = cv
+ * bit[11:10] = l3_hdr_type
+ * bit[14:12] = l4_hdr_type
+ * bit[15] = ip_frag
+ * bit[16] = tunneled
+ * bit[17] = outer_l3_type
+ */
+ ptype = (vector unsigned char)
+ vec_and((vector unsigned long)ptype,
+ (vector unsigned long)ptype_mask);
+ pinfo = (vector unsigned char)
+ vec_and((vector unsigned long)pinfo,
+ (vector unsigned long)pinfo_mask);
+ pinfo = (vector unsigned char)
+ vec_sl((vector unsigned int)pinfo,
+ (vector unsigned int){16, 16, 16, 16});
+
+ /* Make pinfo has merged fields for ol_flags calculation. */
+ pinfo = (vector unsigned char)
+ vec_or((vector unsigned long)ptype,
+ (vector unsigned long)pinfo);
+ ptype = (vector unsigned char)
+ vec_sr((vector unsigned int)pinfo,
+ (vector unsigned int){10, 10, 10, 10});
+ ptype = (vector unsigned char)
+ vec_packs((vector unsigned int)ptype,
+ (vector unsigned int)zero);
+
+ /* Errored packets will have RTE_PTYPE_ALL_MASK. */
+ op_err = (vector unsigned char)
+ vec_sr((vector unsigned short)op_err,
+ (vector unsigned short){8, 8, 8, 8, 8, 8, 8, 8});
+ ptype = (vector unsigned char)
+ vec_or((vector unsigned long)ptype,
+ (vector unsigned long)op_err);
+
+ pt_idx0 = (uint8_t)((vector unsigned char)ptype)[0];
+ pt_idx1 = (uint8_t)((vector unsigned char)ptype)[2];
+ pt_idx2 = (uint8_t)((vector unsigned char)ptype)[4];
+ pt_idx3 = (uint8_t)((vector unsigned char)ptype)[6];
+
+ pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
+ !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
+ pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
+ !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
+ pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
+ !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
+ pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
+ !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
+
+ /* Fill flags for checksum and VLAN. */
+ pinfo = (vector unsigned char)
+ vec_and((vector unsigned long)pinfo,
+ (vector unsigned long)ptype_ol_mask);
+ pinfo = vec_perm(cv_flag_sel, zero, pinfo);
+
+ /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
+ cv_flags = (vector unsigned char)
+ vec_sl((vector unsigned int)pinfo,
+ (vector unsigned int){9, 9, 9, 9});
+ cv_flags = (vector unsigned char)
+ vec_or((vector unsigned long)pinfo,
+ (vector unsigned long)cv_flags);
+
+ /* Move back flags to start from byte[0]. */
+ cv_flags = (vector unsigned char)
+ vec_sr((vector unsigned int)cv_flags,
+ (vector unsigned int){8, 8, 8, 8});
+
+ /* Mask out garbage bits. */
+ cv_flags = (vector unsigned char)
+ vec_and((vector unsigned long)cv_flags,
+ (vector unsigned long)cv_mask);
+
+ /* Merge to ol_flags. */
+ ol_flags = (vector unsigned char)
+ vec_or((vector unsigned long)ol_flags,
+ (vector unsigned long)cv_flags);
+
+ /* Merge mbuf_init and ol_flags. */
+ rearm0 = (vector unsigned char)
+ vec_sel((vector unsigned short)mbuf_init,
+ (vector unsigned short)
+ vec_slo((vector unsigned short)ol_flags,
+ (vector unsigned char){64}), rearm_sel_mask);
+ rearm1 = (vector unsigned char)
+ vec_sel((vector unsigned short)mbuf_init,
+ (vector unsigned short)
+ vec_slo((vector unsigned short)ol_flags,
+ (vector unsigned char){32}), rearm_sel_mask);
+ rearm2 = (vector unsigned char)
+ vec_sel((vector unsigned short)mbuf_init,
+ (vector unsigned short)ol_flags, rearm_sel_mask);
+ rearm3 = (vector unsigned char)
+ vec_sel((vector unsigned short)mbuf_init,
+ (vector unsigned short)
+ vec_sro((vector unsigned short)ol_flags,
+ (vector unsigned char){32}), rearm_sel_mask);
+
+ /* Write 8B rearm_data and 8B ol_flags. */
+ vec_vsx_st(rearm0, 0,
+ (vector unsigned char *)&pkts[0]->rearm_data);
+ vec_vsx_st(rearm1, 0,
+ (vector unsigned char *)&pkts[1]->rearm_data);
+ vec_vsx_st(rearm2, 0,
+ (vector unsigned char *)&pkts[2]->rearm_data);
+ vec_vsx_st(rearm3, 0,
+ (vector unsigned char *)&pkts[3]->rearm_data);
+}
+
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ * @param[out] err
+ * Pointer to a flag. Set non-zero value if pkts array has at least one error
+ * packet to handle.
+ *
+ * @return
+ * Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
+ uint64_t *err)
+{
+ const uint16_t q_n = 1 << rxq->cqe_n;
+ const uint16_t q_mask = q_n - 1;
+ volatile struct mlx5_cqe *cq;
+ struct rte_mbuf **elts;
+ unsigned int pos;
+ uint64_t n;
+ uint16_t repl_n;
+ uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+ uint16_t nocmp_n = 0;
+ uint16_t rcvd_pkt = 0;
+ unsigned int cq_idx = rxq->cq_ci & q_mask;
+ unsigned int elts_idx;
+ unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+ const vector unsigned char zero = (vector unsigned char){0};
+ const vector unsigned char ones = vec_splat_u8(-1);
+ const vector unsigned char owner_check =
+ (vector unsigned char)(vector unsigned long){
+ 0x0100000001000000LL, 0x0100000001000000LL};
+ const vector unsigned char opcode_check =
+ (vector unsigned char)(vector unsigned long){
+ 0xf0000000f0000000LL, 0xf0000000f0000000LL};
+ const vector unsigned char format_check =
+ (vector unsigned char)(vector unsigned long){
+ 0x0c0000000c000000LL, 0x0c0000000c000000LL};
+ const vector unsigned char resp_err_check =
+ (vector unsigned char)(vector unsigned long){
+ 0xe0000000e0000000LL, 0xe0000000e0000000LL};
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint32_t rcvd_byte = 0;
+ /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+ const vector unsigned char len_shuf_mask = (vector unsigned char){
+ 1, 0, 5, 4,
+ 9, 8, 13, 12,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1};
+#endif
+ /* Mask to shuffle from extracted CQE to mbuf. */
+ const vector unsigned char shuf_mask = (vector unsigned char){
+ 5, 4, /* bswap16, pkt_len */
+ -1, -1, /* zero out 2nd half of pkt_len */
+ 5, 4, /* bswap16, data_len */
+ 11, 10, /* bswap16, vlan+tci */
+ 15, 14, 13, 12, /* bswap32, rss */
+ 1, 2, 3, -1}; /* fdir.hi */
+ /* Mask to blend from the last Qword to the first DQword. */
+ /* Mask to blend from the last Qword to the first DQword. */
+ const vector unsigned char blend_mask = (vector unsigned char){
+ -1, 0, 0, 0,
+ 0, 0, 0, 0,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1};
+ const vector unsigned char crc_adj =
+ (vector unsigned char)(vector unsigned short){
+ rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0};
+ const vector unsigned char flow_mark_adj =
+ (vector unsigned char)(vector unsigned int){
+ 0, 0, 0, rxq->mark * (-1)};
+ const vector unsigned short cqe_sel_mask1 =
+ (vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
+ const vector unsigned short cqe_sel_mask2 =
+ (vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0};
+
+ MLX5_ASSERT(rxq->sges_n == 0);
+ MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+ cq = &(*rxq->cqes)[cq_idx];
+ rte_prefetch0(cq);
+ rte_prefetch0(cq + 1);
+ rte_prefetch0(cq + 2);
+ rte_prefetch0(cq + 3);
+ pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+
+ repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
+ if (repl_n >= rxq->rq_repl_thresh)
+ mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
+ /* See if there're unreturned mbufs from compressed CQE. */
+ rcvd_pkt = rxq->decompressed;
+ if (rcvd_pkt > 0) {
+ rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+ rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
+ rxq->rq_pi += rcvd_pkt;
+ rxq->decompressed -= rcvd_pkt;
+ pkts += rcvd_pkt;
+ }
+ elts_idx = rxq->rq_pi & q_mask;
+ elts = &(*rxq->elts)[elts_idx];
+ /* Not to overflow pkts array. */
+ pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
+ /* Not to cross queue end. */
+ pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
+ pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+ if (!pkts_n)
+ return rcvd_pkt;
+ /* At this point, there shouldn't be any remaining packets. */
+ MLX5_ASSERT(rxq->decompressed == 0);
+
+ /*
+ * A. load first Qword (8bytes) in one loop.
+ * B. copy 4 mbuf pointers from elts ring to returing pkts.
+ * C. load remaining CQE data and extract necessary fields.
+ * Final 16bytes cqes[] extracted from original 64bytes CQE has the
+ * following structure:
+ * struct {
+ * uint8_t pkt_info;
+ * uint8_t flow_tag[3];
+ * uint16_t byte_cnt;
+ * uint8_t rsvd4;
+ * uint8_t op_own;
+ * uint16_t hdr_type_etc;
+ * uint16_t vlan_info;
+ * uint32_t rx_has_res;
+ * } c;
+ * D. fill in mbuf.
+ * E. get valid CQEs.
+ * F. find compressed CQE.
+ */
+ for (pos = 0;
+ pos < pkts_n;
+ pos += MLX5_VPMD_DESCS_PER_LOOP) {
+ vector unsigned char cqes[MLX5_VPMD_DESCS_PER_LOOP];
+ vector unsigned char cqe_tmp1, cqe_tmp2;
+ vector unsigned char pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+ vector unsigned char op_own, op_own_tmp1, op_own_tmp2;
+ vector unsigned char opcode, owner_mask, invalid_mask;
+ vector unsigned char comp_mask;
+ vector unsigned char mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ const vector unsigned char lower_half = {
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 16, 17, 20, 21, 24, 25, 28, 29};
+ const vector unsigned char upper_half = {
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ const vector unsigned long shmax = {64, 64};
+ vector unsigned char byte_cnt;
+ vector unsigned short left, right;
+ vector unsigned long lshift;
+ vector __attribute__((altivec(bool__)))
+ unsigned long shmask;
+#endif
+ vector unsigned char mbp1, mbp2;
+ vector unsigned char p =
+ (vector unsigned char)(vector unsigned short){
+ 0, 1, 2, 3, 0, 0, 0, 0};
+ unsigned int p1, p2, p3;
+
+ /* Prefetch next 4 CQEs. */
+ if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+ }
+
+ /* A.0 do not cross the end of CQ. */
+ mask = (vector unsigned char)(vector unsigned long){
+ (pkts_n - pos) * sizeof(uint16_t) * 8, 0};
+
+ {
+ vector unsigned long lshift;
+ vector __attribute__((altivec(bool__)))
+ unsigned long shmask;
+ const vector unsigned long shmax = {64, 64};
+
+ lshift = vec_splat((vector unsigned long)mask, 0);
+ shmask = vec_cmpgt(shmax, lshift);
+ mask = (vector unsigned char)
+ vec_sl((vector unsigned long)ones, lshift);
+ mask = (vector unsigned char)
+ vec_sel((vector unsigned long)shmask,
+ (vector unsigned long)mask, shmask);
+ }
+
+ p = (vector unsigned char)
+ vec_andc((vector unsigned long)p,
+ (vector unsigned long)mask);
+
+ /* A.1 load cqes. */
+ p3 = (unsigned int)((vector unsigned short)p)[3];
+ cqes[3] = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p3].sop_drop_qpn, 0LL};
+ rte_compiler_barrier();
+
+ p2 = (unsigned int)((vector unsigned short)p)[2];
+ cqes[2] = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p2].sop_drop_qpn, 0LL};
+ rte_compiler_barrier();
+
+ /* B.1 load mbuf pointers. */
+ mbp1 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&elts[pos]);
+ mbp2 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&elts[pos + 2]);
+
+ /* A.1 load a block having op_own. */
+ p1 = (unsigned int)((vector unsigned short)p)[1];
+ cqes[1] = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p1].sop_drop_qpn, 0LL};
+ rte_compiler_barrier();
+
+ cqes[0] = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos].sop_drop_qpn, 0LL};
+ rte_compiler_barrier();
+
+ /* B.2 copy mbuf pointers. */
+ *(vector unsigned char *)&pkts[pos] = mbp1;
+ *(vector unsigned char *)&pkts[pos + 2] = mbp2;
+ rte_cio_rmb();
+
+ /* C.1 load remaining CQE data and extract necessary fields. */
+ cqe_tmp2 = *(vector unsigned char *)
+ &cq[pos + p3].pkt_info;
+ cqe_tmp1 = *(vector unsigned char *)
+ &cq[pos + p2].pkt_info;
+ cqes[3] = vec_sel(cqes[3], cqe_tmp2, blend_mask);
+ cqes[2] = vec_sel(cqes[2], cqe_tmp1, blend_mask);
+ cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&cq[pos + p3].csum);
+ cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&cq[pos + p2].csum);
+ cqes[3] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[3],
+ (vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+ cqes[2] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[2],
+ (vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+ cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p3].rsvd3[9], 0LL};
+ cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p2].rsvd3[9], 0LL};
+ cqes[3] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[3],
+ (vector unsigned short)cqe_tmp2,
+ (vector unsigned short)cqe_sel_mask2);
+ cqes[2] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[2],
+ (vector unsigned short)cqe_tmp1,
+ (vector unsigned short)cqe_sel_mask2);
+
+ /* C.2 generate final structure for mbuf with swapping bytes. */
+ pkt_mb3 = vec_perm(cqes[3], zero, shuf_mask);
+ pkt_mb2 = vec_perm(cqes[2], zero, shuf_mask);
+
+ /* C.3 adjust CRC length. */
+ pkt_mb3 = (vector unsigned char)
+ ((vector unsigned short)pkt_mb3 -
+ (vector unsigned short)crc_adj);
+ pkt_mb2 = (vector unsigned char)
+ ((vector unsigned short)pkt_mb2 -
+ (vector unsigned short)crc_adj);
+
+ /* C.4 adjust flow mark. */
+ pkt_mb3 = (vector unsigned char)
+ ((vector unsigned int)pkt_mb3 +
+ (vector unsigned int)flow_mark_adj);
+ pkt_mb2 = (vector unsigned char)
+ ((vector unsigned int)pkt_mb2 +
+ (vector unsigned int)flow_mark_adj);
+
+ /* D.1 fill in mbuf - rx_descriptor_fields1. */
+ *(vector unsigned char *)
+ &pkts[pos + 3]->pkt_len = pkt_mb3;
+ *(vector unsigned char *)
+ &pkts[pos + 2]->pkt_len = pkt_mb2;
+
+ /* E.1 extract op_own field. */
+ op_own_tmp2 = (vector unsigned char)
+ vec_mergeh((vector unsigned int)cqes[2],
+ (vector unsigned int)cqes[3]);
+
+ /* C.1 load remaining CQE data and extract necessary fields. */
+ cqe_tmp2 = *(vector unsigned char *)
+ &cq[pos + p1].pkt_info;
+ cqe_tmp1 = *(vector unsigned char *)
+ &cq[pos].pkt_info;
+ cqes[1] = vec_sel(cqes[1], cqe_tmp2, blend_mask);
+ cqes[0] = vec_sel(cqes[0], cqe_tmp2, blend_mask);
+ cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&cq[pos + p1].csum);
+ cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&cq[pos].csum);
+ cqes[1] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[1],
+ (vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+ cqes[0] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[0],
+ (vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+ cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p1].rsvd3[9], 0LL};
+ cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos].rsvd3[9], 0LL};
+ cqes[1] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[1],
+ (vector unsigned short)cqe_tmp2, cqe_sel_mask2);
+ cqes[0] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[0],
+ (vector unsigned short)cqe_tmp1, cqe_sel_mask2);
+
+ /* C.2 generate final structure for mbuf with swapping bytes. */
+ pkt_mb1 = vec_perm(cqes[1], zero, shuf_mask);
+ pkt_mb0 = vec_perm(cqes[0], zero, shuf_mask);
+
+ /* C.3 adjust CRC length. */
+ pkt_mb1 = (vector unsigned char)
+ ((vector unsigned short)pkt_mb1 -
+ (vector unsigned short)crc_adj);
+ pkt_mb0 = (vector unsigned char)
+ ((vector unsigned short)pkt_mb0 -
+ (vector unsigned short)crc_adj);
+
+ /* C.4 adjust flow mark. */
+ pkt_mb1 = (vector unsigned char)
+ ((vector unsigned int)pkt_mb1 +
+ (vector unsigned int)flow_mark_adj);
+ pkt_mb0 = (vector unsigned char)
+ ((vector unsigned int)pkt_mb0 +
+ (vector unsigned int)flow_mark_adj);
+
+ /* E.1 extract op_own byte. */
+ op_own_tmp1 = (vector unsigned char)
+ vec_mergeh((vector unsigned int)cqes[0],
+ (vector unsigned int)cqes[1]);
+ op_own = (vector unsigned char)
+ vec_mergel((vector unsigned long)op_own_tmp1,
+ (vector unsigned long)op_own_tmp2);
+
+ /* D.1 fill in mbuf - rx_descriptor_fields1. */
+ *(vector unsigned char *)
+ &pkts[pos + 1]->pkt_len = pkt_mb1;
+ *(vector unsigned char *)
+ &pkts[pos]->pkt_len = pkt_mb0;
+
+ /* E.2 flip owner bit to mark CQEs from last round. */
+ owner_mask = (vector unsigned char)
+ vec_and((vector unsigned long)op_own,
+ (vector unsigned long)owner_check);
+ if (ownership)
+ owner_mask = (vector unsigned char)
+ vec_xor((vector unsigned long)owner_mask,
+ (vector unsigned long)owner_check);
+ owner_mask = (vector unsigned char)
+ vec_cmpeq((vector unsigned int)owner_mask,
+ (vector unsigned int)owner_check);
+ owner_mask = (vector unsigned char)
+ vec_packs((vector unsigned int)owner_mask,
+ (vector unsigned int)zero);
+
+ /* E.3 get mask for invalidated CQEs. */
+ opcode = (vector unsigned char)
+ vec_and((vector unsigned long)op_own,
+ (vector unsigned long)opcode_check);
+ invalid_mask = (vector unsigned char)
+ vec_cmpeq((vector unsigned int)opcode_check,
+ (vector unsigned int)opcode);
+ invalid_mask = (vector unsigned char)
+ vec_packs((vector unsigned int)invalid_mask,
+ (vector unsigned int)zero);
+
+ /* E.4 mask out beyond boundary. */
+ invalid_mask = (vector unsigned char)
+ vec_or((vector unsigned long)invalid_mask,
+ (vector unsigned long)mask);
+
+ /* E.5 merge invalid_mask with invalid owner. */
+ invalid_mask = (vector unsigned char)
+ vec_or((vector unsigned long)invalid_mask,
+ (vector unsigned long)owner_mask);
+
+ /* F.1 find compressed CQE format. */
+ comp_mask = (vector unsigned char)
+ vec_and((vector unsigned long)op_own,
+ (vector unsigned long)format_check);
+ comp_mask = (vector unsigned char)
+ vec_cmpeq((vector unsigned int)comp_mask,
+ (vector unsigned int)format_check);
+ comp_mask = (vector unsigned char)
+ vec_packs((vector unsigned int)comp_mask,
+ (vector unsigned int)zero);
+
+ /* F.2 mask out invalid entries. */
+ comp_mask = (vector unsigned char)
+ vec_andc((vector unsigned long)comp_mask,
+ (vector unsigned long)invalid_mask);
+ comp_idx = ((vector unsigned long)comp_mask)[0];
+
+ /* F.3 get the first compressed CQE. */
+ comp_idx = comp_idx ? __builtin_ctzll(comp_idx) /
+ (sizeof(uint16_t) * 8) : MLX5_VPMD_DESCS_PER_LOOP;
+
+ /* E.6 mask out entries after the compressed CQE. */
+ mask = (vector unsigned char)(vector unsigned long){
+ (comp_idx * sizeof(uint16_t) * 8), 0};
+ lshift = vec_splat((vector unsigned long)mask, 0);
+ shmask = vec_cmpgt(shmax, lshift);
+ mask = (vector unsigned char)
+ vec_sl((vector unsigned long)ones, lshift);
+ mask = (vector unsigned char)
+ vec_sel((vector unsigned long)shmask,
+ (vector unsigned long)mask, shmask);
+ invalid_mask = (vector unsigned char)
+ vec_or((vector unsigned long)invalid_mask,
+ (vector unsigned long)mask);
+
+ /* E.7 count non-compressed valid CQEs. */
+ n = ((vector unsigned long)invalid_mask)[0];
+ n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+ MLX5_VPMD_DESCS_PER_LOOP;
+ nocmp_n += n;
+
+ /* D.2 get the final invalid mask. */
+ mask = (vector unsigned char)(vector unsigned long){
+ (n * sizeof(uint16_t) * 8), 0};
+ lshift = vec_splat((vector unsigned long)mask, 0);
+ shmask = vec_cmpgt(shmax, lshift);
+ mask = (vector unsigned char)
+ vec_sl((vector unsigned long)ones, lshift);
+ mask = (vector unsigned char)
+ vec_sel((vector unsigned long)shmask,
+ (vector unsigned long)mask, shmask);
+ invalid_mask = (vector unsigned char)
+ vec_or((vector unsigned long)invalid_mask,
+ (vector unsigned long)mask);
+
+ /* D.3 check error in opcode. */
+ opcode = (vector unsigned char)
+ vec_cmpeq((vector unsigned int)resp_err_check,
+ (vector unsigned int)opcode);
+ opcode = (vector unsigned char)
+ vec_packs((vector unsigned int)opcode,
+ (vector unsigned int)zero);
+ opcode = (vector unsigned char)
+ vec_andc((vector unsigned long)opcode,
+ (vector unsigned long)invalid_mask);
+
+ /* D.4 mark if any error is set */
+ *err |= ((vector unsigned long)opcode)[0];
+
+ /* D.5 fill in mbuf - rearm_data and packet_type. */
+ rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+ if (rxq->hw_timestamp) {
+ pkts[pos]->timestamp =
+ rte_be_to_cpu_64(cq[pos].timestamp);
+ pkts[pos + 1]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p1].timestamp);
+ pkts[pos + 2]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p2].timestamp);
+ pkts[pos + 3]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p3].timestamp);
+ }
+ if (rxq->dynf_meta) {
+ uint64_t flag = rxq->flow_meta_mask;
+ int32_t offs = rxq->flow_meta_offset;
+ uint32_t metadata;
+
+ /* This code is subject for futher optimization. */
+ metadata = cq[pos].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ metadata;
+ pkts[pos]->ol_flags |= metadata ? flag : 0ULL;
+ metadata = cq[pos + 1].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+ metadata;
+ pkts[pos + 1]->ol_flags |= metadata ? flag : 0ULL;
+ metadata = cq[pos + 2].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+ metadata;
+ pkts[pos + 2]->ol_flags |= metadata ? flag : 0ULL;
+ metadata = cq[pos + 3].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+ metadata;
+ pkts[pos + 3]->ol_flags |= metadata ? flag : 0ULL;
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Add up received bytes count. */
+ byte_cnt = vec_perm(op_own, zero, len_shuf_mask);
+ byte_cnt = (vector unsigned char)
+ vec_andc((vector unsigned long)byte_cnt,
+ (vector unsigned long)invalid_mask);
+ left = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, lower_half);
+ right = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, upper_half);
+ byte_cnt = (vector unsigned char)vec_add(left, right);
+ left = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, lower_half);
+ right = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, upper_half);
+ byte_cnt = (vector unsigned char)vec_add(left, right);
+ rcvd_byte += ((vector unsigned long)byte_cnt)[0];
+#endif
+
+ /*
+ * Break the loop unless more valid CQE is expected, or if
+ * there's a compressed CQE.
+ */
+ if (n != MLX5_VPMD_DESCS_PER_LOOP)
+ break;
+ }
+ /* If no new CQE seen, return without updating cq_db. */
+ if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+ return rcvd_pkt;
+ /* Update the consumer indexes for non-compressed CQEs. */
+ MLX5_ASSERT(nocmp_n <= pkts_n);
+ rxq->cq_ci += nocmp_n;
+ rxq->rq_pi += nocmp_n;
+ rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ rxq->stats.ipackets += nocmp_n;
+ rxq->stats.ibytes += rcvd_byte;
+#endif
+ /* Decompress the last CQE if compressed. */
+ if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+ MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+ rxq->decompressed =
+ rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]);
+ /* Return more packets if needed. */
+ if (nocmp_n < pkts_n) {
+ uint16_t n = rxq->decompressed;
+
+ n = RTE_MIN(n, pkts_n - nocmp_n);
+ rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
+ rxq->rq_pi += n;
+ rcvd_pkt += n;
+ rxq->decompressed -= n;
+ }
+ }
+ rte_compiler_barrier();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ return rcvd_pkt;
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
new file mode 100644
index 000000000..ecafbf800
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -0,0 +1,780 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_NEON_H_
+#define RTE_PMD_MLX5_RXTX_VEC_NEON_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param pkts
+ * Pointer to array of packets to be stored.
+ * @param pkts_n
+ * Number of packets to be stored.
+ */
+static inline void
+rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+{
+ const uint16_t q_mask = (1 << rxq->elts_n) - 1;
+ struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
+ unsigned int pos;
+ uint16_t p = n & -2;
+
+ for (pos = 0; pos < p; pos += 2) {
+ uint64x2_t mbp;
+
+ mbp = vld1q_u64((void *)&elts[pos]);
+ vst1q_u64((void *)&pkts[pos], mbp);
+ }
+ if (n & 1)
+ pkts[pos] = elts[pos];
+}
+
+/**
+ * Decompress a compressed completion and fill in mbufs in RX SW ring with data
+ * extracted from the title completion descriptor.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param cq
+ * Pointer to completion array having a compressed completion at first.
+ * @param elts
+ * Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ * the title completion descriptor to be copied to the rest of mbufs.
+ *
+ * @return
+ * Number of mini-CQEs successfully decompressed.
+ */
+static inline uint16_t
+rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+ struct rte_mbuf **elts)
+{
+ volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info;
+ struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+ unsigned int pos;
+ unsigned int i;
+ unsigned int inv = 0;
+ /* Mask to shuffle from extracted mini CQE to mbuf. */
+ const uint8x16_t mcqe_shuf_m1 = {
+ -1, -1, -1, -1, /* skip packet_type */
+ 7, 6, -1, -1, /* pkt_len, bswap16 */
+ 7, 6, /* data_len, bswap16 */
+ -1, -1, /* skip vlan_tci */
+ 3, 2, 1, 0 /* hash.rss, bswap32 */
+ };
+ const uint8x16_t mcqe_shuf_m2 = {
+ -1, -1, -1, -1, /* skip packet_type */
+ 15, 14, -1, -1, /* pkt_len, bswap16 */
+ 15, 14, /* data_len, bswap16 */
+ -1, -1, /* skip vlan_tci */
+ 11, 10, 9, 8 /* hash.rss, bswap32 */
+ };
+ /* Restore the compressed count. Must be 16 bits. */
+ const uint16_t mcqe_n = t_pkt->data_len +
+ (rxq->crc_present * RTE_ETHER_CRC_LEN);
+ const uint64x2_t rearm =
+ vld1q_u64((void *)&t_pkt->rearm_data);
+ const uint32x4_t rxdf_mask = {
+ 0xffffffff, /* packet_type */
+ 0, /* skip pkt_len */
+ 0xffff0000, /* vlan_tci, skip data_len */
+ 0, /* skip hash.rss */
+ };
+ const uint8x16_t rxdf =
+ vandq_u8(vld1q_u8((void *)&t_pkt->rx_descriptor_fields1),
+ vreinterpretq_u8_u32(rxdf_mask));
+ const uint16x8_t crc_adj = {
+ 0, 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+ 0, 0
+ };
+ const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint32_t rcvd_byte = 0;
+#endif
+ /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+ const uint8x8_t len_shuf_m = {
+ 7, 6, /* 1st mCQE */
+ 15, 14, /* 2nd mCQE */
+ 23, 22, /* 3rd mCQE */
+ 31, 30 /* 4th mCQE */
+ };
+
+ /*
+ * A. load mCQEs into a 128bit register.
+ * B. store rearm data to mbuf.
+ * C. combine data from mCQEs with rx_descriptor_fields1.
+ * D. store rx_descriptor_fields1.
+ * E. store flow tag (rte_flow mark).
+ */
+ for (pos = 0; pos < mcqe_n; ) {
+ uint8_t *p = (void *)&mcq[pos % 8];
+ uint8_t *e0 = (void *)&elts[pos]->rearm_data;
+ uint8_t *e1 = (void *)&elts[pos + 1]->rearm_data;
+ uint8_t *e2 = (void *)&elts[pos + 2]->rearm_data;
+ uint8_t *e3 = (void *)&elts[pos + 3]->rearm_data;
+ uint16x4_t byte_cnt;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint16x4_t invalid_mask =
+ vcreate_u16(mcqe_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?
+ -1UL << ((mcqe_n - pos) *
+ sizeof(uint16_t) * 8) : 0);
+#endif
+ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+ if (likely(pos + i < mcqe_n))
+ rte_prefetch0((void *)(cq + pos + i));
+ __asm__ volatile (
+ /* A.1 load mCQEs into a 128bit register. */
+ "ld1 {v16.16b - v17.16b}, [%[mcq]] \n\t"
+ /* B.1 store rearm data to mbuf. */
+ "st1 {%[rearm].2d}, [%[e0]] \n\t"
+ "add %[e0], %[e0], #16 \n\t"
+ "st1 {%[rearm].2d}, [%[e1]] \n\t"
+ "add %[e1], %[e1], #16 \n\t"
+ /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+ "tbl v18.16b, {v16.16b}, %[mcqe_shuf_m1].16b \n\t"
+ "tbl v19.16b, {v16.16b}, %[mcqe_shuf_m2].16b \n\t"
+ "sub v18.8h, v18.8h, %[crc_adj].8h \n\t"
+ "sub v19.8h, v19.8h, %[crc_adj].8h \n\t"
+ "orr v18.16b, v18.16b, %[rxdf].16b \n\t"
+ "orr v19.16b, v19.16b, %[rxdf].16b \n\t"
+ /* D.1 store rx_descriptor_fields1. */
+ "st1 {v18.2d}, [%[e0]] \n\t"
+ "st1 {v19.2d}, [%[e1]] \n\t"
+ /* B.1 store rearm data to mbuf. */
+ "st1 {%[rearm].2d}, [%[e2]] \n\t"
+ "add %[e2], %[e2], #16 \n\t"
+ "st1 {%[rearm].2d}, [%[e3]] \n\t"
+ "add %[e3], %[e3], #16 \n\t"
+ /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+ "tbl v18.16b, {v17.16b}, %[mcqe_shuf_m1].16b \n\t"
+ "tbl v19.16b, {v17.16b}, %[mcqe_shuf_m2].16b \n\t"
+ "sub v18.8h, v18.8h, %[crc_adj].8h \n\t"
+ "sub v19.8h, v19.8h, %[crc_adj].8h \n\t"
+ "orr v18.16b, v18.16b, %[rxdf].16b \n\t"
+ "orr v19.16b, v19.16b, %[rxdf].16b \n\t"
+ /* D.1 store rx_descriptor_fields1. */
+ "st1 {v18.2d}, [%[e2]] \n\t"
+ "st1 {v19.2d}, [%[e3]] \n\t"
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ "tbl %[byte_cnt].8b, {v16.16b - v17.16b}, %[len_shuf_m].8b \n\t"
+#endif
+ :[byte_cnt]"=&w"(byte_cnt)
+ :[mcq]"r"(p),
+ [rxdf]"w"(rxdf),
+ [rearm]"w"(rearm),
+ [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+ [mcqe_shuf_m1]"w"(mcqe_shuf_m1),
+ [mcqe_shuf_m2]"w"(mcqe_shuf_m2),
+ [crc_adj]"w"(crc_adj),
+ [len_shuf_m]"w"(len_shuf_m)
+ :"memory", "v16", "v17", "v18", "v19");
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ byte_cnt = vbic_u16(byte_cnt, invalid_mask);
+ rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
+#endif
+ if (rxq->mark) {
+ /* E.1 store flow tag (rte_flow mark). */
+ elts[pos]->hash.fdir.hi = flow_tag;
+ elts[pos + 1]->hash.fdir.hi = flow_tag;
+ elts[pos + 2]->hash.fdir.hi = flow_tag;
+ elts[pos + 3]->hash.fdir.hi = flow_tag;
+ }
+ if (rxq->dynf_meta) {
+ int32_t offs = rxq->flow_meta_offset;
+ const uint32_t meta =
+ *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
+
+ /* Check if title packet has valid metadata. */
+ if (meta) {
+ MLX5_ASSERT(t_pkt->ol_flags &
+ rxq->flow_meta_mask);
+ *RTE_MBUF_DYNFIELD(elts[pos], offs,
+ uint32_t *) = meta;
+ *RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
+ uint32_t *) = meta;
+ *RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
+ uint32_t *) = meta;
+ *RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
+ uint32_t *) = meta;
+ }
+ }
+ pos += MLX5_VPMD_DESCS_PER_LOOP;
+ /* Move to next CQE and invalidate consumed CQEs. */
+ if (!(pos & 0x7) && pos < mcqe_n) {
+ mcq = (void *)&(cq + pos)->pkt_info;
+ for (i = 0; i < 8; ++i)
+ cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+ }
+ }
+ /* Invalidate the rest of CQEs. */
+ for (; inv < mcqe_n; ++inv)
+ cq[inv].op_own = MLX5_CQE_INVALIDATE;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ rxq->stats.ipackets += mcqe_n;
+ rxq->stats.ibytes += rcvd_byte;
+#endif
+ rxq->cq_ci += mcqe_n;
+ return mcqe_n;
+}
+
+/**
+ * Calculate packet type and offload flag for mbuf and store it.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param ptype_info
+ * Array of four 4bytes packet type info extracted from the original
+ * completion descriptor.
+ * @param flow_tag
+ * Array of four 4bytes flow ID extracted from the original completion
+ * descriptor.
+ * @param op_err
+ * Opcode vector having responder error status. Each field is 4B.
+ * @param pkts
+ * Pointer to array of packets to be filled.
+ */
+static inline void
+rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
+ uint32x4_t ptype_info, uint32x4_t flow_tag,
+ uint16x4_t op_err, struct rte_mbuf **pkts)
+{
+ uint16x4_t ptype;
+ uint32x4_t pinfo, cv_flags;
+ uint32x4_t ol_flags =
+ vdupq_n_u32(rxq->rss_hash * PKT_RX_RSS_HASH |
+ rxq->hw_timestamp * PKT_RX_TIMESTAMP);
+ const uint32x4_t ptype_ol_mask = { 0x106, 0x106, 0x106, 0x106 };
+ const uint8x16_t cv_flag_sel = {
+ 0,
+ (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+ (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1),
+ 0,
+ (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1),
+ 0,
+ (uint8_t)((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ const uint32x4_t cv_mask =
+ vdupq_n_u32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
+ const uint64x2_t mbuf_init = vld1q_u64
+ ((const uint64_t *)&rxq->mbuf_initializer);
+ uint64x2_t rearm0, rearm1, rearm2, rearm3;
+ uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
+
+ if (rxq->mark) {
+ const uint32x4_t ft_def = vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT);
+ const uint32x4_t fdir_flags = vdupq_n_u32(PKT_RX_FDIR);
+ uint32x4_t fdir_id_flags = vdupq_n_u32(PKT_RX_FDIR_ID);
+ uint32x4_t invalid_mask;
+
+ /* Check if flow tag is non-zero then set PKT_RX_FDIR. */
+ invalid_mask = vceqzq_u32(flow_tag);
+ ol_flags = vorrq_u32(ol_flags,
+ vbicq_u32(fdir_flags, invalid_mask));
+ /* Mask out invalid entries. */
+ fdir_id_flags = vbicq_u32(fdir_id_flags, invalid_mask);
+ /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+ ol_flags = vorrq_u32(ol_flags,
+ vbicq_u32(fdir_id_flags,
+ vceqq_u32(flow_tag, ft_def)));
+ }
+ /*
+ * ptype_info has the following:
+ * bit[1] = l3_ok
+ * bit[2] = l4_ok
+ * bit[8] = cv
+ * bit[11:10] = l3_hdr_type
+ * bit[14:12] = l4_hdr_type
+ * bit[15] = ip_frag
+ * bit[16] = tunneled
+ * bit[17] = outer_l3_type
+ */
+ ptype = vshrn_n_u32(ptype_info, 10);
+ /* Errored packets will have RTE_PTYPE_ALL_MASK. */
+ ptype = vorr_u16(ptype, op_err);
+ pt_idx0 = vget_lane_u8(vreinterpret_u8_u16(ptype), 6);
+ pt_idx1 = vget_lane_u8(vreinterpret_u8_u16(ptype), 4);
+ pt_idx2 = vget_lane_u8(vreinterpret_u8_u16(ptype), 2);
+ pt_idx3 = vget_lane_u8(vreinterpret_u8_u16(ptype), 0);
+ pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
+ !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
+ pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
+ !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
+ pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
+ !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
+ pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
+ !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
+ /* Fill flags for checksum and VLAN. */
+ pinfo = vandq_u32(ptype_info, ptype_ol_mask);
+ pinfo = vreinterpretq_u32_u8(
+ vqtbl1q_u8(cv_flag_sel, vreinterpretq_u8_u32(pinfo)));
+ /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
+ cv_flags = vshlq_n_u32(pinfo, 9);
+ cv_flags = vorrq_u32(pinfo, cv_flags);
+ /* Move back flags to start from byte[0]. */
+ cv_flags = vshrq_n_u32(cv_flags, 8);
+ /* Mask out garbage bits. */
+ cv_flags = vandq_u32(cv_flags, cv_mask);
+ /* Merge to ol_flags. */
+ ol_flags = vorrq_u32(ol_flags, cv_flags);
+ /* Merge mbuf_init and ol_flags, and store. */
+ rearm0 = vreinterpretq_u64_u32(vsetq_lane_u32
+ (vgetq_lane_u32(ol_flags, 3),
+ vreinterpretq_u32_u64(mbuf_init), 2));
+ rearm1 = vreinterpretq_u64_u32(vsetq_lane_u32
+ (vgetq_lane_u32(ol_flags, 2),
+ vreinterpretq_u32_u64(mbuf_init), 2));
+ rearm2 = vreinterpretq_u64_u32(vsetq_lane_u32
+ (vgetq_lane_u32(ol_flags, 1),
+ vreinterpretq_u32_u64(mbuf_init), 2));
+ rearm3 = vreinterpretq_u64_u32(vsetq_lane_u32
+ (vgetq_lane_u32(ol_flags, 0),
+ vreinterpretq_u32_u64(mbuf_init), 2));
+
+ vst1q_u64((void *)&pkts[0]->rearm_data, rearm0);
+ vst1q_u64((void *)&pkts[1]->rearm_data, rearm1);
+ vst1q_u64((void *)&pkts[2]->rearm_data, rearm2);
+ vst1q_u64((void *)&pkts[3]->rearm_data, rearm3);
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ * @param[out] err
+ * Pointer to a flag. Set non-zero value if pkts array has at least one error
+ * packet to handle.
+ *
+ * @return
+ * Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
+ uint64_t *err)
+{
+ const uint16_t q_n = 1 << rxq->cqe_n;
+ const uint16_t q_mask = q_n - 1;
+ volatile struct mlx5_cqe *cq;
+ struct rte_mbuf **elts;
+ unsigned int pos;
+ uint64_t n;
+ uint16_t repl_n;
+ uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+ uint16_t nocmp_n = 0;
+ uint16_t rcvd_pkt = 0;
+ unsigned int cq_idx = rxq->cq_ci & q_mask;
+ unsigned int elts_idx;
+ const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));
+ const uint16x4_t owner_check = vcreate_u16(0x0001000100010001);
+ const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);
+ const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c);
+ const uint16x4_t resp_err_check = vcreate_u16(0x00e000e000e000e0);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint32_t rcvd_byte = 0;
+#endif
+ /* Mask to generate 16B length vector. */
+ const uint8x8_t len_shuf_m = {
+ 52, 53, /* 4th CQE */
+ 36, 37, /* 3rd CQE */
+ 20, 21, /* 2nd CQE */
+ 4, 5 /* 1st CQE */
+ };
+ /* Mask to extract 16B data from a 64B CQE. */
+ const uint8x16_t cqe_shuf_m = {
+ 28, 29, /* hdr_type_etc */
+ 0, /* pkt_info */
+ -1, /* null */
+ 47, 46, /* byte_cnt, bswap16 */
+ 31, 30, /* vlan_info, bswap16 */
+ 15, 14, 13, 12, /* rx_hash_res, bswap32 */
+ 57, 58, 59, /* flow_tag */
+ 63 /* op_own */
+ };
+ /* Mask to generate 16B data for mbuf. */
+ const uint8x16_t mb_shuf_m = {
+ 4, 5, -1, -1, /* pkt_len */
+ 4, 5, /* data_len */
+ 6, 7, /* vlan_tci */
+ 8, 9, 10, 11, /* hash.rss */
+ 12, 13, 14, -1 /* hash.fdir.hi */
+ };
+ /* Mask to generate 16B owner vector. */
+ const uint8x8_t owner_shuf_m = {
+ 63, -1, /* 4th CQE */
+ 47, -1, /* 3rd CQE */
+ 31, -1, /* 2nd CQE */
+ 15, -1 /* 1st CQE */
+ };
+ /* Mask to generate a vector having packet_type/ol_flags. */
+ const uint8x16_t ptype_shuf_m = {
+ 48, 49, 50, -1, /* 4th CQE */
+ 32, 33, 34, -1, /* 3rd CQE */
+ 16, 17, 18, -1, /* 2nd CQE */
+ 0, 1, 2, -1 /* 1st CQE */
+ };
+ /* Mask to generate a vector having flow tags. */
+ const uint8x16_t ftag_shuf_m = {
+ 60, 61, 62, -1, /* 4th CQE */
+ 44, 45, 46, -1, /* 3rd CQE */
+ 28, 29, 30, -1, /* 2nd CQE */
+ 12, 13, 14, -1 /* 1st CQE */
+ };
+ const uint16x8_t crc_adj = {
+ 0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0
+ };
+ const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) };
+
+ MLX5_ASSERT(rxq->sges_n == 0);
+ MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+ cq = &(*rxq->cqes)[cq_idx];
+ rte_prefetch_non_temporal(cq);
+ rte_prefetch_non_temporal(cq + 1);
+ rte_prefetch_non_temporal(cq + 2);
+ rte_prefetch_non_temporal(cq + 3);
+ pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+ repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
+ if (repl_n >= rxq->rq_repl_thresh)
+ mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
+ /* See if there're unreturned mbufs from compressed CQE. */
+ rcvd_pkt = rxq->decompressed;
+ if (rcvd_pkt > 0) {
+ rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+ rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
+ rxq->rq_pi += rcvd_pkt;
+ pkts += rcvd_pkt;
+ rxq->decompressed -= rcvd_pkt;
+ }
+ elts_idx = rxq->rq_pi & q_mask;
+ elts = &(*rxq->elts)[elts_idx];
+ /* Not to overflow pkts array. */
+ pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
+ /* Not to cross queue end. */
+ pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
+ pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+ if (!pkts_n)
+ return rcvd_pkt;
+ /* At this point, there shouldn't be any remained packets. */
+ MLX5_ASSERT(rxq->decompressed == 0);
+ /*
+ * Note that vectors have reverse order - {v3, v2, v1, v0}, because
+ * there's no instruction to count trailing zeros. __builtin_clzl() is
+ * used instead.
+ *
+ * A. copy 4 mbuf pointers from elts ring to returing pkts.
+ * B. load 64B CQE and extract necessary fields
+ * Final 16bytes cqes[] extracted from original 64bytes CQE has the
+ * following structure:
+ * struct {
+ * uint16_t hdr_type_etc;
+ * uint8_t pkt_info;
+ * uint8_t rsvd;
+ * uint16_t byte_cnt;
+ * uint16_t vlan_info;
+ * uint32_t rx_has_res;
+ * uint8_t flow_tag[3];
+ * uint8_t op_own;
+ * } c;
+ * C. fill in mbuf.
+ * D. get valid CQEs.
+ * E. find compressed CQE.
+ */
+ for (pos = 0;
+ pos < pkts_n;
+ pos += MLX5_VPMD_DESCS_PER_LOOP) {
+ uint16x4_t op_own;
+ uint16x4_t opcode, owner_mask, invalid_mask;
+ uint16x4_t comp_mask;
+ uint16x4_t mask;
+ uint16x4_t byte_cnt;
+ uint32x4_t ptype_info, flow_tag;
+ register uint64x2_t c0, c1, c2, c3;
+ uint8_t *p0, *p1, *p2, *p3;
+ uint8_t *e0 = (void *)&elts[pos]->pkt_len;
+ uint8_t *e1 = (void *)&elts[pos + 1]->pkt_len;
+ uint8_t *e2 = (void *)&elts[pos + 2]->pkt_len;
+ uint8_t *e3 = (void *)&elts[pos + 3]->pkt_len;
+ void *elts_p = (void *)&elts[pos];
+ void *pkts_p = (void *)&pkts[pos];
+
+ /* A.0 do not cross the end of CQ. */
+ mask = vcreate_u16(pkts_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?
+ -1UL >> ((pkts_n - pos) *
+ sizeof(uint16_t) * 8) : 0);
+ p0 = (void *)&cq[pos].pkt_info;
+ p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);
+ p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);
+ p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);
+ /* B.0 (CQE 3) load a block having op_own. */
+ c3 = vld1q_u64((uint64_t *)(p3 + 48));
+ /* B.0 (CQE 2) load a block having op_own. */
+ c2 = vld1q_u64((uint64_t *)(p2 + 48));
+ /* B.0 (CQE 1) load a block having op_own. */
+ c1 = vld1q_u64((uint64_t *)(p1 + 48));
+ /* B.0 (CQE 0) load a block having op_own. */
+ c0 = vld1q_u64((uint64_t *)(p0 + 48));
+ /* Synchronize for loading the rest of blocks. */
+ rte_cio_rmb();
+ /* Prefetch next 4 CQEs. */
+ if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+ unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP;
+ rte_prefetch_non_temporal(&cq[next]);
+ rte_prefetch_non_temporal(&cq[next + 1]);
+ rte_prefetch_non_temporal(&cq[next + 2]);
+ rte_prefetch_non_temporal(&cq[next + 3]);
+ }
+ __asm__ volatile (
+ /* B.1 (CQE 3) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p3]] \n\t"
+ /* B.2 (CQE 3) move the block having op_own. */
+ "mov v19.16b, %[c3].16b \n\t"
+ /* B.3 (CQE 3) extract 16B fields. */
+ "tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 2) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
+ /* B.4 (CQE 3) adjust CRC length. */
+ "sub v23.8h, v23.8h, %[crc_adj].8h \n\t"
+ /* C.1 (CQE 3) generate final structure for mbuf. */
+ "tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t"
+ /* B.2 (CQE 2) move the block having op_own. */
+ "mov v19.16b, %[c2].16b \n\t"
+ /* B.3 (CQE 2) extract 16B fields. */
+ "tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 1) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
+ /* B.4 (CQE 2) adjust CRC length. */
+ "sub v22.8h, v22.8h, %[crc_adj].8h \n\t"
+ /* C.1 (CQE 2) generate final structure for mbuf. */
+ "tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t"
+ /* B.2 (CQE 1) move the block having op_own. */
+ "mov v19.16b, %[c1].16b \n\t"
+ /* B.3 (CQE 1) extract 16B fields. */
+ "tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 0) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
+ /* B.4 (CQE 1) adjust CRC length. */
+ "sub v21.8h, v21.8h, %[crc_adj].8h \n\t"
+ /* C.1 (CQE 1) generate final structure for mbuf. */
+ "tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t"
+ /* B.2 (CQE 0) move the block having op_own. */
+ "mov v19.16b, %[c0].16b \n\t"
+ /* A.1 load mbuf pointers. */
+ "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
+ /* B.3 (CQE 0) extract 16B fields. */
+ "tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.4 (CQE 0) adjust CRC length. */
+ "sub v20.8h, v20.8h, %[crc_adj].8h \n\t"
+ /* D.1 extract op_own byte. */
+ "tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t"
+ /* C.2 (CQE 3) adjust flow mark. */
+ "add v15.4s, v15.4s, %[flow_mark_adj].4s \n\t"
+ /* C.3 (CQE 3) fill in mbuf - rx_descriptor_fields1. */
+ "st1 {v15.2d}, [%[e3]] \n\t"
+ /* C.2 (CQE 2) adjust flow mark. */
+ "add v14.4s, v14.4s, %[flow_mark_adj].4s \n\t"
+ /* C.3 (CQE 2) fill in mbuf - rx_descriptor_fields1. */
+ "st1 {v14.2d}, [%[e2]] \n\t"
+ /* C.1 (CQE 0) generate final structure for mbuf. */
+ "tbl v12.16b, {v20.16b}, %[mb_shuf_m].16b \n\t"
+ /* C.2 (CQE 1) adjust flow mark. */
+ "add v13.4s, v13.4s, %[flow_mark_adj].4s \n\t"
+ /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+ "st1 {v13.2d}, [%[e1]] \n\t"
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Extract byte_cnt. */
+ "tbl %[byte_cnt].8b, {v20.16b - v23.16b}, %[len_shuf_m].8b \n\t"
+#endif
+ /* Extract ptype_info. */
+ "tbl %[ptype_info].16b, {v20.16b - v23.16b}, %[ptype_shuf_m].16b \n\t"
+ /* Extract flow_tag. */
+ "tbl %[flow_tag].16b, {v20.16b - v23.16b}, %[ftag_shuf_m].16b \n\t"
+ /* A.2 copy mbuf pointers. */
+ "st1 {v24.2d - v25.2d}, [%[pkts_p]] \n\t"
+ /* C.2 (CQE 0) adjust flow mark. */
+ "add v12.4s, v12.4s, %[flow_mark_adj].4s \n\t"
+ /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+ "st1 {v12.2d}, [%[e0]] \n\t"
+ :[op_own]"=&w"(op_own),
+ [byte_cnt]"=&w"(byte_cnt),
+ [ptype_info]"=&w"(ptype_info),
+ [flow_tag]"=&w"(flow_tag)
+ :[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0),
+ [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+ [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0),
+ [elts_p]"r"(elts_p),
+ [pkts_p]"r"(pkts_p),
+ [cqe_shuf_m]"w"(cqe_shuf_m),
+ [mb_shuf_m]"w"(mb_shuf_m),
+ [owner_shuf_m]"w"(owner_shuf_m),
+ [len_shuf_m]"w"(len_shuf_m),
+ [ptype_shuf_m]"w"(ptype_shuf_m),
+ [ftag_shuf_m]"w"(ftag_shuf_m),
+ [crc_adj]"w"(crc_adj),
+ [flow_mark_adj]"w"(flow_mark_adj)
+ :"memory",
+ "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19",
+ "v20", "v21", "v22", "v23",
+ "v24", "v25");
+ /* D.2 flip owner bit to mark CQEs from last round. */
+ owner_mask = vand_u16(op_own, owner_check);
+ owner_mask = vceq_u16(owner_mask, ownership);
+ /* D.3 get mask for invalidated CQEs. */
+ opcode = vand_u16(op_own, opcode_check);
+ invalid_mask = vceq_u16(opcode_check, opcode);
+ /* E.1 find compressed CQE format. */
+ comp_mask = vand_u16(op_own, format_check);
+ comp_mask = vceq_u16(comp_mask, format_check);
+ /* D.4 mask out beyond boundary. */
+ invalid_mask = vorr_u16(invalid_mask, mask);
+ /* D.5 merge invalid_mask with invalid owner. */
+ invalid_mask = vorr_u16(invalid_mask, owner_mask);
+ /* E.2 mask out invalid entries. */
+ comp_mask = vbic_u16(comp_mask, invalid_mask);
+ /* E.3 get the first compressed CQE. */
+ comp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+ comp_mask), 0)) /
+ (sizeof(uint16_t) * 8);
+ /* D.6 mask out entries after the compressed CQE. */
+ mask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ?
+ -1UL >> (comp_idx * sizeof(uint16_t) * 8) :
+ 0);
+ invalid_mask = vorr_u16(invalid_mask, mask);
+ /* D.7 count non-compressed valid CQEs. */
+ n = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+ invalid_mask), 0)) / (sizeof(uint16_t) * 8);
+ nocmp_n += n;
+ /* D.2 get the final invalid mask. */
+ mask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ?
+ -1UL >> (n * sizeof(uint16_t) * 8) : 0);
+ invalid_mask = vorr_u16(invalid_mask, mask);
+ /* D.3 check error in opcode. */
+ opcode = vceq_u16(resp_err_check, opcode);
+ opcode = vbic_u16(opcode, invalid_mask);
+ /* D.4 mark if any error is set */
+ *err |= vget_lane_u64(vreinterpret_u64_u16(opcode), 0);
+ /* C.4 fill in mbuf - rearm_data and packet_type. */
+ rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag,
+ opcode, &elts[pos]);
+ if (rxq->hw_timestamp) {
+ elts[pos]->timestamp =
+ rte_be_to_cpu_64(
+ container_of(p0, struct mlx5_cqe,
+ pkt_info)->timestamp);
+ elts[pos + 1]->timestamp =
+ rte_be_to_cpu_64(
+ container_of(p1, struct mlx5_cqe,
+ pkt_info)->timestamp);
+ elts[pos + 2]->timestamp =
+ rte_be_to_cpu_64(
+ container_of(p2, struct mlx5_cqe,
+ pkt_info)->timestamp);
+ elts[pos + 3]->timestamp =
+ rte_be_to_cpu_64(
+ container_of(p3, struct mlx5_cqe,
+ pkt_info)->timestamp);
+ }
+ if (!!rxq->flow_meta_mask) {
+ /* This code is subject for futher optimization. */
+ int32_t offs = rxq->flow_meta_offset;
+
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ container_of(p0, struct mlx5_cqe,
+ pkt_info)->flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ container_of(p1, struct mlx5_cqe,
+ pkt_info)->flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ container_of(p2, struct mlx5_cqe,
+ pkt_info)->flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ container_of(p3, struct mlx5_cqe,
+ pkt_info)->flow_table_metadata;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+ elts[pos]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+ elts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+ elts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+ elts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Add up received bytes count. */
+ byte_cnt = vbic_u16(byte_cnt, invalid_mask);
+ rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
+#endif
+ /*
+ * Break the loop unless more valid CQE is expected, or if
+ * there's a compressed CQE.
+ */
+ if (n != MLX5_VPMD_DESCS_PER_LOOP)
+ break;
+ }
+ /* If no new CQE seen, return without updating cq_db. */
+ if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+ return rcvd_pkt;
+ /* Update the consumer indexes for non-compressed CQEs. */
+ MLX5_ASSERT(nocmp_n <= pkts_n);
+ rxq->cq_ci += nocmp_n;
+ rxq->rq_pi += nocmp_n;
+ rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ rxq->stats.ipackets += nocmp_n;
+ rxq->stats.ibytes += rcvd_byte;
+#endif
+ /* Decompress the last CQE if compressed. */
+ if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+ MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+ rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+ &elts[nocmp_n]);
+ /* Return more packets if needed. */
+ if (nocmp_n < pkts_n) {
+ uint16_t n = rxq->decompressed;
+
+ n = RTE_MIN(n, pkts_n - nocmp_n);
+ rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
+ rxq->rq_pi += n;
+ rcvd_pkt += n;
+ rxq->decompressed -= n;
+ }
+ }
+ rte_cio_wmb();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ return rcvd_pkt;
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
new file mode 100644
index 000000000..6847ae782
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -0,0 +1,731 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
+#define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <smmintrin.h>
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param pkts
+ * Pointer to array of packets to be stored.
+ * @param pkts_n
+ * Number of packets to be stored.
+ */
+static inline void
+rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+{
+ const uint16_t q_mask = (1 << rxq->elts_n) - 1;
+ struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
+ unsigned int pos;
+ uint16_t p = n & -2;
+
+ for (pos = 0; pos < p; pos += 2) {
+ __m128i mbp;
+
+ mbp = _mm_loadu_si128((__m128i *)&elts[pos]);
+ _mm_storeu_si128((__m128i *)&pkts[pos], mbp);
+ }
+ if (n & 1)
+ pkts[pos] = elts[pos];
+}
+
+/**
+ * Decompress a compressed completion and fill in mbufs in RX SW ring with data
+ * extracted from the title completion descriptor.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param cq
+ * Pointer to completion array having a compressed completion at first.
+ * @param elts
+ * Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ * the title completion descriptor to be copied to the rest of mbufs.
+ *
+ * @return
+ * Number of mini-CQEs successfully decompressed.
+ */
+static inline uint16_t
+rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+ struct rte_mbuf **elts)
+{
+ volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1);
+ struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+ unsigned int pos;
+ unsigned int i;
+ unsigned int inv = 0;
+ /* Mask to shuffle from extracted mini CQE to mbuf. */
+ const __m128i shuf_mask1 =
+ _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */
+ -1, -1, /* skip vlan_tci */
+ 6, 7, /* data_len, bswap16 */
+ -1, -1, 6, 7, /* pkt_len, bswap16 */
+ -1, -1, -1, -1 /* skip packet_type */);
+ const __m128i shuf_mask2 =
+ _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */
+ -1, -1, /* skip vlan_tci */
+ 14, 15, /* data_len, bswap16 */
+ -1, -1, 14, 15, /* pkt_len, bswap16 */
+ -1, -1, -1, -1 /* skip packet_type */);
+ /* Restore the compressed count. Must be 16 bits. */
+ const uint16_t mcqe_n = t_pkt->data_len +
+ (rxq->crc_present * RTE_ETHER_CRC_LEN);
+ const __m128i rearm =
+ _mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
+ const __m128i rxdf =
+ _mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1);
+ const __m128i crc_adj =
+ _mm_set_epi16(0, 0, 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN,
+ 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN,
+ 0, 0);
+ const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ones = _mm_cmpeq_epi32(zero, zero);
+ uint32_t rcvd_byte = 0;
+ /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+ const __m128i len_shuf_mask =
+ _mm_set_epi8(-1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 14, 15, 6, 7,
+ 10, 11, 2, 3);
+#endif
+ /*
+ * A. load mCQEs into a 128bit register.
+ * B. store rearm data to mbuf.
+ * C. combine data from mCQEs with rx_descriptor_fields1.
+ * D. store rx_descriptor_fields1.
+ * E. store flow tag (rte_flow mark).
+ */
+ for (pos = 0; pos < mcqe_n; ) {
+ __m128i mcqe1, mcqe2;
+ __m128i rxdf1, rxdf2;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ __m128i byte_cnt, invalid_mask;
+#endif
+
+ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+ if (likely(pos + i < mcqe_n))
+ rte_prefetch0((void *)(cq + pos + i));
+
+ /* A.1 load mCQEs into a 128bit register. */
+ mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
+ mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
+ /* B.1 store rearm data to mbuf. */
+ _mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm);
+ _mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm);
+ /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+ rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1);
+ rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2);
+ rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
+ rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
+ rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
+ rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
+ /* D.1 store rx_descriptor_fields1. */
+ _mm_storeu_si128((__m128i *)
+ &elts[pos]->rx_descriptor_fields1,
+ rxdf1);
+ _mm_storeu_si128((__m128i *)
+ &elts[pos + 1]->rx_descriptor_fields1,
+ rxdf2);
+ /* B.1 store rearm data to mbuf. */
+ _mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm);
+ _mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm);
+ /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+ rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
+ rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
+ rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
+ rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
+ rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
+ rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
+ /* D.1 store rx_descriptor_fields1. */
+ _mm_storeu_si128((__m128i *)
+ &elts[pos + 2]->rx_descriptor_fields1,
+ rxdf1);
+ _mm_storeu_si128((__m128i *)
+ &elts[pos + 3]->rx_descriptor_fields1,
+ rxdf2);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ invalid_mask = _mm_set_epi64x(0,
+ (mcqe_n - pos) *
+ sizeof(uint16_t) * 8);
+ invalid_mask = _mm_sll_epi64(ones, invalid_mask);
+ mcqe1 = _mm_srli_si128(mcqe1, 4);
+ byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc);
+ byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
+ byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
+ byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
+ rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
+#endif
+ if (rxq->mark) {
+ /* E.1 store flow tag (rte_flow mark). */
+ elts[pos]->hash.fdir.hi = flow_tag;
+ elts[pos + 1]->hash.fdir.hi = flow_tag;
+ elts[pos + 2]->hash.fdir.hi = flow_tag;
+ elts[pos + 3]->hash.fdir.hi = flow_tag;
+ }
+ if (rxq->dynf_meta) {
+ int32_t offs = rxq->flow_meta_offset;
+ const uint32_t meta =
+ *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
+
+ /* Check if title packet has valid metadata. */
+ if (meta) {
+ MLX5_ASSERT(t_pkt->ol_flags &
+ rxq->flow_meta_mask);
+ *RTE_MBUF_DYNFIELD(elts[pos], offs,
+ uint32_t *) = meta;
+ *RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
+ uint32_t *) = meta;
+ *RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
+ uint32_t *) = meta;
+ *RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
+ uint32_t *) = meta;
+ }
+ }
+ pos += MLX5_VPMD_DESCS_PER_LOOP;
+ /* Move to next CQE and invalidate consumed CQEs. */
+ if (!(pos & 0x7) && pos < mcqe_n) {
+ mcq = (void *)(cq + pos);
+ for (i = 0; i < 8; ++i)
+ cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+ }
+ }
+ /* Invalidate the rest of CQEs. */
+ for (; inv < mcqe_n; ++inv)
+ cq[inv].op_own = MLX5_CQE_INVALIDATE;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ rxq->stats.ipackets += mcqe_n;
+ rxq->stats.ibytes += rcvd_byte;
+#endif
+ rxq->cq_ci += mcqe_n;
+ return mcqe_n;
+}
+
+/**
+ * Calculate packet type and offload flag for mbuf and store it.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param cqes[4]
+ * Array of four 16bytes completions extracted from the original completion
+ * descriptor.
+ * @param op_err
+ * Opcode vector having responder error status. Each field is 4B.
+ * @param pkts
+ * Pointer to array of packets to be filled.
+ */
+static inline void
+rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
+ __m128i op_err, struct rte_mbuf **pkts)
+{
+ __m128i pinfo0, pinfo1;
+ __m128i pinfo, ptype;
+ __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH |
+ rxq->hw_timestamp * PKT_RX_TIMESTAMP);
+ __m128i cv_flags;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ptype_mask =
+ _mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06);
+ const __m128i ptype_ol_mask =
+ _mm_set_epi32(0x106, 0x106, 0x106, 0x106);
+ const __m128i pinfo_mask =
+ _mm_set_epi32(0x3, 0x3, 0x3, 0x3);
+ const __m128i cv_flag_sel =
+ _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
+ (uint8_t)((PKT_RX_IP_CKSUM_GOOD |
+ PKT_RX_L4_CKSUM_GOOD) >> 1),
+ 0,
+ (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1),
+ 0,
+ (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1),
+ (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+ 0);
+ const __m128i cv_mask =
+ _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+ PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+ PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+ PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
+ const __m128i mbuf_init =
+ _mm_load_si128((__m128i *)&rxq->mbuf_initializer);
+ __m128i rearm0, rearm1, rearm2, rearm3;
+ uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
+
+ /* Extract pkt_info field. */
+ pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
+ pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
+ pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1);
+ /* Extract hdr_type_etc field. */
+ pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]);
+ pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]);
+ ptype = _mm_unpacklo_epi64(pinfo0, pinfo1);
+ if (rxq->mark) {
+ const __m128i pinfo_ft_mask =
+ _mm_set_epi32(0xffffff00, 0xffffff00,
+ 0xffffff00, 0xffffff00);
+ const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR);
+ __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID);
+ __m128i flow_tag, invalid_mask;
+
+ flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask);
+ /* Check if flow tag is non-zero then set PKT_RX_FDIR. */
+ invalid_mask = _mm_cmpeq_epi32(flow_tag, zero);
+ ol_flags = _mm_or_si128(ol_flags,
+ _mm_andnot_si128(invalid_mask,
+ fdir_flags));
+ /* Mask out invalid entries. */
+ fdir_id_flags = _mm_andnot_si128(invalid_mask, fdir_id_flags);
+ /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+ ol_flags = _mm_or_si128(ol_flags,
+ _mm_andnot_si128(
+ _mm_cmpeq_epi32(flow_tag,
+ pinfo_ft_mask),
+ fdir_id_flags));
+ }
+ /*
+ * Merge the two fields to generate the following:
+ * bit[1] = l3_ok
+ * bit[2] = l4_ok
+ * bit[8] = cv
+ * bit[11:10] = l3_hdr_type
+ * bit[14:12] = l4_hdr_type
+ * bit[15] = ip_frag
+ * bit[16] = tunneled
+ * bit[17] = outer_l3_type
+ */
+ ptype = _mm_and_si128(ptype, ptype_mask);
+ pinfo = _mm_and_si128(pinfo, pinfo_mask);
+ pinfo = _mm_slli_epi32(pinfo, 16);
+ /* Make pinfo has merged fields for ol_flags calculation. */
+ pinfo = _mm_or_si128(ptype, pinfo);
+ ptype = _mm_srli_epi32(pinfo, 10);
+ ptype = _mm_packs_epi32(ptype, zero);
+ /* Errored packets will have RTE_PTYPE_ALL_MASK. */
+ op_err = _mm_srli_epi16(op_err, 8);
+ ptype = _mm_or_si128(ptype, op_err);
+ pt_idx0 = _mm_extract_epi8(ptype, 0);
+ pt_idx1 = _mm_extract_epi8(ptype, 2);
+ pt_idx2 = _mm_extract_epi8(ptype, 4);
+ pt_idx3 = _mm_extract_epi8(ptype, 6);
+ pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
+ !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
+ pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
+ !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
+ pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
+ !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
+ pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
+ !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
+ /* Fill flags for checksum and VLAN. */
+ pinfo = _mm_and_si128(pinfo, ptype_ol_mask);
+ pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo);
+ /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
+ cv_flags = _mm_slli_epi32(pinfo, 9);
+ cv_flags = _mm_or_si128(pinfo, cv_flags);
+ /* Move back flags to start from byte[0]. */
+ cv_flags = _mm_srli_epi32(cv_flags, 8);
+ /* Mask out garbage bits. */
+ cv_flags = _mm_and_si128(cv_flags, cv_mask);
+ /* Merge to ol_flags. */
+ ol_flags = _mm_or_si128(ol_flags, cv_flags);
+ /* Merge mbuf_init and ol_flags. */
+ rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30);
+ rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30);
+ rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30);
+ rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30);
+ /* Write 8B rearm_data and 8B ol_flags. */
+ _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0);
+ _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1);
+ _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2);
+ _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3);
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ * @param[out] err
+ * Pointer to a flag. Set non-zero value if pkts array has at least one error
+ * packet to handle.
+ *
+ * @return
+ * Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
+ uint64_t *err)
+{
+ const uint16_t q_n = 1 << rxq->cqe_n;
+ const uint16_t q_mask = q_n - 1;
+ volatile struct mlx5_cqe *cq;
+ struct rte_mbuf **elts;
+ unsigned int pos;
+ uint64_t n;
+ uint16_t repl_n;
+ uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+ uint16_t nocmp_n = 0;
+ uint16_t rcvd_pkt = 0;
+ unsigned int cq_idx = rxq->cq_ci & q_mask;
+ unsigned int elts_idx;
+ unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+ const __m128i owner_check =
+ _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL);
+ const __m128i opcode_check =
+ _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL);
+ const __m128i format_check =
+ _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL);
+ const __m128i resp_err_check =
+ _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint32_t rcvd_byte = 0;
+ /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+ const __m128i len_shuf_mask =
+ _mm_set_epi8(-1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 12, 13, 8, 9,
+ 4, 5, 0, 1);
+#endif
+ /* Mask to shuffle from extracted CQE to mbuf. */
+ const __m128i shuf_mask =
+ _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */
+ 12, 13, 14, 15, /* rss, bswap32 */
+ 10, 11, /* vlan_tci, bswap16 */
+ 4, 5, /* data_len, bswap16 */
+ -1, -1, /* zero out 2nd half of pkt_len */
+ 4, 5 /* pkt_len, bswap16 */);
+ /* Mask to blend from the last Qword to the first DQword. */
+ const __m128i blend_mask =
+ _mm_set_epi8(-1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 0, 0, 0, 0,
+ 0, 0, 0, -1);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ones = _mm_cmpeq_epi32(zero, zero);
+ const __m128i crc_adj =
+ _mm_set_epi16(0, 0, 0, 0, 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN,
+ 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN);
+ const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
+
+ MLX5_ASSERT(rxq->sges_n == 0);
+ MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+ cq = &(*rxq->cqes)[cq_idx];
+ rte_prefetch0(cq);
+ rte_prefetch0(cq + 1);
+ rte_prefetch0(cq + 2);
+ rte_prefetch0(cq + 3);
+ pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+ repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
+ if (repl_n >= rxq->rq_repl_thresh)
+ mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
+ /* See if there're unreturned mbufs from compressed CQE. */
+ rcvd_pkt = rxq->decompressed;
+ if (rcvd_pkt > 0) {
+ rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+ rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
+ rxq->rq_pi += rcvd_pkt;
+ rxq->decompressed -= rcvd_pkt;
+ pkts += rcvd_pkt;
+ }
+ elts_idx = rxq->rq_pi & q_mask;
+ elts = &(*rxq->elts)[elts_idx];
+ /* Not to overflow pkts array. */
+ pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
+ /* Not to cross queue end. */
+ pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
+ pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+ if (!pkts_n)
+ return rcvd_pkt;
+ /* At this point, there shouldn't be any remained packets. */
+ MLX5_ASSERT(rxq->decompressed == 0);
+ /*
+ * A. load first Qword (8bytes) in one loop.
+ * B. copy 4 mbuf pointers from elts ring to returing pkts.
+ * C. load remained CQE data and extract necessary fields.
+ * Final 16bytes cqes[] extracted from original 64bytes CQE has the
+ * following structure:
+ * struct {
+ * uint8_t pkt_info;
+ * uint8_t flow_tag[3];
+ * uint16_t byte_cnt;
+ * uint8_t rsvd4;
+ * uint8_t op_own;
+ * uint16_t hdr_type_etc;
+ * uint16_t vlan_info;
+ * uint32_t rx_has_res;
+ * } c;
+ * D. fill in mbuf.
+ * E. get valid CQEs.
+ * F. find compressed CQE.
+ */
+ for (pos = 0;
+ pos < pkts_n;
+ pos += MLX5_VPMD_DESCS_PER_LOOP) {
+ __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
+ __m128i cqe_tmp1, cqe_tmp2;
+ __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+ __m128i op_own, op_own_tmp1, op_own_tmp2;
+ __m128i opcode, owner_mask, invalid_mask;
+ __m128i comp_mask;
+ __m128i mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ __m128i byte_cnt;
+#endif
+ __m128i mbp1, mbp2;
+ __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+ unsigned int p1, p2, p3;
+
+ /* Prefetch next 4 CQEs. */
+ if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+ }
+ /* A.0 do not cross the end of CQ. */
+ mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
+ mask = _mm_sll_epi64(ones, mask);
+ p = _mm_andnot_si128(mask, p);
+ /* A.1 load cqes. */
+ p3 = _mm_extract_epi16(p, 3);
+ cqes[3] = _mm_loadl_epi64((__m128i *)
+ &cq[pos + p3].sop_drop_qpn);
+ rte_compiler_barrier();
+ p2 = _mm_extract_epi16(p, 2);
+ cqes[2] = _mm_loadl_epi64((__m128i *)
+ &cq[pos + p2].sop_drop_qpn);
+ rte_compiler_barrier();
+ /* B.1 load mbuf pointers. */
+ mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]);
+ mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]);
+ /* A.1 load a block having op_own. */
+ p1 = _mm_extract_epi16(p, 1);
+ cqes[1] = _mm_loadl_epi64((__m128i *)
+ &cq[pos + p1].sop_drop_qpn);
+ rte_compiler_barrier();
+ cqes[0] = _mm_loadl_epi64((__m128i *)
+ &cq[pos].sop_drop_qpn);
+ /* B.2 copy mbuf pointers. */
+ _mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
+ _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
+ rte_cio_rmb();
+ /* C.1 load remained CQE data and extract necessary fields. */
+ cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
+ cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
+ cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
+ cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
+ cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);
+ cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);
+ cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
+ cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
+ cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);
+ cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);
+ cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
+ cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
+ /* C.2 generate final structure for mbuf with swapping bytes. */
+ pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
+ pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
+ /* C.3 adjust CRC length. */
+ pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
+ pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
+ /* C.4 adjust flow mark. */
+ pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
+ pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
+ /* D.1 fill in mbuf - rx_descriptor_fields1. */
+ _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
+ _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
+ /* E.1 extract op_own field. */
+ op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
+ /* C.1 load remained CQE data and extract necessary fields. */
+ cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);
+ cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);
+ cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
+ cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
+ cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);
+ cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);
+ cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
+ cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
+ cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);
+ cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);
+ cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
+ cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
+ /* C.2 generate final structure for mbuf with swapping bytes. */
+ pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
+ pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
+ /* C.3 adjust CRC length. */
+ pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
+ pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
+ /* C.4 adjust flow mark. */
+ pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
+ pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
+ /* E.1 extract op_own byte. */
+ op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
+ op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
+ /* D.1 fill in mbuf - rx_descriptor_fields1. */
+ _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
+ _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
+ /* E.2 flip owner bit to mark CQEs from last round. */
+ owner_mask = _mm_and_si128(op_own, owner_check);
+ if (ownership)
+ owner_mask = _mm_xor_si128(owner_mask, owner_check);
+ owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check);
+ owner_mask = _mm_packs_epi32(owner_mask, zero);
+ /* E.3 get mask for invalidated CQEs. */
+ opcode = _mm_and_si128(op_own, opcode_check);
+ invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
+ invalid_mask = _mm_packs_epi32(invalid_mask, zero);
+ /* E.4 mask out beyond boundary. */
+ invalid_mask = _mm_or_si128(invalid_mask, mask);
+ /* E.5 merge invalid_mask with invalid owner. */
+ invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
+ /* F.1 find compressed CQE format. */
+ comp_mask = _mm_and_si128(op_own, format_check);
+ comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
+ comp_mask = _mm_packs_epi32(comp_mask, zero);
+ /* F.2 mask out invalid entries. */
+ comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
+ comp_idx = _mm_cvtsi128_si64(comp_mask);
+ /* F.3 get the first compressed CQE. */
+ comp_idx = comp_idx ?
+ __builtin_ctzll(comp_idx) /
+ (sizeof(uint16_t) * 8) :
+ MLX5_VPMD_DESCS_PER_LOOP;
+ /* E.6 mask out entries after the compressed CQE. */
+ mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
+ mask = _mm_sll_epi64(ones, mask);
+ invalid_mask = _mm_or_si128(invalid_mask, mask);
+ /* E.7 count non-compressed valid CQEs. */
+ n = _mm_cvtsi128_si64(invalid_mask);
+ n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+ MLX5_VPMD_DESCS_PER_LOOP;
+ nocmp_n += n;
+ /* D.2 get the final invalid mask. */
+ mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
+ mask = _mm_sll_epi64(ones, mask);
+ invalid_mask = _mm_or_si128(invalid_mask, mask);
+ /* D.3 check error in opcode. */
+ opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
+ opcode = _mm_packs_epi32(opcode, zero);
+ opcode = _mm_andnot_si128(invalid_mask, opcode);
+ /* D.4 mark if any error is set */
+ *err |= _mm_cvtsi128_si64(opcode);
+ /* D.5 fill in mbuf - rearm_data and packet_type. */
+ rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+ if (rxq->hw_timestamp) {
+ pkts[pos]->timestamp =
+ rte_be_to_cpu_64(cq[pos].timestamp);
+ pkts[pos + 1]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p1].timestamp);
+ pkts[pos + 2]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p2].timestamp);
+ pkts[pos + 3]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p3].timestamp);
+ }
+ if (rxq->dynf_meta) {
+ /* This code is subject for futher optimization. */
+ int32_t offs = rxq->flow_meta_offset;
+
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ cq[pos].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+ cq[pos + p1].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+ cq[pos + p2].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+ cq[pos + p3].flow_table_metadata;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+ pkts[pos]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+ pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+ pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+ pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Add up received bytes count. */
+ byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
+ byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
+ byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
+ rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
+#endif
+ /*
+ * Break the loop unless more valid CQE is expected, or if
+ * there's a compressed CQE.
+ */
+ if (n != MLX5_VPMD_DESCS_PER_LOOP)
+ break;
+ }
+ /* If no new CQE seen, return without updating cq_db. */
+ if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+ return rcvd_pkt;
+ /* Update the consumer indexes for non-compressed CQEs. */
+ MLX5_ASSERT(nocmp_n <= pkts_n);
+ rxq->cq_ci += nocmp_n;
+ rxq->rq_pi += nocmp_n;
+ rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ rxq->stats.ipackets += nocmp_n;
+ rxq->stats.ibytes += rcvd_byte;
+#endif
+ /* Decompress the last CQE if compressed. */
+ if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+ MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+ rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+ &elts[nocmp_n]);
+ /* Return more packets if needed. */
+ if (nocmp_n < pkts_n) {
+ uint16_t n = rxq->decompressed;
+
+ n = RTE_MIN(n, pkts_n - nocmp_n);
+ rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
+ rxq->rq_pi += n;
+ rcvd_pkt += n;
+ rxq->decompressed -= n;
+ }
+ }
+ rte_compiler_barrier();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ return rcvd_pkt;
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c
new file mode 100644
index 000000000..a79896cb3
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "rte_eal.h"
+#include "mlx5_utils.h"
+#include "mlx5.h"
+
+/* PMD socket service for tools. */
+
+int server_socket; /* Unix socket for primary process. */
+struct rte_intr_handle server_intr_handle; /* Interrupt handler. */
+
+static void
+mlx5_pmd_make_path(struct sockaddr_un *addr, int pid)
+{
+ snprintf(addr->sun_path, sizeof(addr->sun_path), "/var/tmp/dpdk_%s_%d",
+ MLX5_DRIVER_NAME, pid);
+}
+
+/**
+ * Handle server pmd socket interrupts.
+ */
+static void
+mlx5_pmd_socket_handle(void *cb __rte_unused)
+{
+ int conn_sock;
+ int ret = -1;
+ struct cmsghdr *cmsg = NULL;
+ int data;
+ char buf[CMSG_SPACE(sizeof(int))] = { 0 };
+ struct iovec io = {
+ .iov_base = &data,
+ .iov_len = sizeof(data),
+ };
+ struct msghdr msg = {
+ .msg_iov = &io,
+ .msg_iovlen = 1,
+ .msg_control = buf,
+ .msg_controllen = sizeof(buf),
+ };
+ uint16_t port_id;
+ int fd;
+ FILE *file = NULL;
+ struct rte_eth_dev *dev;
+
+ /* Accept the connection from the client. */
+ conn_sock = accept(server_socket, NULL, NULL);
+ if (conn_sock < 0) {
+ DRV_LOG(WARNING, "connection failed: %s", strerror(errno));
+ return;
+ }
+ ret = recvmsg(conn_sock, &msg, MSG_WAITALL);
+ if (ret < 0) {
+ DRV_LOG(WARNING, "wrong message received: %s",
+ strerror(errno));
+ goto error;
+ }
+ /* Receive file descriptor. */
+ cmsg = CMSG_FIRSTHDR(&msg);
+ if (cmsg == NULL || cmsg->cmsg_type != SCM_RIGHTS ||
+ cmsg->cmsg_len < sizeof(int)) {
+ DRV_LOG(WARNING, "invalid file descriptor message");
+ goto error;
+ }
+ memcpy(&fd, CMSG_DATA(cmsg), sizeof(fd));
+ file = fdopen(fd, "w");
+ if (!file) {
+ DRV_LOG(WARNING, "Failed to open file");
+ goto error;
+ }
+ /* Receive port number. */
+ if (msg.msg_iovlen != 1 || msg.msg_iov->iov_len < sizeof(uint16_t)) {
+ DRV_LOG(WARNING, "wrong port number message");
+ goto error;
+ }
+ memcpy(&port_id, msg.msg_iov->iov_base, sizeof(port_id));
+ if (!rte_eth_dev_is_valid_port(port_id)) {
+ DRV_LOG(WARNING, "Invalid port %u", port_id);
+ goto error;
+ }
+ /* Dump flow. */
+ dev = &rte_eth_devices[port_id];
+ ret = mlx5_flow_dev_dump(dev, file, NULL);
+ /* Set-up the ancillary data and reply. */
+ msg.msg_controllen = 0;
+ msg.msg_control = NULL;
+ msg.msg_iovlen = 1;
+ msg.msg_iov = &io;
+ data = -ret;
+ io.iov_len = sizeof(data);
+ io.iov_base = &data;
+ do {
+ ret = sendmsg(conn_sock, &msg, 0);
+ } while (ret < 0 && errno == EINTR);
+ if (ret < 0)
+ DRV_LOG(WARNING, "failed to send response %s",
+ strerror(errno));
+error:
+ if (conn_sock > 0)
+ close(conn_sock);
+ if (file)
+ fclose(file);
+}
+
+/**
+ * Install interrupt handler.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @return
+ * 0 on success, a negative errno value otherwise.
+ */
+static int
+mlx5_pmd_interrupt_handler_install(void)
+{
+ MLX5_ASSERT(server_socket);
+ server_intr_handle.fd = server_socket;
+ server_intr_handle.type = RTE_INTR_HANDLE_EXT;
+ return rte_intr_callback_register(&server_intr_handle,
+ mlx5_pmd_socket_handle, NULL);
+}
+
+/**
+ * Uninstall interrupt handler.
+ */
+static void
+mlx5_pmd_interrupt_handler_uninstall(void)
+{
+ if (server_socket) {
+ mlx5_intr_callback_unregister(&server_intr_handle,
+ mlx5_pmd_socket_handle,
+ NULL);
+ }
+ server_intr_handle.fd = 0;
+ server_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+}
+
+/**
+ * Initialise the socket to communicate with the secondary process
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int
+mlx5_pmd_socket_init(void)
+{
+ struct sockaddr_un sun = {
+ .sun_family = AF_UNIX,
+ };
+ int ret = -1;
+ int flags;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ if (server_socket)
+ return 0;
+ /*
+ * Initialize the socket to communicate with the secondary
+ * process.
+ */
+ ret = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (ret < 0) {
+ DRV_LOG(WARNING, "Failed to open mlx5 socket: %s",
+ strerror(errno));
+ goto error;
+ }
+ server_socket = ret;
+ flags = fcntl(server_socket, F_GETFL, 0);
+ if (flags == -1)
+ goto error;
+ ret = fcntl(server_socket, F_SETFL, flags | O_NONBLOCK);
+ if (ret < 0)
+ goto error;
+ mlx5_pmd_make_path(&sun, getpid());
+ remove(sun.sun_path);
+ ret = bind(server_socket, (const struct sockaddr *)&sun, sizeof(sun));
+ if (ret < 0) {
+ DRV_LOG(WARNING,
+ "cannot bind mlx5 socket: %s", strerror(errno));
+ goto close;
+ }
+ ret = listen(server_socket, 0);
+ if (ret < 0) {
+ DRV_LOG(WARNING, "cannot listen on mlx5 socket: %s",
+ strerror(errno));
+ goto close;
+ }
+ if (mlx5_pmd_interrupt_handler_install()) {
+ DRV_LOG(WARNING, "cannot register interrupt handler for mlx5 socket: %s",
+ strerror(errno));
+ goto close;
+ }
+ return 0;
+close:
+ remove(sun.sun_path);
+error:
+ claim_zero(close(server_socket));
+ server_socket = 0;
+ DRV_LOG(ERR, "Cannot initialize socket: %s", strerror(errno));
+ return -errno;
+}
+
+/**
+ * Un-Initialize the pmd socket
+ */
+RTE_FINI(mlx5_pmd_socket_uninit)
+{
+ if (!server_socket)
+ return;
+ mlx5_pmd_interrupt_handler_uninstall();
+ claim_zero(close(server_socket));
+ server_socket = 0;
+ MKSTR(path, "/var/tmp/dpdk_%s_%d", MLX5_DRIVER_NAME, getpid());
+ claim_zero(remove(path));
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c
new file mode 100644
index 000000000..b4ca6922a
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c
@@ -0,0 +1,589 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <linux/sockios.h>
+#include <linux/ethtool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+#include <rte_malloc.h>
+
+#include <mlx5_common.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+
+
+static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
+ {
+ .dpdk_name = "rx_port_unicast_bytes",
+ .ctr_name = "rx_vport_unicast_bytes",
+ },
+ {
+ .dpdk_name = "rx_port_multicast_bytes",
+ .ctr_name = "rx_vport_multicast_bytes",
+ },
+ {
+ .dpdk_name = "rx_port_broadcast_bytes",
+ .ctr_name = "rx_vport_broadcast_bytes",
+ },
+ {
+ .dpdk_name = "rx_port_unicast_packets",
+ .ctr_name = "rx_vport_unicast_packets",
+ },
+ {
+ .dpdk_name = "rx_port_multicast_packets",
+ .ctr_name = "rx_vport_multicast_packets",
+ },
+ {
+ .dpdk_name = "rx_port_broadcast_packets",
+ .ctr_name = "rx_vport_broadcast_packets",
+ },
+ {
+ .dpdk_name = "tx_port_unicast_bytes",
+ .ctr_name = "tx_vport_unicast_bytes",
+ },
+ {
+ .dpdk_name = "tx_port_multicast_bytes",
+ .ctr_name = "tx_vport_multicast_bytes",
+ },
+ {
+ .dpdk_name = "tx_port_broadcast_bytes",
+ .ctr_name = "tx_vport_broadcast_bytes",
+ },
+ {
+ .dpdk_name = "tx_port_unicast_packets",
+ .ctr_name = "tx_vport_unicast_packets",
+ },
+ {
+ .dpdk_name = "tx_port_multicast_packets",
+ .ctr_name = "tx_vport_multicast_packets",
+ },
+ {
+ .dpdk_name = "tx_port_broadcast_packets",
+ .ctr_name = "tx_vport_broadcast_packets",
+ },
+ {
+ .dpdk_name = "rx_wqe_err",
+ .ctr_name = "rx_wqe_err",
+ },
+ {
+ .dpdk_name = "rx_crc_errors_phy",
+ .ctr_name = "rx_crc_errors_phy",
+ },
+ {
+ .dpdk_name = "rx_in_range_len_errors_phy",
+ .ctr_name = "rx_in_range_len_errors_phy",
+ },
+ {
+ .dpdk_name = "rx_symbol_err_phy",
+ .ctr_name = "rx_symbol_err_phy",
+ },
+ {
+ .dpdk_name = "tx_errors_phy",
+ .ctr_name = "tx_errors_phy",
+ },
+ {
+ .dpdk_name = "rx_out_of_buffer",
+ .ctr_name = "out_of_buffer",
+ .ib = 1,
+ },
+ {
+ .dpdk_name = "tx_packets_phy",
+ .ctr_name = "tx_packets_phy",
+ },
+ {
+ .dpdk_name = "rx_packets_phy",
+ .ctr_name = "rx_packets_phy",
+ },
+ {
+ .dpdk_name = "tx_discards_phy",
+ .ctr_name = "tx_discards_phy",
+ },
+ {
+ .dpdk_name = "rx_discards_phy",
+ .ctr_name = "rx_discards_phy",
+ },
+ {
+ .dpdk_name = "tx_bytes_phy",
+ .ctr_name = "tx_bytes_phy",
+ },
+ {
+ .dpdk_name = "rx_bytes_phy",
+ .ctr_name = "rx_bytes_phy",
+ },
+ /* Representor only */
+ {
+ .dpdk_name = "rx_packets",
+ .ctr_name = "vport_rx_packets",
+ },
+ {
+ .dpdk_name = "rx_bytes",
+ .ctr_name = "vport_rx_bytes",
+ },
+ {
+ .dpdk_name = "tx_packets",
+ .ctr_name = "vport_tx_packets",
+ },
+ {
+ .dpdk_name = "tx_bytes",
+ .ctr_name = "vport_tx_bytes",
+ },
+};
+
+static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
+
+static inline int
+mlx5_read_ib_stat(struct mlx5_priv *priv, const char *ctr_name, uint64_t *stat)
+{
+ int fd;
+
+ if (priv->sh) {
+ MKSTR(path, "%s/ports/%d/hw_counters/%s",
+ priv->sh->ibdev_path,
+ priv->ibv_port,
+ ctr_name);
+ fd = open(path, O_RDONLY);
+ if (fd != -1) {
+ char buf[21] = {'\0'};
+ ssize_t n = read(fd, buf, sizeof(buf));
+
+ close(fd);
+ if (n != -1) {
+ *stat = strtoull(buf, NULL, 10);
+ return 0;
+ }
+ }
+ }
+ *stat = 0;
+ return 1;
+}
+
+/**
+ * Read device counters table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] stats
+ * Counters table output buffer.
+ *
+ * @return
+ * 0 on success and stats is filled, negative errno value otherwise and
+ * rte_errno is set.
+ */
+static int
+mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+ unsigned int i;
+ struct ifreq ifr;
+ unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t);
+ unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
+ struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
+ int ret;
+
+ et_stats->cmd = ETHTOOL_GSTATS;
+ et_stats->n_stats = xstats_ctrl->stats_n;
+ ifr.ifr_data = (caddr_t)et_stats;
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING,
+ "port %u unable to read statistic values from device",
+ dev->data->port_id);
+ return ret;
+ }
+ for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
+ if (xstats_ctrl->info[i].ib) {
+ ret = mlx5_read_ib_stat(priv,
+ xstats_ctrl->info[i].ctr_name,
+ &stats[i]);
+ /* return last xstats counter if fail to read. */
+ if (ret == 0)
+ xstats_ctrl->xstats[i] = stats[i];
+ else
+ stats[i] = xstats_ctrl->xstats[i];
+ } else {
+ stats[i] = (uint64_t)
+ et_stats->data[xstats_ctrl->dev_table_idx[i]];
+ }
+ }
+ return 0;
+}
+
+/**
+ * Query the number of statistics provided by ETHTOOL.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * Number of statistics on success, negative errno value otherwise and
+ * rte_errno is set.
+ */
+static int
+mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
+ struct ethtool_drvinfo drvinfo;
+ struct ifreq ifr;
+ int ret;
+
+ drvinfo.cmd = ETHTOOL_GDRVINFO;
+ ifr.ifr_data = (caddr_t)&drvinfo;
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u unable to query number of statistics",
+ dev->data->port_id);
+ return ret;
+ }
+ return drvinfo.n_stats;
+}
+
+/**
+ * Init the structures to read device counters.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_stats_init(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+ struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
+ unsigned int i;
+ unsigned int j;
+ struct ifreq ifr;
+ struct ethtool_gstrings *strings = NULL;
+ unsigned int dev_stats_n;
+ unsigned int str_sz;
+ int ret;
+
+ /* So that it won't aggregate for each init. */
+ xstats_ctrl->mlx5_stats_n = 0;
+ ret = mlx5_ethtool_get_stats_n(dev);
+ if (ret < 0) {
+ DRV_LOG(WARNING, "port %u no extended statistics available",
+ dev->data->port_id);
+ return;
+ }
+ dev_stats_n = ret;
+ /* Allocate memory to grab stat names and values. */
+ str_sz = dev_stats_n * ETH_GSTRING_LEN;
+ strings = (struct ethtool_gstrings *)
+ rte_malloc("xstats_strings",
+ str_sz + sizeof(struct ethtool_gstrings), 0);
+ if (!strings) {
+ DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
+ dev->data->port_id);
+ return;
+ }
+ strings->cmd = ETHTOOL_GSTRINGS;
+ strings->string_set = ETH_SS_STATS;
+ strings->len = dev_stats_n;
+ ifr.ifr_data = (caddr_t)strings;
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u unable to get statistic names",
+ dev->data->port_id);
+ goto free;
+ }
+ for (i = 0; i != dev_stats_n; ++i) {
+ const char *curr_string = (const char *)
+ &strings->data[i * ETH_GSTRING_LEN];
+
+ for (j = 0; j != xstats_n; ++j) {
+ if (!strcmp(mlx5_counters_init[j].ctr_name,
+ curr_string)) {
+ unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+ xstats_ctrl->dev_table_idx[idx] = i;
+ xstats_ctrl->info[idx] = mlx5_counters_init[j];
+ break;
+ }
+ }
+ }
+ /* Add IB counters. */
+ for (i = 0; i != xstats_n; ++i) {
+ if (mlx5_counters_init[i].ib) {
+ unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+ xstats_ctrl->info[idx] = mlx5_counters_init[i];
+ xstats_ctrl->hw_stats[idx] = 0;
+ }
+ }
+ MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS);
+ xstats_ctrl->stats_n = dev_stats_n;
+ /* Copy to base at first time. */
+ ret = mlx5_read_dev_counters(dev, xstats_ctrl->base);
+ if (ret)
+ DRV_LOG(ERR, "port %u cannot read device counters: %s",
+ dev->data->port_id, strerror(rte_errno));
+ mlx5_read_ib_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
+ stats_ctrl->imissed = 0;
+free:
+ rte_free(strings);
+}
+
+/**
+ * DPDK callback to get extended device statistics.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] stats
+ * Pointer to rte extended stats table.
+ * @param n
+ * The size of the stats table.
+ *
+ * @return
+ * Number of extended stats on success and stats is filled,
+ * negative on error and rte_errno is set.
+ */
+int
+mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats,
+ unsigned int n)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+ uint64_t counters[n];
+ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+ uint16_t mlx5_stats_n = xstats_ctrl->mlx5_stats_n;
+
+ if (n >= mlx5_stats_n && stats) {
+ int stats_n;
+ int ret;
+
+ stats_n = mlx5_ethtool_get_stats_n(dev);
+ if (stats_n < 0)
+ return stats_n;
+ if (xstats_ctrl->stats_n != stats_n)
+ mlx5_stats_init(dev);
+ ret = mlx5_read_dev_counters(dev, counters);
+ if (ret)
+ return ret;
+ for (i = 0; i != mlx5_stats_n; ++i) {
+ stats[i].id = i;
+ if (xstats_ctrl->info[i].ib) {
+ uint64_t wrap_n;
+ uint64_t hw_stat = xstats_ctrl->hw_stats[i];
+
+ stats[i].value = (counters[i] -
+ xstats_ctrl->base[i]) &
+ (uint64_t)UINT32_MAX;
+ wrap_n = hw_stat >> 32;
+ if (stats[i].value <
+ (hw_stat & (uint64_t)UINT32_MAX))
+ wrap_n++;
+ stats[i].value |= (wrap_n) << 32;
+ xstats_ctrl->hw_stats[i] = stats[i].value;
+ } else {
+ stats[i].value =
+ (counters[i] - xstats_ctrl->base[i]);
+ }
+ }
+ }
+ return mlx5_stats_n;
+}
+
+/**
+ * DPDK callback to get device statistics.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] stats
+ * Stats structure output buffer.
+ *
+ * @return
+ * 0 on success and stats is filled, negative errno value otherwise and
+ * rte_errno is set.
+ */
+int
+mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
+ struct rte_eth_stats tmp;
+ unsigned int i;
+ unsigned int idx;
+ uint64_t wrap_n;
+ int ret;
+
+ memset(&tmp, 0, sizeof(tmp));
+ /* Add software counters. */
+ for (i = 0; (i != priv->rxqs_n); ++i) {
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+
+ if (rxq == NULL)
+ continue;
+ idx = rxq->idx;
+ if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ tmp.q_ipackets[idx] += rxq->stats.ipackets;
+ tmp.q_ibytes[idx] += rxq->stats.ibytes;
+#endif
+ tmp.q_errors[idx] += (rxq->stats.idropped +
+ rxq->stats.rx_nombuf);
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ tmp.ipackets += rxq->stats.ipackets;
+ tmp.ibytes += rxq->stats.ibytes;
+#endif
+ tmp.ierrors += rxq->stats.idropped;
+ tmp.rx_nombuf += rxq->stats.rx_nombuf;
+ }
+ for (i = 0; (i != priv->txqs_n); ++i) {
+ struct mlx5_txq_data *txq = (*priv->txqs)[i];
+
+ if (txq == NULL)
+ continue;
+ idx = txq->idx;
+ if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ tmp.q_opackets[idx] += txq->stats.opackets;
+ tmp.q_obytes[idx] += txq->stats.obytes;
+#endif
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ tmp.opackets += txq->stats.opackets;
+ tmp.obytes += txq->stats.obytes;
+#endif
+ tmp.oerrors += txq->stats.oerrors;
+ }
+ ret = mlx5_read_ib_stat(priv, "out_of_buffer", &tmp.imissed);
+ if (ret == 0) {
+ tmp.imissed = (tmp.imissed - stats_ctrl->imissed_base) &
+ (uint64_t)UINT32_MAX;
+ wrap_n = stats_ctrl->imissed >> 32;
+ if (tmp.imissed < (stats_ctrl->imissed & (uint64_t)UINT32_MAX))
+ wrap_n++;
+ tmp.imissed |= (wrap_n) << 32;
+ stats_ctrl->imissed = tmp.imissed;
+ } else {
+ tmp.imissed = stats_ctrl->imissed;
+ }
+#ifndef MLX5_PMD_SOFT_COUNTERS
+ /* FIXME: retrieve and add hardware counters. */
+#endif
+ *stats = tmp;
+ return 0;
+}
+
+/**
+ * DPDK callback to clear device statistics.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * always 0 on success and stats is reset
+ */
+int
+mlx5_stats_reset(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
+ unsigned int i;
+
+ for (i = 0; (i != priv->rxqs_n); ++i) {
+ if ((*priv->rxqs)[i] == NULL)
+ continue;
+ memset(&(*priv->rxqs)[i]->stats, 0,
+ sizeof(struct mlx5_rxq_stats));
+ }
+ for (i = 0; (i != priv->txqs_n); ++i) {
+ if ((*priv->txqs)[i] == NULL)
+ continue;
+ memset(&(*priv->txqs)[i]->stats, 0,
+ sizeof(struct mlx5_txq_stats));
+ }
+ mlx5_read_ib_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
+ stats_ctrl->imissed = 0;
+#ifndef MLX5_PMD_SOFT_COUNTERS
+ /* FIXME: reset hardware counters. */
+#endif
+
+ return 0;
+}
+
+/**
+ * DPDK callback to clear device extended statistics.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success and stats is reset, negative errno value otherwise and
+ * rte_errno is set.
+ */
+int
+mlx5_xstats_reset(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+ int stats_n;
+ unsigned int i;
+ unsigned int n = xstats_ctrl->mlx5_stats_n;
+ uint64_t counters[n];
+ int ret;
+
+ stats_n = mlx5_ethtool_get_stats_n(dev);
+ if (stats_n < 0) {
+ DRV_LOG(ERR, "port %u cannot get stats: %s", dev->data->port_id,
+ strerror(-stats_n));
+ return stats_n;
+ }
+ if (xstats_ctrl->stats_n != stats_n)
+ mlx5_stats_init(dev);
+ ret = mlx5_read_dev_counters(dev, counters);
+ if (ret) {
+ DRV_LOG(ERR, "port %u cannot read device counters: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ for (i = 0; i != n; ++i) {
+ xstats_ctrl->base[i] = counters[i];
+ xstats_ctrl->hw_stats[i] = 0;
+ }
+
+ return 0;
+}
+
+/**
+ * DPDK callback to retrieve names of extended device statistics
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] xstats_names
+ * Buffer to insert names into.
+ * @param n
+ * Number of names.
+ *
+ * @return
+ * Number of xstats names.
+ */
+int
+mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
+ struct rte_eth_xstat_name *xstats_names, unsigned int n)
+{
+ unsigned int i;
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+ unsigned int mlx5_xstats_n = xstats_ctrl->mlx5_stats_n;
+
+ if (n >= mlx5_xstats_n && xstats_names) {
+ for (i = 0; i != mlx5_xstats_n; ++i) {
+ strncpy(xstats_names[i].name,
+ xstats_ctrl->info[i].dpdk_name,
+ RTE_ETH_XSTATS_NAME_SIZE);
+ xstats_names[i].name[RTE_ETH_XSTATS_NAME_SIZE - 1] = 0;
+ }
+ }
+ return mlx5_xstats_n;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c
new file mode 100644
index 000000000..8106598ff
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c
@@ -0,0 +1,579 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <unistd.h>
+
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_interrupts.h>
+#include <rte_alarm.h>
+
+#include "mlx5.h"
+#include "mlx5_mr.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+#include "rte_pmd_mlx5.h"
+
+/**
+ * Stop traffic on Tx queues.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+static void
+mlx5_txq_stop(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+
+ for (i = 0; i != priv->txqs_n; ++i)
+ mlx5_txq_release(dev, i);
+}
+
+/**
+ * Start traffic on Tx queues.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_txq_start(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+ int ret;
+
+ for (i = 0; i != priv->txqs_n; ++i) {
+ struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
+
+ if (!txq_ctrl)
+ continue;
+ if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
+ txq_ctrl->obj = mlx5_txq_obj_new
+ (dev, i, MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN);
+ } else {
+ txq_alloc_elts(txq_ctrl);
+ txq_ctrl->obj = mlx5_txq_obj_new
+ (dev, i, MLX5_TXQ_OBJ_TYPE_IBV);
+ }
+ if (!txq_ctrl->obj) {
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ }
+ return 0;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ do {
+ mlx5_txq_release(dev, i);
+ } while (i-- != 0);
+ rte_errno = ret; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Stop traffic on Rx queues.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+static void
+mlx5_rxq_stop(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+
+ for (i = 0; i != priv->rxqs_n; ++i)
+ mlx5_rxq_release(dev, i);
+}
+
+/**
+ * Start traffic on Rx queues.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rxq_start(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+ int ret = 0;
+ enum mlx5_rxq_obj_type obj_type = MLX5_RXQ_OBJ_TYPE_IBV;
+ struct mlx5_rxq_data *rxq = NULL;
+
+ for (i = 0; i < priv->rxqs_n; ++i) {
+ rxq = (*priv->rxqs)[i];
+ if (rxq && rxq->lro) {
+ obj_type = MLX5_RXQ_OBJ_TYPE_DEVX_RQ;
+ break;
+ }
+ }
+ /* Allocate/reuse/resize mempool for Multi-Packet RQ. */
+ if (mlx5_mprq_alloc_mp(dev)) {
+ /* Should not release Rx queues but return immediately. */
+ return -rte_errno;
+ }
+ for (i = 0; i != priv->rxqs_n; ++i) {
+ struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
+ struct rte_mempool *mp;
+
+ if (!rxq_ctrl)
+ continue;
+ if (rxq_ctrl->type == MLX5_RXQ_TYPE_HAIRPIN) {
+ rxq_ctrl->obj = mlx5_rxq_obj_new
+ (dev, i, MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN);
+ if (!rxq_ctrl->obj)
+ goto error;
+ continue;
+ }
+ /* Pre-register Rx mempool. */
+ mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
+ rxq_ctrl->rxq.mprq_mp : rxq_ctrl->rxq.mp;
+ DRV_LOG(DEBUG,
+ "port %u Rx queue %u registering"
+ " mp %s having %u chunks",
+ dev->data->port_id, rxq_ctrl->rxq.idx,
+ mp->name, mp->nb_mem_chunks);
+ mlx5_mr_update_mp(dev, &rxq_ctrl->rxq.mr_ctrl, mp);
+ ret = rxq_alloc_elts(rxq_ctrl);
+ if (ret)
+ goto error;
+ rxq_ctrl->obj = mlx5_rxq_obj_new(dev, i, obj_type);
+ if (!rxq_ctrl->obj)
+ goto error;
+ if (obj_type == MLX5_RXQ_OBJ_TYPE_IBV)
+ rxq_ctrl->wqn = rxq_ctrl->obj->wq->wq_num;
+ else if (obj_type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ)
+ rxq_ctrl->wqn = rxq_ctrl->obj->rq->id;
+ }
+ return 0;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ do {
+ mlx5_rxq_release(dev, i);
+ } while (i-- != 0);
+ rte_errno = ret; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Binds Tx queues to Rx queues for hairpin.
+ *
+ * Binds Tx queues to the target Rx queues.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_hairpin_bind(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
+ struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
+ struct mlx5_txq_ctrl *txq_ctrl;
+ struct mlx5_rxq_ctrl *rxq_ctrl;
+ struct mlx5_devx_obj *sq;
+ struct mlx5_devx_obj *rq;
+ unsigned int i;
+ int ret = 0;
+
+ for (i = 0; i != priv->txqs_n; ++i) {
+ txq_ctrl = mlx5_txq_get(dev, i);
+ if (!txq_ctrl)
+ continue;
+ if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
+ mlx5_txq_release(dev, i);
+ continue;
+ }
+ if (!txq_ctrl->obj) {
+ rte_errno = ENOMEM;
+ DRV_LOG(ERR, "port %u no txq object found: %d",
+ dev->data->port_id, i);
+ mlx5_txq_release(dev, i);
+ return -rte_errno;
+ }
+ sq = txq_ctrl->obj->sq;
+ rxq_ctrl = mlx5_rxq_get(dev,
+ txq_ctrl->hairpin_conf.peers[0].queue);
+ if (!rxq_ctrl) {
+ mlx5_txq_release(dev, i);
+ rte_errno = EINVAL;
+ DRV_LOG(ERR, "port %u no rxq object found: %d",
+ dev->data->port_id,
+ txq_ctrl->hairpin_conf.peers[0].queue);
+ return -rte_errno;
+ }
+ if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
+ rxq_ctrl->hairpin_conf.peers[0].queue != i) {
+ rte_errno = ENOMEM;
+ DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
+ "Rx queue %d", dev->data->port_id,
+ i, txq_ctrl->hairpin_conf.peers[0].queue);
+ goto error;
+ }
+ rq = rxq_ctrl->obj->rq;
+ if (!rq) {
+ rte_errno = ENOMEM;
+ DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
+ dev->data->port_id,
+ txq_ctrl->hairpin_conf.peers[0].queue);
+ goto error;
+ }
+ sq_attr.state = MLX5_SQC_STATE_RDY;
+ sq_attr.sq_state = MLX5_SQC_STATE_RST;
+ sq_attr.hairpin_peer_rq = rq->id;
+ sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
+ ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
+ if (ret)
+ goto error;
+ rq_attr.state = MLX5_SQC_STATE_RDY;
+ rq_attr.rq_state = MLX5_SQC_STATE_RST;
+ rq_attr.hairpin_peer_sq = sq->id;
+ rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
+ ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
+ if (ret)
+ goto error;
+ mlx5_txq_release(dev, i);
+ mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
+ }
+ return 0;
+error:
+ mlx5_txq_release(dev, i);
+ mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
+ return -rte_errno;
+}
+
+/**
+ * DPDK callback to start the device.
+ *
+ * Simulate device start by attaching all configured flows.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_start(struct rte_eth_dev *dev)
+{
+ int ret;
+ int fine_inline;
+
+ DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
+ fine_inline = rte_mbuf_dynflag_lookup
+ (RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
+ if (fine_inline > 0)
+ rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
+ else
+ rte_net_mlx5_dynf_inline_mask = 0;
+ if (dev->data->nb_rx_queues > 0) {
+ ret = mlx5_dev_configure_rss_reta(dev);
+ if (ret) {
+ DRV_LOG(ERR, "port %u reta config failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return -rte_errno;
+ }
+ }
+ ret = mlx5_txq_start(dev);
+ if (ret) {
+ DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return -rte_errno;
+ }
+ ret = mlx5_rxq_start(dev);
+ if (ret) {
+ DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ mlx5_txq_stop(dev);
+ return -rte_errno;
+ }
+ ret = mlx5_hairpin_bind(dev);
+ if (ret) {
+ DRV_LOG(ERR, "port %u hairpin binding failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ mlx5_txq_stop(dev);
+ return -rte_errno;
+ }
+ /* Set started flag here for the following steps like control flow. */
+ dev->data->dev_started = 1;
+ ret = mlx5_rx_intr_vec_enable(dev);
+ if (ret) {
+ DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
+ dev->data->port_id);
+ goto error;
+ }
+ mlx5_stats_init(dev);
+ ret = mlx5_traffic_enable(dev);
+ if (ret) {
+ DRV_LOG(ERR, "port %u failed to set defaults flows",
+ dev->data->port_id);
+ goto error;
+ }
+ /* Set a mask and offset of dynamic metadata flows into Rx queues*/
+ mlx5_flow_rxq_dynf_metadata_set(dev);
+ /*
+ * In non-cached mode, it only needs to start the default mreg copy
+ * action and no flow created by application exists anymore.
+ * But it is worth wrapping the interface for further usage.
+ */
+ ret = mlx5_flow_start_default(dev);
+ if (ret) {
+ DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
+ dev->data->port_id, strerror(rte_errno));
+ goto error;
+ }
+ rte_wmb();
+ dev->tx_pkt_burst = mlx5_select_tx_function(dev);
+ dev->rx_pkt_burst = mlx5_select_rx_function(dev);
+ /* Enable datapath on secondary process. */
+ mlx5_mp_req_start_rxtx(dev);
+ mlx5_dev_interrupt_handler_install(dev);
+ return 0;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ /* Rollback. */
+ dev->data->dev_started = 0;
+ mlx5_flow_stop_default(dev);
+ mlx5_traffic_disable(dev);
+ mlx5_txq_stop(dev);
+ mlx5_rxq_stop(dev);
+ rte_errno = ret; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * DPDK callback to stop the device.
+ *
+ * Simulate device stop by detaching all configured flows.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void
+mlx5_dev_stop(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ dev->data->dev_started = 0;
+ /* Prevent crashes when queues are still in use. */
+ dev->rx_pkt_burst = removed_rx_burst;
+ dev->tx_pkt_burst = removed_tx_burst;
+ rte_wmb();
+ /* Disable datapath on secondary process. */
+ mlx5_mp_req_stop_rxtx(dev);
+ usleep(1000 * priv->rxqs_n);
+ DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
+ mlx5_flow_stop_default(dev);
+ /* Control flows for default traffic can be removed firstly. */
+ mlx5_traffic_disable(dev);
+ /* All RX queue flags will be cleared in the flush interface. */
+ mlx5_flow_list_flush(dev, &priv->flows, true);
+ mlx5_rx_intr_vec_disable(dev);
+ mlx5_dev_interrupt_handler_uninstall(dev);
+ mlx5_txq_stop(dev);
+ mlx5_rxq_stop(dev);
+}
+
+/**
+ * Enable traffic flows configured by control plane
+ *
+ * @param dev
+ * Pointer to Ethernet device private data.
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_traffic_enable(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow_item_eth bcast = {
+ .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+ };
+ struct rte_flow_item_eth ipv6_multi_spec = {
+ .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
+ };
+ struct rte_flow_item_eth ipv6_multi_mask = {
+ .dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
+ };
+ struct rte_flow_item_eth unicast = {
+ .src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+ };
+ struct rte_flow_item_eth unicast_mask = {
+ .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+ };
+ const unsigned int vlan_filter_n = priv->vlan_filter_n;
+ const struct rte_ether_addr cmp = {
+ .addr_bytes = "\x00\x00\x00\x00\x00\x00",
+ };
+ unsigned int i;
+ unsigned int j;
+ int ret;
+
+ /*
+ * Hairpin txq default flow should be created no matter if it is
+ * isolation mode. Or else all the packets to be sent will be sent
+ * out directly without the TX flow actions, e.g. encapsulation.
+ */
+ for (i = 0; i != priv->txqs_n; ++i) {
+ struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
+ if (!txq_ctrl)
+ continue;
+ if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
+ ret = mlx5_ctrl_flow_source_queue(dev, i);
+ if (ret) {
+ mlx5_txq_release(dev, i);
+ goto error;
+ }
+ }
+ mlx5_txq_release(dev, i);
+ }
+ if (priv->config.dv_esw_en && !priv->config.vf) {
+ if (mlx5_flow_create_esw_table_zero_flow(dev))
+ priv->fdb_def_rule = 1;
+ else
+ DRV_LOG(INFO, "port %u FDB default rule cannot be"
+ " configured - only Eswitch group 0 flows are"
+ " supported.", dev->data->port_id);
+ }
+ if (priv->isolated)
+ return 0;
+ if (dev->data->promiscuous) {
+ struct rte_flow_item_eth promisc = {
+ .dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+ .src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+ .type = 0,
+ };
+
+ ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
+ if (ret)
+ goto error;
+ }
+ if (dev->data->all_multicast) {
+ struct rte_flow_item_eth multicast = {
+ .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
+ .src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+ .type = 0,
+ };
+
+ ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
+ if (ret)
+ goto error;
+ } else {
+ /* Add broadcast/multicast flows. */
+ for (i = 0; i != vlan_filter_n; ++i) {
+ uint16_t vlan = priv->vlan_filter[i];
+
+ struct rte_flow_item_vlan vlan_spec = {
+ .tci = rte_cpu_to_be_16(vlan),
+ };
+ struct rte_flow_item_vlan vlan_mask =
+ rte_flow_item_vlan_mask;
+
+ ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
+ &vlan_spec, &vlan_mask);
+ if (ret)
+ goto error;
+ ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
+ &ipv6_multi_mask,
+ &vlan_spec, &vlan_mask);
+ if (ret)
+ goto error;
+ }
+ if (!vlan_filter_n) {
+ ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
+ if (ret)
+ goto error;
+ ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
+ &ipv6_multi_mask);
+ if (ret)
+ goto error;
+ }
+ }
+ /* Add MAC address flows. */
+ for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
+ struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
+
+ if (!memcmp(mac, &cmp, sizeof(*mac)))
+ continue;
+ memcpy(&unicast.dst.addr_bytes,
+ mac->addr_bytes,
+ RTE_ETHER_ADDR_LEN);
+ for (j = 0; j != vlan_filter_n; ++j) {
+ uint16_t vlan = priv->vlan_filter[j];
+
+ struct rte_flow_item_vlan vlan_spec = {
+ .tci = rte_cpu_to_be_16(vlan),
+ };
+ struct rte_flow_item_vlan vlan_mask =
+ rte_flow_item_vlan_mask;
+
+ ret = mlx5_ctrl_flow_vlan(dev, &unicast,
+ &unicast_mask,
+ &vlan_spec,
+ &vlan_mask);
+ if (ret)
+ goto error;
+ }
+ if (!vlan_filter_n) {
+ ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
+ if (ret)
+ goto error;
+ }
+ }
+ return 0;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ mlx5_flow_list_flush(dev, &priv->ctrl_flows, false);
+ rte_errno = ret; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+
+/**
+ * Disable traffic flows configured by control plane
+ *
+ * @param dev
+ * Pointer to Ethernet device private data.
+ */
+void
+mlx5_traffic_disable(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ mlx5_flow_list_flush(dev, &priv->ctrl_flows, false);
+}
+
+/**
+ * Restart traffic flows configured by control plane
+ *
+ * @param dev
+ * Pointer to Ethernet device private data.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_traffic_restart(struct rte_eth_dev *dev)
+{
+ if (dev->data->dev_started) {
+ mlx5_traffic_disable(dev);
+ return mlx5_traffic_enable(dev);
+ }
+ return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c
new file mode 100644
index 000000000..a211fa91b
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c
@@ -0,0 +1,1470 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <inttypes.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_malloc.h>
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5_defs.h"
+#include "mlx5_utils.h"
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_autoconf.h"
+
+/**
+ * Allocate TX queue elements.
+ *
+ * @param txq_ctrl
+ * Pointer to TX queue structure.
+ */
+void
+txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl)
+{
+ const unsigned int elts_n = 1 << txq_ctrl->txq.elts_n;
+ unsigned int i;
+
+ for (i = 0; (i != elts_n); ++i)
+ txq_ctrl->txq.elts[i] = NULL;
+ DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs",
+ PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx, elts_n);
+ txq_ctrl->txq.elts_head = 0;
+ txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
+}
+
+/**
+ * Free TX queue elements.
+ *
+ * @param txq_ctrl
+ * Pointer to TX queue structure.
+ */
+void
+txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl)
+{
+ const uint16_t elts_n = 1 << txq_ctrl->txq.elts_n;
+ const uint16_t elts_m = elts_n - 1;
+ uint16_t elts_head = txq_ctrl->txq.elts_head;
+ uint16_t elts_tail = txq_ctrl->txq.elts_tail;
+ struct rte_mbuf *(*elts)[elts_n] = &txq_ctrl->txq.elts;
+
+ DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs",
+ PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx);
+ txq_ctrl->txq.elts_head = 0;
+ txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
+
+ while (elts_tail != elts_head) {
+ struct rte_mbuf *elt = (*elts)[elts_tail & elts_m];
+
+ MLX5_ASSERT(elt != NULL);
+ rte_pktmbuf_free_seg(elt);
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+ /* Poisoning. */
+ memset(&(*elts)[elts_tail & elts_m],
+ 0x77,
+ sizeof((*elts)[elts_tail & elts_m]));
+#endif
+ ++elts_tail;
+ }
+}
+
+/**
+ * Returns the per-port supported offloads.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * Supported Tx offloads.
+ */
+uint64_t
+mlx5_get_tx_port_offloads(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ uint64_t offloads = (DEV_TX_OFFLOAD_MULTI_SEGS |
+ DEV_TX_OFFLOAD_VLAN_INSERT);
+ struct mlx5_dev_config *config = &priv->config;
+
+ if (config->hw_csum)
+ offloads |= (DEV_TX_OFFLOAD_IPV4_CKSUM |
+ DEV_TX_OFFLOAD_UDP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_CKSUM);
+ if (config->tso)
+ offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+ if (config->swp) {
+ if (config->hw_csum)
+ offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+ if (config->tso)
+ offloads |= (DEV_TX_OFFLOAD_IP_TNL_TSO |
+ DEV_TX_OFFLOAD_UDP_TNL_TSO);
+ }
+ if (config->tunnel_en) {
+ if (config->hw_csum)
+ offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+ if (config->tso)
+ offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+ DEV_TX_OFFLOAD_GRE_TNL_TSO |
+ DEV_TX_OFFLOAD_GENEVE_TNL_TSO);
+ }
+ return offloads;
+}
+
+/**
+ * Tx queue presetup checks.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * Tx queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_tx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (desc <= MLX5_TX_COMP_THRESH) {
+ DRV_LOG(WARNING,
+ "port %u number of descriptors requested for Tx queue"
+ " %u must be higher than MLX5_TX_COMP_THRESH, using %u"
+ " instead of %u",
+ dev->data->port_id, idx, MLX5_TX_COMP_THRESH + 1, desc);
+ desc = MLX5_TX_COMP_THRESH + 1;
+ }
+ if (!rte_is_power_of_2(desc)) {
+ desc = 1 << log2above(desc);
+ DRV_LOG(WARNING,
+ "port %u increased number of descriptors in Tx queue"
+ " %u to the next power of two (%d)",
+ dev->data->port_id, idx, desc);
+ }
+ DRV_LOG(DEBUG, "port %u configuring queue %u for %u descriptors",
+ dev->data->port_id, idx, desc);
+ if (idx >= priv->txqs_n) {
+ DRV_LOG(ERR, "port %u Tx queue index out of range (%u >= %u)",
+ dev->data->port_id, idx, priv->txqs_n);
+ rte_errno = EOVERFLOW;
+ return -rte_errno;
+ }
+ if (!mlx5_txq_releasable(dev, idx)) {
+ rte_errno = EBUSY;
+ DRV_LOG(ERR, "port %u unable to release queue index %u",
+ dev->data->port_id, idx);
+ return -rte_errno;
+ }
+ mlx5_txq_release(dev, idx);
+ return 0;
+}
+/**
+ * DPDK callback to configure a TX queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * TX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ * Thresholds parameters.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_txconf *conf)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_data *txq = (*priv->txqs)[idx];
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq, struct mlx5_txq_ctrl, txq);
+ int res;
+
+ res = mlx5_tx_queue_pre_setup(dev, idx, desc);
+ if (res)
+ return res;
+ txq_ctrl = mlx5_txq_new(dev, idx, desc, socket, conf);
+ if (!txq_ctrl) {
+ DRV_LOG(ERR, "port %u unable to allocate queue index %u",
+ dev->data->port_id, idx);
+ return -rte_errno;
+ }
+ DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
+ dev->data->port_id, idx);
+ (*priv->txqs)[idx] = &txq_ctrl->txq;
+ return 0;
+}
+
+/**
+ * DPDK callback to configure a TX hairpin queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * TX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param[in] hairpin_conf
+ * The hairpin binding configuration.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+ uint16_t desc,
+ const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_data *txq = (*priv->txqs)[idx];
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq, struct mlx5_txq_ctrl, txq);
+ int res;
+
+ res = mlx5_tx_queue_pre_setup(dev, idx, desc);
+ if (res)
+ return res;
+ if (hairpin_conf->peer_count != 1 ||
+ hairpin_conf->peers[0].port != dev->data->port_id ||
+ hairpin_conf->peers[0].queue >= priv->rxqs_n) {
+ DRV_LOG(ERR, "port %u unable to setup hairpin queue index %u "
+ " invalid hairpind configuration", dev->data->port_id,
+ idx);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ txq_ctrl = mlx5_txq_hairpin_new(dev, idx, desc, hairpin_conf);
+ if (!txq_ctrl) {
+ DRV_LOG(ERR, "port %u unable to allocate queue index %u",
+ dev->data->port_id, idx);
+ return -rte_errno;
+ }
+ DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
+ dev->data->port_id, idx);
+ (*priv->txqs)[idx] = &txq_ctrl->txq;
+ return 0;
+}
+
+/**
+ * DPDK callback to release a TX queue.
+ *
+ * @param dpdk_txq
+ * Generic TX queue pointer.
+ */
+void
+mlx5_tx_queue_release(void *dpdk_txq)
+{
+ struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
+ struct mlx5_txq_ctrl *txq_ctrl;
+ struct mlx5_priv *priv;
+ unsigned int i;
+
+ if (txq == NULL)
+ return;
+ txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
+ priv = txq_ctrl->priv;
+ for (i = 0; (i != priv->txqs_n); ++i)
+ if ((*priv->txqs)[i] == txq) {
+ DRV_LOG(DEBUG, "port %u removing Tx queue %u from list",
+ PORT_ID(priv), txq->idx);
+ mlx5_txq_release(ETH_DEV(priv), i);
+ break;
+ }
+}
+
+/**
+ * Configure the doorbell register non-cached attribute.
+ *
+ * @param txq_ctrl
+ * Pointer to Tx queue control structure.
+ * @param page_size
+ * Systme page size
+ */
+static void
+txq_uar_ncattr_init(struct mlx5_txq_ctrl *txq_ctrl, size_t page_size)
+{
+ struct mlx5_priv *priv = txq_ctrl->priv;
+ off_t cmd;
+
+ txq_ctrl->txq.db_heu = priv->config.dbnc == MLX5_TXDB_HEURISTIC;
+ txq_ctrl->txq.db_nc = 0;
+ /* Check the doorbell register mapping type. */
+ cmd = txq_ctrl->uar_mmap_offset / page_size;
+ cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
+ cmd &= MLX5_UAR_MMAP_CMD_MASK;
+ if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
+ txq_ctrl->txq.db_nc = 1;
+}
+
+/**
+ * Initialize Tx UAR registers for primary process.
+ *
+ * @param txq_ctrl
+ * Pointer to Tx queue control structure.
+ */
+static void
+txq_uar_init(struct mlx5_txq_ctrl *txq_ctrl)
+{
+ struct mlx5_priv *priv = txq_ctrl->priv;
+ struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
+ const size_t page_size = sysconf(_SC_PAGESIZE);
+#ifndef RTE_ARCH_64
+ unsigned int lock_idx;
+#endif
+
+ if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD)
+ return;
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ MLX5_ASSERT(ppriv);
+ ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg;
+ txq_uar_ncattr_init(txq_ctrl, page_size);
+#ifndef RTE_ARCH_64
+ /* Assign an UAR lock according to UAR page number */
+ lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
+ MLX5_UAR_PAGE_NUM_MASK;
+ txq_ctrl->txq.uar_lock = &priv->uar_lock[lock_idx];
+#endif
+}
+
+/**
+ * Remap UAR register of a Tx queue for secondary process.
+ *
+ * Remapped address is stored at the table in the process private structure of
+ * the device, indexed by queue index.
+ *
+ * @param txq_ctrl
+ * Pointer to Tx queue control structure.
+ * @param fd
+ * Verbs file descriptor to map UAR pages.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+txq_uar_init_secondary(struct mlx5_txq_ctrl *txq_ctrl, int fd)
+{
+ struct mlx5_priv *priv = txq_ctrl->priv;
+ struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
+ struct mlx5_txq_data *txq = &txq_ctrl->txq;
+ void *addr;
+ uintptr_t uar_va;
+ uintptr_t offset;
+ const size_t page_size = sysconf(_SC_PAGESIZE);
+
+ if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD)
+ return 0;
+ MLX5_ASSERT(ppriv);
+ /*
+ * As rdma-core, UARs are mapped in size of OS page
+ * size. Ref to libmlx5 function: mlx5_init_context()
+ */
+ uar_va = (uintptr_t)txq_ctrl->bf_reg;
+ offset = uar_va & (page_size - 1); /* Offset in page. */
+ addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
+ txq_ctrl->uar_mmap_offset);
+ if (addr == MAP_FAILED) {
+ DRV_LOG(ERR,
+ "port %u mmap failed for BF reg of txq %u",
+ txq->port_id, txq->idx);
+ rte_errno = ENXIO;
+ return -rte_errno;
+ }
+ addr = RTE_PTR_ADD(addr, offset);
+ ppriv->uar_table[txq->idx] = addr;
+ txq_uar_ncattr_init(txq_ctrl, page_size);
+ return 0;
+}
+
+/**
+ * Unmap UAR register of a Tx queue for secondary process.
+ *
+ * @param txq_ctrl
+ * Pointer to Tx queue control structure.
+ */
+static void
+txq_uar_uninit_secondary(struct mlx5_txq_ctrl *txq_ctrl)
+{
+ struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(txq_ctrl->priv));
+ const size_t page_size = sysconf(_SC_PAGESIZE);
+ void *addr;
+
+ if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD)
+ return;
+ addr = ppriv->uar_table[txq_ctrl->txq.idx];
+ munmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size);
+}
+
+/**
+ * Initialize Tx UAR registers for secondary process.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param fd
+ * Verbs file descriptor to map UAR pages.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_data *txq;
+ struct mlx5_txq_ctrl *txq_ctrl;
+ unsigned int i;
+ int ret;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
+ for (i = 0; i != priv->txqs_n; ++i) {
+ if (!(*priv->txqs)[i])
+ continue;
+ txq = (*priv->txqs)[i];
+ txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
+ if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD)
+ continue;
+ MLX5_ASSERT(txq->idx == (uint16_t)i);
+ ret = txq_uar_init_secondary(txq_ctrl, fd);
+ if (ret)
+ goto error;
+ }
+ return 0;
+error:
+ /* Rollback. */
+ do {
+ if (!(*priv->txqs)[i])
+ continue;
+ txq = (*priv->txqs)[i];
+ txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
+ txq_uar_uninit_secondary(txq_ctrl);
+ } while (i--);
+ return -rte_errno;
+}
+
+/**
+ * Create the Tx hairpin queue object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Tx queue array
+ *
+ * @return
+ * The hairpin DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_txq_obj *
+mlx5_txq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq_data, struct mlx5_txq_ctrl, txq);
+ struct mlx5_devx_create_sq_attr attr = { 0 };
+ struct mlx5_txq_obj *tmpl = NULL;
+ int ret = 0;
+ uint32_t max_wq_data;
+
+ MLX5_ASSERT(txq_data);
+ MLX5_ASSERT(!txq_ctrl->obj);
+ tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+ txq_ctrl->socket);
+ if (!tmpl) {
+ DRV_LOG(ERR,
+ "port %u Tx queue %u cannot allocate memory resources",
+ dev->data->port_id, txq_data->idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ tmpl->type = MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN;
+ tmpl->txq_ctrl = txq_ctrl;
+ attr.hairpin = 1;
+ attr.tis_lst_sz = 1;
+ max_wq_data = priv->config.hca_attr.log_max_hairpin_wq_data_sz;
+ /* Jumbo frames > 9KB should be supported, and more packets. */
+ if (priv->config.log_hp_size != (uint32_t)MLX5_ARG_UNSET) {
+ if (priv->config.log_hp_size > max_wq_data) {
+ DRV_LOG(ERR, "total data size %u power of 2 is "
+ "too large for hairpin",
+ priv->config.log_hp_size);
+ rte_errno = ERANGE;
+ return NULL;
+ }
+ attr.wq_attr.log_hairpin_data_sz = priv->config.log_hp_size;
+ } else {
+ attr.wq_attr.log_hairpin_data_sz =
+ (max_wq_data < MLX5_HAIRPIN_JUMBO_LOG_SIZE) ?
+ max_wq_data : MLX5_HAIRPIN_JUMBO_LOG_SIZE;
+ }
+ /* Set the packets number to the maximum value for performance. */
+ attr.wq_attr.log_hairpin_num_packets =
+ attr.wq_attr.log_hairpin_data_sz -
+ MLX5_HAIRPIN_QUEUE_STRIDE;
+ attr.tis_num = priv->sh->tis->id;
+ tmpl->sq = mlx5_devx_cmd_create_sq(priv->sh->ctx, &attr);
+ if (!tmpl->sq) {
+ DRV_LOG(ERR,
+ "port %u tx hairpin queue %u can't create sq object",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ goto error;
+ }
+ DRV_LOG(DEBUG, "port %u sxq %u updated with %p", dev->data->port_id,
+ idx, (void *)&tmpl);
+ rte_atomic32_inc(&tmpl->refcnt);
+ LIST_INSERT_HEAD(&priv->txqsobj, tmpl, next);
+ return tmpl;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ if (tmpl->tis)
+ mlx5_devx_cmd_destroy(tmpl->tis);
+ if (tmpl->sq)
+ mlx5_devx_cmd_destroy(tmpl->sq);
+ rte_errno = ret; /* Restore rte_errno. */
+ return NULL;
+}
+
+/**
+ * Create the Tx queue Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Tx queue array.
+ * @param type
+ * Type of the Tx queue object to create.
+ *
+ * @return
+ * The Verbs object initialised, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_txq_obj *
+mlx5_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
+ enum mlx5_txq_obj_type type)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq_data, struct mlx5_txq_ctrl, txq);
+ struct mlx5_txq_obj tmpl;
+ struct mlx5_txq_obj *txq_obj = NULL;
+ union {
+ struct ibv_qp_init_attr_ex init;
+ struct ibv_cq_init_attr_ex cq;
+ struct ibv_qp_attr mod;
+ } attr;
+ unsigned int cqe_n;
+ struct mlx5dv_qp qp = { .comp_mask = MLX5DV_QP_MASK_UAR_MMAP_OFFSET };
+ struct mlx5dv_cq cq_info;
+ struct mlx5dv_obj obj;
+ const int desc = 1 << txq_data->elts_n;
+ int ret = 0;
+
+ if (type == MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN)
+ return mlx5_txq_obj_hairpin_new(dev, idx);
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ /* If using DevX, need additional mask to read tisn value. */
+ if (priv->config.devx && !priv->sh->tdn)
+ qp.comp_mask |= MLX5DV_QP_MASK_RAW_QP_HANDLES;
+#endif
+ MLX5_ASSERT(txq_data);
+ priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_TX_QUEUE;
+ priv->verbs_alloc_ctx.obj = txq_ctrl;
+ if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
+ DRV_LOG(ERR,
+ "port %u MLX5_ENABLE_CQE_COMPRESSION must never be set",
+ dev->data->port_id);
+ rte_errno = EINVAL;
+ return NULL;
+ }
+ memset(&tmpl, 0, sizeof(struct mlx5_txq_obj));
+ attr.cq = (struct ibv_cq_init_attr_ex){
+ .comp_mask = 0,
+ };
+ cqe_n = desc / MLX5_TX_COMP_THRESH +
+ 1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
+ tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
+ if (tmpl.cq == NULL) {
+ DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ goto error;
+ }
+ attr.init = (struct ibv_qp_init_attr_ex){
+ /* CQ to be associated with the send queue. */
+ .send_cq = tmpl.cq,
+ /* CQ to be associated with the receive queue. */
+ .recv_cq = tmpl.cq,
+ .cap = {
+ /* Max number of outstanding WRs. */
+ .max_send_wr =
+ ((priv->sh->device_attr.orig_attr.max_qp_wr <
+ desc) ?
+ priv->sh->device_attr.orig_attr.max_qp_wr :
+ desc),
+ /*
+ * Max number of scatter/gather elements in a WR,
+ * must be 1 to prevent libmlx5 from trying to affect
+ * too much memory. TX gather is not impacted by the
+ * device_attr.max_sge limit and will still work
+ * properly.
+ */
+ .max_send_sge = 1,
+ },
+ .qp_type = IBV_QPT_RAW_PACKET,
+ /*
+ * Do *NOT* enable this, completions events are managed per
+ * Tx burst.
+ */
+ .sq_sig_all = 0,
+ .pd = priv->sh->pd,
+ .comp_mask = IBV_QP_INIT_ATTR_PD,
+ };
+ if (txq_data->inlen_send)
+ attr.init.cap.max_inline_data = txq_ctrl->max_inline_data;
+ if (txq_data->tso_en) {
+ attr.init.max_tso_header = txq_ctrl->max_tso_header;
+ attr.init.comp_mask |= IBV_QP_INIT_ATTR_MAX_TSO_HEADER;
+ }
+ tmpl.qp = mlx5_glue->create_qp_ex(priv->sh->ctx, &attr.init);
+ if (tmpl.qp == NULL) {
+ DRV_LOG(ERR, "port %u Tx queue %u QP creation failure",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ goto error;
+ }
+ attr.mod = (struct ibv_qp_attr){
+ /* Move the QP to this state. */
+ .qp_state = IBV_QPS_INIT,
+ /* IB device port number. */
+ .port_num = (uint8_t)priv->ibv_port,
+ };
+ ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod,
+ (IBV_QP_STATE | IBV_QP_PORT));
+ if (ret) {
+ DRV_LOG(ERR,
+ "port %u Tx queue %u QP state to IBV_QPS_INIT failed",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ goto error;
+ }
+ attr.mod = (struct ibv_qp_attr){
+ .qp_state = IBV_QPS_RTR
+ };
+ ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE);
+ if (ret) {
+ DRV_LOG(ERR,
+ "port %u Tx queue %u QP state to IBV_QPS_RTR failed",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ goto error;
+ }
+ attr.mod.qp_state = IBV_QPS_RTS;
+ ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE);
+ if (ret) {
+ DRV_LOG(ERR,
+ "port %u Tx queue %u QP state to IBV_QPS_RTS failed",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ goto error;
+ }
+ txq_obj = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_txq_obj), 0,
+ txq_ctrl->socket);
+ if (!txq_obj) {
+ DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory",
+ dev->data->port_id, idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ obj.cq.in = tmpl.cq;
+ obj.cq.out = &cq_info;
+ obj.qp.in = tmpl.qp;
+ obj.qp.out = &qp;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP);
+ if (ret != 0) {
+ rte_errno = errno;
+ goto error;
+ }
+ if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+ DRV_LOG(ERR,
+ "port %u wrong MLX5_CQE_SIZE environment variable"
+ " value: it should be set to %u",
+ dev->data->port_id, RTE_CACHE_LINE_SIZE);
+ rte_errno = EINVAL;
+ goto error;
+ }
+ txq_data->cqe_n = log2above(cq_info.cqe_cnt);
+ txq_data->cqe_s = 1 << txq_data->cqe_n;
+ txq_data->cqe_m = txq_data->cqe_s - 1;
+ txq_data->qp_num_8s = tmpl.qp->qp_num << 8;
+ txq_data->wqes = qp.sq.buf;
+ txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
+ txq_data->wqe_s = 1 << txq_data->wqe_n;
+ txq_data->wqe_m = txq_data->wqe_s - 1;
+ txq_data->wqes_end = txq_data->wqes + txq_data->wqe_s;
+ txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
+ txq_data->cq_db = cq_info.dbrec;
+ txq_data->cqes = (volatile struct mlx5_cqe *)cq_info.buf;
+ txq_data->cq_ci = 0;
+ txq_data->cq_pi = 0;
+ txq_data->wqe_ci = 0;
+ txq_data->wqe_pi = 0;
+ txq_data->wqe_comp = 0;
+ txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV;
+ txq_data->fcqs = rte_calloc_socket(__func__,
+ txq_data->cqe_s,
+ sizeof(*txq_data->fcqs),
+ RTE_CACHE_LINE_SIZE,
+ txq_ctrl->socket);
+ if (!txq_data->fcqs) {
+ DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory (FCQ)",
+ dev->data->port_id, idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ /*
+ * If using DevX need to query and store TIS transport domain value.
+ * This is done once per port.
+ * Will use this value on Rx, when creating matching TIR.
+ */
+ if (priv->config.devx && !priv->sh->tdn) {
+ ret = mlx5_devx_cmd_qp_query_tis_td(tmpl.qp, qp.tisn,
+ &priv->sh->tdn);
+ if (ret) {
+ DRV_LOG(ERR, "Fail to query port %u Tx queue %u QP TIS "
+ "transport domain", dev->data->port_id, idx);
+ rte_errno = EINVAL;
+ goto error;
+ } else {
+ DRV_LOG(DEBUG, "port %u Tx queue %u TIS number %d "
+ "transport domain %d", dev->data->port_id,
+ idx, qp.tisn, priv->sh->tdn);
+ }
+ }
+#endif
+ txq_obj->qp = tmpl.qp;
+ txq_obj->cq = tmpl.cq;
+ rte_atomic32_inc(&txq_obj->refcnt);
+ txq_ctrl->bf_reg = qp.bf.reg;
+ if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
+ txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
+ DRV_LOG(DEBUG, "port %u: uar_mmap_offset 0x%"PRIx64,
+ dev->data->port_id, txq_ctrl->uar_mmap_offset);
+ } else {
+ DRV_LOG(ERR,
+ "port %u failed to retrieve UAR info, invalid"
+ " libmlx5.so",
+ dev->data->port_id);
+ rte_errno = EINVAL;
+ goto error;
+ }
+ txq_uar_init(txq_ctrl);
+ LIST_INSERT_HEAD(&priv->txqsobj, txq_obj, next);
+ txq_obj->txq_ctrl = txq_ctrl;
+ priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+ return txq_obj;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ if (tmpl.cq)
+ claim_zero(mlx5_glue->destroy_cq(tmpl.cq));
+ if (tmpl.qp)
+ claim_zero(mlx5_glue->destroy_qp(tmpl.qp));
+ if (txq_data && txq_data->fcqs)
+ rte_free(txq_data->fcqs);
+ if (txq_obj)
+ rte_free(txq_obj);
+ priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+ rte_errno = ret; /* Restore rte_errno. */
+ return NULL;
+}
+
+/**
+ * Get an Tx queue Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Tx queue array.
+ *
+ * @return
+ * The Verbs object if it exists.
+ */
+struct mlx5_txq_obj *
+mlx5_txq_obj_get(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_ctrl *txq_ctrl;
+
+ if (idx >= priv->txqs_n)
+ return NULL;
+ if (!(*priv->txqs)[idx])
+ return NULL;
+ txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
+ if (txq_ctrl->obj)
+ rte_atomic32_inc(&txq_ctrl->obj->refcnt);
+ return txq_ctrl->obj;
+}
+
+/**
+ * Release an Tx verbs queue object.
+ *
+ * @param txq_obj
+ * Verbs Tx queue object.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+int
+mlx5_txq_obj_release(struct mlx5_txq_obj *txq_obj)
+{
+ MLX5_ASSERT(txq_obj);
+ if (rte_atomic32_dec_and_test(&txq_obj->refcnt)) {
+ if (txq_obj->type == MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN) {
+ if (txq_obj->tis)
+ claim_zero(mlx5_devx_cmd_destroy(txq_obj->tis));
+ } else {
+ claim_zero(mlx5_glue->destroy_qp(txq_obj->qp));
+ claim_zero(mlx5_glue->destroy_cq(txq_obj->cq));
+ if (txq_obj->txq_ctrl->txq.fcqs)
+ rte_free(txq_obj->txq_ctrl->txq.fcqs);
+ }
+ LIST_REMOVE(txq_obj, next);
+ rte_free(txq_obj);
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Verify the Verbs Tx queue list is empty
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The number of object not released.
+ */
+int
+mlx5_txq_obj_verify(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret = 0;
+ struct mlx5_txq_obj *txq_obj;
+
+ LIST_FOREACH(txq_obj, &priv->txqsobj, next) {
+ DRV_LOG(DEBUG, "port %u Verbs Tx queue %u still referenced",
+ dev->data->port_id, txq_obj->txq_ctrl->txq.idx);
+ ++ret;
+ }
+ return ret;
+}
+
+/**
+ * Calculate the total number of WQEBB for Tx queue.
+ *
+ * Simplified version of calc_sq_size() in rdma-core.
+ *
+ * @param txq_ctrl
+ * Pointer to Tx queue control structure.
+ *
+ * @return
+ * The number of WQEBB.
+ */
+static int
+txq_calc_wqebb_cnt(struct mlx5_txq_ctrl *txq_ctrl)
+{
+ unsigned int wqe_size;
+ const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
+
+ wqe_size = MLX5_WQE_CSEG_SIZE +
+ MLX5_WQE_ESEG_SIZE +
+ MLX5_WSEG_SIZE -
+ MLX5_ESEG_MIN_INLINE_SIZE +
+ txq_ctrl->max_inline_data;
+ return rte_align32pow2(wqe_size * desc) / MLX5_WQE_SIZE;
+}
+
+/**
+ * Calculate the maximal inline data size for Tx queue.
+ *
+ * @param txq_ctrl
+ * Pointer to Tx queue control structure.
+ *
+ * @return
+ * The maximal inline data size.
+ */
+static unsigned int
+txq_calc_inline_max(struct mlx5_txq_ctrl *txq_ctrl)
+{
+ const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
+ struct mlx5_priv *priv = txq_ctrl->priv;
+ unsigned int wqe_size;
+
+ wqe_size = priv->sh->device_attr.orig_attr.max_qp_wr / desc;
+ if (!wqe_size)
+ return 0;
+ /*
+ * This calculation is derived from tthe source of
+ * mlx5_calc_send_wqe() in rdma_core library.
+ */
+ wqe_size = wqe_size * MLX5_WQE_SIZE -
+ MLX5_WQE_CSEG_SIZE -
+ MLX5_WQE_ESEG_SIZE -
+ MLX5_WSEG_SIZE -
+ MLX5_WSEG_SIZE +
+ MLX5_DSEG_MIN_INLINE_SIZE;
+ return wqe_size;
+}
+
+/**
+ * Set Tx queue parameters from device configuration.
+ *
+ * @param txq_ctrl
+ * Pointer to Tx queue control structure.
+ */
+static void
+txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
+{
+ struct mlx5_priv *priv = txq_ctrl->priv;
+ struct mlx5_dev_config *config = &priv->config;
+ unsigned int inlen_send; /* Inline data for ordinary SEND.*/
+ unsigned int inlen_empw; /* Inline data for enhanced MPW. */
+ unsigned int inlen_mode; /* Minimal required Inline data. */
+ unsigned int txqs_inline; /* Min Tx queues to enable inline. */
+ uint64_t dev_txoff = priv->dev_data->dev_conf.txmode.offloads;
+ bool tso = txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+ DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+ DEV_TX_OFFLOAD_GRE_TNL_TSO |
+ DEV_TX_OFFLOAD_IP_TNL_TSO |
+ DEV_TX_OFFLOAD_UDP_TNL_TSO);
+ bool vlan_inline;
+ unsigned int temp;
+
+ if (config->txqs_inline == MLX5_ARG_UNSET)
+ txqs_inline =
+#if defined(RTE_ARCH_ARM64)
+ (priv->pci_dev->id.device_id ==
+ PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) ?
+ MLX5_INLINE_MAX_TXQS_BLUEFIELD :
+#endif
+ MLX5_INLINE_MAX_TXQS;
+ else
+ txqs_inline = (unsigned int)config->txqs_inline;
+ inlen_send = (config->txq_inline_max == MLX5_ARG_UNSET) ?
+ MLX5_SEND_DEF_INLINE_LEN :
+ (unsigned int)config->txq_inline_max;
+ inlen_empw = (config->txq_inline_mpw == MLX5_ARG_UNSET) ?
+ MLX5_EMPW_DEF_INLINE_LEN :
+ (unsigned int)config->txq_inline_mpw;
+ inlen_mode = (config->txq_inline_min == MLX5_ARG_UNSET) ?
+ 0 : (unsigned int)config->txq_inline_min;
+ if (config->mps != MLX5_MPW_ENHANCED && config->mps != MLX5_MPW)
+ inlen_empw = 0;
+ /*
+ * If there is requested minimal amount of data to inline
+ * we MUST enable inlining. This is a case for ConnectX-4
+ * which usually requires L2 inlined for correct operating
+ * and ConnectX-4 Lx which requires L2-L4 inlined to
+ * support E-Switch Flows.
+ */
+ if (inlen_mode) {
+ if (inlen_mode <= MLX5_ESEG_MIN_INLINE_SIZE) {
+ /*
+ * Optimize minimal inlining for single
+ * segment packets to fill one WQEBB
+ * without gaps.
+ */
+ temp = MLX5_ESEG_MIN_INLINE_SIZE;
+ } else {
+ temp = inlen_mode - MLX5_ESEG_MIN_INLINE_SIZE;
+ temp = RTE_ALIGN(temp, MLX5_WSEG_SIZE) +
+ MLX5_ESEG_MIN_INLINE_SIZE;
+ temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+ }
+ if (temp != inlen_mode) {
+ DRV_LOG(INFO,
+ "port %u minimal required inline setting"
+ " aligned from %u to %u",
+ PORT_ID(priv), inlen_mode, temp);
+ inlen_mode = temp;
+ }
+ }
+ /*
+ * If port is configured to support VLAN insertion and device
+ * does not support this feature by HW (for NICs before ConnectX-5
+ * or in case of wqe_vlan_insert flag is not set) we must enable
+ * data inline on all queues because it is supported by single
+ * tx_burst routine.
+ */
+ txq_ctrl->txq.vlan_en = config->hw_vlan_insert;
+ vlan_inline = (dev_txoff & DEV_TX_OFFLOAD_VLAN_INSERT) &&
+ !config->hw_vlan_insert;
+ /*
+ * If there are few Tx queues it is prioritized
+ * to save CPU cycles and disable data inlining at all.
+ */
+ if (inlen_send && priv->txqs_n >= txqs_inline) {
+ /*
+ * The data sent with ordinal MLX5_OPCODE_SEND
+ * may be inlined in Ethernet Segment, align the
+ * length accordingly to fit entire WQEBBs.
+ */
+ temp = RTE_MAX(inlen_send,
+ MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE);
+ temp -= MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
+ temp = RTE_ALIGN(temp, MLX5_WQE_SIZE);
+ temp += MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
+ temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+ MLX5_ESEG_MIN_INLINE_SIZE -
+ MLX5_WQE_CSEG_SIZE -
+ MLX5_WQE_ESEG_SIZE -
+ MLX5_WQE_DSEG_SIZE * 2);
+ temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+ temp = RTE_MAX(temp, inlen_mode);
+ if (temp != inlen_send) {
+ DRV_LOG(INFO,
+ "port %u ordinary send inline setting"
+ " aligned from %u to %u",
+ PORT_ID(priv), inlen_send, temp);
+ inlen_send = temp;
+ }
+ /*
+ * Not aligned to cache lines, but to WQEs.
+ * First bytes of data (initial alignment)
+ * is going to be copied explicitly at the
+ * beginning of inlining buffer in Ethernet
+ * Segment.
+ */
+ MLX5_ASSERT(inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+ MLX5_ASSERT(inlen_send <= MLX5_WQE_SIZE_MAX +
+ MLX5_ESEG_MIN_INLINE_SIZE -
+ MLX5_WQE_CSEG_SIZE -
+ MLX5_WQE_ESEG_SIZE -
+ MLX5_WQE_DSEG_SIZE * 2);
+ } else if (inlen_mode) {
+ /*
+ * If minimal inlining is requested we must
+ * enable inlining in general, despite the
+ * number of configured queues. Ignore the
+ * txq_inline_max devarg, this is not
+ * full-featured inline.
+ */
+ inlen_send = inlen_mode;
+ inlen_empw = 0;
+ } else if (vlan_inline) {
+ /*
+ * Hardware does not report offload for
+ * VLAN insertion, we must enable data inline
+ * to implement feature by software.
+ */
+ inlen_send = MLX5_ESEG_MIN_INLINE_SIZE;
+ inlen_empw = 0;
+ } else {
+ inlen_send = 0;
+ inlen_empw = 0;
+ }
+ txq_ctrl->txq.inlen_send = inlen_send;
+ txq_ctrl->txq.inlen_mode = inlen_mode;
+ txq_ctrl->txq.inlen_empw = 0;
+ if (inlen_send && inlen_empw && priv->txqs_n >= txqs_inline) {
+ /*
+ * The data sent with MLX5_OPCODE_ENHANCED_MPSW
+ * may be inlined in Data Segment, align the
+ * length accordingly to fit entire WQEBBs.
+ */
+ temp = RTE_MAX(inlen_empw,
+ MLX5_WQE_SIZE + MLX5_DSEG_MIN_INLINE_SIZE);
+ temp -= MLX5_DSEG_MIN_INLINE_SIZE;
+ temp = RTE_ALIGN(temp, MLX5_WQE_SIZE);
+ temp += MLX5_DSEG_MIN_INLINE_SIZE;
+ temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+ MLX5_DSEG_MIN_INLINE_SIZE -
+ MLX5_WQE_CSEG_SIZE -
+ MLX5_WQE_ESEG_SIZE -
+ MLX5_WQE_DSEG_SIZE);
+ temp = RTE_MIN(temp, MLX5_EMPW_MAX_INLINE_LEN);
+ if (temp != inlen_empw) {
+ DRV_LOG(INFO,
+ "port %u enhanced empw inline setting"
+ " aligned from %u to %u",
+ PORT_ID(priv), inlen_empw, temp);
+ inlen_empw = temp;
+ }
+ MLX5_ASSERT(inlen_empw >= MLX5_ESEG_MIN_INLINE_SIZE);
+ MLX5_ASSERT(inlen_empw <= MLX5_WQE_SIZE_MAX +
+ MLX5_DSEG_MIN_INLINE_SIZE -
+ MLX5_WQE_CSEG_SIZE -
+ MLX5_WQE_ESEG_SIZE -
+ MLX5_WQE_DSEG_SIZE);
+ txq_ctrl->txq.inlen_empw = inlen_empw;
+ }
+ txq_ctrl->max_inline_data = RTE_MAX(inlen_send, inlen_empw);
+ if (tso) {
+ txq_ctrl->max_tso_header = MLX5_MAX_TSO_HEADER;
+ txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->max_inline_data,
+ MLX5_MAX_TSO_HEADER);
+ txq_ctrl->txq.tso_en = 1;
+ }
+ txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp;
+ txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO |
+ DEV_TX_OFFLOAD_UDP_TNL_TSO |
+ DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) &
+ txq_ctrl->txq.offloads) && config->swp;
+}
+
+/**
+ * Adjust Tx queue data inline parameters for large queue sizes.
+ * The data inline feature requires multiple WQEs to fit the packets,
+ * and if the large amount of Tx descriptors is requested by application
+ * the total WQE amount may exceed the hardware capabilities. If the
+ * default inline setting are used we can try to adjust these ones and
+ * meet the hardware requirements and not exceed the queue size.
+ *
+ * @param txq_ctrl
+ * Pointer to Tx queue control structure.
+ *
+ * @return
+ * Zero on success, otherwise the parameters can not be adjusted.
+ */
+static int
+txq_adjust_params(struct mlx5_txq_ctrl *txq_ctrl)
+{
+ struct mlx5_priv *priv = txq_ctrl->priv;
+ struct mlx5_dev_config *config = &priv->config;
+ unsigned int max_inline;
+
+ max_inline = txq_calc_inline_max(txq_ctrl);
+ if (!txq_ctrl->txq.inlen_send) {
+ /*
+ * Inline data feature is not engaged at all.
+ * There is nothing to adjust.
+ */
+ return 0;
+ }
+ if (txq_ctrl->max_inline_data <= max_inline) {
+ /*
+ * The requested inline data length does not
+ * exceed queue capabilities.
+ */
+ return 0;
+ }
+ if (txq_ctrl->txq.inlen_mode > max_inline) {
+ DRV_LOG(ERR,
+ "minimal data inline requirements (%u) are not"
+ " satisfied (%u) on port %u, try the smaller"
+ " Tx queue size (%d)",
+ txq_ctrl->txq.inlen_mode, max_inline,
+ priv->dev_data->port_id,
+ priv->sh->device_attr.orig_attr.max_qp_wr);
+ goto error;
+ }
+ if (txq_ctrl->txq.inlen_send > max_inline &&
+ config->txq_inline_max != MLX5_ARG_UNSET &&
+ config->txq_inline_max > (int)max_inline) {
+ DRV_LOG(ERR,
+ "txq_inline_max requirements (%u) are not"
+ " satisfied (%u) on port %u, try the smaller"
+ " Tx queue size (%d)",
+ txq_ctrl->txq.inlen_send, max_inline,
+ priv->dev_data->port_id,
+ priv->sh->device_attr.orig_attr.max_qp_wr);
+ goto error;
+ }
+ if (txq_ctrl->txq.inlen_empw > max_inline &&
+ config->txq_inline_mpw != MLX5_ARG_UNSET &&
+ config->txq_inline_mpw > (int)max_inline) {
+ DRV_LOG(ERR,
+ "txq_inline_mpw requirements (%u) are not"
+ " satisfied (%u) on port %u, try the smaller"
+ " Tx queue size (%d)",
+ txq_ctrl->txq.inlen_empw, max_inline,
+ priv->dev_data->port_id,
+ priv->sh->device_attr.orig_attr.max_qp_wr);
+ goto error;
+ }
+ if (txq_ctrl->txq.tso_en && max_inline < MLX5_MAX_TSO_HEADER) {
+ DRV_LOG(ERR,
+ "tso header inline requirements (%u) are not"
+ " satisfied (%u) on port %u, try the smaller"
+ " Tx queue size (%d)",
+ MLX5_MAX_TSO_HEADER, max_inline,
+ priv->dev_data->port_id,
+ priv->sh->device_attr.orig_attr.max_qp_wr);
+ goto error;
+ }
+ if (txq_ctrl->txq.inlen_send > max_inline) {
+ DRV_LOG(WARNING,
+ "adjust txq_inline_max (%u->%u)"
+ " due to large Tx queue on port %u",
+ txq_ctrl->txq.inlen_send, max_inline,
+ priv->dev_data->port_id);
+ txq_ctrl->txq.inlen_send = max_inline;
+ }
+ if (txq_ctrl->txq.inlen_empw > max_inline) {
+ DRV_LOG(WARNING,
+ "adjust txq_inline_mpw (%u->%u)"
+ "due to large Tx queue on port %u",
+ txq_ctrl->txq.inlen_empw, max_inline,
+ priv->dev_data->port_id);
+ txq_ctrl->txq.inlen_empw = max_inline;
+ }
+ txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->txq.inlen_send,
+ txq_ctrl->txq.inlen_empw);
+ MLX5_ASSERT(txq_ctrl->max_inline_data <= max_inline);
+ MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= max_inline);
+ MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_send);
+ MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_empw ||
+ !txq_ctrl->txq.inlen_empw);
+ return 0;
+error:
+ rte_errno = ENOMEM;
+ return -ENOMEM;
+}
+
+/**
+ * Create a DPDK Tx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * TX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ * Thresholds parameters.
+ *
+ * @return
+ * A DPDK queue object on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_txq_ctrl *
+mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_txconf *conf)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_ctrl *tmpl;
+
+ tmpl = rte_calloc_socket("TXQ", 1,
+ sizeof(*tmpl) +
+ desc * sizeof(struct rte_mbuf *),
+ 0, socket);
+ if (!tmpl) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ if (mlx5_mr_btree_init(&tmpl->txq.mr_ctrl.cache_bh,
+ MLX5_MR_BTREE_CACHE_N, socket)) {
+ /* rte_errno is already set. */
+ goto error;
+ }
+ /* Save pointer of global generation number to check memory event. */
+ tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->share_cache.dev_gen;
+ MLX5_ASSERT(desc > MLX5_TX_COMP_THRESH);
+ tmpl->txq.offloads = conf->offloads |
+ dev->data->dev_conf.txmode.offloads;
+ tmpl->priv = priv;
+ tmpl->socket = socket;
+ tmpl->txq.elts_n = log2above(desc);
+ tmpl->txq.elts_s = desc;
+ tmpl->txq.elts_m = desc - 1;
+ tmpl->txq.port_id = dev->data->port_id;
+ tmpl->txq.idx = idx;
+ txq_set_params(tmpl);
+ if (txq_adjust_params(tmpl))
+ goto error;
+ if (txq_calc_wqebb_cnt(tmpl) >
+ priv->sh->device_attr.orig_attr.max_qp_wr) {
+ DRV_LOG(ERR,
+ "port %u Tx WQEBB count (%d) exceeds the limit (%d),"
+ " try smaller queue size",
+ dev->data->port_id, txq_calc_wqebb_cnt(tmpl),
+ priv->sh->device_attr.orig_attr.max_qp_wr);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ rte_atomic32_inc(&tmpl->refcnt);
+ tmpl->type = MLX5_TXQ_TYPE_STANDARD;
+ LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
+ return tmpl;
+error:
+ rte_free(tmpl);
+ return NULL;
+}
+
+/**
+ * Create a DPDK Tx hairpin queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * TX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param hairpin_conf
+ * The hairpin configuration.
+ *
+ * @return
+ * A DPDK queue object on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_txq_ctrl *
+mlx5_txq_hairpin_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_ctrl *tmpl;
+
+ tmpl = rte_calloc_socket("TXQ", 1,
+ sizeof(*tmpl), 0, SOCKET_ID_ANY);
+ if (!tmpl) {
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ tmpl->priv = priv;
+ tmpl->socket = SOCKET_ID_ANY;
+ tmpl->txq.elts_n = log2above(desc);
+ tmpl->txq.port_id = dev->data->port_id;
+ tmpl->txq.idx = idx;
+ tmpl->hairpin_conf = *hairpin_conf;
+ tmpl->type = MLX5_TXQ_TYPE_HAIRPIN;
+ rte_atomic32_inc(&tmpl->refcnt);
+ LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
+ return tmpl;
+}
+
+/**
+ * Get a Tx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * TX queue index.
+ *
+ * @return
+ * A pointer to the queue if it exists.
+ */
+struct mlx5_txq_ctrl *
+mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_ctrl *ctrl = NULL;
+
+ if ((*priv->txqs)[idx]) {
+ ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl,
+ txq);
+ mlx5_txq_obj_get(dev, idx);
+ rte_atomic32_inc(&ctrl->refcnt);
+ }
+ return ctrl;
+}
+
+/**
+ * Release a Tx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * TX queue index.
+ *
+ * @return
+ * 1 while a reference on it exists, 0 when freed.
+ */
+int
+mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_ctrl *txq;
+
+ if (!(*priv->txqs)[idx])
+ return 0;
+ txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
+ if (txq->obj && !mlx5_txq_obj_release(txq->obj))
+ txq->obj = NULL;
+ if (rte_atomic32_dec_and_test(&txq->refcnt)) {
+ txq_free_elts(txq);
+ mlx5_mr_btree_free(&txq->txq.mr_ctrl.cache_bh);
+ LIST_REMOVE(txq, next);
+ rte_free(txq);
+ (*priv->txqs)[idx] = NULL;
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Verify if the queue can be released.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * TX queue index.
+ *
+ * @return
+ * 1 if the queue can be released.
+ */
+int
+mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_ctrl *txq;
+
+ if (!(*priv->txqs)[idx])
+ return -1;
+ txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
+ return (rte_atomic32_read(&txq->refcnt) == 1);
+}
+
+/**
+ * Verify the Tx Queue list is empty
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * The number of object not released.
+ */
+int
+mlx5_txq_verify(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_ctrl *txq_ctrl;
+ int ret = 0;
+
+ LIST_FOREACH(txq_ctrl, &priv->txqsctrl, next) {
+ DRV_LOG(DEBUG, "port %u Tx queue %u still referenced",
+ dev->data->port_id, txq_ctrl->txq.idx);
+ ++ret;
+ }
+ return ret;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c
new file mode 100644
index 000000000..d29fbcbc8
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c
@@ -0,0 +1,484 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#include <rte_malloc.h>
+#include <rte_hash_crc.h>
+
+#include "mlx5_utils.h"
+
+struct mlx5_hlist *
+mlx5_hlist_create(const char *name, uint32_t size)
+{
+ struct mlx5_hlist *h;
+ uint32_t act_size;
+ uint32_t alloc_size;
+
+ if (!size)
+ return NULL;
+ /* Align to the next power of 2, 32bits integer is enough now. */
+ if (!rte_is_power_of_2(size)) {
+ act_size = rte_align32pow2(size);
+ DRV_LOG(WARNING, "Size 0x%" PRIX32 " is not power of 2, will "
+ "be aligned to 0x%" PRIX32 ".\n", size, act_size);
+ } else {
+ act_size = size;
+ }
+ alloc_size = sizeof(struct mlx5_hlist) +
+ sizeof(struct mlx5_hlist_head) * act_size;
+ /* Using zmalloc, then no need to initialize the heads. */
+ h = rte_zmalloc(name, alloc_size, RTE_CACHE_LINE_SIZE);
+ if (!h) {
+ DRV_LOG(ERR, "No memory for hash list %s creation\n",
+ name ? name : "None");
+ return NULL;
+ }
+ if (name)
+ snprintf(h->name, MLX5_HLIST_NAMESIZE, "%s", name);
+ h->table_sz = act_size;
+ h->mask = act_size - 1;
+ DRV_LOG(DEBUG, "Hash list with %s size 0x%" PRIX32 " is created.\n",
+ h->name, act_size);
+ return h;
+}
+
+struct mlx5_hlist_entry *
+mlx5_hlist_lookup(struct mlx5_hlist *h, uint64_t key)
+{
+ uint32_t idx;
+ struct mlx5_hlist_head *first;
+ struct mlx5_hlist_entry *node;
+
+ MLX5_ASSERT(h);
+ idx = rte_hash_crc_8byte(key, 0) & h->mask;
+ first = &h->heads[idx];
+ LIST_FOREACH(node, first, next) {
+ if (node->key == key)
+ return node;
+ }
+ return NULL;
+}
+
+int
+mlx5_hlist_insert(struct mlx5_hlist *h, struct mlx5_hlist_entry *entry)
+{
+ uint32_t idx;
+ struct mlx5_hlist_head *first;
+ struct mlx5_hlist_entry *node;
+
+ MLX5_ASSERT(h && entry);
+ idx = rte_hash_crc_8byte(entry->key, 0) & h->mask;
+ first = &h->heads[idx];
+ /* No need to reuse the lookup function. */
+ LIST_FOREACH(node, first, next) {
+ if (node->key == entry->key)
+ return -EEXIST;
+ }
+ LIST_INSERT_HEAD(first, entry, next);
+ return 0;
+}
+
+void
+mlx5_hlist_remove(struct mlx5_hlist *h __rte_unused,
+ struct mlx5_hlist_entry *entry)
+{
+ MLX5_ASSERT(entry && entry->next.le_prev);
+ LIST_REMOVE(entry, next);
+ /* Set to NULL to get rid of removing action for more than once. */
+ entry->next.le_prev = NULL;
+}
+
+void
+mlx5_hlist_destroy(struct mlx5_hlist *h,
+ mlx5_hlist_destroy_callback_fn cb, void *ctx)
+{
+ uint32_t idx;
+ struct mlx5_hlist_entry *entry;
+
+ MLX5_ASSERT(h);
+ for (idx = 0; idx < h->table_sz; ++idx) {
+ /* no LIST_FOREACH_SAFE, using while instead */
+ while (!LIST_EMPTY(&h->heads[idx])) {
+ entry = LIST_FIRST(&h->heads[idx]);
+ LIST_REMOVE(entry, next);
+ /*
+ * The owner of whole element which contains data entry
+ * is the user, so it's the user's duty to do the clean
+ * up and the free work because someone may not put the
+ * hlist entry at the beginning(suggested to locate at
+ * the beginning). Or else the default free function
+ * will be used.
+ */
+ if (cb)
+ cb(entry, ctx);
+ else
+ rte_free(entry);
+ }
+ }
+ rte_free(h);
+}
+
+static inline void
+mlx5_ipool_lock(struct mlx5_indexed_pool *pool)
+{
+ if (pool->cfg.need_lock)
+ rte_spinlock_lock(&pool->lock);
+}
+
+static inline void
+mlx5_ipool_unlock(struct mlx5_indexed_pool *pool)
+{
+ if (pool->cfg.need_lock)
+ rte_spinlock_unlock(&pool->lock);
+}
+
+static inline uint32_t
+mlx5_trunk_idx_get(struct mlx5_indexed_pool *pool, uint32_t entry_idx)
+{
+ struct mlx5_indexed_pool_config *cfg = &pool->cfg;
+ uint32_t trunk_idx = 0;
+ uint32_t i;
+
+ if (!cfg->grow_trunk)
+ return entry_idx / cfg->trunk_size;
+ if (entry_idx >= pool->grow_tbl[cfg->grow_trunk - 1]) {
+ trunk_idx = (entry_idx - pool->grow_tbl[cfg->grow_trunk - 1]) /
+ (cfg->trunk_size << (cfg->grow_shift *
+ cfg->grow_trunk)) + cfg->grow_trunk;
+ } else {
+ for (i = 0; i < cfg->grow_trunk; i++) {
+ if (entry_idx < pool->grow_tbl[i])
+ break;
+ }
+ trunk_idx = i;
+ }
+ return trunk_idx;
+}
+
+static inline uint32_t
+mlx5_trunk_size_get(struct mlx5_indexed_pool *pool, uint32_t trunk_idx)
+{
+ struct mlx5_indexed_pool_config *cfg = &pool->cfg;
+
+ return cfg->trunk_size << (cfg->grow_shift *
+ (trunk_idx > cfg->grow_trunk ? cfg->grow_trunk : trunk_idx));
+}
+
+static inline uint32_t
+mlx5_trunk_idx_offset_get(struct mlx5_indexed_pool *pool, uint32_t trunk_idx)
+{
+ struct mlx5_indexed_pool_config *cfg = &pool->cfg;
+ uint32_t offset = 0;
+
+ if (!trunk_idx)
+ return 0;
+ if (!cfg->grow_trunk)
+ return cfg->trunk_size * trunk_idx;
+ if (trunk_idx < cfg->grow_trunk)
+ offset = pool->grow_tbl[trunk_idx - 1];
+ else
+ offset = pool->grow_tbl[cfg->grow_trunk - 1] +
+ (cfg->trunk_size << (cfg->grow_shift *
+ cfg->grow_trunk)) * (trunk_idx - cfg->grow_trunk);
+ return offset;
+}
+
+struct mlx5_indexed_pool *
+mlx5_ipool_create(struct mlx5_indexed_pool_config *cfg)
+{
+ struct mlx5_indexed_pool *pool;
+ uint32_t i;
+
+ if (!cfg || !cfg->size || (!cfg->malloc ^ !cfg->free) ||
+ (cfg->trunk_size && ((cfg->trunk_size & (cfg->trunk_size - 1)) ||
+ ((__builtin_ffs(cfg->trunk_size) + TRUNK_IDX_BITS) > 32))))
+ return NULL;
+ pool = rte_zmalloc("mlx5_ipool", sizeof(*pool) + cfg->grow_trunk *
+ sizeof(pool->grow_tbl[0]), RTE_CACHE_LINE_SIZE);
+ if (!pool)
+ return NULL;
+ pool->cfg = *cfg;
+ if (!pool->cfg.trunk_size)
+ pool->cfg.trunk_size = MLX5_IPOOL_DEFAULT_TRUNK_SIZE;
+ if (!cfg->malloc && !cfg->free) {
+ pool->cfg.malloc = rte_malloc_socket;
+ pool->cfg.free = rte_free;
+ }
+ pool->free_list = TRUNK_INVALID;
+ if (pool->cfg.need_lock)
+ rte_spinlock_init(&pool->lock);
+ /*
+ * Initialize the dynamic grow trunk size lookup table to have a quick
+ * lookup for the trunk entry index offset.
+ */
+ for (i = 0; i < cfg->grow_trunk; i++) {
+ pool->grow_tbl[i] = cfg->trunk_size << (cfg->grow_shift * i);
+ if (i > 0)
+ pool->grow_tbl[i] += pool->grow_tbl[i - 1];
+ }
+ return pool;
+}
+
+static int
+mlx5_ipool_grow(struct mlx5_indexed_pool *pool)
+{
+ struct mlx5_indexed_trunk *trunk;
+ struct mlx5_indexed_trunk **trunk_tmp;
+ struct mlx5_indexed_trunk **p;
+ size_t trunk_size = 0;
+ size_t data_size;
+ size_t bmp_size;
+ uint32_t idx;
+
+ if (pool->n_trunk_valid == TRUNK_MAX_IDX)
+ return -ENOMEM;
+ if (pool->n_trunk_valid == pool->n_trunk) {
+ /* No free trunk flags, expand trunk list. */
+ int n_grow = pool->n_trunk_valid ? pool->n_trunk :
+ RTE_CACHE_LINE_SIZE / sizeof(void *);
+
+ p = pool->cfg.malloc(pool->cfg.type,
+ (pool->n_trunk_valid + n_grow) *
+ sizeof(struct mlx5_indexed_trunk *),
+ RTE_CACHE_LINE_SIZE, rte_socket_id());
+ if (!p)
+ return -ENOMEM;
+ if (pool->trunks)
+ memcpy(p, pool->trunks, pool->n_trunk_valid *
+ sizeof(struct mlx5_indexed_trunk *));
+ memset(RTE_PTR_ADD(p, pool->n_trunk_valid * sizeof(void *)), 0,
+ n_grow * sizeof(void *));
+ trunk_tmp = pool->trunks;
+ pool->trunks = p;
+ if (trunk_tmp)
+ pool->cfg.free(trunk_tmp);
+ pool->n_trunk += n_grow;
+ }
+ if (!pool->cfg.release_mem_en) {
+ idx = pool->n_trunk_valid;
+ } else {
+ /* Find the first available slot in trunk list */
+ for (idx = 0; idx < pool->n_trunk; idx++)
+ if (pool->trunks[idx] == NULL)
+ break;
+ }
+ trunk_size += sizeof(*trunk);
+ data_size = mlx5_trunk_size_get(pool, idx);
+ bmp_size = rte_bitmap_get_memory_footprint(data_size);
+ /* rte_bitmap requires memory cacheline aligned. */
+ trunk_size += RTE_CACHE_LINE_ROUNDUP(data_size * pool->cfg.size);
+ trunk_size += bmp_size;
+ trunk = pool->cfg.malloc(pool->cfg.type, trunk_size,
+ RTE_CACHE_LINE_SIZE, rte_socket_id());
+ if (!trunk)
+ return -ENOMEM;
+ pool->trunks[idx] = trunk;
+ trunk->idx = idx;
+ trunk->free = data_size;
+ trunk->prev = TRUNK_INVALID;
+ trunk->next = TRUNK_INVALID;
+ MLX5_ASSERT(pool->free_list == TRUNK_INVALID);
+ pool->free_list = idx;
+ /* Mark all entries as available. */
+ trunk->bmp = rte_bitmap_init_with_all_set(data_size, &trunk->data
+ [RTE_CACHE_LINE_ROUNDUP(data_size * pool->cfg.size)],
+ bmp_size);
+ MLX5_ASSERT(trunk->bmp);
+ pool->n_trunk_valid++;
+#ifdef POOL_DEBUG
+ pool->trunk_new++;
+ pool->trunk_avail++;
+#endif
+ return 0;
+}
+
+void *
+mlx5_ipool_malloc(struct mlx5_indexed_pool *pool, uint32_t *idx)
+{
+ struct mlx5_indexed_trunk *trunk;
+ uint64_t slab = 0;
+ uint32_t iidx = 0;
+ void *p;
+
+ mlx5_ipool_lock(pool);
+ if (pool->free_list == TRUNK_INVALID) {
+ /* If no available trunks, grow new. */
+ if (mlx5_ipool_grow(pool)) {
+ mlx5_ipool_unlock(pool);
+ return NULL;
+ }
+ }
+ MLX5_ASSERT(pool->free_list != TRUNK_INVALID);
+ trunk = pool->trunks[pool->free_list];
+ MLX5_ASSERT(trunk->free);
+ if (!rte_bitmap_scan(trunk->bmp, &iidx, &slab)) {
+ mlx5_ipool_unlock(pool);
+ return NULL;
+ }
+ MLX5_ASSERT(slab);
+ iidx += __builtin_ctzll(slab);
+ MLX5_ASSERT(iidx != UINT32_MAX);
+ MLX5_ASSERT(iidx < mlx5_trunk_size_get(pool, trunk->idx));
+ rte_bitmap_clear(trunk->bmp, iidx);
+ p = &trunk->data[iidx * pool->cfg.size];
+ iidx += mlx5_trunk_idx_offset_get(pool, trunk->idx);
+ iidx += 1; /* non-zero index. */
+ trunk->free--;
+#ifdef POOL_DEBUG
+ pool->n_entry++;
+#endif
+ if (!trunk->free) {
+ /* Full trunk will be removed from free list in imalloc. */
+ MLX5_ASSERT(pool->free_list == trunk->idx);
+ pool->free_list = trunk->next;
+ if (trunk->next != TRUNK_INVALID)
+ pool->trunks[trunk->next]->prev = TRUNK_INVALID;
+ trunk->prev = TRUNK_INVALID;
+ trunk->next = TRUNK_INVALID;
+#ifdef POOL_DEBUG
+ pool->trunk_empty++;
+ pool->trunk_avail--;
+#endif
+ }
+ *idx = iidx;
+ mlx5_ipool_unlock(pool);
+ return p;
+}
+
+void *
+mlx5_ipool_zmalloc(struct mlx5_indexed_pool *pool, uint32_t *idx)
+{
+ void *entry = mlx5_ipool_malloc(pool, idx);
+
+ if (entry)
+ memset(entry, 0, pool->cfg.size);
+ return entry;
+}
+
+void
+mlx5_ipool_free(struct mlx5_indexed_pool *pool, uint32_t idx)
+{
+ struct mlx5_indexed_trunk *trunk;
+ uint32_t trunk_idx;
+ uint32_t entry_idx;
+
+ if (!idx)
+ return;
+ idx -= 1;
+ mlx5_ipool_lock(pool);
+ trunk_idx = mlx5_trunk_idx_get(pool, idx);
+ if ((!pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk_valid) ||
+ (pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk))
+ goto out;
+ trunk = pool->trunks[trunk_idx];
+ if (!trunk)
+ goto out;
+ entry_idx = idx - mlx5_trunk_idx_offset_get(pool, trunk->idx);
+ if (trunk_idx != trunk->idx ||
+ rte_bitmap_get(trunk->bmp, entry_idx))
+ goto out;
+ rte_bitmap_set(trunk->bmp, entry_idx);
+ trunk->free++;
+ if (pool->cfg.release_mem_en && trunk->free == mlx5_trunk_size_get
+ (pool, trunk->idx)) {
+ if (pool->free_list == trunk->idx)
+ pool->free_list = trunk->next;
+ if (trunk->next != TRUNK_INVALID)
+ pool->trunks[trunk->next]->prev = trunk->prev;
+ if (trunk->prev != TRUNK_INVALID)
+ pool->trunks[trunk->prev]->next = trunk->next;
+ pool->cfg.free(trunk);
+ pool->trunks[trunk_idx] = NULL;
+ pool->n_trunk_valid--;
+#ifdef POOL_DEBUG
+ pool->trunk_avail--;
+ pool->trunk_free++;
+#endif
+ if (pool->n_trunk_valid == 0) {
+ pool->cfg.free(pool->trunks);
+ pool->trunks = NULL;
+ pool->n_trunk = 0;
+ }
+ } else if (trunk->free == 1) {
+ /* Put into free trunk list head. */
+ MLX5_ASSERT(pool->free_list != trunk->idx);
+ trunk->next = pool->free_list;
+ trunk->prev = TRUNK_INVALID;
+ if (pool->free_list != TRUNK_INVALID)
+ pool->trunks[pool->free_list]->prev = trunk->idx;
+ pool->free_list = trunk->idx;
+#ifdef POOL_DEBUG
+ pool->trunk_empty--;
+ pool->trunk_avail++;
+#endif
+ }
+#ifdef POOL_DEBUG
+ pool->n_entry--;
+#endif
+out:
+ mlx5_ipool_unlock(pool);
+}
+
+void *
+mlx5_ipool_get(struct mlx5_indexed_pool *pool, uint32_t idx)
+{
+ struct mlx5_indexed_trunk *trunk;
+ void *p = NULL;
+ uint32_t trunk_idx;
+ uint32_t entry_idx;
+
+ if (!idx)
+ return NULL;
+ idx -= 1;
+ mlx5_ipool_lock(pool);
+ trunk_idx = mlx5_trunk_idx_get(pool, idx);
+ if ((!pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk_valid) ||
+ (pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk))
+ goto out;
+ trunk = pool->trunks[trunk_idx];
+ if (!trunk)
+ goto out;
+ entry_idx = idx - mlx5_trunk_idx_offset_get(pool, trunk->idx);
+ if (trunk_idx != trunk->idx ||
+ rte_bitmap_get(trunk->bmp, entry_idx))
+ goto out;
+ p = &trunk->data[entry_idx * pool->cfg.size];
+out:
+ mlx5_ipool_unlock(pool);
+ return p;
+}
+
+int
+mlx5_ipool_destroy(struct mlx5_indexed_pool *pool)
+{
+ struct mlx5_indexed_trunk **trunks;
+ uint32_t i;
+
+ MLX5_ASSERT(pool);
+ mlx5_ipool_lock(pool);
+ trunks = pool->trunks;
+ for (i = 0; i < pool->n_trunk; i++) {
+ if (trunks[i])
+ pool->cfg.free(trunks[i]);
+ }
+ if (!pool->trunks)
+ pool->cfg.free(pool->trunks);
+ mlx5_ipool_unlock(pool);
+ rte_free(pool);
+ return 0;
+}
+
+void
+mlx5_ipool_dump(struct mlx5_indexed_pool *pool)
+{
+ printf("Pool %s entry size %u, trunks %u, %d entry per trunk, "
+ "total: %d\n",
+ pool->cfg.type, pool->cfg.size, pool->n_trunk_valid,
+ pool->cfg.trunk_size, pool->n_trunk_valid);
+#ifdef POOL_DEBUG
+ printf("Pool %s entry %u, trunk alloc %u, empty: %u, "
+ "available %u free %u\n",
+ pool->cfg.type, pool->n_entry, pool->trunk_new,
+ pool->trunk_empty, pool->trunk_avail, pool->trunk_free);
+#endif
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h
new file mode 100644
index 000000000..f4ec15170
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h
@@ -0,0 +1,423 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_UTILS_H_
+#define RTE_PMD_MLX5_UTILS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <limits.h>
+#include <errno.h>
+
+#include <rte_spinlock.h>
+#include <rte_memory.h>
+#include <rte_bitmap.h>
+
+#include <mlx5_common.h>
+
+#include "mlx5_defs.h"
+
+
+/* Convert a bit number to the corresponding 64-bit mask */
+#define MLX5_BITSHIFT(v) (UINT64_C(1) << (v))
+
+/* Save and restore errno around argument evaluation. */
+#define ERRNO_SAFE(x) ((errno = (int []){ errno, ((x), 0) }[0]))
+
+extern int mlx5_logtype;
+
+/* Generic printf()-like logging macro with automatic line feed. */
+#define DRV_LOG(level, ...) \
+ PMD_DRV_LOG_(level, mlx5_logtype, MLX5_DRIVER_NAME, \
+ __VA_ARGS__ PMD_DRV_LOG_STRIP PMD_DRV_LOG_OPAREN, \
+ PMD_DRV_LOG_CPAREN)
+
+#define INFO(...) DRV_LOG(INFO, __VA_ARGS__)
+#define WARN(...) DRV_LOG(WARNING, __VA_ARGS__)
+#define ERROR(...) DRV_LOG(ERR, __VA_ARGS__)
+
+/* Convenience macros for accessing mbuf fields. */
+#define NEXT(m) ((m)->next)
+#define DATA_LEN(m) ((m)->data_len)
+#define PKT_LEN(m) ((m)->pkt_len)
+#define DATA_OFF(m) ((m)->data_off)
+#define SET_DATA_OFF(m, o) ((m)->data_off = (o))
+#define NB_SEGS(m) ((m)->nb_segs)
+#define PORT(m) ((m)->port)
+
+/* Transpose flags. Useful to convert IBV to DPDK flags. */
+#define TRANSPOSE(val, from, to) \
+ (((from) >= (to)) ? \
+ (((val) & (from)) / ((from) / (to))) : \
+ (((val) & (from)) * ((to) / (from))))
+
+/*
+ * The indexed memory entry index is made up of trunk index and offset of
+ * the entry in the trunk. Since the entry index is 32 bits, in case user
+ * prefers to have small trunks, user can change the macro below to a big
+ * number which helps the pool contains more trunks with lots of entries
+ * allocated.
+ */
+#define TRUNK_IDX_BITS 16
+#define TRUNK_MAX_IDX ((1 << TRUNK_IDX_BITS) - 1)
+#define TRUNK_INVALID TRUNK_MAX_IDX
+#define MLX5_IPOOL_DEFAULT_TRUNK_SIZE (1 << (28 - TRUNK_IDX_BITS))
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+#define POOL_DEBUG 1
+#endif
+
+struct mlx5_indexed_pool_config {
+ uint32_t size; /* Pool entry size. */
+ uint32_t trunk_size:22;
+ /*
+ * Trunk entry number. Must be power of 2. It can be increased
+ * if trunk_grow enable. The trunk entry number increases with
+ * left shift grow_shift. Trunks with index are after grow_trunk
+ * will keep the entry number same with the last grow trunk.
+ */
+ uint32_t grow_trunk:4;
+ /*
+ * Trunks with entry number increase in the pool. Set it to 0
+ * to make the pool works as trunk entry fixed pool. It works
+ * only if grow_shift is not 0.
+ */
+ uint32_t grow_shift:4;
+ /*
+ * Trunk entry number increase shift value, stop after grow_trunk.
+ * It works only if grow_trunk is not 0.
+ */
+ uint32_t need_lock:1;
+ /* Lock is needed for multiple thread usage. */
+ uint32_t release_mem_en:1; /* Rlease trunk when it is free. */
+ const char *type; /* Memory allocate type name. */
+ void *(*malloc)(const char *type, size_t size, unsigned int align,
+ int socket);
+ /* User defined memory allocator. */
+ void (*free)(void *addr); /* User defined memory release. */
+};
+
+struct mlx5_indexed_trunk {
+ uint32_t idx; /* Trunk id. */
+ uint32_t prev; /* Previous free trunk in free list. */
+ uint32_t next; /* Next free trunk in free list. */
+ uint32_t free; /* Free entries available */
+ struct rte_bitmap *bmp;
+ uint8_t data[] __rte_cache_aligned; /* Entry data start. */
+};
+
+struct mlx5_indexed_pool {
+ struct mlx5_indexed_pool_config cfg; /* Indexed pool configuration. */
+ rte_spinlock_t lock; /* Pool lock for multiple thread usage. */
+ uint32_t n_trunk_valid; /* Trunks allocated. */
+ uint32_t n_trunk; /* Trunk pointer array size. */
+ /* Dim of trunk pointer array. */
+ struct mlx5_indexed_trunk **trunks;
+ uint32_t free_list; /* Index to first free trunk. */
+#ifdef POOL_DEBUG
+ uint32_t n_entry;
+ uint32_t trunk_new;
+ uint32_t trunk_avail;
+ uint32_t trunk_empty;
+ uint32_t trunk_free;
+#endif
+ uint32_t grow_tbl[]; /* Save the index offset for the grow trunks. */
+};
+
+/**
+ * Return logarithm of the nearest power of two above input value.
+ *
+ * @param v
+ * Input value.
+ *
+ * @return
+ * Logarithm of the nearest power of two above input value.
+ */
+static inline unsigned int
+log2above(unsigned int v)
+{
+ unsigned int l;
+ unsigned int r;
+
+ for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
+ r |= (v & 1);
+ return l + r;
+}
+
+/** Maximum size of string for naming the hlist table. */
+#define MLX5_HLIST_NAMESIZE 32
+
+/**
+ * Structure of the entry in the hash list, user should define its own struct
+ * that contains this in order to store the data. The 'key' is 64-bits right
+ * now and its user's responsibility to guarantee there is no collision.
+ */
+struct mlx5_hlist_entry {
+ LIST_ENTRY(mlx5_hlist_entry) next; /* entry pointers in the list. */
+ uint64_t key; /* user defined 'key', could be the hash signature. */
+};
+
+/** Structure for hash head. */
+LIST_HEAD(mlx5_hlist_head, mlx5_hlist_entry);
+
+/** Type of function that is used to handle the data before freeing. */
+typedef void (*mlx5_hlist_destroy_callback_fn)(void *p, void *ctx);
+
+/** hash list table structure */
+struct mlx5_hlist {
+ char name[MLX5_HLIST_NAMESIZE]; /**< Name of the hash list. */
+ /**< number of heads, need to be power of 2. */
+ uint32_t table_sz;
+ /**< mask to get the index of the list heads. */
+ uint32_t mask;
+ struct mlx5_hlist_head heads[]; /**< list head arrays. */
+};
+
+/**
+ * Create a hash list table, the user can specify the list heads array size
+ * of the table, now the size should be a power of 2 in order to get better
+ * distribution for the entries. Each entry is a part of the whole data element
+ * and the caller should be responsible for the data element's allocation and
+ * cleanup / free. Key of each entry will be calculated with CRC in order to
+ * generate a little fairer distribution.
+ *
+ * @param name
+ * Name of the hash list(optional).
+ * @param size
+ * Heads array size of the hash list.
+ *
+ * @return
+ * Pointer of the hash list table created, NULL on failure.
+ */
+struct mlx5_hlist *mlx5_hlist_create(const char *name, uint32_t size);
+
+/**
+ * Search an entry matching the key.
+ *
+ * @param h
+ * Pointer to the hast list table.
+ * @param key
+ * Key for the searching entry.
+ *
+ * @return
+ * Pointer of the hlist entry if found, NULL otherwise.
+ */
+struct mlx5_hlist_entry *mlx5_hlist_lookup(struct mlx5_hlist *h, uint64_t key);
+
+/**
+ * Insert an entry to the hash list table, the entry is only part of whole data
+ * element and a 64B key is used for matching. User should construct the key or
+ * give a calculated hash signature and guarantee there is no collision.
+ *
+ * @param h
+ * Pointer to the hast list table.
+ * @param entry
+ * Entry to be inserted into the hash list table.
+ *
+ * @return
+ * - zero for success.
+ * - -EEXIST if the entry is already inserted.
+ */
+int mlx5_hlist_insert(struct mlx5_hlist *h, struct mlx5_hlist_entry *entry);
+
+/**
+ * Remove an entry from the hash list table. User should guarantee the validity
+ * of the entry.
+ *
+ * @param h
+ * Pointer to the hast list table. (not used)
+ * @param entry
+ * Entry to be removed from the hash list table.
+ */
+void mlx5_hlist_remove(struct mlx5_hlist *h __rte_unused,
+ struct mlx5_hlist_entry *entry);
+
+/**
+ * Destroy the hash list table, all the entries already inserted into the lists
+ * will be handled by the callback function provided by the user (including
+ * free if needed) before the table is freed.
+ *
+ * @param h
+ * Pointer to the hast list table.
+ * @param cb
+ * Callback function for each inserted entry when destroying the hash list.
+ * @param ctx
+ * Common context parameter used by callback function for each entry.
+ */
+void mlx5_hlist_destroy(struct mlx5_hlist *h,
+ mlx5_hlist_destroy_callback_fn cb, void *ctx);
+
+/**
+ * This function allocates non-initialized memory entry from pool.
+ * In NUMA systems, the memory entry allocated resides on the same
+ * NUMA socket as the core that calls this function.
+ *
+ * Memory entry is allocated from memory trunk, no alignment.
+ *
+ * @param pool
+ * Pointer to indexed memory entry pool.
+ * No initialization required.
+ * @param[out] idx
+ * Pointer to memory to save allocated index.
+ * Memory index always positive value.
+ * @return
+ * - Pointer to the allocated memory entry.
+ * - NULL on error. Not enough memory, or invalid arguments.
+ */
+void *mlx5_ipool_malloc(struct mlx5_indexed_pool *pool, uint32_t *idx);
+
+/**
+ * This function allocates zero initialized memory entry from pool.
+ * In NUMA systems, the memory entry allocated resides on the same
+ * NUMA socket as the core that calls this function.
+ *
+ * Memory entry is allocated from memory trunk, no alignment.
+ *
+ * @param pool
+ * Pointer to indexed memory pool.
+ * No initialization required.
+ * @param[out] idx
+ * Pointer to memory to save allocated index.
+ * Memory index always positive value.
+ * @return
+ * - Pointer to the allocated memory entry .
+ * - NULL on error. Not enough memory, or invalid arguments.
+ */
+void *mlx5_ipool_zmalloc(struct mlx5_indexed_pool *pool, uint32_t *idx);
+
+/**
+ * This function frees indexed memory entry to pool.
+ * Caller has to make sure that the index is allocated from same pool.
+ *
+ * @param pool
+ * Pointer to indexed memory pool.
+ * @param idx
+ * Allocated memory entry index.
+ */
+void mlx5_ipool_free(struct mlx5_indexed_pool *pool, uint32_t idx);
+
+/**
+ * This function returns pointer of indexed memory entry from index.
+ * Caller has to make sure that the index is valid, and allocated
+ * from same pool.
+ *
+ * @param pool
+ * Pointer to indexed memory pool.
+ * @param idx
+ * Allocated memory index.
+ * @return
+ * - Pointer to indexed memory entry.
+ */
+void *mlx5_ipool_get(struct mlx5_indexed_pool *pool, uint32_t idx);
+
+/**
+ * This function creates indexed memory pool.
+ * Caller has to configure the configuration accordingly.
+ *
+ * @param pool
+ * Pointer to indexed memory pool.
+ * @param cfg
+ * Allocated memory index.
+ */
+struct mlx5_indexed_pool *
+mlx5_ipool_create(struct mlx5_indexed_pool_config *cfg);
+
+/**
+ * This function releases all resources of pool.
+ * Caller has to make sure that all indexes and memories allocated
+ * from this pool not referenced anymore.
+ *
+ * @param pool
+ * Pointer to indexed memory pool.
+ * @return
+ * - non-zero value on error.
+ * - 0 on success.
+ */
+int mlx5_ipool_destroy(struct mlx5_indexed_pool *pool);
+
+/**
+ * This function dumps debug info of pool.
+ *
+ * @param pool
+ * Pointer to indexed memory pool.
+ */
+void mlx5_ipool_dump(struct mlx5_indexed_pool *pool);
+
+/*
+ * Macros for linked list based on indexed memory.
+ * Example data structure:
+ * struct Foo {
+ * ILIST_ENTRY(uint16_t) next;
+ * ...
+ * }
+ *
+ */
+#define ILIST_ENTRY(type) \
+struct { \
+ type prev; /* Index of previous element. */ \
+ type next; /* Index of next element. */ \
+}
+
+#define ILIST_INSERT(pool, head, idx, elem, field) \
+ do { \
+ typeof(elem) peer; \
+ MLX5_ASSERT((elem) && (idx)); \
+ (elem)->field.next = *(head); \
+ (elem)->field.prev = 0; \
+ if (*(head)) { \
+ (peer) = mlx5_ipool_get(pool, *(head)); \
+ if (peer) \
+ (peer)->field.prev = (idx); \
+ } \
+ *(head) = (idx); \
+ } while (0)
+
+#define ILIST_REMOVE(pool, head, idx, elem, field) \
+ do { \
+ typeof(elem) peer; \
+ MLX5_ASSERT(elem); \
+ MLX5_ASSERT(head); \
+ if ((elem)->field.prev) { \
+ (peer) = mlx5_ipool_get \
+ (pool, (elem)->field.prev); \
+ if (peer) \
+ (peer)->field.next = (elem)->field.next;\
+ } \
+ if ((elem)->field.next) { \
+ (peer) = mlx5_ipool_get \
+ (pool, (elem)->field.next); \
+ if (peer) \
+ (peer)->field.prev = (elem)->field.prev;\
+ } \
+ if (*(head) == (idx)) \
+ *(head) = (elem)->field.next; \
+ } while (0)
+
+#define ILIST_FOREACH(pool, head, idx, elem, field) \
+ for ((idx) = (head), (elem) = \
+ (idx) ? mlx5_ipool_get(pool, (idx)) : NULL; (elem); \
+ idx = (elem)->field.next, (elem) = \
+ (idx) ? mlx5_ipool_get(pool, idx) : NULL)
+
+/* Single index list. */
+#define SILIST_ENTRY(type) \
+struct { \
+ type next; /* Index of next element. */ \
+}
+
+#define SILIST_INSERT(head, idx, elem, field) \
+ do { \
+ MLX5_ASSERT((elem) && (idx)); \
+ (elem)->field.next = *(head); \
+ *(head) = (idx); \
+ } while (0)
+
+#define SILIST_FOREACH(pool, head, idx, elem, field) \
+ for ((idx) = (head), (elem) = \
+ (idx) ? mlx5_ipool_get(pool, (idx)) : NULL; (elem); \
+ idx = (elem)->field.next, (elem) = \
+ (idx) ? mlx5_ipool_get(pool, idx) : NULL)
+
+#endif /* RTE_PMD_MLX5_UTILS_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c
new file mode 100644
index 000000000..f65e416da
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c
@@ -0,0 +1,327 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <stdint.h>
+#include <unistd.h>
+
+
+/*
+ * Not needed by this file; included to work around the lack of off_t
+ * definition for mlx5dv.h with unpatched rdma-core versions.
+ */
+#include <sys/types.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx5dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_hypervisor.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_nl.h>
+
+#include "mlx5.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+/**
+ * DPDK callback to configure a VLAN filter.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param vlan_id
+ * VLAN ID to filter.
+ * @param on
+ * Toggle filter.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+
+ DRV_LOG(DEBUG, "port %u %s VLAN filter ID %" PRIu16,
+ dev->data->port_id, (on ? "enable" : "disable"), vlan_id);
+ MLX5_ASSERT(priv->vlan_filter_n <= RTE_DIM(priv->vlan_filter));
+ for (i = 0; (i != priv->vlan_filter_n); ++i)
+ if (priv->vlan_filter[i] == vlan_id)
+ break;
+ /* Check if there's room for another VLAN filter. */
+ if (i == RTE_DIM(priv->vlan_filter)) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ if (i < priv->vlan_filter_n) {
+ MLX5_ASSERT(priv->vlan_filter_n != 0);
+ /* Enabling an existing VLAN filter has no effect. */
+ if (on)
+ goto out;
+ /* Remove VLAN filter from list. */
+ --priv->vlan_filter_n;
+ memmove(&priv->vlan_filter[i],
+ &priv->vlan_filter[i + 1],
+ sizeof(priv->vlan_filter[i]) *
+ (priv->vlan_filter_n - i));
+ priv->vlan_filter[priv->vlan_filter_n] = 0;
+ } else {
+ MLX5_ASSERT(i == priv->vlan_filter_n);
+ /* Disabling an unknown VLAN filter has no effect. */
+ if (!on)
+ goto out;
+ /* Add new VLAN filter. */
+ priv->vlan_filter[priv->vlan_filter_n] = vlan_id;
+ ++priv->vlan_filter_n;
+ }
+out:
+ if (dev->data->dev_started)
+ return mlx5_traffic_restart(dev);
+ return 0;
+}
+
+/**
+ * Callback to set/reset VLAN stripping for a specific queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param queue
+ * RX queue index.
+ * @param on
+ * Enable/disable VLAN stripping.
+ */
+void
+mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[queue];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+ struct ibv_wq_attr mod;
+ uint16_t vlan_offloads =
+ (on ? IBV_WQ_FLAGS_CVLAN_STRIPPING : 0) |
+ 0;
+ int ret = 0;
+
+ /* Validate hw support */
+ if (!priv->config.hw_vlan_strip) {
+ DRV_LOG(ERR, "port %u VLAN stripping is not supported",
+ dev->data->port_id);
+ return;
+ }
+ /* Validate queue number */
+ if (queue >= priv->rxqs_n) {
+ DRV_LOG(ERR, "port %u VLAN stripping, invalid queue number %d",
+ dev->data->port_id, queue);
+ return;
+ }
+ DRV_LOG(DEBUG, "port %u set VLAN offloads 0x%x for port %uqueue %d",
+ dev->data->port_id, vlan_offloads, rxq->port_id, queue);
+ if (!rxq_ctrl->obj) {
+ /* Update related bits in RX queue. */
+ rxq->vlan_strip = !!on;
+ return;
+ }
+ if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
+ mod = (struct ibv_wq_attr){
+ .attr_mask = IBV_WQ_ATTR_FLAGS,
+ .flags_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING,
+ .flags = vlan_offloads,
+ };
+ ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod);
+ } else if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) {
+ struct mlx5_devx_modify_rq_attr rq_attr;
+
+ memset(&rq_attr, 0, sizeof(rq_attr));
+ rq_attr.rq_state = MLX5_RQC_STATE_RDY;
+ rq_attr.state = MLX5_RQC_STATE_RDY;
+ rq_attr.vsd = (on ? 0 : 1);
+ rq_attr.modify_bitmask = MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD;
+ ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
+ }
+ if (ret) {
+ DRV_LOG(ERR, "port %u failed to modify object %d stripping "
+ "mode: %s", dev->data->port_id,
+ rxq_ctrl->obj->type, strerror(rte_errno));
+ return;
+ }
+ /* Update related bits in RX queue. */
+ rxq->vlan_strip = !!on;
+}
+
+/**
+ * Callback to set/reset VLAN offloads for a port.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param mask
+ * VLAN offload bit mask.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+
+ if (mask & ETH_VLAN_STRIP_MASK) {
+ int hw_vlan_strip = !!(dev->data->dev_conf.rxmode.offloads &
+ DEV_RX_OFFLOAD_VLAN_STRIP);
+
+ if (!priv->config.hw_vlan_strip) {
+ DRV_LOG(ERR, "port %u VLAN stripping is not supported",
+ dev->data->port_id);
+ return 0;
+ }
+ /* Run on every RX queue and set/reset VLAN stripping. */
+ for (i = 0; (i != priv->rxqs_n); i++)
+ mlx5_vlan_strip_queue_set(dev, i, hw_vlan_strip);
+ }
+ return 0;
+}
+
+/*
+ * Release VLAN network device, created for VM workaround.
+ *
+ * @param[in] dev
+ * Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ * Object representing the network device to release.
+ */
+void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
+ struct mlx5_vf_vlan *vlan)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_nl_vlan_vmwa_context *vmwa = priv->vmwa_context;
+ struct mlx5_nl_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
+
+ MLX5_ASSERT(vlan->created);
+ MLX5_ASSERT(priv->vmwa_context);
+ if (!vlan->created || !vmwa)
+ return;
+ vlan->created = 0;
+ MLX5_ASSERT(vlan_dev[vlan->tag].refcnt);
+ if (--vlan_dev[vlan->tag].refcnt == 0 &&
+ vlan_dev[vlan->tag].ifindex) {
+ mlx5_nl_vlan_vmwa_delete(vmwa, vlan_dev[vlan->tag].ifindex);
+ vlan_dev[vlan->tag].ifindex = 0;
+ }
+}
+
+/**
+ * Acquire VLAN interface with specified tag for VM workaround.
+ *
+ * @param[in] dev
+ * Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ * Object representing the network device to acquire.
+ */
+void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
+ struct mlx5_vf_vlan *vlan)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_nl_vlan_vmwa_context *vmwa = priv->vmwa_context;
+ struct mlx5_nl_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
+
+ MLX5_ASSERT(!vlan->created);
+ MLX5_ASSERT(priv->vmwa_context);
+ if (vlan->created || !vmwa)
+ return;
+ if (vlan_dev[vlan->tag].refcnt == 0) {
+ MLX5_ASSERT(!vlan_dev[vlan->tag].ifindex);
+ vlan_dev[vlan->tag].ifindex =
+ mlx5_nl_vlan_vmwa_create(vmwa, vmwa->vf_ifindex,
+ vlan->tag);
+ }
+ if (vlan_dev[vlan->tag].ifindex) {
+ vlan_dev[vlan->tag].refcnt++;
+ vlan->created = 1;
+ }
+}
+
+/*
+ * Create per ethernet device VLAN VM workaround context
+ */
+struct mlx5_nl_vlan_vmwa_context *
+mlx5_vlan_vmwa_init(struct rte_eth_dev *dev, uint32_t ifindex)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ struct mlx5_nl_vlan_vmwa_context *vmwa;
+ enum rte_hypervisor hv_type;
+
+ /* Do not engage workaround over PF. */
+ if (!config->vf)
+ return NULL;
+ /* Check whether there is desired virtual environment */
+ hv_type = rte_hypervisor_get();
+ switch (hv_type) {
+ case RTE_HYPERVISOR_UNKNOWN:
+ case RTE_HYPERVISOR_VMWARE:
+ /*
+ * The "white list" of configurations
+ * to engage the workaround.
+ */
+ break;
+ default:
+ /*
+ * The configuration is not found in the "white list".
+ * We should not engage the VLAN workaround.
+ */
+ return NULL;
+ }
+ vmwa = rte_zmalloc(__func__, sizeof(*vmwa), sizeof(uint32_t));
+ if (!vmwa) {
+ DRV_LOG(WARNING,
+ "Can not allocate memory"
+ " for VLAN workaround context");
+ return NULL;
+ }
+ vmwa->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
+ if (vmwa->nl_socket < 0) {
+ DRV_LOG(WARNING,
+ "Can not create Netlink socket"
+ " for VLAN workaround context");
+ rte_free(vmwa);
+ return NULL;
+ }
+ vmwa->vf_ifindex = ifindex;
+ /* Cleanup for existing VLAN devices. */
+ return vmwa;
+}
+
+/*
+ * Destroy per ethernet device VLAN VM workaround context
+ */
+void mlx5_vlan_vmwa_exit(struct mlx5_nl_vlan_vmwa_context *vmwa)
+{
+ unsigned int i;
+
+ /* Delete all remaining VLAN devices. */
+ for (i = 0; i < RTE_DIM(vmwa->vlan_dev); i++) {
+ if (vmwa->vlan_dev[i].ifindex)
+ mlx5_nl_vlan_vmwa_delete(vmwa,
+ vmwa->vlan_dev[i].ifindex);
+ }
+ if (vmwa->nl_socket >= 0)
+ close(vmwa->nl_socket);
+ rte_free(vmwa);
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h
new file mode 100644
index 000000000..8c6922835
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_PRIVATE_MLX5_H_
+#define RTE_PMD_PRIVATE_MLX5_H_
+
+/**
+ * @file
+ * MLX5 public header.
+ *
+ * This interface provides the ability to support private PMD
+ * dynamic flags.
+ */
+
+#define RTE_PMD_MLX5_FINE_GRANULARITY_INLINE "mlx5_fine_granularity_inline"
+
+/**
+ * Returns the dynamic flags name, that are supported.
+ *
+ * @param[out] names
+ * Array that is used to return the supported dynamic flags names.
+ * @param[in] n
+ * The number of elements in the names array.
+ *
+ * @return
+ * The number of dynamic flags that were copied if not negative.
+ * Otherwise:
+ * - ENOMEM - not enough entries in the array
+ * - EINVAL - invalid array entry
+ */
+__rte_experimental
+int rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n);
+
+#endif
diff --git a/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map
new file mode 100644
index 000000000..c8b1031b0
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map
@@ -0,0 +1,10 @@
+DPDK_20.0 {
+ local: *;
+};
+
+EXPERIMENTAL {
+ global:
+
+ # added in 20.02
+ rte_pmd_mlx5_get_dyn_flag_names;
+};