34 files changed, 45111 insertions, 0 deletions
diff --git a/src/spdk/dpdk/drivers/net/mlx5/Makefile b/src/spdk/dpdk/drivers/net/mlx5/Makefile
new file mode 100644
index 000000000..2577ee5e5
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/Makefile
@@ -0,0 +1,77 @@
+#   SPDX-License-Identifier: BSD-3-Clause
+#   Copyright 2015 6WIND S.A.
+#   Copyright 2015 Mellanox Technologies, Ltd
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# Library name.
+LIB = librte_pmd_mlx5.a
+
+# Sources.
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_txq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx.c
+ifneq ($(filter y,$(CONFIG_RTE_ARCH_X86_64) \
+			$(CONFIG_RTE_ARCH_PPC_64) \
+		  $(CONFIG_RTE_ARCH_ARM64)),)
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx_vec.c
+endif
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_trigger.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_ethdev.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mac.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxmode.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_meter.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_dv.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_verbs.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mp.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_utils.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
+
+# Basic CFLAGS.
+CFLAGS += -O3
+CFLAGS += -std=c11 -Wall -Wextra
+CFLAGS += -g
+CFLAGS += -I$(RTE_SDK)/drivers/common/mlx5
+CFLAGS += -I$(RTE_SDK)/drivers/net/mlx5
+CFLAGS += -I$(BUILDDIR)/drivers/common/mlx5
+CFLAGS += -D_BSD_SOURCE
+CFLAGS += -D_DEFAULT_SOURCE
+CFLAGS += -D_XOPEN_SOURCE=600
+CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -Wno-strict-prototypes
+LDLIBS += -lrte_common_mlx5
+LDLIBS += -lm
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_pci
+
+# A few warnings cannot be avoided in external headers.
+CFLAGS += -Wno-error=cast-qual
+
+EXPORT_MAP := rte_pmd_mlx5_version.map
+
+# DEBUG which is usually provided on the command-line may enable
+# CONFIG_RTE_LIBRTE_MLX5_DEBUG.
+ifeq ($(DEBUG),1)
+CONFIG_RTE_LIBRTE_MLX5_DEBUG := y
+endif
+
+# User-defined CFLAGS.
+ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DEBUG),y)
+CFLAGS += -pedantic
+ifneq ($(CONFIG_RTE_TOOLCHAIN_ICC),y)
+CFLAGS += -DPEDANTIC
+endif
+AUTO_CONFIG_CFLAGS += -Wno-pedantic
+else
+CFLAGS += -UPEDANTIC
+endif
+
+include $(RTE_SDK)/mk/rte.lib.mk
+
diff --git a/src/spdk/dpdk/drivers/net/mlx5/meson.build b/src/spdk/dpdk/drivers/net/mlx5/meson.build
new file mode 100644
index 000000000..928663af7
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/meson.build
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2018 6WIND S.A.
+# Copyright 2018 Mellanox Technologies, Ltd
+
+if not is_linux
+	build = false
+	reason = 'only supported on Linux'
+	subdir_done()
+endif
+
+deps += ['hash', 'common_mlx5']
+sources = files(
+	'mlx5.c',
+	'mlx5_ethdev.c',
+	'mlx5_flow.c',
+	'mlx5_flow_meter.c',
+	'mlx5_flow_dv.c',
+	'mlx5_flow_verbs.c',
+	'mlx5_mac.c',
+	'mlx5_mr.c',
+	'mlx5_rss.c',
+	'mlx5_rxmode.c',
+	'mlx5_rxq.c',
+	'mlx5_rxtx.c',
+	'mlx5_mp.c',
+	'mlx5_stats.c',
+	'mlx5_trigger.c',
+	'mlx5_txq.c',
+	'mlx5_vlan.c',
+	'mlx5_utils.c',
+	'mlx5_socket.c',
+)
+if (dpdk_conf.has('RTE_ARCH_X86_64')
+	or dpdk_conf.has('RTE_ARCH_ARM64')
+	or dpdk_conf.has('RTE_ARCH_PPC_64'))
+	sources += files('mlx5_rxtx_vec.c')
+endif
+cflags_options = [
+	'-std=c11',
+	'-Wno-strict-prototypes',
+	'-D_BSD_SOURCE',
+	'-D_DEFAULT_SOURCE',
+	'-D_XOPEN_SOURCE=600'
+]
+foreach option:cflags_options
+	if cc.has_argument(option)
+		cflags += option
+	endif
+endforeach
+if get_option('buildtype').contains('debug')
+	cflags += [ '-pedantic', '-DPEDANTIC' ]
+else
+	cflags += [ '-UPEDANTIC' ]
+endif
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5.c
new file mode 100644
index 000000000..5589772eb
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5.c
@@ -0,0 +1,3814 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <net/if.h>
+#include <sys/mman.h>
+#include <linux/rtnetlink.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_malloc.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ethdev_pci.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_common.h>
+#include <rte_kvargs.h>
+#include <rte_rwlock.h>
+#include <rte_spinlock.h>
+#include <rte_string_fns.h>
+#include <rte_alarm.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mp.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_mr.h"
+#include "mlx5_flow.h"
+#include "rte_pmd_mlx5.h"
+
+/* Device parameter to enable RX completion queue compression. */
+#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
+
+/* Device parameter to enable RX completion entry padding to 128B. */
+#define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
+
+/* Device parameter to enable padding Rx packet to cacheline size. */
+#define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
+
+/* Device parameter to enable Multi-Packet Rx queue. */
+#define MLX5_RX_MPRQ_EN "mprq_en"
+
+/* Device parameter to configure log 2 of the number of strides for MPRQ. */
+#define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
+
+/* Device parameter to configure log 2 of the stride size for MPRQ. */
+#define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size"
+
+/* Device parameter to limit the size of memcpy'd packet for MPRQ. */
+#define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
+
+/* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
+#define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
+
+/* Device parameter to configure inline send. Deprecated, ignored.*/
+#define MLX5_TXQ_INLINE "txq_inline"
+
+/* Device parameter to limit packet size to inline with ordinary SEND. */
+#define MLX5_TXQ_INLINE_MAX "txq_inline_max"
+
+/* Device parameter to configure minimal data size to inline. */
+#define MLX5_TXQ_INLINE_MIN "txq_inline_min"
+
+/* Device parameter to limit packet size to inline with Enhanced MPW. */
+#define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
+
+/*
+ * Device parameter to configure the number of TX queues threshold for
+ * enabling inline send.
+ */
+#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
+
+/*
+ * Device parameter to configure the number of TX queues threshold for
+ * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
+ */
+#define MLX5_TXQS_MAX_VEC "txqs_max_vec"
+
+/* Device parameter to enable multi-packet send WQEs. */
+#define MLX5_TXQ_MPW_EN "txq_mpw_en"
+
+/*
+ * Device parameter to force doorbell register mapping
+ * to non-cahed region eliminating the extra write memory barrier.
+ */
+#define MLX5_TX_DB_NC "tx_db_nc"
+
+/*
+ * Device parameter to include 2 dsegs in the title WQEBB.
+ * Deprecated, ignored.
+ */
+#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
+
+/*
+ * Device parameter to limit the size of inlining packet.
+ * Deprecated, ignored.
+ */
+#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
+
+/*
+ * Device parameter to enable hardware Tx vector.
+ * Deprecated, ignored (no vectorized Tx routines anymore).
+ */
+#define MLX5_TX_VEC_EN "tx_vec_en"
+
+/* Device parameter to enable hardware Rx vector. */
+#define MLX5_RX_VEC_EN "rx_vec_en"
+
+/* Allow L3 VXLAN flow creation. */
+#define MLX5_L3_VXLAN_EN "l3_vxlan_en"
+
+/* Activate DV E-Switch flow steering. */
+#define MLX5_DV_ESW_EN "dv_esw_en"
+
+/* Activate DV flow steering. */
+#define MLX5_DV_FLOW_EN "dv_flow_en"
+
+/* Enable extensive flow metadata support. */
+#define MLX5_DV_XMETA_EN "dv_xmeta_en"
+
+/* Activate Netlink support in VF mode. */
+#define MLX5_VF_NL_EN "vf_nl_en"
+
+/* Enable extending memsegs when creating a MR. */
+#define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
+
+/* Select port representors to instantiate. */
+#define MLX5_REPRESENTOR "representor"
+
+/* Device parameter to configure the maximum number of dump files per queue. */
+#define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
+
+/* Configure timeout of LRO session (in microseconds). */
+#define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
+
+/*
+ * Device parameter to configure the total data buffer size for a single
+ * hairpin queue (logarithm value).
+ */
+#define MLX5_HP_BUF_SIZE "hp_buf_log_sz"
+
+#ifndef HAVE_IBV_MLX5_MOD_MPW
+#define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
+#define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
+#endif
+
+#ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
+#define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
+#endif
+
+static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
+
+/* Shared memory between primary and secondary processes. */
+struct mlx5_shared_data *mlx5_shared_data;
+
+/* Spinlock for mlx5_shared_data allocation. */
+static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Process local data for secondary processes. */
+static struct mlx5_local_data mlx5_local_data;
+
+/** Driver-specific log messages type. */
+int mlx5_logtype;
+
+/** Data associated with devices to spawn. */
+struct mlx5_dev_spawn_data {
+	uint32_t ifindex; /**< Network interface index. */
+	uint32_t max_port; /**< IB device maximal port index. */
+	uint32_t ibv_port; /**< IB device physical port index. */
+	int pf_bond; /**< bonding device PF index. < 0 - no bonding */
+	struct mlx5_switch_info info; /**< Switch information. */
+	struct ibv_device *ibv_dev; /**< Associated IB device. */
+	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
+};
+
+static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
+static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	{
+		.size = sizeof(struct mlx5_flow_dv_encap_decap_resource),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 0,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "mlx5_encap_decap_ipool",
+	},
+	{
+		.size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 0,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "mlx5_push_vlan_ipool",
+	},
+	{
+		.size = sizeof(struct mlx5_flow_dv_tag_resource),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 0,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "mlx5_tag_ipool",
+	},
+	{
+		.size = sizeof(struct mlx5_flow_dv_port_id_action_resource),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 0,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "mlx5_port_id_ipool",
+	},
+	{
+		.size = sizeof(struct mlx5_flow_tbl_data_entry),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 0,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "mlx5_jump_ipool",
+	},
+#endif
+	{
+		.size = sizeof(struct mlx5_flow_meter),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 0,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "mlx5_meter_ipool",
+	},
+	{
+		.size = sizeof(struct mlx5_flow_mreg_copy_resource),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 0,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "mlx5_mcp_ipool",
+	},
+	{
+		.size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 0,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "mlx5_hrxq_ipool",
+	},
+	{
+		.size = sizeof(struct mlx5_flow_handle),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 0,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "mlx5_flow_handle_ipool",
+	},
+	{
+		.size = sizeof(struct rte_flow),
+		.trunk_size = 4096,
+		.need_lock = 1,
+		.release_mem_en = 1,
+		.malloc = rte_malloc_socket,
+		.free = rte_free,
+		.type = "rte_flow_ipool",
+	},
+};
+
+
+#define MLX5_FLOW_MIN_ID_POOL_SIZE 512
+#define MLX5_ID_GENERATION_ARRAY_FACTOR 16
+
+#define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
+#define MLX5_TAGS_HLIST_ARRAY_SIZE 8192
+
+/**
+ * Allocate ID pool structure.
+ *
+ * @param[in] max_id
+ *   The maximum id can be allocated from the pool.
+ *
+ * @return
+ *   Pointer to pool object, NULL value otherwise.
+ */
+struct mlx5_flow_id_pool *
+mlx5_flow_id_pool_alloc(uint32_t max_id)
+{
+	struct mlx5_flow_id_pool *pool;
+	void *mem;
+
+	pool = rte_zmalloc("id pool allocation", sizeof(*pool),
+			   RTE_CACHE_LINE_SIZE);
+	if (!pool) {
+		DRV_LOG(ERR, "can't allocate id pool");
+		rte_errno  = ENOMEM;
+		return NULL;
+	}
+	mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
+			  RTE_CACHE_LINE_SIZE);
+	if (!mem) {
+		DRV_LOG(ERR, "can't allocate mem for id pool");
+		rte_errno  = ENOMEM;
+		goto error;
+	}
+	pool->free_arr = mem;
+	pool->curr = pool->free_arr;
+	pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
+	pool->base_index = 0;
+	pool->max_id = max_id;
+	return pool;
+error:
+	rte_free(pool);
+	return NULL;
+}
+
+/**
+ * Release ID pool structure.
+ *
+ * @param[in] pool
+ *   Pointer to flow id pool object to free.
+ */
+void
+mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
+{
+	rte_free(pool->free_arr);
+	rte_free(pool);
+}
+
+/**
+ * Generate ID.
+ *
+ * @param[in] pool
+ *   Pointer to flow id pool.
+ * @param[out] id
+ *   The generated ID.
+ *
+ * @return
+ *   0 on success, error value otherwise.
+ */
+uint32_t
+mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
+{
+	if (pool->curr == pool->free_arr) {
+		if (pool->base_index == pool->max_id) {
+			rte_errno  = ENOMEM;
+			DRV_LOG(ERR, "no free id");
+			return -rte_errno;
+		}
+		*id = ++pool->base_index;
+		return 0;
+	}
+	*id = *(--pool->curr);
+	return 0;
+}
+
+/**
+ * Release ID.
+ *
+ * @param[in] pool
+ *   Pointer to flow id pool.
+ * @param[out] id
+ *   The generated ID.
+ *
+ * @return
+ *   0 on success, error value otherwise.
+ */
+uint32_t
+mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
+{
+	uint32_t size;
+	uint32_t size2;
+	void *mem;
+
+	if (pool->curr == pool->last) {
+		size = pool->curr - pool->free_arr;
+		size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
+		MLX5_ASSERT(size2 > size);
+		mem = rte_malloc("", size2 * sizeof(uint32_t), 0);
+		if (!mem) {
+			DRV_LOG(ERR, "can't allocate mem for id pool");
+			rte_errno  = ENOMEM;
+			return -rte_errno;
+		}
+		memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
+		rte_free(pool->free_arr);
+		pool->free_arr = mem;
+		pool->curr = pool->free_arr + size;
+		pool->last = pool->free_arr + size2;
+	}
+	*pool->curr = id;
+	pool->curr++;
+	return 0;
+}
+
+/**
+ * Initialize the shared aging list information per port.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object.
+ */
+static void
+mlx5_flow_aging_init(struct mlx5_ibv_shared *sh)
+{
+	uint32_t i;
+	struct mlx5_age_info *age_info;
+
+	for (i = 0; i < sh->max_port; i++) {
+		age_info = &sh->port[i].age_info;
+		age_info->flags = 0;
+		TAILQ_INIT(&age_info->aged_counters);
+		rte_spinlock_init(&age_info->aged_sl);
+		MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER);
+	}
+}
+
+/**
+ * Initialize the counters management structure.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object to free
+ */
+static void
+mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh)
+{
+	int i;
+
+	memset(&sh->cmng, 0, sizeof(sh->cmng));
+	TAILQ_INIT(&sh->cmng.flow_counters);
+	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
+		TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
+		rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
+	}
+}
+
+/**
+ * Destroy all the resources allocated for a counter memory management.
+ *
+ * @param[in] mng
+ *   Pointer to the memory management structure.
+ */
+static void
+mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
+{
+	uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
+
+	LIST_REMOVE(mng, next);
+	claim_zero(mlx5_devx_cmd_destroy(mng->dm));
+	claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
+	rte_free(mem);
+}
+
+/**
+ * Close and release all the resources of the counters management.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object to free.
+ */
+static void
+mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh)
+{
+	struct mlx5_counter_stats_mem_mng *mng;
+	int i;
+	int j;
+	int retries = 1024;
+
+	rte_errno = 0;
+	while (--retries) {
+		rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
+		if (rte_errno != EINPROGRESS)
+			break;
+		rte_pause();
+	}
+	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
+		struct mlx5_flow_counter_pool *pool;
+		uint32_t batch = !!(i > 1);
+
+		if (!sh->cmng.ccont[i].pools)
+			continue;
+		pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+		while (pool) {
+			if (batch && pool->min_dcs)
+				claim_zero(mlx5_devx_cmd_destroy
+							       (pool->min_dcs));
+			for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
+				if (MLX5_POOL_GET_CNT(pool, j)->action)
+					claim_zero
+					 (mlx5_glue->destroy_flow_action
+					  (MLX5_POOL_GET_CNT
+					  (pool, j)->action));
+				if (!batch && MLX5_GET_POOL_CNT_EXT
+				    (pool, j)->dcs)
+					claim_zero(mlx5_devx_cmd_destroy
+						   (MLX5_GET_POOL_CNT_EXT
+						    (pool, j)->dcs));
+			}
+			TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next);
+			rte_free(pool);
+			pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+		}
+		rte_free(sh->cmng.ccont[i].pools);
+	}
+	mng = LIST_FIRST(&sh->cmng.mem_mngs);
+	while (mng) {
+		mlx5_flow_destroy_counter_stat_mem_mng(mng);
+		mng = LIST_FIRST(&sh->cmng.mem_mngs);
+	}
+	memset(&sh->cmng, 0, sizeof(sh->cmng));
+}
+
+/**
+ * Initialize the flow resources' indexed mempool.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object.
+ * @param[in] sh
+ *   Pointer to user dev config.
+ */
+static void
+mlx5_flow_ipool_create(struct mlx5_ibv_shared *sh,
+		       const struct mlx5_dev_config *config __rte_unused)
+{
+	uint8_t i;
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	/*
+	 * While DV is supported, user chooses the verbs mode,
+	 * the mlx5 flow handle size is different with the
+	 * MLX5_FLOW_HANDLE_VERBS_SIZE.
+	 */
+	if (!config->dv_flow_en)
+		mlx5_ipool_cfg[MLX5_IPOOL_MLX5_FLOW].size =
+					MLX5_FLOW_HANDLE_VERBS_SIZE;
+#endif
+	for (i = 0; i < MLX5_IPOOL_MAX; ++i)
+		sh->ipool[i] = mlx5_ipool_create(&mlx5_ipool_cfg[i]);
+}
+
+/**
+ * Release the flow resources' indexed mempool.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object.
+ */
+static void
+mlx5_flow_ipool_destroy(struct mlx5_ibv_shared *sh)
+{
+	uint8_t i;
+
+	for (i = 0; i < MLX5_IPOOL_MAX; ++i)
+		mlx5_ipool_destroy(sh->ipool[i]);
+}
+
+/**
+ * Extract pdn of PD object using DV API.
+ *
+ * @param[in] pd
+ *   Pointer to the verbs PD object.
+ * @param[out] pdn
+ *   Pointer to the PD object number variable.
+ *
+ * @return
+ *   0 on success, error value otherwise.
+ */
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+static int
+mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused)
+{
+	struct mlx5dv_obj obj;
+	struct mlx5dv_pd pd_info;
+	int ret = 0;
+
+	obj.pd.in = pd;
+	obj.pd.out = &pd_info;
+	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
+	if (ret) {
+		DRV_LOG(DEBUG, "Fail to get PD object info");
+		return ret;
+	}
+	*pdn = pd_info.pdn;
+	return 0;
+}
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
+
+static int
+mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
+{
+	char *env;
+	int value;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	/* Get environment variable to store. */
+	env = getenv(MLX5_SHUT_UP_BF);
+	value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
+	if (config->dbnc == MLX5_ARG_UNSET)
+		setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
+	else
+		setenv(MLX5_SHUT_UP_BF,
+		       config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
+	return value;
+}
+
+static void
+mlx5_restore_doorbell_mapping_env(int value)
+{
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	/* Restore the original environment variable state. */
+	if (value == MLX5_ARG_UNSET)
+		unsetenv(MLX5_SHUT_UP_BF);
+	else
+		setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
+}
+
+/**
+ * Allocate shared IB device context. If there is multiport device the
+ * master and representors will share this context, if there is single
+ * port dedicated IB device, the context will be used by only given
+ * port due to unification.
+ *
+ * Routine first searches the context for the specified IB device name,
+ * if found the shared context assumed and reference counter is incremented.
+ * If no context found the new one is created and initialized with specified
+ * IB device context and parameters.
+ *
+ * @param[in] spawn
+ *   Pointer to the IB device attributes (name, port, etc).
+ * @param[in] config
+ *   Pointer to device configuration structure.
+ *
+ * @return
+ *   Pointer to mlx5_ibv_shared object on success,
+ *   otherwise NULL and rte_errno is set.
+ */
+static struct mlx5_ibv_shared *
+mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn,
+			const struct mlx5_dev_config *config)
+{
+	struct mlx5_ibv_shared *sh;
+	int dbmap_env;
+	int err = 0;
+	uint32_t i;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	struct mlx5_devx_tis_attr tis_attr = { 0 };
+#endif
+
+	MLX5_ASSERT(spawn);
+	/* Secondary process should not create the shared context. */
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	pthread_mutex_lock(&mlx5_ibv_list_mutex);
+	/* Search for IB context by device name. */
+	LIST_FOREACH(sh, &mlx5_ibv_list, next) {
+		if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) {
+			sh->refcnt++;
+			goto exit;
+		}
+	}
+	/* No device found, we have to create new shared context. */
+	MLX5_ASSERT(spawn->max_port);
+	sh = rte_zmalloc("ethdev shared ib context",
+			 sizeof(struct mlx5_ibv_shared) +
+			 spawn->max_port *
+			 sizeof(struct mlx5_ibv_shared_port),
+			 RTE_CACHE_LINE_SIZE);
+	if (!sh) {
+		DRV_LOG(ERR, "shared context allocation failure");
+		rte_errno  = ENOMEM;
+		goto exit;
+	}
+	/*
+	 * Configure environment variable "MLX5_BF_SHUT_UP"
+	 * before the device creation. The rdma_core library
+	 * checks the variable at device creation and
+	 * stores the result internally.
+	 */
+	dbmap_env = mlx5_config_doorbell_mapping_env(config);
+	/* Try to open IB device with DV first, then usual Verbs. */
+	errno = 0;
+	sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
+	if (sh->ctx) {
+		sh->devx = 1;
+		DRV_LOG(DEBUG, "DevX is supported");
+		/* The device is created, no need for environment. */
+		mlx5_restore_doorbell_mapping_env(dbmap_env);
+	} else {
+		/* The environment variable is still configured. */
+		sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
+		err = errno ? errno : ENODEV;
+		/*
+		 * The environment variable is not needed anymore,
+		 * all device creation attempts are completed.
+		 */
+		mlx5_restore_doorbell_mapping_env(dbmap_env);
+		if (!sh->ctx)
+			goto error;
+		DRV_LOG(DEBUG, "DevX is NOT supported");
+	}
+	err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr);
+	if (err) {
+		DRV_LOG(DEBUG, "ibv_query_device_ex() failed");
+		goto error;
+	}
+	sh->refcnt = 1;
+	sh->max_port = spawn->max_port;
+	strncpy(sh->ibdev_name, sh->ctx->device->name,
+		sizeof(sh->ibdev_name));
+	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
+		sizeof(sh->ibdev_path));
+	pthread_mutex_init(&sh->intr_mutex, NULL);
+	/*
+	 * Setting port_id to max unallowed value means
+	 * there is no interrupt subhandler installed for
+	 * the given port index i.
+	 */
+	for (i = 0; i < sh->max_port; i++) {
+		sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
+		sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
+	}
+	sh->pd = mlx5_glue->alloc_pd(sh->ctx);
+	if (sh->pd == NULL) {
+		DRV_LOG(ERR, "PD allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	if (sh->devx) {
+		err = mlx5_get_pdn(sh->pd, &sh->pdn);
+		if (err) {
+			DRV_LOG(ERR, "Fail to extract pdn from PD");
+			goto error;
+		}
+		sh->td = mlx5_devx_cmd_create_td(sh->ctx);
+		if (!sh->td) {
+			DRV_LOG(ERR, "TD allocation failure");
+			err = ENOMEM;
+			goto error;
+		}
+		tis_attr.transport_domain = sh->td->id;
+		sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
+		if (!sh->tis) {
+			DRV_LOG(ERR, "TIS allocation failure");
+			err = ENOMEM;
+			goto error;
+		}
+	}
+	sh->flow_id_pool = mlx5_flow_id_pool_alloc
+					((1 << HAIRPIN_FLOW_ID_BITS) - 1);
+	if (!sh->flow_id_pool) {
+		DRV_LOG(ERR, "can't create flow id pool");
+		err = ENOMEM;
+		goto error;
+	}
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 *
+	 * At this point the device is not added to the memory
+	 * event list yet, context is just being created.
+	 */
+	err = mlx5_mr_btree_init(&sh->share_cache.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 spawn->pci_dev->device.numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	mlx5_flow_aging_init(sh);
+	mlx5_flow_counters_mng_init(sh);
+	mlx5_flow_ipool_create(sh, config);
+	/* Add device to memory callback list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
+			 sh, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+	/* Add context to the global device list. */
+	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
+exit:
+	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
+	return sh;
+error:
+	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
+	MLX5_ASSERT(sh);
+	if (sh->tis)
+		claim_zero(mlx5_devx_cmd_destroy(sh->tis));
+	if (sh->td)
+		claim_zero(mlx5_devx_cmd_destroy(sh->td));
+	if (sh->pd)
+		claim_zero(mlx5_glue->dealloc_pd(sh->pd));
+	if (sh->ctx)
+		claim_zero(mlx5_glue->close_device(sh->ctx));
+	if (sh->flow_id_pool)
+		mlx5_flow_id_pool_release(sh->flow_id_pool);
+	rte_free(sh);
+	MLX5_ASSERT(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * Free shared IB device context. Decrement counter and if zero free
+ * all allocated resources and close handles.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object to free
+ */
+static void
+mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh)
+{
+	pthread_mutex_lock(&mlx5_ibv_list_mutex);
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+	/* Check the object presence in the list. */
+	struct mlx5_ibv_shared *lctx;
+
+	LIST_FOREACH(lctx, &mlx5_ibv_list, next)
+		if (lctx == sh)
+			break;
+	MLX5_ASSERT(lctx);
+	if (lctx != sh) {
+		DRV_LOG(ERR, "Freeing non-existing shared IB context");
+		goto exit;
+	}
+#endif
+	MLX5_ASSERT(sh);
+	MLX5_ASSERT(sh->refcnt);
+	/* Secondary process should not free the shared context. */
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (--sh->refcnt)
+		goto exit;
+	/* Remove from memory callback device list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_REMOVE(sh, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+	/* Release created Memory Regions. */
+	mlx5_mr_release_cache(&sh->share_cache);
+	/* Remove context from the global device list. */
+	LIST_REMOVE(sh, next);
+	/*
+	 *  Ensure there is no async event handler installed.
+	 *  Only primary process handles async device events.
+	 **/
+	mlx5_flow_counters_mng_close(sh);
+	mlx5_flow_ipool_destroy(sh);
+	MLX5_ASSERT(!sh->intr_cnt);
+	if (sh->intr_cnt)
+		mlx5_intr_callback_unregister
+			(&sh->intr_handle, mlx5_dev_interrupt_handler, sh);
+#ifdef HAVE_MLX5_DEVX_ASYNC_SUPPORT
+	if (sh->devx_intr_cnt) {
+		if (sh->intr_handle_devx.fd)
+			rte_intr_callback_unregister(&sh->intr_handle_devx,
+					  mlx5_dev_interrupt_handler_devx, sh);
+		if (sh->devx_comp)
+			mlx5dv_devx_destroy_cmd_comp(sh->devx_comp);
+	}
+#endif
+	pthread_mutex_destroy(&sh->intr_mutex);
+	if (sh->pd)
+		claim_zero(mlx5_glue->dealloc_pd(sh->pd));
+	if (sh->tis)
+		claim_zero(mlx5_devx_cmd_destroy(sh->tis));
+	if (sh->td)
+		claim_zero(mlx5_devx_cmd_destroy(sh->td));
+	if (sh->ctx)
+		claim_zero(mlx5_glue->close_device(sh->ctx));
+	if (sh->flow_id_pool)
+		mlx5_flow_id_pool_release(sh->flow_id_pool);
+	rte_free(sh);
+exit:
+	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
+}
+
+/**
+ * Destroy table hash list and all the root entries per domain.
+ *
+ * @param[in] priv
+ *   Pointer to the private device data structure.
+ */
+static void
+mlx5_free_table_hash_list(struct mlx5_priv *priv)
+{
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_tbl_data_entry *tbl_data;
+	union mlx5_flow_tbl_key table_key = {
+		{
+			.table_id = 0,
+			.reserved = 0,
+			.domain = 0,
+			.direction = 0,
+		}
+	};
+	struct mlx5_hlist_entry *pos;
+
+	if (!sh->flow_tbls)
+		return;
+	pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
+	if (pos) {
+		tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
+					entry);
+		MLX5_ASSERT(tbl_data);
+		mlx5_hlist_remove(sh->flow_tbls, pos);
+		rte_free(tbl_data);
+	}
+	table_key.direction = 1;
+	pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
+	if (pos) {
+		tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
+					entry);
+		MLX5_ASSERT(tbl_data);
+		mlx5_hlist_remove(sh->flow_tbls, pos);
+		rte_free(tbl_data);
+	}
+	table_key.direction = 0;
+	table_key.domain = 1;
+	pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
+	if (pos) {
+		tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
+					entry);
+		MLX5_ASSERT(tbl_data);
+		mlx5_hlist_remove(sh->flow_tbls, pos);
+		rte_free(tbl_data);
+	}
+	mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
+}
+
+/**
+ * Initialize flow table hash list and create the root tables entry
+ * for each domain.
+ *
+ * @param[in] priv
+ *   Pointer to the private device data structure.
+ *
+ * @return
+ *   Zero on success, positive error code otherwise.
+ */
+static int
+mlx5_alloc_table_hash_list(struct mlx5_priv *priv)
+{
+	struct mlx5_ibv_shared *sh = priv->sh;
+	char s[MLX5_HLIST_NAMESIZE];
+	int err = 0;
+
+	MLX5_ASSERT(sh);
+	snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
+	sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
+	if (!sh->flow_tbls) {
+		DRV_LOG(ERR, "flow tables with hash creation failed.\n");
+		err = ENOMEM;
+		return err;
+	}
+#ifndef HAVE_MLX5DV_DR
+	/*
+	 * In case we have not DR support, the zero tables should be created
+	 * because DV expect to see them even if they cannot be created by
+	 * RDMA-CORE.
+	 */
+	union mlx5_flow_tbl_key table_key = {
+		{
+			.table_id = 0,
+			.reserved = 0,
+			.domain = 0,
+			.direction = 0,
+		}
+	};
+	struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL,
+							  sizeof(*tbl_data), 0);
+
+	if (!tbl_data) {
+		err = ENOMEM;
+		goto error;
+	}
+	tbl_data->entry.key = table_key.v64;
+	err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
+	if (err)
+		goto error;
+	rte_atomic32_init(&tbl_data->tbl.refcnt);
+	rte_atomic32_inc(&tbl_data->tbl.refcnt);
+	table_key.direction = 1;
+	tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
+	if (!tbl_data) {
+		err = ENOMEM;
+		goto error;
+	}
+	tbl_data->entry.key = table_key.v64;
+	err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
+	if (err)
+		goto error;
+	rte_atomic32_init(&tbl_data->tbl.refcnt);
+	rte_atomic32_inc(&tbl_data->tbl.refcnt);
+	table_key.direction = 0;
+	table_key.domain = 1;
+	tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
+	if (!tbl_data) {
+		err = ENOMEM;
+		goto error;
+	}
+	tbl_data->entry.key = table_key.v64;
+	err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
+	if (err)
+		goto error;
+	rte_atomic32_init(&tbl_data->tbl.refcnt);
+	rte_atomic32_inc(&tbl_data->tbl.refcnt);
+	return err;
+error:
+	mlx5_free_table_hash_list(priv);
+#endif /* HAVE_MLX5DV_DR */
+	return err;
+}
+
+/**
+ * Initialize DR related data within private structure.
+ * Routine checks the reference counter and does actual
+ * resources creation/initialization only if counter is zero.
+ *
+ * @param[in] priv
+ *   Pointer to the private device data structure.
+ *
+ * @return
+ *   Zero on success, positive error code otherwise.
+ */
+static int
+mlx5_alloc_shared_dr(struct mlx5_priv *priv)
+{
+	struct mlx5_ibv_shared *sh = priv->sh;
+	char s[MLX5_HLIST_NAMESIZE];
+	int err = 0;
+
+	if (!sh->flow_tbls)
+		err = mlx5_alloc_table_hash_list(priv);
+	else
+		DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n",
+			(void *)sh->flow_tbls);
+	if (err)
+		return err;
+	/* Create tags hash list table. */
+	snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name);
+	sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE);
+	if (!sh->tag_table) {
+		DRV_LOG(ERR, "tags with hash creation failed.\n");
+		err = ENOMEM;
+		goto error;
+	}
+#ifdef HAVE_MLX5DV_DR
+	void *domain;
+
+	if (sh->dv_refcnt) {
+		/* Shared DV/DR structures is already initialized. */
+		sh->dv_refcnt++;
+		priv->dr_shared = 1;
+		return 0;
+	}
+	/* Reference counter is zero, we should initialize structures. */
+	domain = mlx5_glue->dr_create_domain(sh->ctx,
+					     MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
+	if (!domain) {
+		DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
+		err = errno;
+		goto error;
+	}
+	sh->rx_domain = domain;
+	domain = mlx5_glue->dr_create_domain(sh->ctx,
+					     MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
+	if (!domain) {
+		DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
+		err = errno;
+		goto error;
+	}
+	pthread_mutex_init(&sh->dv_mutex, NULL);
+	sh->tx_domain = domain;
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	if (priv->config.dv_esw_en) {
+		domain  = mlx5_glue->dr_create_domain
+			(sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
+		if (!domain) {
+			DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
+			err = errno;
+			goto error;
+		}
+		sh->fdb_domain = domain;
+		sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
+	}
+#endif
+	sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
+#endif /* HAVE_MLX5DV_DR */
+	sh->dv_refcnt++;
+	priv->dr_shared = 1;
+	return 0;
+error:
+	/* Rollback the created objects. */
+	if (sh->rx_domain) {
+		mlx5_glue->dr_destroy_domain(sh->rx_domain);
+		sh->rx_domain = NULL;
+	}
+	if (sh->tx_domain) {
+		mlx5_glue->dr_destroy_domain(sh->tx_domain);
+		sh->tx_domain = NULL;
+	}
+	if (sh->fdb_domain) {
+		mlx5_glue->dr_destroy_domain(sh->fdb_domain);
+		sh->fdb_domain = NULL;
+	}
+	if (sh->esw_drop_action) {
+		mlx5_glue->destroy_flow_action(sh->esw_drop_action);
+		sh->esw_drop_action = NULL;
+	}
+	if (sh->pop_vlan_action) {
+		mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
+		sh->pop_vlan_action = NULL;
+	}
+	if (sh->tag_table) {
+		/* tags should be destroyed with flow before. */
+		mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
+		sh->tag_table = NULL;
+	}
+	mlx5_free_table_hash_list(priv);
+	return err;
+}
+
+/**
+ * Destroy DR related data within private structure.
+ *
+ * @param[in] priv
+ *   Pointer to the private device data structure.
+ */
+static void
+mlx5_free_shared_dr(struct mlx5_priv *priv)
+{
+	struct mlx5_ibv_shared *sh;
+
+	if (!priv->dr_shared)
+		return;
+	priv->dr_shared = 0;
+	sh = priv->sh;
+	MLX5_ASSERT(sh);
+#ifdef HAVE_MLX5DV_DR
+	MLX5_ASSERT(sh->dv_refcnt);
+	if (sh->dv_refcnt && --sh->dv_refcnt)
+		return;
+	if (sh->rx_domain) {
+		mlx5_glue->dr_destroy_domain(sh->rx_domain);
+		sh->rx_domain = NULL;
+	}
+	if (sh->tx_domain) {
+		mlx5_glue->dr_destroy_domain(sh->tx_domain);
+		sh->tx_domain = NULL;
+	}
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	if (sh->fdb_domain) {
+		mlx5_glue->dr_destroy_domain(sh->fdb_domain);
+		sh->fdb_domain = NULL;
+	}
+	if (sh->esw_drop_action) {
+		mlx5_glue->destroy_flow_action(sh->esw_drop_action);
+		sh->esw_drop_action = NULL;
+	}
+#endif
+	if (sh->pop_vlan_action) {
+		mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
+		sh->pop_vlan_action = NULL;
+	}
+	pthread_mutex_destroy(&sh->dv_mutex);
+#endif /* HAVE_MLX5DV_DR */
+	if (sh->tag_table) {
+		/* tags should be destroyed with flow before. */
+		mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
+		sh->tag_table = NULL;
+	}
+	mlx5_free_table_hash_list(priv);
+}
+
+/**
+ * Initialize shared data between primary and secondary process.
+ *
+ * A memzone is reserved by primary process and secondary processes attach to
+ * the memzone.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_init_shared_data(void)
+{
+	const struct rte_memzone *mz;
+	int ret = 0;
+
+	rte_spinlock_lock(&mlx5_shared_data_lock);
+	if (mlx5_shared_data == NULL) {
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			/* Allocate shared memory. */
+			mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
+						 sizeof(*mlx5_shared_data),
+						 SOCKET_ID_ANY, 0);
+			if (mz == NULL) {
+				DRV_LOG(ERR,
+					"Cannot allocate mlx5 shared data");
+				ret = -rte_errno;
+				goto error;
+			}
+			mlx5_shared_data = mz->addr;
+			memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
+			rte_spinlock_init(&mlx5_shared_data->lock);
+		} else {
+			/* Lookup allocated shared memory. */
+			mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
+			if (mz == NULL) {
+				DRV_LOG(ERR,
+					"Cannot attach mlx5 shared data");
+				ret = -rte_errno;
+				goto error;
+			}
+			mlx5_shared_data = mz->addr;
+			memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
+		}
+	}
+error:
+	rte_spinlock_unlock(&mlx5_shared_data_lock);
+	return ret;
+}
+
+/**
+ * Retrieve integer value from environment variable.
+ *
+ * @param[in] name
+ *   Environment variable name.
+ *
+ * @return
+ *   Integer value, 0 if the variable is not set.
+ */
+int
+mlx5_getenv_int(const char *name)
+{
+	const char *val = getenv(name);
+
+	if (val == NULL)
+		return 0;
+	return atoi(val);
+}
+
+/**
+ * Verbs callback to allocate a memory. This function should allocate the space
+ * according to the size provided residing inside a huge page.
+ * Please note that all allocation must respect the alignment from libmlx5
+ * (i.e. currently sysconf(_SC_PAGESIZE)).
+ *
+ * @param[in] size
+ *   The size in bytes of the memory to allocate.
+ * @param[in] data
+ *   A pointer to the callback data.
+ *
+ * @return
+ *   Allocated buffer, NULL otherwise and rte_errno is set.
+ */
+static void *
+mlx5_alloc_verbs_buf(size_t size, void *data)
+{
+	struct mlx5_priv *priv = data;
+	void *ret;
+	size_t alignment = sysconf(_SC_PAGESIZE);
+	unsigned int socket = SOCKET_ID_ANY;
+
+	if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
+		const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
+
+		socket = ctrl->socket;
+	} else if (priv->verbs_alloc_ctx.type ==
+		   MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
+		const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
+
+		socket = ctrl->socket;
+	}
+	MLX5_ASSERT(data != NULL);
+	ret = rte_malloc_socket(__func__, size, alignment, socket);
+	if (!ret && size)
+		rte_errno = ENOMEM;
+	return ret;
+}
+
+/**
+ * Verbs callback to free a memory.
+ *
+ * @param[in] ptr
+ *   A pointer to the memory to free.
+ * @param[in] data
+ *   A pointer to the callback data.
+ */
+static void
+mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
+{
+	MLX5_ASSERT(data != NULL);
+	rte_free(ptr);
+}
+
+/**
+ * DPDK callback to add udp tunnel port
+ *
+ * @param[in] dev
+ *   A pointer to eth_dev
+ * @param[in] udp_tunnel
+ *   A pointer to udp tunnel
+ *
+ * @return
+ *   0 on valid udp ports and tunnels, -ENOTSUP otherwise.
+ */
+int
+mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
+			 struct rte_eth_udp_tunnel *udp_tunnel)
+{
+	MLX5_ASSERT(udp_tunnel != NULL);
+	if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
+	    udp_tunnel->udp_port == 4789)
+		return 0;
+	if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
+	    udp_tunnel->udp_port == 4790)
+		return 0;
+	return -ENOTSUP;
+}
+
+/**
+ * Initialize process private data structure.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_proc_priv_init(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_proc_priv *ppriv;
+	size_t ppriv_size;
+
+	/*
+	 * UAR register table follows the process private structure. BlueFlame
+	 * registers for Tx queues are stored in the table.
+	 */
+	ppriv_size =
+		sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
+	ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
+				  RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+	if (!ppriv) {
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	ppriv->uar_table_sz = ppriv_size;
+	dev->process_private = ppriv;
+	return 0;
+}
+
+/**
+ * Un-initialize process private data structure.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+static void
+mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
+{
+	if (!dev->process_private)
+		return;
+	rte_free(dev->process_private);
+	dev->process_private = NULL;
+}
+
+/**
+ * DPDK callback to close the device.
+ *
+ * Destroy all queues and objects, free memory.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+static void
+mlx5_dev_close(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+	int ret;
+
+	DRV_LOG(DEBUG, "port %u closing device \"%s\"",
+		dev->data->port_id,
+		((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : ""));
+	/* In case mlx5_dev_stop() has not been called. */
+	mlx5_dev_interrupt_handler_uninstall(dev);
+	mlx5_dev_interrupt_handler_devx_uninstall(dev);
+	/*
+	 * If default mreg copy action is removed at the stop stage,
+	 * the search will return none and nothing will be done anymore.
+	 */
+	mlx5_flow_stop_default(dev);
+	mlx5_traffic_disable(dev);
+	/*
+	 * If all the flows are already flushed in the device stop stage,
+	 * then this will return directly without any action.
+	 */
+	mlx5_flow_list_flush(dev, &priv->flows, true);
+	mlx5_flow_meter_flush(dev, NULL);
+	/* Free the intermediate buffers for flow creation. */
+	mlx5_flow_free_intermediate(dev);
+	/* Prevent crashes when queues are still in use. */
+	dev->rx_pkt_burst = removed_rx_burst;
+	dev->tx_pkt_burst = removed_tx_burst;
+	rte_wmb();
+	/* Disable datapath on secondary process. */
+	mlx5_mp_req_stop_rxtx(dev);
+	if (priv->rxqs != NULL) {
+		/* XXX race condition if mlx5_rx_burst() is still running. */
+		usleep(1000);
+		for (i = 0; (i != priv->rxqs_n); ++i)
+			mlx5_rxq_release(dev, i);
+		priv->rxqs_n = 0;
+		priv->rxqs = NULL;
+	}
+	if (priv->txqs != NULL) {
+		/* XXX race condition if mlx5_tx_burst() is still running. */
+		usleep(1000);
+		for (i = 0; (i != priv->txqs_n); ++i)
+			mlx5_txq_release(dev, i);
+		priv->txqs_n = 0;
+		priv->txqs = NULL;
+	}
+	mlx5_proc_priv_uninit(dev);
+	if (priv->mreg_cp_tbl)
+		mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
+	mlx5_mprq_free_mp(dev);
+	mlx5_free_shared_dr(priv);
+	if (priv->rss_conf.rss_key != NULL)
+		rte_free(priv->rss_conf.rss_key);
+	if (priv->reta_idx != NULL)
+		rte_free(priv->reta_idx);
+	if (priv->config.vf)
+		mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
+				       dev->data->mac_addrs,
+				       MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
+	if (priv->nl_socket_route >= 0)
+		close(priv->nl_socket_route);
+	if (priv->nl_socket_rdma >= 0)
+		close(priv->nl_socket_rdma);
+	if (priv->vmwa_context)
+		mlx5_vlan_vmwa_exit(priv->vmwa_context);
+	ret = mlx5_hrxq_verify(dev);
+	if (ret)
+		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
+			dev->data->port_id);
+	ret = mlx5_ind_table_obj_verify(dev);
+	if (ret)
+		DRV_LOG(WARNING, "port %u some indirection table still remain",
+			dev->data->port_id);
+	ret = mlx5_rxq_obj_verify(dev);
+	if (ret)
+		DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
+			dev->data->port_id);
+	ret = mlx5_rxq_verify(dev);
+	if (ret)
+		DRV_LOG(WARNING, "port %u some Rx queues still remain",
+			dev->data->port_id);
+	ret = mlx5_txq_obj_verify(dev);
+	if (ret)
+		DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
+			dev->data->port_id);
+	ret = mlx5_txq_verify(dev);
+	if (ret)
+		DRV_LOG(WARNING, "port %u some Tx queues still remain",
+			dev->data->port_id);
+	ret = mlx5_flow_verify(dev);
+	if (ret)
+		DRV_LOG(WARNING, "port %u some flows still remain",
+			dev->data->port_id);
+	if (priv->sh) {
+		/*
+		 * Free the shared context in last turn, because the cleanup
+		 * routines above may use some shared fields, like
+		 * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
+		 * ifindex if Netlink fails.
+		 */
+		mlx5_free_shared_ibctx(priv->sh);
+		priv->sh = NULL;
+	}
+	if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+		unsigned int c = 0;
+		uint16_t port_id;
+
+		MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+			struct mlx5_priv *opriv =
+				rte_eth_devices[port_id].data->dev_private;
+
+			if (!opriv ||
+			    opriv->domain_id != priv->domain_id ||
+			    &rte_eth_devices[port_id] == dev)
+				continue;
+			++c;
+			break;
+		}
+		if (!c)
+			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
+	}
+	memset(priv, 0, sizeof(*priv));
+	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+	/*
+	 * Reset mac_addrs to NULL such that it is not freed as part of
+	 * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
+	 * it is freed when dev_private is freed.
+	 */
+	dev->data->mac_addrs = NULL;
+}
+
+const struct eth_dev_ops mlx5_dev_ops = {
+	.dev_configure = mlx5_dev_configure,
+	.dev_start = mlx5_dev_start,
+	.dev_stop = mlx5_dev_stop,
+	.dev_set_link_down = mlx5_set_link_down,
+	.dev_set_link_up = mlx5_set_link_up,
+	.dev_close = mlx5_dev_close,
+	.promiscuous_enable = mlx5_promiscuous_enable,
+	.promiscuous_disable = mlx5_promiscuous_disable,
+	.allmulticast_enable = mlx5_allmulticast_enable,
+	.allmulticast_disable = mlx5_allmulticast_disable,
+	.link_update = mlx5_link_update,
+	.stats_get = mlx5_stats_get,
+	.stats_reset = mlx5_stats_reset,
+	.xstats_get = mlx5_xstats_get,
+	.xstats_reset = mlx5_xstats_reset,
+	.xstats_get_names = mlx5_xstats_get_names,
+	.fw_version_get = mlx5_fw_version_get,
+	.dev_infos_get = mlx5_dev_infos_get,
+	.read_clock = mlx5_read_clock,
+	.dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
+	.vlan_filter_set = mlx5_vlan_filter_set,
+	.rx_queue_setup = mlx5_rx_queue_setup,
+	.rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
+	.tx_queue_setup = mlx5_tx_queue_setup,
+	.tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
+	.rx_queue_release = mlx5_rx_queue_release,
+	.tx_queue_release = mlx5_tx_queue_release,
+	.flow_ctrl_get = mlx5_dev_get_flow_ctrl,
+	.flow_ctrl_set = mlx5_dev_set_flow_ctrl,
+	.mac_addr_remove = mlx5_mac_addr_remove,
+	.mac_addr_add = mlx5_mac_addr_add,
+	.mac_addr_set = mlx5_mac_addr_set,
+	.set_mc_addr_list = mlx5_set_mc_addr_list,
+	.mtu_set = mlx5_dev_set_mtu,
+	.vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
+	.vlan_offload_set = mlx5_vlan_offload_set,
+	.reta_update = mlx5_dev_rss_reta_update,
+	.reta_query = mlx5_dev_rss_reta_query,
+	.rss_hash_update = mlx5_rss_hash_update,
+	.rss_hash_conf_get = mlx5_rss_hash_conf_get,
+	.filter_ctrl = mlx5_dev_filter_ctrl,
+	.rx_descriptor_status = mlx5_rx_descriptor_status,
+	.tx_descriptor_status = mlx5_tx_descriptor_status,
+	.rxq_info_get = mlx5_rxq_info_get,
+	.txq_info_get = mlx5_txq_info_get,
+	.rx_burst_mode_get = mlx5_rx_burst_mode_get,
+	.tx_burst_mode_get = mlx5_tx_burst_mode_get,
+	.rx_queue_count = mlx5_rx_queue_count,
+	.rx_queue_intr_enable = mlx5_rx_intr_enable,
+	.rx_queue_intr_disable = mlx5_rx_intr_disable,
+	.is_removed = mlx5_is_removed,
+	.udp_tunnel_port_add  = mlx5_udp_tunnel_port_add,
+	.get_module_info = mlx5_get_module_info,
+	.get_module_eeprom = mlx5_get_module_eeprom,
+	.hairpin_cap_get = mlx5_hairpin_cap_get,
+	.mtr_ops_get = mlx5_flow_meter_ops_get,
+};
+
+/* Available operations from secondary process. */
+static const struct eth_dev_ops mlx5_dev_sec_ops = {
+	.stats_get = mlx5_stats_get,
+	.stats_reset = mlx5_stats_reset,
+	.xstats_get = mlx5_xstats_get,
+	.xstats_reset = mlx5_xstats_reset,
+	.xstats_get_names = mlx5_xstats_get_names,
+	.fw_version_get = mlx5_fw_version_get,
+	.dev_infos_get = mlx5_dev_infos_get,
+	.rx_descriptor_status = mlx5_rx_descriptor_status,
+	.tx_descriptor_status = mlx5_tx_descriptor_status,
+	.rxq_info_get = mlx5_rxq_info_get,
+	.txq_info_get = mlx5_txq_info_get,
+	.rx_burst_mode_get = mlx5_rx_burst_mode_get,
+	.tx_burst_mode_get = mlx5_tx_burst_mode_get,
+	.get_module_info = mlx5_get_module_info,
+	.get_module_eeprom = mlx5_get_module_eeprom,
+};
+
+/* Available operations in flow isolated mode. */
+const struct eth_dev_ops mlx5_dev_ops_isolate = {
+	.dev_configure = mlx5_dev_configure,
+	.dev_start = mlx5_dev_start,
+	.dev_stop = mlx5_dev_stop,
+	.dev_set_link_down = mlx5_set_link_down,
+	.dev_set_link_up = mlx5_set_link_up,
+	.dev_close = mlx5_dev_close,
+	.promiscuous_enable = mlx5_promiscuous_enable,
+	.promiscuous_disable = mlx5_promiscuous_disable,
+	.allmulticast_enable = mlx5_allmulticast_enable,
+	.allmulticast_disable = mlx5_allmulticast_disable,
+	.link_update = mlx5_link_update,
+	.stats_get = mlx5_stats_get,
+	.stats_reset = mlx5_stats_reset,
+	.xstats_get = mlx5_xstats_get,
+	.xstats_reset = mlx5_xstats_reset,
+	.xstats_get_names = mlx5_xstats_get_names,
+	.fw_version_get = mlx5_fw_version_get,
+	.dev_infos_get = mlx5_dev_infos_get,
+	.dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
+	.vlan_filter_set = mlx5_vlan_filter_set,
+	.rx_queue_setup = mlx5_rx_queue_setup,
+	.rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
+	.tx_queue_setup = mlx5_tx_queue_setup,
+	.tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
+	.rx_queue_release = mlx5_rx_queue_release,
+	.tx_queue_release = mlx5_tx_queue_release,
+	.flow_ctrl_get = mlx5_dev_get_flow_ctrl,
+	.flow_ctrl_set = mlx5_dev_set_flow_ctrl,
+	.mac_addr_remove = mlx5_mac_addr_remove,
+	.mac_addr_add = mlx5_mac_addr_add,
+	.mac_addr_set = mlx5_mac_addr_set,
+	.set_mc_addr_list = mlx5_set_mc_addr_list,
+	.mtu_set = mlx5_dev_set_mtu,
+	.vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
+	.vlan_offload_set = mlx5_vlan_offload_set,
+	.filter_ctrl = mlx5_dev_filter_ctrl,
+	.rx_descriptor_status = mlx5_rx_descriptor_status,
+	.tx_descriptor_status = mlx5_tx_descriptor_status,
+	.rxq_info_get = mlx5_rxq_info_get,
+	.txq_info_get = mlx5_txq_info_get,
+	.rx_burst_mode_get = mlx5_rx_burst_mode_get,
+	.tx_burst_mode_get = mlx5_tx_burst_mode_get,
+	.rx_queue_intr_enable = mlx5_rx_intr_enable,
+	.rx_queue_intr_disable = mlx5_rx_intr_disable,
+	.is_removed = mlx5_is_removed,
+	.get_module_info = mlx5_get_module_info,
+	.get_module_eeprom = mlx5_get_module_eeprom,
+	.hairpin_cap_get = mlx5_hairpin_cap_get,
+	.mtr_ops_get = mlx5_flow_meter_ops_get,
+};
+
+/**
+ * Verify and store value for device argument.
+ *
+ * @param[in] key
+ *   Key argument to verify.
+ * @param[in] val
+ *   Value associated with key.
+ * @param opaque
+ *   User data.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_args_check(const char *key, const char *val, void *opaque)
+{
+	struct mlx5_dev_config *config = opaque;
+	unsigned long tmp;
+
+	/* No-op, port representors are processed in mlx5_dev_spawn(). */
+	if (!strcmp(MLX5_REPRESENTOR, key))
+		return 0;
+	errno = 0;
+	tmp = strtoul(val, NULL, 0);
+	if (errno) {
+		rte_errno = errno;
+		DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
+		return -rte_errno;
+	}
+	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
+		config->cqe_comp = !!tmp;
+	} else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
+		config->cqe_pad = !!tmp;
+	} else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
+		config->hw_padding = !!tmp;
+	} else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
+		config->mprq.enabled = !!tmp;
+	} else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
+		config->mprq.stride_num_n = tmp;
+	} else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) {
+		config->mprq.stride_size_n = tmp;
+	} else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
+		config->mprq.max_memcpy_len = tmp;
+	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
+		config->mprq.min_rxqs_num = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
+		DRV_LOG(WARNING, "%s: deprecated parameter,"
+				 " converted to txq_inline_max", key);
+		config->txq_inline_max = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
+		config->txq_inline_max = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
+		config->txq_inline_min = tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
+		config->txq_inline_mpw = tmp;
+	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
+		config->txqs_inline = tmp;
+	} else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
+		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
+	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
+		config->mps = !!tmp;
+	} else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
+		if (tmp != MLX5_TXDB_CACHED &&
+		    tmp != MLX5_TXDB_NCACHED &&
+		    tmp != MLX5_TXDB_HEURISTIC) {
+			DRV_LOG(ERR, "invalid Tx doorbell "
+				     "mapping parameter");
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
+		config->dbnc = tmp;
+	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
+		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
+	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
+		DRV_LOG(WARNING, "%s: deprecated parameter,"
+				 " converted to txq_inline_mpw", key);
+		config->txq_inline_mpw = tmp;
+	} else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
+		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
+	} else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
+		config->rx_vec_en = !!tmp;
+	} else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
+		config->l3_vxlan_en = !!tmp;
+	} else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
+		config->vf_nl_en = !!tmp;
+	} else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
+		config->dv_esw_en = !!tmp;
+	} else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
+		config->dv_flow_en = !!tmp;
+	} else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
+		if (tmp != MLX5_XMETA_MODE_LEGACY &&
+		    tmp != MLX5_XMETA_MODE_META16 &&
+		    tmp != MLX5_XMETA_MODE_META32) {
+			DRV_LOG(ERR, "invalid extensive "
+				     "metadata parameter");
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
+		config->dv_xmeta_en = tmp;
+	} else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
+		config->mr_ext_memseg_en = !!tmp;
+	} else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
+		config->max_dump_files_num = tmp;
+	} else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
+		config->lro.timeout = tmp;
+	} else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) {
+		DRV_LOG(DEBUG, "class argument is %s.", val);
+	} else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) {
+		config->log_hp_size = tmp;
+	} else {
+		DRV_LOG(WARNING, "%s: unknown parameter", key);
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Parse device parameters.
+ *
+ * @param config
+ *   Pointer to device configuration structure.
+ * @param devargs
+ *   Device arguments structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
+{
+	const char **params = (const char *[]){
+		MLX5_RXQ_CQE_COMP_EN,
+		MLX5_RXQ_CQE_PAD_EN,
+		MLX5_RXQ_PKT_PAD_EN,
+		MLX5_RX_MPRQ_EN,
+		MLX5_RX_MPRQ_LOG_STRIDE_NUM,
+		MLX5_RX_MPRQ_LOG_STRIDE_SIZE,
+		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
+		MLX5_RXQS_MIN_MPRQ,
+		MLX5_TXQ_INLINE,
+		MLX5_TXQ_INLINE_MIN,
+		MLX5_TXQ_INLINE_MAX,
+		MLX5_TXQ_INLINE_MPW,
+		MLX5_TXQS_MIN_INLINE,
+		MLX5_TXQS_MAX_VEC,
+		MLX5_TXQ_MPW_EN,
+		MLX5_TXQ_MPW_HDR_DSEG_EN,
+		MLX5_TXQ_MAX_INLINE_LEN,
+		MLX5_TX_DB_NC,
+		MLX5_TX_VEC_EN,
+		MLX5_RX_VEC_EN,
+		MLX5_L3_VXLAN_EN,
+		MLX5_VF_NL_EN,
+		MLX5_DV_ESW_EN,
+		MLX5_DV_FLOW_EN,
+		MLX5_DV_XMETA_EN,
+		MLX5_MR_EXT_MEMSEG_EN,
+		MLX5_REPRESENTOR,
+		MLX5_MAX_DUMP_FILES_NUM,
+		MLX5_LRO_TIMEOUT_USEC,
+		MLX5_CLASS_ARG_NAME,
+		MLX5_HP_BUF_SIZE,
+		NULL,
+	};
+	struct rte_kvargs *kvlist;
+	int ret = 0;
+	int i;
+
+	if (devargs == NULL)
+		return 0;
+	/* Following UGLY cast is done to pass checkpatch. */
+	kvlist = rte_kvargs_parse(devargs->args, params);
+	if (kvlist == NULL) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	/* Process parameters. */
+	for (i = 0; (params[i] != NULL); ++i) {
+		if (rte_kvargs_count(kvlist, params[i])) {
+			ret = rte_kvargs_process(kvlist, params[i],
+						 mlx5_args_check, config);
+			if (ret) {
+				rte_errno = EINVAL;
+				rte_kvargs_free(kvlist);
+				return -rte_errno;
+			}
+		}
+	}
+	rte_kvargs_free(kvlist);
+	return 0;
+}
+
+static struct rte_pci_driver mlx5_driver;
+
+/**
+ * PMD global initialization.
+ *
+ * Independent from individual device, this function initializes global
+ * per-PMD data structures distinguishing primary and secondary processes.
+ * Hence, each initialization is called once per a process.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_init_once(void)
+{
+	struct mlx5_shared_data *sd;
+	struct mlx5_local_data *ld = &mlx5_local_data;
+	int ret = 0;
+
+	if (mlx5_init_shared_data())
+		return -rte_errno;
+	sd = mlx5_shared_data;
+	MLX5_ASSERT(sd);
+	rte_spinlock_lock(&sd->lock);
+	switch (rte_eal_process_type()) {
+	case RTE_PROC_PRIMARY:
+		if (sd->init_done)
+			break;
+		LIST_INIT(&sd->mem_event_cb_list);
+		rte_rwlock_init(&sd->mem_event_rwlock);
+		rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
+						mlx5_mr_mem_event_cb, NULL);
+		ret = mlx5_mp_init_primary(MLX5_MP_NAME,
+					   mlx5_mp_primary_handle);
+		if (ret)
+			goto out;
+		sd->init_done = true;
+		break;
+	case RTE_PROC_SECONDARY:
+		if (ld->init_done)
+			break;
+		ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
+					     mlx5_mp_secondary_handle);
+		if (ret)
+			goto out;
+		++sd->secondary_cnt;
+		ld->init_done = true;
+		break;
+	default:
+		break;
+	}
+out:
+	rte_spinlock_unlock(&sd->lock);
+	return ret;
+}
+
+/**
+ * Configures the minimal amount of data to inline into WQE
+ * while sending packets.
+ *
+ * - the txq_inline_min has the maximal priority, if this
+ *   key is specified in devargs
+ * - if DevX is enabled the inline mode is queried from the
+ *   device (HCA attributes and NIC vport context if needed).
+ * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx
+ *   and none (0 bytes) for other NICs
+ *
+ * @param spawn
+ *   Verbs device parameters (name, port, switch_info) to spawn.
+ * @param config
+ *   Device configuration parameters.
+ */
+static void
+mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
+		    struct mlx5_dev_config *config)
+{
+	if (config->txq_inline_min != MLX5_ARG_UNSET) {
+		/* Application defines size of inlined data explicitly. */
+		switch (spawn->pci_dev->id.device_id) {
+		case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
+		case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+			if (config->txq_inline_min <
+				       (int)MLX5_INLINE_HSIZE_L2) {
+				DRV_LOG(DEBUG,
+					"txq_inline_mix aligned to minimal"
+					" ConnectX-4 required value %d",
+					(int)MLX5_INLINE_HSIZE_L2);
+				config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+			}
+			break;
+		}
+		goto exit;
+	}
+	if (config->hca_attr.eth_net_offloads) {
+		/* We have DevX enabled, inline mode queried successfully. */
+		switch (config->hca_attr.wqe_inline_mode) {
+		case MLX5_CAP_INLINE_MODE_L2:
+			/* outer L2 header must be inlined. */
+			config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+			goto exit;
+		case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+			/* No inline data are required by NIC. */
+			config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+			config->hw_vlan_insert =
+				config->hca_attr.wqe_vlan_insert;
+			DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
+			goto exit;
+		case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+			/* inline mode is defined by NIC vport context. */
+			if (!config->hca_attr.eth_virt)
+				break;
+			switch (config->hca_attr.vport_inline_mode) {
+			case MLX5_INLINE_MODE_NONE:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_NONE;
+				goto exit;
+			case MLX5_INLINE_MODE_L2:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L2;
+				goto exit;
+			case MLX5_INLINE_MODE_IP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L3;
+				goto exit;
+			case MLX5_INLINE_MODE_TCP_UDP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_L4;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_L2:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L2;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_IP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L3;
+				goto exit;
+			case MLX5_INLINE_MODE_INNER_TCP_UDP:
+				config->txq_inline_min =
+					MLX5_INLINE_HSIZE_INNER_L4;
+				goto exit;
+			}
+		}
+	}
+	/*
+	 * We get here if we are unable to deduce
+	 * inline data size with DevX. Try PCI ID
+	 * to determine old NICs.
+	 */
+	switch (spawn->pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+		config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
+		config->hw_vlan_insert = 0;
+		break;
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+		/*
+		 * These NICs support VLAN insertion from WQE and
+		 * report the wqe_vlan_insert flag. But there is the bug
+		 * and PFC control may be broken, so disable feature.
+		 */
+		config->hw_vlan_insert = 0;
+		config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+		break;
+	default:
+		config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
+		break;
+	}
+exit:
+	DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
+}
+
+/**
+ * Configures the metadata mask fields in the shared context.
+ *
+ * @param [in] dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mlx5_set_metadata_mask(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	uint32_t meta, mark, reg_c0;
+
+	reg_c0 = ~priv->vport_meta_mask;
+	switch (priv->config.dv_xmeta_en) {
+	case MLX5_XMETA_MODE_LEGACY:
+		meta = UINT32_MAX;
+		mark = MLX5_FLOW_MARK_MASK;
+		break;
+	case MLX5_XMETA_MODE_META16:
+		meta = reg_c0 >> rte_bsf32(reg_c0);
+		mark = MLX5_FLOW_MARK_MASK;
+		break;
+	case MLX5_XMETA_MODE_META32:
+		meta = UINT32_MAX;
+		mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
+		break;
+	default:
+		meta = 0;
+		mark = 0;
+		MLX5_ASSERT(false);
+		break;
+	}
+	if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
+		DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
+				 sh->dv_mark_mask, mark);
+	else
+		sh->dv_mark_mask = mark;
+	if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
+		DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
+				 sh->dv_meta_mask, meta);
+	else
+		sh->dv_meta_mask = meta;
+	if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
+		DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
+				 sh->dv_meta_mask, reg_c0);
+	else
+		sh->dv_regc0_mask = reg_c0;
+	DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
+	DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
+	DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
+	DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
+}
+
+/**
+ * Allocate page of door-bells and register it using DevX API.
+ *
+ * @param [in] dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   Pointer to new page on success, NULL otherwise.
+ */
+static struct mlx5_devx_dbr_page *
+mlx5_alloc_dbr_page(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_devx_dbr_page *page;
+
+	/* Allocate space for door-bell page and management data. */
+	page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page),
+				 RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+	if (!page) {
+		DRV_LOG(ERR, "port %u cannot allocate dbr page",
+			dev->data->port_id);
+		return NULL;
+	}
+	/* Register allocated memory. */
+	page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs,
+					      MLX5_DBR_PAGE_SIZE, 0);
+	if (!page->umem) {
+		DRV_LOG(ERR, "port %u cannot umem reg dbr page",
+			dev->data->port_id);
+		rte_free(page);
+		return NULL;
+	}
+	return page;
+}
+
+/**
+ * Find the next available door-bell, allocate new page if needed.
+ *
+ * @param [in] dev
+ *   Pointer to Ethernet device.
+ * @param [out] dbr_page
+ *   Door-bell page containing the page data.
+ *
+ * @return
+ *   Door-bell address offset on success, a negative error value otherwise.
+ */
+int64_t
+mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_devx_dbr_page *page = NULL;
+	uint32_t i, j;
+
+	LIST_FOREACH(page, &priv->dbrpgs, next)
+		if (page->dbr_count < MLX5_DBR_PER_PAGE)
+			break;
+	if (!page) { /* No page with free door-bell exists. */
+		page = mlx5_alloc_dbr_page(dev);
+		if (!page) /* Failed to allocate new page. */
+			return (-1);
+		LIST_INSERT_HEAD(&priv->dbrpgs, page, next);
+	}
+	/* Loop to find bitmap part with clear bit. */
+	for (i = 0;
+	     i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX;
+	     i++)
+		; /* Empty. */
+	/* Find the first clear bit. */
+	MLX5_ASSERT(i < MLX5_DBR_BITMAP_SIZE);
+	j = rte_bsf64(~page->dbr_bitmap[i]);
+	page->dbr_bitmap[i] |= (UINT64_C(1) << j);
+	page->dbr_count++;
+	*dbr_page = page;
+	return (((i * 64) + j) * sizeof(uint64_t));
+}
+
+/**
+ * Release a door-bell record.
+ *
+ * @param [in] dev
+ *   Pointer to Ethernet device.
+ * @param [in] umem_id
+ *   UMEM ID of page containing the door-bell record to release.
+ * @param [in] offset
+ *   Offset of door-bell record in page.
+ *
+ * @return
+ *   0 on success, a negative error value otherwise.
+ */
+int32_t
+mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_devx_dbr_page *page = NULL;
+	int ret = 0;
+
+	LIST_FOREACH(page, &priv->dbrpgs, next)
+		/* Find the page this address belongs to. */
+		if (page->umem->umem_id == umem_id)
+			break;
+	if (!page)
+		return -EINVAL;
+	page->dbr_count--;
+	if (!page->dbr_count) {
+		/* Page not used, free it and remove from list. */
+		LIST_REMOVE(page, next);
+		if (page->umem)
+			ret = -mlx5_glue->devx_umem_dereg(page->umem);
+		rte_free(page);
+	} else {
+		/* Mark in bitmap that this door-bell is not in use. */
+		offset /= MLX5_DBR_SIZE;
+		int i = offset / 64;
+		int j = offset % 64;
+
+		page->dbr_bitmap[i] &= ~(UINT64_C(1) << j);
+	}
+	return ret;
+}
+
+int
+rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n)
+{
+	static const char *const dynf_names[] = {
+		RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
+		RTE_MBUF_DYNFLAG_METADATA_NAME
+	};
+	unsigned int i;
+
+	if (n < RTE_DIM(dynf_names))
+		return -ENOMEM;
+	for (i = 0; i < RTE_DIM(dynf_names); i++) {
+		if (names[i] == NULL)
+			return -EINVAL;
+		strcpy(names[i], dynf_names[i]);
+	}
+	return RTE_DIM(dynf_names);
+}
+
+/**
+ * Check sibling device configurations.
+ *
+ * Sibling devices sharing the Infiniband device context
+ * should have compatible configurations. This regards
+ * representors and bonding slaves.
+ *
+ * @param priv
+ *   Private device descriptor.
+ * @param config
+ *   Configuration of the device is going to be created.
+ *
+ * @return
+ *   0 on success, EINVAL otherwise
+ */
+static int
+mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
+			      struct mlx5_dev_config *config)
+{
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_dev_config *sh_conf = NULL;
+	uint16_t port_id;
+
+	MLX5_ASSERT(sh);
+	/* Nothing to compare for the single/first device. */
+	if (sh->refcnt == 1)
+		return 0;
+	/* Find the device with shared context. */
+	MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+		struct mlx5_priv *opriv =
+			rte_eth_devices[port_id].data->dev_private;
+
+		if (opriv && opriv != priv && opriv->sh == sh) {
+			sh_conf = &opriv->config;
+			break;
+		}
+	}
+	if (!sh_conf)
+		return 0;
+	if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
+		DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
+			     " for shared %s context", sh->ibdev_name);
+		rte_errno = EINVAL;
+		return rte_errno;
+	}
+	if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
+		DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
+			     " for shared %s context", sh->ibdev_name);
+		rte_errno = EINVAL;
+		return rte_errno;
+	}
+	return 0;
+}
+/**
+ * Spawn an Ethernet device from Verbs information.
+ *
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param spawn
+ *   Verbs device parameters (name, port, switch_info) to spawn.
+ * @param config
+ *   Device configuration parameters.
+ *
+ * @return
+ *   A valid Ethernet device object on success, NULL otherwise and rte_errno
+ *   is set. The following errors are defined:
+ *
+ *   EBUSY: device is not supposed to be spawned.
+ *   EEXIST: device is already spawned
+ */
+static struct rte_eth_dev *
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+	       struct mlx5_dev_spawn_data *spawn,
+	       struct mlx5_dev_config config)
+{
+	const struct mlx5_switch_info *switch_info = &spawn->info;
+	struct mlx5_ibv_shared *sh = NULL;
+	struct ibv_port_attr port_attr;
+	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct rte_eth_dev *eth_dev = NULL;
+	struct mlx5_priv *priv = NULL;
+	int err = 0;
+	unsigned int hw_padding = 0;
+	unsigned int mps;
+	unsigned int cqe_comp;
+	unsigned int cqe_pad = 0;
+	unsigned int tunnel_en = 0;
+	unsigned int mpls_en = 0;
+	unsigned int swp = 0;
+	unsigned int mprq = 0;
+	unsigned int mprq_min_stride_size_n = 0;
+	unsigned int mprq_max_stride_size_n = 0;
+	unsigned int mprq_min_stride_num_n = 0;
+	unsigned int mprq_max_stride_num_n = 0;
+	struct rte_ether_addr mac;
+	char name[RTE_ETH_NAME_MAX_LEN];
+	int own_domain_id = 0;
+	uint16_t port_id;
+	unsigned int i;
+#ifdef HAVE_MLX5DV_DR_DEVX_PORT
+	struct mlx5dv_devx_port devx_port = { .comp_mask = 0 };
+#endif
+
+	/* Determine if this port representor is supposed to be spawned. */
+	if (switch_info->representor && dpdk_dev->devargs) {
+		struct rte_eth_devargs eth_da;
+
+		err = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
+		if (err) {
+			rte_errno = -err;
+			DRV_LOG(ERR, "failed to process device arguments: %s",
+				strerror(rte_errno));
+			return NULL;
+		}
+		for (i = 0; i < eth_da.nb_representor_ports; ++i)
+			if (eth_da.representor_ports[i] ==
+			    (uint16_t)switch_info->port_name)
+				break;
+		if (i == eth_da.nb_representor_ports) {
+			rte_errno = EBUSY;
+			return NULL;
+		}
+	}
+	/* Build device name. */
+	if (spawn->pf_bond <  0) {
+		/* Single device. */
+		if (!switch_info->representor)
+			strlcpy(name, dpdk_dev->name, sizeof(name));
+		else
+			snprintf(name, sizeof(name), "%s_representor_%u",
+				 dpdk_dev->name, switch_info->port_name);
+	} else {
+		/* Bonding device. */
+		if (!switch_info->representor)
+			snprintf(name, sizeof(name), "%s_%s",
+				 dpdk_dev->name, spawn->ibv_dev->name);
+		else
+			snprintf(name, sizeof(name), "%s_%s_representor_%u",
+				 dpdk_dev->name, spawn->ibv_dev->name,
+				 switch_info->port_name);
+	}
+	/* check if the device is already spawned */
+	if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
+		rte_errno = EEXIST;
+		return NULL;
+	}
+	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		struct mlx5_mp_id mp_id;
+
+		eth_dev = rte_eth_dev_attach_secondary(name);
+		if (eth_dev == NULL) {
+			DRV_LOG(ERR, "can not attach rte ethdev");
+			rte_errno = ENOMEM;
+			return NULL;
+		}
+		eth_dev->device = dpdk_dev;
+		eth_dev->dev_ops = &mlx5_dev_sec_ops;
+		err = mlx5_proc_priv_init(eth_dev);
+		if (err)
+			return NULL;
+		mp_id.port_id = eth_dev->data->port_id;
+		strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
+		/* Receive command fd from primary process */
+		err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
+		if (err < 0)
+			return NULL;
+		/* Remap UAR for Tx queues. */
+		err = mlx5_tx_uar_init_secondary(eth_dev, err);
+		if (err)
+			return NULL;
+		/*
+		 * Ethdev pointer is still required as input since
+		 * the primary device is not accessible from the
+		 * secondary process.
+		 */
+		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
+		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
+		return eth_dev;
+	}
+	/*
+	 * Some parameters ("tx_db_nc" in particularly) are needed in
+	 * advance to create dv/verbs device context. We proceed the
+	 * devargs here to get ones, and later proceed devargs again
+	 * to override some hardware settings.
+	 */
+	err = mlx5_args(&config, dpdk_dev->devargs);
+	if (err) {
+		err = rte_errno;
+		DRV_LOG(ERR, "failed to process device arguments: %s",
+			strerror(rte_errno));
+		goto error;
+	}
+	sh = mlx5_alloc_shared_ibctx(spawn, &config);
+	if (!sh)
+		return NULL;
+	config.devx = sh->devx;
+#ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
+	config.dest_tir = 1;
+#endif
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+#endif
+	/*
+	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
+	 * as all ConnectX-5 devices.
+	 */
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
+#endif
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
+#endif
+	mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
+	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+			DRV_LOG(DEBUG, "enhanced MPW is supported");
+			mps = MLX5_MPW_ENHANCED;
+		} else {
+			DRV_LOG(DEBUG, "MPW is supported");
+			mps = MLX5_MPW;
+		}
+	} else {
+		DRV_LOG(DEBUG, "MPW isn't supported");
+		mps = MLX5_MPW_DISABLED;
+	}
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
+		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
+	DRV_LOG(DEBUG, "SWP support: %u", swp);
+#endif
+	config.swp = !!swp;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
+		struct mlx5dv_striding_rq_caps mprq_caps =
+			dv_attr.striding_rq_caps;
+
+		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
+			mprq_caps.min_single_stride_log_num_of_bytes);
+		DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
+			mprq_caps.max_single_stride_log_num_of_bytes);
+		DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
+			mprq_caps.min_single_wqe_log_num_of_strides);
+		DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
+			mprq_caps.max_single_wqe_log_num_of_strides);
+		DRV_LOG(DEBUG, "\tsupported_qpts: %d",
+			mprq_caps.supported_qpts);
+		DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
+		mprq = 1;
+		mprq_min_stride_size_n =
+			mprq_caps.min_single_stride_log_num_of_bytes;
+		mprq_max_stride_size_n =
+			mprq_caps.max_single_stride_log_num_of_bytes;
+		mprq_min_stride_num_n =
+			mprq_caps.min_single_wqe_log_num_of_strides;
+		mprq_max_stride_num_n =
+			mprq_caps.max_single_wqe_log_num_of_strides;
+	}
+#endif
+	if (RTE_CACHE_LINE_SIZE == 128 &&
+	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
+		cqe_comp = 0;
+	else
+		cqe_comp = 1;
+	config.cqe_comp = cqe_comp;
+#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
+	/* Whether device supports 128B Rx CQE padding. */
+	cqe_pad = RTE_CACHE_LINE_SIZE == 128 &&
+		  (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD);
+#endif
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
+		tunnel_en = ((dv_attr.tunnel_offloads_caps &
+			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
+			     (dv_attr.tunnel_offloads_caps &
+			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
+			     (dv_attr.tunnel_offloads_caps &
+			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
+	}
+	DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
+		tunnel_en ? "" : "not ");
+#else
+	DRV_LOG(WARNING,
+		"tunnel offloading disabled due to old OFED/rdma-core version");
+#endif
+	config.tunnel_en = tunnel_en;
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+	mpls_en = ((dv_attr.tunnel_offloads_caps &
+		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
+		   (dv_attr.tunnel_offloads_caps &
+		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
+	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
+		mpls_en ? "" : "not ");
+#else
+	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
+		" old OFED/rdma-core version or firmware configuration");
+#endif
+	config.mpls_en = mpls_en;
+	/* Check port status. */
+	err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
+	if (err) {
+		DRV_LOG(ERR, "port query failed: %s", strerror(err));
+		goto error;
+	}
+	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		DRV_LOG(ERR, "port is not configured in Ethernet mode");
+		err = EINVAL;
+		goto error;
+	}
+	if (port_attr.state != IBV_PORT_ACTIVE)
+		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+			mlx5_glue->port_state_str(port_attr.state),
+			port_attr.state);
+	/* Allocate private eth device data. */
+	priv = rte_zmalloc("ethdev private structure",
+			   sizeof(*priv),
+			   RTE_CACHE_LINE_SIZE);
+	if (priv == NULL) {
+		DRV_LOG(ERR, "priv allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv->sh = sh;
+	priv->ibv_port = spawn->ibv_port;
+	priv->pci_dev = spawn->pci_dev;
+	priv->mtu = RTE_ETHER_MTU;
+	priv->mp_id.port_id = port_id;
+	strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
+#ifndef RTE_ARCH_64
+	/* Initialize UAR access locks for 32bit implementations. */
+	rte_spinlock_init(&priv->uar_lock_cq);
+	for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
+		rte_spinlock_init(&priv->uar_lock[i]);
+#endif
+	/* Some internal functions rely on Netlink sockets, open them now. */
+	priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
+	priv->nl_socket_route =	mlx5_nl_init(NETLINK_ROUTE);
+	priv->representor = !!switch_info->representor;
+	priv->master = !!switch_info->master;
+	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+	priv->vport_meta_tag = 0;
+	priv->vport_meta_mask = 0;
+	priv->pf_bond = spawn->pf_bond;
+#ifdef HAVE_MLX5DV_DR_DEVX_PORT
+	/*
+	 * The DevX port query API is implemented. E-Switch may use
+	 * either vport or reg_c[0] metadata register to match on
+	 * vport index. The engaged part of metadata register is
+	 * defined by mask.
+	 */
+	if (switch_info->representor || switch_info->master) {
+		devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT |
+				      MLX5DV_DEVX_PORT_MATCH_REG_C_0;
+		err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port,
+						 &devx_port);
+		if (err) {
+			DRV_LOG(WARNING,
+				"can't query devx port %d on device %s",
+				spawn->ibv_port, spawn->ibv_dev->name);
+			devx_port.comp_mask = 0;
+		}
+	}
+	if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) {
+		priv->vport_meta_tag = devx_port.reg_c_0.value;
+		priv->vport_meta_mask = devx_port.reg_c_0.mask;
+		if (!priv->vport_meta_mask) {
+			DRV_LOG(ERR, "vport zero mask for port %d"
+				     " on bonding device %s",
+				     spawn->ibv_port, spawn->ibv_dev->name);
+			err = ENOTSUP;
+			goto error;
+		}
+		if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
+			DRV_LOG(ERR, "invalid vport tag for port %d"
+				     " on bonding device %s",
+				     spawn->ibv_port, spawn->ibv_dev->name);
+			err = ENOTSUP;
+			goto error;
+		}
+	}
+	if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) {
+		priv->vport_id = devx_port.vport_num;
+	} else if (spawn->pf_bond >= 0) {
+		DRV_LOG(ERR, "can't deduce vport index for port %d"
+			     " on bonding device %s",
+			     spawn->ibv_port, spawn->ibv_dev->name);
+		err = ENOTSUP;
+		goto error;
+	} else {
+		/* Suppose vport index in compatible way. */
+		priv->vport_id = switch_info->representor ?
+				 switch_info->port_name + 1 : -1;
+	}
+#else
+	/*
+	 * Kernel/rdma_core support single E-Switch per PF configurations
+	 * only and vport_id field contains the vport index for
+	 * associated VF, which is deduced from representor port name.
+	 * For example, let's have the IB device port 10, it has
+	 * attached network device eth0, which has port name attribute
+	 * pf0vf2, we can deduce the VF number as 2, and set vport index
+	 * as 3 (2+1). This assigning schema should be changed if the
+	 * multiple E-Switch instances per PF configurations or/and PCI
+	 * subfunctions are added.
+	 */
+	priv->vport_id = switch_info->representor ?
+			 switch_info->port_name + 1 : -1;
+#endif
+	/* representor_id field keeps the unmodified VF index. */
+	priv->representor_id = switch_info->representor ?
+			       switch_info->port_name : -1;
+	/*
+	 * Look for sibling devices in order to reuse their switch domain
+	 * if any, otherwise allocate one.
+	 */
+	MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+		const struct mlx5_priv *opriv =
+			rte_eth_devices[port_id].data->dev_private;
+
+		if (!opriv ||
+		    opriv->sh != priv->sh ||
+			opriv->domain_id ==
+			RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+			continue;
+		priv->domain_id = opriv->domain_id;
+		break;
+	}
+	if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+		err = rte_eth_switch_domain_alloc(&priv->domain_id);
+		if (err) {
+			err = rte_errno;
+			DRV_LOG(ERR, "unable to allocate switch domain: %s",
+				strerror(rte_errno));
+			goto error;
+		}
+		own_domain_id = 1;
+	}
+	/* Override some values set by hardware configuration. */
+	mlx5_args(&config, dpdk_dev->devargs);
+	err = mlx5_dev_check_sibling_config(priv, &config);
+	if (err)
+		goto error;
+	config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
+			    IBV_DEVICE_RAW_IP_CSUM);
+	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
+		(config.hw_csum ? "" : "not "));
+#if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
+	!defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+	DRV_LOG(DEBUG, "counters are not supported");
+#endif
+#if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
+	if (config.dv_flow_en) {
+		DRV_LOG(WARNING, "DV flow is not supported");
+		config.dv_flow_en = 0;
+	}
+#endif
+	config.ind_table_max_size =
+		sh->device_attr.rss_caps.max_rwq_indirection_table_size;
+	/*
+	 * Remove this check once DPDK supports larger/variable
+	 * indirection tables.
+	 */
+	if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
+		config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
+	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
+		config.ind_table_max_size);
+	config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
+				  IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
+	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
+		(config.hw_vlan_strip ? "" : "not "));
+	config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
+				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
+	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
+		(config.hw_fcs_strip ? "" : "not "));
+#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
+	hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
+#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
+	hw_padding = !!(sh->device_attr.device_cap_flags_ex &
+			IBV_DEVICE_PCI_WRITE_END_PADDING);
+#endif
+	if (config.hw_padding && !hw_padding) {
+		DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
+		config.hw_padding = 0;
+	} else if (config.hw_padding) {
+		DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
+	}
+	config.tso = (sh->device_attr.tso_caps.max_tso > 0 &&
+		      (sh->device_attr.tso_caps.supported_qpts &
+		       (1 << IBV_QPT_RAW_PACKET)));
+	if (config.tso)
+		config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso;
+	/*
+	 * MPW is disabled by default, while the Enhanced MPW is enabled
+	 * by default.
+	 */
+	if (config.mps == MLX5_ARG_UNSET)
+		config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
+							  MLX5_MPW_DISABLED;
+	else
+		config.mps = config.mps ? mps : MLX5_MPW_DISABLED;
+	DRV_LOG(INFO, "%sMPS is %s",
+		config.mps == MLX5_MPW_ENHANCED ? "enhanced " :
+		config.mps == MLX5_MPW ? "legacy " : "",
+		config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
+	if (config.cqe_comp && !cqe_comp) {
+		DRV_LOG(WARNING, "Rx CQE compression isn't supported");
+		config.cqe_comp = 0;
+	}
+	if (config.cqe_pad && !cqe_pad) {
+		DRV_LOG(WARNING, "Rx CQE padding isn't supported");
+		config.cqe_pad = 0;
+	} else if (config.cqe_pad) {
+		DRV_LOG(INFO, "Rx CQE padding is enabled");
+	}
+	if (config.devx) {
+		priv->counter_fallback = 0;
+		err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
+		if (err) {
+			err = -err;
+			goto error;
+		}
+		if (!config.hca_attr.flow_counters_dump)
+			priv->counter_fallback = 1;
+#ifndef HAVE_IBV_DEVX_ASYNC
+		priv->counter_fallback = 1;
+#endif
+		if (priv->counter_fallback)
+			DRV_LOG(INFO, "Use fall-back DV counter management");
+		/* Check for LRO support. */
+		if (config.dest_tir && config.hca_attr.lro_cap &&
+		    config.dv_flow_en) {
+			/* TBD check tunnel lro caps. */
+			config.lro.supported = config.hca_attr.lro_cap;
+			DRV_LOG(DEBUG, "Device supports LRO");
+			/*
+			 * If LRO timeout is not configured by application,
+			 * use the minimal supported value.
+			 */
+			if (!config.lro.timeout)
+				config.lro.timeout =
+				config.hca_attr.lro_timer_supported_periods[0];
+			DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
+				config.lro.timeout);
+		}
+#if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER)
+		if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup &&
+		    config.dv_flow_en) {
+			uint8_t reg_c_mask =
+				config.hca_attr.qos.flow_meter_reg_c_ids;
+			/*
+			 * Meter needs two REG_C's for color match and pre-sfx
+			 * flow match. Here get the REG_C for color match.
+			 * REG_C_0 and REG_C_1 is reserved for metadata feature.
+			 */
+			reg_c_mask &= 0xfc;
+			if (__builtin_popcount(reg_c_mask) < 1) {
+				priv->mtr_en = 0;
+				DRV_LOG(WARNING, "No available register for"
+					" meter.");
+			} else {
+				priv->mtr_color_reg = ffs(reg_c_mask) - 1 +
+						      REG_C_0;
+				priv->mtr_en = 1;
+				priv->mtr_reg_share =
+				      config.hca_attr.qos.flow_meter_reg_share;
+				DRV_LOG(DEBUG, "The REG_C meter uses is %d",
+					priv->mtr_color_reg);
+			}
+		}
+#endif
+	}
+	if (config.mprq.enabled && mprq) {
+		if (config.mprq.stride_num_n &&
+		    (config.mprq.stride_num_n > mprq_max_stride_num_n ||
+		     config.mprq.stride_num_n < mprq_min_stride_num_n)) {
+			config.mprq.stride_num_n =
+				RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+						mprq_min_stride_num_n),
+					mprq_max_stride_num_n);
+			DRV_LOG(WARNING,
+				"the number of strides"
+				" for Multi-Packet RQ is out of range,"
+				" setting default value (%u)",
+				1 << config.mprq.stride_num_n);
+		}
+		if (config.mprq.stride_size_n &&
+		    (config.mprq.stride_size_n > mprq_max_stride_size_n ||
+		     config.mprq.stride_size_n < mprq_min_stride_size_n)) {
+			config.mprq.stride_size_n =
+				RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
+						mprq_min_stride_size_n),
+					mprq_max_stride_size_n);
+			DRV_LOG(WARNING,
+				"the size of a stride"
+				" for Multi-Packet RQ is out of range,"
+				" setting default value (%u)",
+				1 << config.mprq.stride_size_n);
+		}
+		config.mprq.min_stride_size_n = mprq_min_stride_size_n;
+		config.mprq.max_stride_size_n = mprq_max_stride_size_n;
+	} else if (config.mprq.enabled && !mprq) {
+		DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
+		config.mprq.enabled = 0;
+	}
+	if (config.max_dump_files_num == 0)
+		config.max_dump_files_num = 128;
+	eth_dev = rte_eth_dev_allocate(name);
+	if (eth_dev == NULL) {
+		DRV_LOG(ERR, "can not allocate rte ethdev");
+		err = ENOMEM;
+		goto error;
+	}
+	/* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */
+	eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
+	if (priv->representor) {
+		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
+		eth_dev->data->representor_id = priv->representor_id;
+	}
+	/*
+	 * Store associated network device interface index. This index
+	 * is permanent throughout the lifetime of device. So, we may store
+	 * the ifindex here and use the cached value further.
+	 */
+	MLX5_ASSERT(spawn->ifindex);
+	priv->if_index = spawn->ifindex;
+	eth_dev->data->dev_private = priv;
+	priv->dev_data = eth_dev->data;
+	eth_dev->data->mac_addrs = priv->mac;
+	eth_dev->device = dpdk_dev;
+	/* Configure the first MAC address by default. */
+	if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
+		DRV_LOG(ERR,
+			"port %u cannot get MAC address, is mlx5_en"
+			" loaded? (errno: %s)",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = ENODEV;
+		goto error;
+	}
+	DRV_LOG(INFO,
+		"port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
+		eth_dev->data->port_id,
+		mac.addr_bytes[0], mac.addr_bytes[1],
+		mac.addr_bytes[2], mac.addr_bytes[3],
+		mac.addr_bytes[4], mac.addr_bytes[5]);
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+	{
+		char ifname[IF_NAMESIZE];
+
+		if (mlx5_get_ifname(eth_dev, &ifname) == 0)
+			DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
+				eth_dev->data->port_id, ifname);
+		else
+			DRV_LOG(DEBUG, "port %u ifname is unknown",
+				eth_dev->data->port_id);
+	}
+#endif
+	/* Get actual MTU if possible. */
+	err = mlx5_get_mtu(eth_dev, &priv->mtu);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
+		priv->mtu);
+	/* Initialize burst functions to prevent crashes before link-up. */
+	eth_dev->rx_pkt_burst = removed_rx_burst;
+	eth_dev->tx_pkt_burst = removed_tx_burst;
+	eth_dev->dev_ops = &mlx5_dev_ops;
+	/* Register MAC address. */
+	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
+	if (config.vf && config.vf_nl_en)
+		mlx5_nl_mac_addr_sync(priv->nl_socket_route,
+				      mlx5_ifindex(eth_dev),
+				      eth_dev->data->mac_addrs,
+				      MLX5_MAX_MAC_ADDRESSES);
+	priv->flows = 0;
+	priv->ctrl_flows = 0;
+	TAILQ_INIT(&priv->flow_meters);
+	TAILQ_INIT(&priv->flow_meter_profiles);
+	/* Hint libmlx5 to use PMD allocator for data plane resources */
+	struct mlx5dv_ctx_allocators alctr = {
+		.alloc = &mlx5_alloc_verbs_buf,
+		.free = &mlx5_free_verbs_buf,
+		.data = priv,
+	};
+	mlx5_glue->dv_set_context_attr(sh->ctx,
+				       MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+				       (void *)((uintptr_t)&alctr));
+	/* Bring Ethernet device up. */
+	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
+		eth_dev->data->port_id);
+	mlx5_set_link_up(eth_dev);
+	/*
+	 * Even though the interrupt handler is not installed yet,
+	 * interrupts will still trigger on the async_fd from
+	 * Verbs context returned by ibv_open_device().
+	 */
+	mlx5_link_update(eth_dev, 0);
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	if (!(config.hca_attr.eswitch_manager && config.dv_flow_en &&
+	      (switch_info->representor || switch_info->master)))
+		config.dv_esw_en = 0;
+#else
+	config.dv_esw_en = 0;
+#endif
+	/* Detect minimal data bytes to inline. */
+	mlx5_set_min_inline(spawn, &config);
+	/* Store device configuration on private structure. */
+	priv->config = config;
+	/* Create context for virtual machine VLAN workaround. */
+	priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
+	if (config.dv_flow_en) {
+		err = mlx5_alloc_shared_dr(priv);
+		if (err)
+			goto error;
+		/*
+		 * RSS id is shared with meter flow id. Meter flow id can only
+		 * use the 24 MSB of the register.
+		 */
+		priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >>
+				     MLX5_MTR_COLOR_BITS);
+		if (!priv->qrss_id_pool) {
+			DRV_LOG(ERR, "can't create flow id pool");
+			err = ENOMEM;
+			goto error;
+		}
+	}
+	/* Supported Verbs flow priority number detection. */
+	err = mlx5_flow_discover_priorities(eth_dev);
+	if (err < 0) {
+		err = -err;
+		goto error;
+	}
+	priv->config.flow_prio = err;
+	if (!priv->config.dv_esw_en &&
+	    priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+		DRV_LOG(WARNING, "metadata mode %u is not supported "
+				 "(no E-Switch)", priv->config.dv_xmeta_en);
+		priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
+	}
+	mlx5_set_metadata_mask(eth_dev);
+	if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+	    !priv->sh->dv_regc0_mask) {
+		DRV_LOG(ERR, "metadata mode %u is not supported "
+			     "(no metadata reg_c[0] is available)",
+			     priv->config.dv_xmeta_en);
+			err = ENOTSUP;
+			goto error;
+	}
+	/*
+	 * Allocate the buffer for flow creating, just once.
+	 * The allocation must be done before any flow creating.
+	 */
+	mlx5_flow_alloc_intermediate(eth_dev);
+	/* Query availibility of metadata reg_c's. */
+	err = mlx5_flow_discover_mreg_c(eth_dev);
+	if (err < 0) {
+		err = -err;
+		goto error;
+	}
+	if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
+		DRV_LOG(DEBUG,
+			"port %u extensive metadata register is not supported",
+			eth_dev->data->port_id);
+		if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+			DRV_LOG(ERR, "metadata mode %u is not supported "
+				     "(no metadata registers available)",
+				     priv->config.dv_xmeta_en);
+			err = ENOTSUP;
+			goto error;
+		}
+	}
+	if (priv->config.dv_flow_en &&
+	    priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+	    mlx5_flow_ext_mreg_supported(eth_dev) &&
+	    priv->sh->dv_regc0_mask) {
+		priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
+						      MLX5_FLOW_MREG_HTABLE_SZ);
+		if (!priv->mreg_cp_tbl) {
+			err = ENOMEM;
+			goto error;
+		}
+	}
+	return eth_dev;
+error:
+	if (priv) {
+		if (priv->mreg_cp_tbl)
+			mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
+		if (priv->sh)
+			mlx5_free_shared_dr(priv);
+		if (priv->nl_socket_route >= 0)
+			close(priv->nl_socket_route);
+		if (priv->nl_socket_rdma >= 0)
+			close(priv->nl_socket_rdma);
+		if (priv->vmwa_context)
+			mlx5_vlan_vmwa_exit(priv->vmwa_context);
+		if (priv->qrss_id_pool)
+			mlx5_flow_id_pool_release(priv->qrss_id_pool);
+		if (own_domain_id)
+			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
+		rte_free(priv);
+		if (eth_dev != NULL)
+			eth_dev->data->dev_private = NULL;
+	}
+	if (eth_dev != NULL) {
+		/* mac_addrs must not be freed alone because part of dev_private */
+		eth_dev->data->mac_addrs = NULL;
+		rte_eth_dev_release_port(eth_dev);
+	}
+	if (sh)
+		mlx5_free_shared_ibctx(sh);
+	MLX5_ASSERT(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * Comparison callback to sort device data.
+ *
+ * This is meant to be used with qsort().
+ *
+ * @param a[in]
+ *   Pointer to pointer to first data object.
+ * @param b[in]
+ *   Pointer to pointer to second data object.
+ *
+ * @return
+ *   0 if both objects are equal, less than 0 if the first argument is less
+ *   than the second, greater than 0 otherwise.
+ */
+static int
+mlx5_dev_spawn_data_cmp(const void *a, const void *b)
+{
+	const struct mlx5_switch_info *si_a =
+		&((const struct mlx5_dev_spawn_data *)a)->info;
+	const struct mlx5_switch_info *si_b =
+		&((const struct mlx5_dev_spawn_data *)b)->info;
+	int ret;
+
+	/* Master device first. */
+	ret = si_b->master - si_a->master;
+	if (ret)
+		return ret;
+	/* Then representor devices. */
+	ret = si_b->representor - si_a->representor;
+	if (ret)
+		return ret;
+	/* Unidentified devices come last in no specific order. */
+	if (!si_a->representor)
+		return 0;
+	/* Order representors by name. */
+	return si_a->port_name - si_b->port_name;
+}
+
+/**
+ * Match PCI information for possible slaves of bonding device.
+ *
+ * @param[in] ibv_dev
+ *   Pointer to Infiniband device structure.
+ * @param[in] pci_dev
+ *   Pointer to PCI device structure to match PCI address.
+ * @param[in] nl_rdma
+ *   Netlink RDMA group socket handle.
+ *
+ * @return
+ *   negative value if no bonding device found, otherwise
+ *   positive index of slave PF in bonding.
+ */
+static int
+mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
+			   const struct rte_pci_device *pci_dev,
+			   int nl_rdma)
+{
+	char ifname[IF_NAMESIZE + 1];
+	unsigned int ifindex;
+	unsigned int np, i;
+	FILE *file = NULL;
+	int pf = -1;
+
+	/*
+	 * Try to get master device name. If something goes
+	 * wrong suppose the lack of kernel support and no
+	 * bonding devices.
+	 */
+	if (nl_rdma < 0)
+		return -1;
+	if (!strstr(ibv_dev->name, "bond"))
+		return -1;
+	np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
+	if (!np)
+		return -1;
+	/*
+	 * The Master device might not be on the predefined
+	 * port (not on port index 1, it is not garanted),
+	 * we have to scan all Infiniband device port and
+	 * find master.
+	 */
+	for (i = 1; i <= np; ++i) {
+		/* Check whether Infiniband port is populated. */
+		ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
+		if (!ifindex)
+			continue;
+		if (!if_indextoname(ifindex, ifname))
+			continue;
+		/* Try to read bonding slave names from sysfs. */
+		MKSTR(slaves,
+		      "/sys/class/net/%s/master/bonding/slaves", ifname);
+		file = fopen(slaves, "r");
+		if (file)
+			break;
+	}
+	if (!file)
+		return -1;
+	/* Use safe format to check maximal buffer length. */
+	MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
+	while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
+		char tmp_str[IF_NAMESIZE + 32];
+		struct rte_pci_addr pci_addr;
+		struct mlx5_switch_info	info;
+
+		/* Process slave interface names in the loop. */
+		snprintf(tmp_str, sizeof(tmp_str),
+			 "/sys/class/net/%s", ifname);
+		if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
+			DRV_LOG(WARNING, "can not get PCI address"
+					 " for netdev \"%s\"", ifname);
+			continue;
+		}
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+		/* Slave interface PCI address match found. */
+		fclose(file);
+		snprintf(tmp_str, sizeof(tmp_str),
+			 "/sys/class/net/%s/phys_port_name", ifname);
+		file = fopen(tmp_str, "rb");
+		if (!file)
+			break;
+		info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
+		if (fscanf(file, "%32s", tmp_str) == 1)
+			mlx5_translate_port_name(tmp_str, &info);
+		if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
+		    info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
+			pf = info.port_name;
+		break;
+	}
+	if (file)
+		fclose(file);
+	return pf;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function spawns Ethernet devices out of a given PCI device.
+ *
+ * @param[in] pci_drv
+ *   PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+	       struct rte_pci_device *pci_dev)
+{
+	struct ibv_device **ibv_list;
+	/*
+	 * Number of found IB Devices matching with requested PCI BDF.
+	 * nd != 1 means there are multiple IB devices over the same
+	 * PCI device and we have representors and master.
+	 */
+	unsigned int nd = 0;
+	/*
+	 * Number of found IB device Ports. nd = 1 and np = 1..n means
+	 * we have the single multiport IB device, and there may be
+	 * representors attached to some of found ports.
+	 */
+	unsigned int np = 0;
+	/*
+	 * Number of DPDK ethernet devices to Spawn - either over
+	 * multiple IB devices or multiple ports of single IB device.
+	 * Actually this is the number of iterations to spawn.
+	 */
+	unsigned int ns = 0;
+	/*
+	 * Bonding device
+	 *   < 0 - no bonding device (single one)
+	 *  >= 0 - bonding device (value is slave PF index)
+	 */
+	int bd = -1;
+	struct mlx5_dev_spawn_data *list = NULL;
+	struct mlx5_dev_config dev_config;
+	int ret;
+
+	if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) {
+		DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
+			" driver.");
+		return 1;
+	}
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		mlx5_pmd_socket_init();
+	ret = mlx5_init_once();
+	if (ret) {
+		DRV_LOG(ERR, "unable to init PMD global data: %s",
+			strerror(rte_errno));
+		return -rte_errno;
+	}
+	MLX5_ASSERT(pci_drv == &mlx5_driver);
+	errno = 0;
+	ibv_list = mlx5_glue->get_device_list(&ret);
+	if (!ibv_list) {
+		rte_errno = errno ? errno : ENOSYS;
+		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
+		return -rte_errno;
+	}
+	/*
+	 * First scan the list of all Infiniband devices to find
+	 * matching ones, gathering into the list.
+	 */
+	struct ibv_device *ibv_match[ret + 1];
+	int nl_route = mlx5_nl_init(NETLINK_ROUTE);
+	int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
+	unsigned int i;
+
+	while (ret-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+		bd = mlx5_device_bond_pci_match
+				(ibv_list[ret], pci_dev, nl_rdma);
+		if (bd >= 0) {
+			/*
+			 * Bonding device detected. Only one match is allowed,
+			 * the bonding is supported over multi-port IB device,
+			 * there should be no matches on representor PCI
+			 * functions or non VF LAG bonding devices with
+			 * specified address.
+			 */
+			if (nd) {
+				DRV_LOG(ERR,
+					"multiple PCI match on bonding device"
+					"\"%s\" found", ibv_list[ret]->name);
+				rte_errno = ENOENT;
+				ret = -rte_errno;
+				goto exit;
+			}
+			DRV_LOG(INFO, "PCI information matches for"
+				      " slave %d bonding device \"%s\"",
+				      bd, ibv_list[ret]->name);
+			ibv_match[nd++] = ibv_list[ret];
+			break;
+		}
+		if (mlx5_dev_to_pci_addr
+			(ibv_list[ret]->ibdev_path, &pci_addr))
+			continue;
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+		DRV_LOG(INFO, "PCI information matches for device \"%s\"",
+			ibv_list[ret]->name);
+		ibv_match[nd++] = ibv_list[ret];
+	}
+	ibv_match[nd] = NULL;
+	if (!nd) {
+		/* No device matches, just complain and bail out. */
+		DRV_LOG(WARNING,
+			"no Verbs device matches PCI device " PCI_PRI_FMT ","
+			" are kernel drivers loaded?",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+		rte_errno = ENOENT;
+		ret = -rte_errno;
+		goto exit;
+	}
+	if (nd == 1) {
+		/*
+		 * Found single matching device may have multiple ports.
+		 * Each port may be representor, we have to check the port
+		 * number and check the representors existence.
+		 */
+		if (nl_rdma >= 0)
+			np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
+		if (!np)
+			DRV_LOG(WARNING, "can not get IB device \"%s\""
+					 " ports number", ibv_match[0]->name);
+		if (bd >= 0 && !np) {
+			DRV_LOG(ERR, "can not get ports"
+				     " for bonding device");
+			rte_errno = ENOENT;
+			ret = -rte_errno;
+			goto exit;
+		}
+	}
+#ifndef HAVE_MLX5DV_DR_DEVX_PORT
+	if (bd >= 0) {
+		/*
+		 * This may happen if there is VF LAG kernel support and
+		 * application is compiled with older rdma_core library.
+		 */
+		DRV_LOG(ERR,
+			"No kernel/verbs support for VF LAG bonding found.");
+		rte_errno = ENOTSUP;
+		ret = -rte_errno;
+		goto exit;
+	}
+#endif
+	/*
+	 * Now we can determine the maximal
+	 * amount of devices to be spawned.
+	 */
+	list = rte_zmalloc("device spawn data",
+			 sizeof(struct mlx5_dev_spawn_data) *
+			 (np ? np : nd),
+			 RTE_CACHE_LINE_SIZE);
+	if (!list) {
+		DRV_LOG(ERR, "spawn data array allocation failure");
+		rte_errno = ENOMEM;
+		ret = -rte_errno;
+		goto exit;
+	}
+	if (bd >= 0 || np > 1) {
+		/*
+		 * Single IB device with multiple ports found,
+		 * it may be E-Switch master device and representors.
+		 * We have to perform identification through the ports.
+		 */
+		MLX5_ASSERT(nl_rdma >= 0);
+		MLX5_ASSERT(ns == 0);
+		MLX5_ASSERT(nd == 1);
+		MLX5_ASSERT(np);
+		for (i = 1; i <= np; ++i) {
+			list[ns].max_port = np;
+			list[ns].ibv_port = i;
+			list[ns].ibv_dev = ibv_match[0];
+			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
+			list[ns].pf_bond = bd;
+			list[ns].ifindex = mlx5_nl_ifindex
+					(nl_rdma, list[ns].ibv_dev->name, i);
+			if (!list[ns].ifindex) {
+				/*
+				 * No network interface index found for the
+				 * specified port, it means there is no
+				 * representor on this port. It's OK,
+				 * there can be disabled ports, for example
+				 * if sriov_numvfs < sriov_totalvfs.
+				 */
+				continue;
+			}
+			ret = -1;
+			if (nl_route >= 0)
+				ret = mlx5_nl_switch_info
+					       (nl_route,
+						list[ns].ifindex,
+						&list[ns].info);
+			if (ret || (!list[ns].info.representor &&
+				    !list[ns].info.master)) {
+				/*
+				 * We failed to recognize representors with
+				 * Netlink, let's try to perform the task
+				 * with sysfs.
+				 */
+				ret =  mlx5_sysfs_switch_info
+						(list[ns].ifindex,
+						 &list[ns].info);
+			}
+			if (!ret && bd >= 0) {
+				switch (list[ns].info.name_type) {
+				case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+					if (list[ns].info.port_name == bd)
+						ns++;
+					break;
+				case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+					if (list[ns].info.pf_num == bd)
+						ns++;
+					break;
+				default:
+					break;
+				}
+				continue;
+			}
+			if (!ret && (list[ns].info.representor ^
+				     list[ns].info.master))
+				ns++;
+		}
+		if (!ns) {
+			DRV_LOG(ERR,
+				"unable to recognize master/representors"
+				" on the IB device with multiple ports");
+			rte_errno = ENOENT;
+			ret = -rte_errno;
+			goto exit;
+		}
+	} else {
+		/*
+		 * The existence of several matching entries (nd > 1) means
+		 * port representors have been instantiated. No existing Verbs
+		 * call nor sysfs entries can tell them apart, this can only
+		 * be done through Netlink calls assuming kernel drivers are
+		 * recent enough to support them.
+		 *
+		 * In the event of identification failure through Netlink,
+		 * try again through sysfs, then:
+		 *
+		 * 1. A single IB device matches (nd == 1) with single
+		 *    port (np=0/1) and is not a representor, assume
+		 *    no switch support.
+		 *
+		 * 2. Otherwise no safe assumptions can be made;
+		 *    complain louder and bail out.
+		 */
+		np = 1;
+		for (i = 0; i != nd; ++i) {
+			memset(&list[ns].info, 0, sizeof(list[ns].info));
+			list[ns].max_port = 1;
+			list[ns].ibv_port = 1;
+			list[ns].ibv_dev = ibv_match[i];
+			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
+			list[ns].pf_bond = -1;
+			list[ns].ifindex = 0;
+			if (nl_rdma >= 0)
+				list[ns].ifindex = mlx5_nl_ifindex
+					(nl_rdma, list[ns].ibv_dev->name, 1);
+			if (!list[ns].ifindex) {
+				char ifname[IF_NAMESIZE];
+
+				/*
+				 * Netlink failed, it may happen with old
+				 * ib_core kernel driver (before 4.16).
+				 * We can assume there is old driver because
+				 * here we are processing single ports IB
+				 * devices. Let's try sysfs to retrieve
+				 * the ifindex. The method works for
+				 * master device only.
+				 */
+				if (nd > 1) {
+					/*
+					 * Multiple devices found, assume
+					 * representors, can not distinguish
+					 * master/representor and retrieve
+					 * ifindex via sysfs.
+					 */
+					continue;
+				}
+				ret = mlx5_get_master_ifname
+					(ibv_match[i]->ibdev_path, &ifname);
+				if (!ret)
+					list[ns].ifindex =
+						if_nametoindex(ifname);
+				if (!list[ns].ifindex) {
+					/*
+					 * No network interface index found
+					 * for the specified device, it means
+					 * there it is neither representor
+					 * nor master.
+					 */
+					continue;
+				}
+			}
+			ret = -1;
+			if (nl_route >= 0)
+				ret = mlx5_nl_switch_info
+					       (nl_route,
+						list[ns].ifindex,
+						&list[ns].info);
+			if (ret || (!list[ns].info.representor &&
+				    !list[ns].info.master)) {
+				/*
+				 * We failed to recognize representors with
+				 * Netlink, let's try to perform the task
+				 * with sysfs.
+				 */
+				ret =  mlx5_sysfs_switch_info
+						(list[ns].ifindex,
+						 &list[ns].info);
+			}
+			if (!ret && (list[ns].info.representor ^
+				     list[ns].info.master)) {
+				ns++;
+			} else if ((nd == 1) &&
+				   !list[ns].info.representor &&
+				   !list[ns].info.master) {
+				/*
+				 * Single IB device with
+				 * one physical port and
+				 * attached network device.
+				 * May be SRIOV is not enabled
+				 * or there is no representors.
+				 */
+				DRV_LOG(INFO, "no E-Switch support detected");
+				ns++;
+				break;
+			}
+		}
+		if (!ns) {
+			DRV_LOG(ERR,
+				"unable to recognize master/representors"
+				" on the multiple IB devices");
+			rte_errno = ENOENT;
+			ret = -rte_errno;
+			goto exit;
+		}
+	}
+	MLX5_ASSERT(ns);
+	/*
+	 * Sort list to probe devices in natural order for users convenience
+	 * (i.e. master first, then representors from lowest to highest ID).
+	 */
+	qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
+	/* Default configuration. */
+	dev_config = (struct mlx5_dev_config){
+		.hw_padding = 0,
+		.mps = MLX5_ARG_UNSET,
+		.dbnc = MLX5_ARG_UNSET,
+		.rx_vec_en = 1,
+		.txq_inline_max = MLX5_ARG_UNSET,
+		.txq_inline_min = MLX5_ARG_UNSET,
+		.txq_inline_mpw = MLX5_ARG_UNSET,
+		.txqs_inline = MLX5_ARG_UNSET,
+		.vf_nl_en = 1,
+		.mr_ext_memseg_en = 1,
+		.mprq = {
+			.enabled = 0, /* Disabled by default. */
+			.stride_num_n = 0,
+			.stride_size_n = 0,
+			.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
+			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
+		},
+		.dv_esw_en = 1,
+		.dv_flow_en = 1,
+		.log_hp_size = MLX5_ARG_UNSET,
+	};
+	/* Device specific configuration. */
+	switch (pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF:
+		dev_config.vf = 1;
+		break;
+	default:
+		break;
+	}
+	for (i = 0; i != ns; ++i) {
+		uint32_t restore;
+
+		list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
+						 &list[i],
+						 dev_config);
+		if (!list[i].eth_dev) {
+			if (rte_errno != EBUSY && rte_errno != EEXIST)
+				break;
+			/* Device is disabled or already spawned. Ignore it. */
+			continue;
+		}
+		restore = list[i].eth_dev->data->dev_flags;
+		rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
+		/* Restore non-PCI flags cleared by the above call. */
+		list[i].eth_dev->data->dev_flags |= restore;
+		mlx5_dev_interrupt_handler_devx_install(list[i].eth_dev);
+		rte_eth_dev_probing_finish(list[i].eth_dev);
+	}
+	if (i != ns) {
+		DRV_LOG(ERR,
+			"probe of PCI device " PCI_PRI_FMT " aborted after"
+			" encountering an error: %s",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function,
+			strerror(rte_errno));
+		ret = -rte_errno;
+		/* Roll back. */
+		while (i--) {
+			if (!list[i].eth_dev)
+				continue;
+			mlx5_dev_close(list[i].eth_dev);
+			/* mac_addrs must not be freed because in dev_private */
+			list[i].eth_dev->data->mac_addrs = NULL;
+			claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
+		}
+		/* Restore original error. */
+		rte_errno = -ret;
+	} else {
+		ret = 0;
+	}
+exit:
+	/*
+	 * Do the routine cleanup:
+	 * - close opened Netlink sockets
+	 * - free allocated spawn data array
+	 * - free the Infiniband device list
+	 */
+	if (nl_rdma >= 0)
+		close(nl_rdma);
+	if (nl_route >= 0)
+		close(nl_route);
+	if (list)
+		rte_free(list);
+	MLX5_ASSERT(ibv_list);
+	mlx5_glue->free_device_list(ibv_list);
+	return ret;
+}
+
+/**
+ * Look for the ethernet device belonging to mlx5 driver.
+ *
+ * @param[in] port_id
+ *   port_id to start looking for device.
+ * @param[in] pci_dev
+ *   Pointer to the hint PCI device. When device is being probed
+ *   the its siblings (master and preceding representors might
+ *   not have assigned driver yet (because the mlx5_pci_probe()
+ *   is not completed yet, for this case match on hint PCI
+ *   device may be used to detect sibling device.
+ *
+ * @return
+ *   port_id of found device, RTE_MAX_ETHPORT if not found.
+ */
+uint16_t
+mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
+{
+	while (port_id < RTE_MAX_ETHPORTS) {
+		struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+		if (dev->state != RTE_ETH_DEV_UNUSED &&
+		    dev->device &&
+		    (dev->device == &pci_dev->device ||
+		     (dev->device->driver &&
+		     dev->device->driver->name &&
+		     !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
+			break;
+		port_id++;
+	}
+	if (port_id >= RTE_MAX_ETHPORTS)
+		return RTE_MAX_ETHPORTS;
+	return port_id;
+}
+
+/**
+ * DPDK callback to remove a PCI device.
+ *
+ * This function removes all Ethernet devices belong to a given PCI device.
+ *
+ * @param[in] pci_dev
+ *   Pointer to the PCI device.
+ *
+ * @return
+ *   0 on success, the function cannot fail.
+ */
+static int
+mlx5_pci_remove(struct rte_pci_device *pci_dev)
+{
+	uint16_t port_id;
+
+	RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device)
+		rte_eth_dev_close(port_id);
+	return 0;
+}
+
+static const struct rte_pci_id mlx5_pci_id_map[] = {
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX4)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX5)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+				PCI_DEVICE_ID_MELLANOX_CONNECTX6)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+				PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+				PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+				PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+				PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
+	},
+	{
+		.vendor_id = 0
+	}
+};
+
+static struct rte_pci_driver mlx5_driver = {
+	.driver = {
+		.name = MLX5_DRIVER_NAME
+	},
+	.id_table = mlx5_pci_id_map,
+	.probe = mlx5_pci_probe,
+	.remove = mlx5_pci_remove,
+	.dma_map = mlx5_dma_map,
+	.dma_unmap = mlx5_dma_unmap,
+	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
+		     RTE_PCI_DRV_PROBE_AGAIN,
+};
+
+/**
+ * Driver initialization routine.
+ */
+RTE_INIT(rte_mlx5_pmd_init)
+{
+	/* Initialize driver log type. */
+	mlx5_logtype = rte_log_register("pmd.net.mlx5");
+	if (mlx5_logtype >= 0)
+		rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE);
+
+	/* Build the static tables for Verbs conversion. */
+	mlx5_set_ptype_table();
+	mlx5_set_cksum_table();
+	mlx5_set_swp_types_table();
+	if (mlx5_glue)
+		rte_pci_register(&mlx5_driver);
+}
+
+RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
+RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
+RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5.h
new file mode 100644
index 000000000..d9f5d816f
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5.h
@@ -0,0 +1,848 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_H_
+#define RTE_PMD_MLX5_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <limits.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/queue.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_pci.h>
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_rwlock.h>
+#include <rte_interrupts.h>
+#include <rte_errno.h>
+#include <rte_flow.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+#include <mlx5_nl.h>
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5_defs.h"
+#include "mlx5_utils.h"
+#include "mlx5_autoconf.h"
+
+
+enum mlx5_ipool_index {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	MLX5_IPOOL_DECAP_ENCAP = 0, /* Pool for encap/decap resource. */
+	MLX5_IPOOL_PUSH_VLAN, /* Pool for push vlan resource. */
+	MLX5_IPOOL_TAG, /* Pool for tag resource. */
+	MLX5_IPOOL_PORT_ID, /* Pool for port id resource. */
+	MLX5_IPOOL_JUMP, /* Pool for jump resource. */
+#endif
+	MLX5_IPOOL_MTR, /* Pool for meter resource. */
+	MLX5_IPOOL_MCP, /* Pool for metadata resource. */
+	MLX5_IPOOL_HRXQ, /* Pool for hrxq resource. */
+	MLX5_IPOOL_MLX5_FLOW, /* Pool for mlx5 flow handle. */
+	MLX5_IPOOL_RTE_FLOW, /* Pool for rte_flow. */
+	MLX5_IPOOL_MAX,
+};
+
+/** Key string for IPC. */
+#define MLX5_MP_NAME "net_mlx5_mp"
+
+
+LIST_HEAD(mlx5_dev_list, mlx5_ibv_shared);
+
+/* Shared data between primary and secondary processes. */
+struct mlx5_shared_data {
+	rte_spinlock_t lock;
+	/* Global spinlock for primary and secondary processes. */
+	int init_done; /* Whether primary has done initialization. */
+	unsigned int secondary_cnt; /* Number of secondary processes init'd. */
+	struct mlx5_dev_list mem_event_cb_list;
+	rte_rwlock_t mem_event_rwlock;
+};
+
+/* Per-process data structure, not visible to other processes. */
+struct mlx5_local_data {
+	int init_done; /* Whether a secondary has done initialization. */
+};
+
+extern struct mlx5_shared_data *mlx5_shared_data;
+
+struct mlx5_counter_ctrl {
+	/* Name of the counter. */
+	char dpdk_name[RTE_ETH_XSTATS_NAME_SIZE];
+	/* Name of the counter on the device table. */
+	char ctr_name[RTE_ETH_XSTATS_NAME_SIZE];
+	uint32_t ib:1; /**< Nonzero for IB counters. */
+};
+
+struct mlx5_xstats_ctrl {
+	/* Number of device stats. */
+	uint16_t stats_n;
+	/* Number of device stats identified by PMD. */
+	uint16_t  mlx5_stats_n;
+	/* Index in the device counters table. */
+	uint16_t dev_table_idx[MLX5_MAX_XSTATS];
+	uint64_t base[MLX5_MAX_XSTATS];
+	uint64_t xstats[MLX5_MAX_XSTATS];
+	uint64_t hw_stats[MLX5_MAX_XSTATS];
+	struct mlx5_counter_ctrl info[MLX5_MAX_XSTATS];
+};
+
+struct mlx5_stats_ctrl {
+	/* Base for imissed counter. */
+	uint64_t imissed_base;
+	uint64_t imissed;
+};
+
+/* Default PMD specific parameter value. */
+#define MLX5_ARG_UNSET (-1)
+
+#define MLX5_LRO_SUPPORTED(dev) \
+	(((struct mlx5_priv *)((dev)->data->dev_private))->config.lro.supported)
+
+/* Maximal size of coalesced segment for LRO is set in chunks of 256 Bytes. */
+#define MLX5_LRO_SEG_CHUNK_SIZE	256u
+
+/* Maximal size of aggregated LRO packet. */
+#define MLX5_MAX_LRO_SIZE (UINT8_MAX * MLX5_LRO_SEG_CHUNK_SIZE)
+
+/* LRO configurations structure. */
+struct mlx5_lro_config {
+	uint32_t supported:1; /* Whether LRO is supported. */
+	uint32_t timeout; /* User configuration. */
+};
+
+/*
+ * Device configuration structure.
+ *
+ * Merged configuration from:
+ *
+ *  - Device capabilities,
+ *  - User device parameters disabled features.
+ */
+struct mlx5_dev_config {
+	unsigned int hw_csum:1; /* Checksum offload is supported. */
+	unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */
+	unsigned int hw_vlan_insert:1; /* VLAN insertion in WQE is supported. */
+	unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */
+	unsigned int hw_padding:1; /* End alignment padding is supported. */
+	unsigned int vf:1; /* This is a VF. */
+	unsigned int tunnel_en:1;
+	/* Whether tunnel stateless offloads are supported. */
+	unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */
+	unsigned int cqe_comp:1; /* CQE compression is enabled. */
+	unsigned int cqe_pad:1; /* CQE padding is enabled. */
+	unsigned int tso:1; /* Whether TSO is supported. */
+	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
+	unsigned int mr_ext_memseg_en:1;
+	/* Whether memseg should be extended for MR creation. */
+	unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
+	unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */
+	unsigned int dv_esw_en:1; /* Enable E-Switch DV flow. */
+	unsigned int dv_flow_en:1; /* Enable DV flow. */
+	unsigned int dv_xmeta_en:2; /* Enable extensive flow metadata. */
+	unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */
+	unsigned int devx:1; /* Whether devx interface is available or not. */
+	unsigned int dest_tir:1; /* Whether advanced DR API is available. */
+	struct {
+		unsigned int enabled:1; /* Whether MPRQ is enabled. */
+		unsigned int stride_num_n; /* Number of strides. */
+		unsigned int stride_size_n; /* Size of a stride. */
+		unsigned int min_stride_size_n; /* Min size of a stride. */
+		unsigned int max_stride_size_n; /* Max size of a stride. */
+		unsigned int max_memcpy_len;
+		/* Maximum packet size to memcpy Rx packets. */
+		unsigned int min_rxqs_num;
+		/* Rx queue count threshold to enable MPRQ. */
+	} mprq; /* Configurations for Multi-Packet RQ. */
+	int mps; /* Multi-packet send supported mode. */
+	int dbnc; /* Skip doorbell register write barrier. */
+	unsigned int flow_prio; /* Number of flow priorities. */
+	enum modify_reg flow_mreg_c[MLX5_MREG_C_NUM];
+	/* Availibility of mreg_c's. */
+	unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */
+	unsigned int ind_table_max_size; /* Maximum indirection table size. */
+	unsigned int max_dump_files_num; /* Maximum dump files per queue. */
+	unsigned int log_hp_size; /* Single hairpin queue data size in total. */
+	int txqs_inline; /* Queue number threshold for inlining. */
+	int txq_inline_min; /* Minimal amount of data bytes to inline. */
+	int txq_inline_max; /* Max packet size for inlining with SEND. */
+	int txq_inline_mpw; /* Max packet size for inlining with eMPW. */
+	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
+	struct mlx5_lro_config lro; /* LRO configuration. */
+};
+
+
+/**
+ * Type of object being allocated.
+ */
+enum mlx5_verbs_alloc_type {
+	MLX5_VERBS_ALLOC_TYPE_NONE,
+	MLX5_VERBS_ALLOC_TYPE_TX_QUEUE,
+	MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
+};
+
+/* Structure for VF VLAN workaround. */
+struct mlx5_vf_vlan {
+	uint32_t tag:12;
+	uint32_t created:1;
+};
+
+/**
+ * Verbs allocator needs a context to know in the callback which kind of
+ * resources it is allocating.
+ */
+struct mlx5_verbs_alloc_ctx {
+	enum mlx5_verbs_alloc_type type; /* Kind of object being allocated. */
+	const void *obj; /* Pointer to the DPDK object. */
+};
+
+/* Flow drop context necessary due to Verbs API. */
+struct mlx5_drop {
+	struct mlx5_hrxq *hrxq; /* Hash Rx queue queue. */
+	struct mlx5_rxq_obj *rxq; /* Rx queue object. */
+};
+
+#define MLX5_COUNTERS_PER_POOL 512
+#define MLX5_MAX_PENDING_QUERIES 4
+#define MLX5_CNT_CONTAINER_RESIZE 64
+#define MLX5_CNT_AGE_OFFSET 0x80000000
+#define CNT_SIZE (sizeof(struct mlx5_flow_counter))
+#define CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
+#define AGE_SIZE (sizeof(struct mlx5_age_param))
+#define MLX5_AGING_TIME_DELAY	7
+#define CNT_POOL_TYPE_EXT	(1 << 0)
+#define CNT_POOL_TYPE_AGE	(1 << 1)
+#define IS_EXT_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_EXT)
+#define IS_AGE_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_AGE)
+#define MLX_CNT_IS_AGE(counter) ((counter) & MLX5_CNT_AGE_OFFSET ? 1 : 0)
+#define MLX5_CNT_LEN(pool) \
+	(CNT_SIZE + \
+	(IS_AGE_POOL(pool) ? AGE_SIZE : 0) + \
+	(IS_EXT_POOL(pool) ? CNTEXT_SIZE : 0))
+#define MLX5_POOL_GET_CNT(pool, index) \
+	((struct mlx5_flow_counter *) \
+	((uint8_t *)((pool) + 1) + (index) * (MLX5_CNT_LEN(pool))))
+#define MLX5_CNT_ARRAY_IDX(pool, cnt) \
+	((int)(((uint8_t *)(cnt) - (uint8_t *)((pool) + 1)) / \
+	MLX5_CNT_LEN(pool)))
+/*
+ * The pool index and offset of counter in the pool array makes up the
+ * counter index. In case the counter is from pool 0 and offset 0, it
+ * should plus 1 to avoid index 0, since 0 means invalid counter index
+ * currently.
+ */
+#define MLX5_MAKE_CNT_IDX(pi, offset) \
+	((pi) * MLX5_COUNTERS_PER_POOL + (offset) + 1)
+#define MLX5_CNT_TO_CNT_EXT(pool, cnt) \
+	((struct mlx5_flow_counter_ext *)\
+	((uint8_t *)((cnt) + 1) + \
+	(IS_AGE_POOL(pool) ? AGE_SIZE : 0)))
+#define MLX5_GET_POOL_CNT_EXT(pool, offset) \
+	MLX5_CNT_TO_CNT_EXT(pool, MLX5_POOL_GET_CNT((pool), (offset)))
+#define MLX5_CNT_TO_AGE(cnt) \
+	((struct mlx5_age_param *)((cnt) + 1))
+
+struct mlx5_flow_counter_pool;
+
+/*age status*/
+enum {
+	AGE_FREE, /* Initialized state. */
+	AGE_CANDIDATE, /* Counter assigned to flows. */
+	AGE_TMOUT, /* Timeout, wait for rte_flow_get_aged_flows and destroy. */
+};
+
+#define MLX5_CNT_CONTAINER(sh, batch, age) (&(sh)->cmng.ccont \
+					    [(batch) * 2 + (age)])
+
+enum {
+	MLX5_CCONT_TYPE_SINGLE,
+	MLX5_CCONT_TYPE_SINGLE_FOR_AGE,
+	MLX5_CCONT_TYPE_BATCH,
+	MLX5_CCONT_TYPE_BATCH_FOR_AGE,
+	MLX5_CCONT_TYPE_MAX,
+};
+
+/* Counter age parameter. */
+struct mlx5_age_param {
+	rte_atomic16_t state; /**< Age state. */
+	uint16_t port_id; /**< Port id of the counter. */
+	uint32_t timeout:15; /**< Age timeout in unit of 0.1sec. */
+	uint32_t expire:16; /**< Expire time(0.1sec) in the future. */
+	void *context; /**< Flow counter age context. */
+};
+
+struct flow_counter_stats {
+	uint64_t hits;
+	uint64_t bytes;
+};
+
+/* Generic counters information. */
+struct mlx5_flow_counter {
+	TAILQ_ENTRY(mlx5_flow_counter) next;
+	/**< Pointer to the next flow counter structure. */
+	union {
+		uint64_t hits; /**< Reset value of hits packets. */
+		int64_t query_gen; /**< Generation of the last release. */
+	};
+	uint64_t bytes; /**< Reset value of bytes. */
+	void *action; /**< Pointer to the dv action. */
+};
+
+/* Extend counters information for none batch counters. */
+struct mlx5_flow_counter_ext {
+	uint32_t shared:1; /**< Share counter ID with other flow rules. */
+	uint32_t batch: 1;
+	/**< Whether the counter was allocated by batch command. */
+	uint32_t ref_cnt:30; /**< Reference counter. */
+	uint32_t id; /**< User counter ID. */
+	union {  /**< Holds the counters for the rule. */
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+		struct ibv_counter_set *cs;
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+		struct ibv_counters *cs;
+#endif
+		struct mlx5_devx_obj *dcs; /**< Counter Devx object. */
+	};
+};
+
+TAILQ_HEAD(mlx5_counters, mlx5_flow_counter);
+
+/* Generic counter pool structure - query is in pool resolution. */
+struct mlx5_flow_counter_pool {
+	TAILQ_ENTRY(mlx5_flow_counter_pool) next;
+	struct mlx5_counters counters; /* Free counter list. */
+	union {
+		struct mlx5_devx_obj *min_dcs;
+		rte_atomic64_t a64_dcs;
+	};
+	/* The devx object of the minimum counter ID. */
+	rte_atomic64_t start_query_gen; /* Query start round. */
+	rte_atomic64_t end_query_gen; /* Query end round. */
+	uint32_t index; /* Pool index in container. */
+	uint8_t type; /* Memory type behind the counter array. */
+	rte_spinlock_t sl; /* The pool lock. */
+	struct mlx5_counter_stats_raw *raw;
+	struct mlx5_counter_stats_raw *raw_hw; /* The raw on HW working. */
+};
+
+struct mlx5_counter_stats_raw;
+
+/* Memory management structure for group of counter statistics raws. */
+struct mlx5_counter_stats_mem_mng {
+	LIST_ENTRY(mlx5_counter_stats_mem_mng) next;
+	struct mlx5_counter_stats_raw *raws;
+	struct mlx5_devx_obj *dm;
+	struct mlx5dv_devx_umem *umem;
+};
+
+/* Raw memory structure for the counter statistics values of a pool. */
+struct mlx5_counter_stats_raw {
+	LIST_ENTRY(mlx5_counter_stats_raw) next;
+	int min_dcs_id;
+	struct mlx5_counter_stats_mem_mng *mem_mng;
+	volatile struct flow_counter_stats *data;
+};
+
+TAILQ_HEAD(mlx5_counter_pools, mlx5_flow_counter_pool);
+
+/* Container structure for counter pools. */
+struct mlx5_pools_container {
+	rte_atomic16_t n_valid; /* Number of valid pools. */
+	uint16_t n; /* Number of pools. */
+	rte_spinlock_t resize_sl; /* The resize lock. */
+	struct mlx5_counter_pools pool_list; /* Counter pool list. */
+	struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
+	struct mlx5_counter_stats_mem_mng *mem_mng;
+	/* Hold the memory management for the next allocated pools raws. */
+};
+
+/* Counter global management structure. */
+struct mlx5_flow_counter_mng {
+	struct mlx5_pools_container ccont[MLX5_CCONT_TYPE_MAX];
+	struct mlx5_counters flow_counters; /* Legacy flow counter list. */
+	uint8_t pending_queries;
+	uint8_t batch;
+	uint16_t pool_index;
+	uint8_t age;
+	uint8_t query_thread_on;
+	LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
+	LIST_HEAD(stat_raws, mlx5_counter_stats_raw) free_stat_raws;
+};
+
+#define MLX5_AGE_EVENT_NEW		1
+#define MLX5_AGE_TRIGGER		2
+#define MLX5_AGE_SET(age_info, BIT) \
+	((age_info)->flags |= (1 << (BIT)))
+#define MLX5_AGE_GET(age_info, BIT) \
+	((age_info)->flags & (1 << (BIT)))
+#define GET_PORT_AGE_INFO(priv) \
+	(&((priv)->sh->port[(priv)->ibv_port - 1].age_info))
+
+/* Aging information for per port. */
+struct mlx5_age_info {
+	uint8_t flags; /*Indicate if is new event or need be trigered*/
+	struct mlx5_counters aged_counters; /* Aged flow counter list. */
+	rte_spinlock_t aged_sl; /* Aged flow counter list lock. */
+};
+
+/* Per port data of shared IB device. */
+struct mlx5_ibv_shared_port {
+	uint32_t ih_port_id;
+	uint32_t devx_ih_port_id;
+	/*
+	 * Interrupt handler port_id. Used by shared interrupt
+	 * handler to find the corresponding rte_eth device
+	 * by IB port index. If value is equal or greater
+	 * RTE_MAX_ETHPORTS it means there is no subhandler
+	 * installed for specified IB port index.
+	 */
+	struct mlx5_age_info age_info;
+	/* Aging information for per port. */
+};
+
+/* Table key of the hash organization. */
+union mlx5_flow_tbl_key {
+	struct {
+		/* Table ID should be at the lowest address. */
+		uint32_t table_id;	/**< ID of the table. */
+		uint16_t reserved;	/**< must be zero for comparison. */
+		uint8_t domain;		/**< 1 - FDB, 0 - NIC TX/RX. */
+		uint8_t direction;	/**< 1 - egress, 0 - ingress. */
+	};
+	uint64_t v64;			/**< full 64bits value of key */
+};
+
+/* Table structure. */
+struct mlx5_flow_tbl_resource {
+	void *obj; /**< Pointer to DR table object. */
+	rte_atomic32_t refcnt; /**< Reference counter. */
+};
+
+#define MLX5_MAX_TABLES UINT16_MAX
+#define MLX5_FLOW_TABLE_LEVEL_METER (UINT16_MAX - 3)
+#define MLX5_FLOW_TABLE_LEVEL_SUFFIX (UINT16_MAX - 2)
+#define MLX5_HAIRPIN_TX_TABLE (UINT16_MAX - 1)
+/* Reserve the last two tables for metadata register copy. */
+#define MLX5_FLOW_MREG_ACT_TABLE_GROUP (MLX5_MAX_TABLES - 1)
+#define MLX5_FLOW_MREG_CP_TABLE_GROUP (MLX5_MAX_TABLES - 2)
+/* Tables for metering splits should be added here. */
+#define MLX5_MAX_TABLES_EXTERNAL (MLX5_MAX_TABLES - 3)
+#define MLX5_MAX_TABLES_FDB UINT16_MAX
+
+#define MLX5_DBR_PAGE_SIZE 4096 /* Must be >= 512. */
+#define MLX5_DBR_SIZE 8
+#define MLX5_DBR_PER_PAGE (MLX5_DBR_PAGE_SIZE / MLX5_DBR_SIZE)
+#define MLX5_DBR_BITMAP_SIZE (MLX5_DBR_PER_PAGE / 64)
+
+struct mlx5_devx_dbr_page {
+	/* Door-bell records, must be first member in structure. */
+	uint8_t dbrs[MLX5_DBR_PAGE_SIZE];
+	LIST_ENTRY(mlx5_devx_dbr_page) next; /* Pointer to the next element. */
+	struct mlx5dv_devx_umem *umem;
+	uint32_t dbr_count; /* Number of door-bell records in use. */
+	/* 1 bit marks matching door-bell is in use. */
+	uint64_t dbr_bitmap[MLX5_DBR_BITMAP_SIZE];
+};
+
+/* ID generation structure. */
+struct mlx5_flow_id_pool {
+	uint32_t *free_arr; /**< Pointer to the a array of free values. */
+	uint32_t base_index;
+	/**< The next index that can be used without any free elements. */
+	uint32_t *curr; /**< Pointer to the index to pop. */
+	uint32_t *last; /**< Pointer to the last element in the empty arrray. */
+	uint32_t max_id; /**< Maximum id can be allocated from the pool. */
+};
+
+/*
+ * Shared Infiniband device context for Master/Representors
+ * which belong to same IB device with multiple IB ports.
+ **/
+struct mlx5_ibv_shared {
+	LIST_ENTRY(mlx5_ibv_shared) next;
+	uint32_t refcnt;
+	uint32_t devx:1; /* Opened with DV. */
+	uint32_t max_port; /* Maximal IB device port index. */
+	struct ibv_context *ctx; /* Verbs/DV context. */
+	struct ibv_pd *pd; /* Protection Domain. */
+	uint32_t pdn; /* Protection Domain number. */
+	uint32_t tdn; /* Transport Domain number. */
+	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
+	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
+	struct ibv_device_attr_ex device_attr; /* Device properties. */
+	LIST_ENTRY(mlx5_ibv_shared) mem_event_cb;
+	/**< Called by memory event callback. */
+	struct mlx5_mr_share_cache share_cache;
+	/* Shared DV/DR flow data section. */
+	pthread_mutex_t dv_mutex; /* DV context mutex. */
+	uint32_t dv_meta_mask; /* flow META metadata supported mask. */
+	uint32_t dv_mark_mask; /* flow MARK metadata supported mask. */
+	uint32_t dv_regc0_mask; /* available bits of metatada reg_c[0]. */
+	uint32_t dv_refcnt; /* DV/DR data reference counter. */
+	void *fdb_domain; /* FDB Direct Rules name space handle. */
+	void *rx_domain; /* RX Direct Rules name space handle. */
+	void *tx_domain; /* TX Direct Rules name space handle. */
+	struct mlx5_hlist *flow_tbls;
+	/* Direct Rules tables for FDB, NIC TX+RX */
+	void *esw_drop_action; /* Pointer to DR E-Switch drop action. */
+	void *pop_vlan_action; /* Pointer to DR pop VLAN action. */
+	uint32_t encaps_decaps; /* Encap/decap action indexed memory list. */
+	LIST_HEAD(modify_cmd, mlx5_flow_dv_modify_hdr_resource) modify_cmds;
+	struct mlx5_hlist *tag_table;
+	uint32_t port_id_action_list; /* List of port ID actions. */
+	uint32_t push_vlan_action_list; /* List of push VLAN actions. */
+	struct mlx5_flow_counter_mng cmng; /* Counters management structure. */
+	struct mlx5_indexed_pool *ipool[MLX5_IPOOL_MAX];
+	/* Memory Pool for mlx5 flow resources. */
+	/* Shared interrupt handler section. */
+	pthread_mutex_t intr_mutex; /* Interrupt config mutex. */
+	uint32_t intr_cnt; /* Interrupt handler reference counter. */
+	struct rte_intr_handle intr_handle; /* Interrupt handler for device. */
+	uint32_t devx_intr_cnt; /* Devx interrupt handler reference counter. */
+	struct rte_intr_handle intr_handle_devx; /* DEVX interrupt handler. */
+	struct mlx5dv_devx_cmd_comp *devx_comp; /* DEVX async comp obj. */
+	struct mlx5_devx_obj *tis; /* TIS object. */
+	struct mlx5_devx_obj *td; /* Transport domain. */
+	struct mlx5_flow_id_pool *flow_id_pool; /* Flow ID pool. */
+	struct mlx5_ibv_shared_port port[]; /* per device port data array. */
+};
+
+/* Per-process private structure. */
+struct mlx5_proc_priv {
+	size_t uar_table_sz;
+	/* Size of UAR register table. */
+	void *uar_table[];
+	/* Table of UAR registers for each process. */
+};
+
+/* MTR profile list. */
+TAILQ_HEAD(mlx5_mtr_profiles, mlx5_flow_meter_profile);
+/* MTR list. */
+TAILQ_HEAD(mlx5_flow_meters, mlx5_flow_meter);
+
+#define MLX5_PROC_PRIV(port_id) \
+	((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private)
+
+struct mlx5_priv {
+	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
+	struct mlx5_ibv_shared *sh; /* Shared IB device context. */
+	uint32_t ibv_port; /* IB device port number. */
+	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	struct rte_ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */
+	BITFIELD_DECLARE(mac_own, uint64_t, MLX5_MAX_MAC_ADDRESSES);
+	/* Bit-field of MAC addresses owned by the PMD. */
+	uint16_t vlan_filter[MLX5_MAX_VLAN_IDS]; /* VLAN filters table. */
+	unsigned int vlan_filter_n; /* Number of configured VLAN filters. */
+	/* Device properties. */
+	uint16_t mtu; /* Configured MTU. */
+	unsigned int isolated:1; /* Whether isolated mode is enabled. */
+	unsigned int representor:1; /* Device is a port representor. */
+	unsigned int master:1; /* Device is a E-Switch master. */
+	unsigned int dr_shared:1; /* DV/DR data is shared. */
+	unsigned int counter_fallback:1; /* Use counter fallback management. */
+	unsigned int mtr_en:1; /* Whether support meter. */
+	unsigned int mtr_reg_share:1; /* Whether support meter REG_C share. */
+	uint16_t domain_id; /* Switch domain identifier. */
+	uint16_t vport_id; /* Associated VF vport index (if any). */
+	uint32_t vport_meta_tag; /* Used for vport index match ove VF LAG. */
+	uint32_t vport_meta_mask; /* Used for vport index field match mask. */
+	int32_t representor_id; /* Port representor identifier. */
+	int32_t pf_bond; /* >=0 means PF index in bonding configuration. */
+	unsigned int if_index; /* Associated kernel network device index. */
+	/* RX/TX queues. */
+	unsigned int rxqs_n; /* RX queues array size. */
+	unsigned int txqs_n; /* TX queues array size. */
+	struct mlx5_rxq_data *(*rxqs)[]; /* RX queues. */
+	struct mlx5_txq_data *(*txqs)[]; /* TX queues. */
+	struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
+	struct rte_eth_rss_conf rss_conf; /* RSS configuration. */
+	unsigned int (*reta_idx)[]; /* RETA index table. */
+	unsigned int reta_idx_n; /* RETA index size. */
+	struct mlx5_drop drop_queue; /* Flow drop queues. */
+	uint32_t flows; /* RTE Flow rules. */
+	uint32_t ctrl_flows; /* Control flow rules. */
+	void *inter_flows; /* Intermediate resources for flow creation. */
+	void *rss_desc; /* Intermediate rss description resources. */
+	int flow_idx; /* Intermediate device flow index. */
+	int flow_nested_idx; /* Intermediate device flow index, nested. */
+	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
+	LIST_HEAD(rxqobj, mlx5_rxq_obj) rxqsobj; /* Verbs/DevX Rx queues. */
+	uint32_t hrxqs; /* Verbs Hash Rx queues. */
+	LIST_HEAD(txq, mlx5_txq_ctrl) txqsctrl; /* DPDK Tx queues. */
+	LIST_HEAD(txqobj, mlx5_txq_obj) txqsobj; /* Verbs/DevX Tx queues. */
+	/* Indirection tables. */
+	LIST_HEAD(ind_tables, mlx5_ind_table_obj) ind_tbls;
+	/* Pointer to next element. */
+	rte_atomic32_t refcnt; /**< Reference counter. */
+	struct ibv_flow_action *verbs_action;
+	/**< Verbs modify header action object. */
+	uint8_t ft_type; /**< Flow table type, Rx or Tx. */
+	uint8_t max_lro_msg_size;
+	/* Tags resources cache. */
+	uint32_t link_speed_capa; /* Link speed capabilities. */
+	struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
+	struct mlx5_stats_ctrl stats_ctrl; /* Stats control. */
+	struct mlx5_dev_config config; /* Device configuration. */
+	struct mlx5_verbs_alloc_ctx verbs_alloc_ctx;
+	/* Context for Verbs allocator. */
+	int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
+	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
+	LIST_HEAD(dbrpage, mlx5_devx_dbr_page) dbrpgs; /* Door-bell pages. */
+	struct mlx5_nl_vlan_vmwa_context *vmwa_context; /* VLAN WA context. */
+	struct mlx5_flow_id_pool *qrss_id_pool;
+	struct mlx5_hlist *mreg_cp_tbl;
+	/* Hash table of Rx metadata register copy table. */
+	uint8_t mtr_sfx_reg; /* Meter prefix-suffix flow match REG_C. */
+	uint8_t mtr_color_reg; /* Meter color match REG_C. */
+	struct mlx5_mtr_profiles flow_meter_profiles; /* MTR profile list. */
+	struct mlx5_flow_meters flow_meters; /* MTR list. */
+#ifndef RTE_ARCH_64
+	rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
+	rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
+	/* UAR same-page access control required in 32bit implementations. */
+#endif
+	uint8_t skip_default_rss_reta; /* Skip configuration of default reta. */
+	uint8_t fdb_def_rule; /* Whether fdb jump to table 1 is configured. */
+	struct mlx5_mp_id mp_id; /* ID of a multi-process process */
+	LIST_HEAD(fdir, mlx5_fdir_flow) fdir_flows; /* fdir flows. */
+};
+
+#define PORT_ID(priv) ((priv)->dev_data->port_id)
+#define ETH_DEV(priv) (&rte_eth_devices[PORT_ID(priv)])
+
+/* mlx5.c */
+
+int mlx5_getenv_int(const char *);
+int mlx5_proc_priv_init(struct rte_eth_dev *dev);
+int64_t mlx5_get_dbr(struct rte_eth_dev *dev,
+		     struct mlx5_devx_dbr_page **dbr_page);
+int32_t mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id,
+			 uint64_t offset);
+int mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev,
+			      struct rte_eth_udp_tunnel *udp_tunnel);
+uint16_t mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev);
+
+/* Macro to iterate over all valid ports for mlx5 driver. */
+#define MLX5_ETH_FOREACH_DEV(port_id, pci_dev) \
+	for (port_id = mlx5_eth_find_next(0, pci_dev); \
+	     port_id < RTE_MAX_ETHPORTS; \
+	     port_id = mlx5_eth_find_next(port_id + 1, pci_dev))
+
+/* mlx5_ethdev.c */
+
+int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);
+int mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE]);
+unsigned int mlx5_ifindex(const struct rte_eth_dev *dev);
+int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
+int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);
+int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
+		   unsigned int flags);
+int mlx5_dev_configure(struct rte_eth_dev *dev);
+int mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info);
+int mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock);
+int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size);
+const uint32_t *mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev);
+int mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete);
+int mlx5_force_link_status_change(struct rte_eth_dev *dev, int status);
+int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
+int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev,
+			   struct rte_eth_fc_conf *fc_conf);
+int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev,
+			   struct rte_eth_fc_conf *fc_conf);
+void mlx5_dev_link_status_handler(void *arg);
+void mlx5_dev_interrupt_handler(void *arg);
+void mlx5_dev_interrupt_handler_devx(void *arg);
+void mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev);
+void mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev);
+void mlx5_dev_interrupt_handler_devx_uninstall(struct rte_eth_dev *dev);
+void mlx5_dev_interrupt_handler_devx_install(struct rte_eth_dev *dev);
+int mlx5_set_link_down(struct rte_eth_dev *dev);
+int mlx5_set_link_up(struct rte_eth_dev *dev);
+int mlx5_is_removed(struct rte_eth_dev *dev);
+eth_tx_burst_t mlx5_select_tx_function(struct rte_eth_dev *dev);
+eth_rx_burst_t mlx5_select_rx_function(struct rte_eth_dev *dev);
+struct mlx5_priv *mlx5_port_to_eswitch_info(uint16_t port, bool valid);
+struct mlx5_priv *mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev);
+int mlx5_sysfs_switch_info(unsigned int ifindex,
+			   struct mlx5_switch_info *info);
+void mlx5_sysfs_check_switch_info(bool device_dir,
+				  struct mlx5_switch_info *switch_info);
+void mlx5_translate_port_name(const char *port_name_in,
+			      struct mlx5_switch_info *port_info_out);
+void mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
+				   rte_intr_callback_fn cb_fn, void *cb_arg);
+int mlx5_get_module_info(struct rte_eth_dev *dev,
+			 struct rte_eth_dev_module_info *modinfo);
+int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
+			   struct rte_dev_eeprom_info *info);
+int mlx5_hairpin_cap_get(struct rte_eth_dev *dev,
+			 struct rte_eth_hairpin_cap *cap);
+int mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev);
+
+/* mlx5_mac.c */
+
+int mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN]);
+void mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
+int mlx5_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
+		      uint32_t index, uint32_t vmdq);
+struct mlx5_nl_vlan_vmwa_context *mlx5_vlan_vmwa_init
+				    (struct rte_eth_dev *dev, uint32_t ifindex);
+int mlx5_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr);
+int mlx5_set_mc_addr_list(struct rte_eth_dev *dev,
+			struct rte_ether_addr *mc_addr_set,
+			uint32_t nb_mc_addr);
+
+/* mlx5_rss.c */
+
+int mlx5_rss_hash_update(struct rte_eth_dev *dev,
+			 struct rte_eth_rss_conf *rss_conf);
+int mlx5_rss_hash_conf_get(struct rte_eth_dev *dev,
+			   struct rte_eth_rss_conf *rss_conf);
+int mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size);
+int mlx5_dev_rss_reta_query(struct rte_eth_dev *dev,
+			    struct rte_eth_rss_reta_entry64 *reta_conf,
+			    uint16_t reta_size);
+int mlx5_dev_rss_reta_update(struct rte_eth_dev *dev,
+			     struct rte_eth_rss_reta_entry64 *reta_conf,
+			     uint16_t reta_size);
+
+/* mlx5_rxmode.c */
+
+int mlx5_promiscuous_enable(struct rte_eth_dev *dev);
+int mlx5_promiscuous_disable(struct rte_eth_dev *dev);
+int mlx5_allmulticast_enable(struct rte_eth_dev *dev);
+int mlx5_allmulticast_disable(struct rte_eth_dev *dev);
+
+/* mlx5_stats.c */
+
+void mlx5_stats_init(struct rte_eth_dev *dev);
+int mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
+int mlx5_stats_reset(struct rte_eth_dev *dev);
+int mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats,
+		    unsigned int n);
+int mlx5_xstats_reset(struct rte_eth_dev *dev);
+int mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
+			  struct rte_eth_xstat_name *xstats_names,
+			  unsigned int n);
+
+/* mlx5_vlan.c */
+
+int mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
+void mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on);
+int mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask);
+void mlx5_vlan_vmwa_exit(struct mlx5_nl_vlan_vmwa_context *ctx);
+void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+
+/* mlx5_trigger.c */
+
+int mlx5_dev_start(struct rte_eth_dev *dev);
+void mlx5_dev_stop(struct rte_eth_dev *dev);
+int mlx5_traffic_enable(struct rte_eth_dev *dev);
+void mlx5_traffic_disable(struct rte_eth_dev *dev);
+int mlx5_traffic_restart(struct rte_eth_dev *dev);
+
+/* mlx5_flow.c */
+
+int mlx5_flow_discover_mreg_c(struct rte_eth_dev *eth_dev);
+bool mlx5_flow_ext_mreg_supported(struct rte_eth_dev *dev);
+int mlx5_flow_discover_priorities(struct rte_eth_dev *dev);
+void mlx5_flow_print(struct rte_flow *flow);
+int mlx5_flow_validate(struct rte_eth_dev *dev,
+		       const struct rte_flow_attr *attr,
+		       const struct rte_flow_item items[],
+		       const struct rte_flow_action actions[],
+		       struct rte_flow_error *error);
+struct rte_flow *mlx5_flow_create(struct rte_eth_dev *dev,
+				  const struct rte_flow_attr *attr,
+				  const struct rte_flow_item items[],
+				  const struct rte_flow_action actions[],
+				  struct rte_flow_error *error);
+int mlx5_flow_destroy(struct rte_eth_dev *dev, struct rte_flow *flow,
+		      struct rte_flow_error *error);
+void mlx5_flow_list_flush(struct rte_eth_dev *dev, uint32_t *list, bool active);
+int mlx5_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
+int mlx5_flow_query(struct rte_eth_dev *dev, struct rte_flow *flow,
+		    const struct rte_flow_action *action, void *data,
+		    struct rte_flow_error *error);
+int mlx5_flow_isolate(struct rte_eth_dev *dev, int enable,
+		      struct rte_flow_error *error);
+int mlx5_dev_filter_ctrl(struct rte_eth_dev *dev,
+			 enum rte_filter_type filter_type,
+			 enum rte_filter_op filter_op,
+			 void *arg);
+int mlx5_flow_start(struct rte_eth_dev *dev, uint32_t *list);
+void mlx5_flow_stop(struct rte_eth_dev *dev, uint32_t *list);
+int mlx5_flow_start_default(struct rte_eth_dev *dev);
+void mlx5_flow_stop_default(struct rte_eth_dev *dev);
+void mlx5_flow_alloc_intermediate(struct rte_eth_dev *dev);
+void mlx5_flow_free_intermediate(struct rte_eth_dev *dev);
+int mlx5_flow_verify(struct rte_eth_dev *dev);
+int mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev, uint32_t queue);
+int mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
+			struct rte_flow_item_eth *eth_spec,
+			struct rte_flow_item_eth *eth_mask,
+			struct rte_flow_item_vlan *vlan_spec,
+			struct rte_flow_item_vlan *vlan_mask);
+int mlx5_ctrl_flow(struct rte_eth_dev *dev,
+		   struct rte_flow_item_eth *eth_spec,
+		   struct rte_flow_item_eth *eth_mask);
+struct rte_flow *mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev);
+int mlx5_flow_create_drop_queue(struct rte_eth_dev *dev);
+void mlx5_flow_delete_drop_queue(struct rte_eth_dev *dev);
+void mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh,
+				       uint64_t async_id, int status);
+void mlx5_set_query_alarm(struct mlx5_ibv_shared *sh);
+void mlx5_flow_query_alarm(void *arg);
+uint32_t mlx5_counter_alloc(struct rte_eth_dev *dev);
+void mlx5_counter_free(struct rte_eth_dev *dev, uint32_t cnt);
+int mlx5_counter_query(struct rte_eth_dev *dev, uint32_t cnt,
+		       bool clear, uint64_t *pkts, uint64_t *bytes);
+int mlx5_flow_dev_dump(struct rte_eth_dev *dev, FILE *file,
+		       struct rte_flow_error *error);
+void mlx5_flow_rxq_dynf_metadata_set(struct rte_eth_dev *dev);
+int mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void **contexts,
+			uint32_t nb_contexts, struct rte_flow_error *error);
+
+/* mlx5_mp.c */
+int mlx5_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer);
+int mlx5_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer);
+void mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev);
+void mlx5_mp_req_stop_rxtx(struct rte_eth_dev *dev);
+
+/* mlx5_socket.c */
+
+int mlx5_pmd_socket_init(void);
+
+/* mlx5_flow_meter.c */
+
+int mlx5_flow_meter_ops_get(struct rte_eth_dev *dev, void *arg);
+struct mlx5_flow_meter *mlx5_flow_meter_find(struct mlx5_priv *priv,
+					     uint32_t meter_id);
+struct mlx5_flow_meter *mlx5_flow_meter_attach
+					(struct mlx5_priv *priv,
+					 uint32_t meter_id,
+					 const struct rte_flow_attr *attr,
+					 struct rte_flow_error *error);
+void mlx5_flow_meter_detach(struct mlx5_flow_meter *fm);
+
+#endif /* RTE_PMD_MLX5_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h
new file mode 100644
index 000000000..260f58429
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_defs.h
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_DEFS_H_
+#define RTE_PMD_MLX5_DEFS_H_
+
+#include <rte_ethdev_driver.h>
+#include <rte_vxlan.h>
+
+#include "mlx5_autoconf.h"
+
+/* Reported driver name. */
+#define MLX5_DRIVER_NAME "net_mlx5"
+
+/* Maximum number of simultaneous VLAN filters. */
+#define MLX5_MAX_VLAN_IDS 128
+
+/*
+ * Request TX completion every time descriptors reach this threshold since
+ * the previous request. Must be a power of two for performance reasons.
+ */
+#define MLX5_TX_COMP_THRESH 32u
+
+/*
+ * Request TX completion every time the total number of WQEBBs used for inlining
+ * packets exceeds the size of WQ divided by this divisor. Better to be power of
+ * two for performance.
+ */
+#define MLX5_TX_COMP_THRESH_INLINE_DIV (1 << 3)
+
+/*
+ * Maximal amount of normal completion CQEs
+ * processed in one call of tx_burst() routine.
+ */
+#define MLX5_TX_COMP_MAX_CQE 2u
+
+
+/* Size of per-queue MR cache array for linear search. */
+#define MLX5_MR_CACHE_N 8
+
+/* Size of MR cache table for binary search. */
+#define MLX5_MR_BTREE_CACHE_N 256
+
+/*
+ * If defined, only use software counters. The PMD will never ask the hardware
+ * for these, and many of them won't be available.
+ */
+#ifndef MLX5_PMD_SOFT_COUNTERS
+#define MLX5_PMD_SOFT_COUNTERS 1
+#endif
+
+/* Switch port ID parameters for bonding configurations. */
+#define MLX5_PORT_ID_BONDING_PF_MASK 0xf
+#define MLX5_PORT_ID_BONDING_PF_SHIFT 0xf
+
+/* Alarm timeout. */
+#define MLX5_ALARM_TIMEOUT_US 100000
+
+/* Maximum number of extended statistics counters. */
+#define MLX5_MAX_XSTATS 32
+
+/* Maximum Packet headers size (L2+L3+L4) for TSO. */
+#define MLX5_MAX_TSO_HEADER (128u + 34u)
+
+/* Inline data size required by NICs. */
+#define MLX5_INLINE_HSIZE_NONE 0
+#define MLX5_INLINE_HSIZE_L2 (sizeof(struct rte_ether_hdr) + \
+			      sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_L3 (MLX5_INLINE_HSIZE_L2 + \
+			      sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_L4 (MLX5_INLINE_HSIZE_L3 + \
+			      sizeof(struct rte_tcp_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L2 (MLX5_INLINE_HSIZE_L3 + \
+				    sizeof(struct rte_udp_hdr) + \
+				    sizeof(struct rte_vxlan_hdr) + \
+				    sizeof(struct rte_ether_hdr) + \
+				    sizeof(struct rte_vlan_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L3 (MLX5_INLINE_HSIZE_INNER_L2 + \
+				    sizeof(struct rte_ipv6_hdr))
+#define MLX5_INLINE_HSIZE_INNER_L4 (MLX5_INLINE_HSIZE_INNER_L3 + \
+				    sizeof(struct rte_tcp_hdr))
+
+/* Threshold of buffer replenishment for vectorized Rx. */
+#define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \
+	(RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2))
+
+/* Maximum size of burst for vectorized Rx. */
+#define MLX5_VPMD_RX_MAX_BURST 64U
+
+/* Recommended optimal burst size. */
+#define MLX5_RX_DEFAULT_BURST 64U
+#define MLX5_TX_DEFAULT_BURST 64U
+
+/* Number of packets vectorized Rx can simultaneously process in a loop. */
+#define MLX5_VPMD_DESCS_PER_LOOP      4
+
+/* Mask of RSS on source only or destination only. */
+#define MLX5_RSS_SRC_DST_ONLY (ETH_RSS_L3_SRC_ONLY | ETH_RSS_L3_DST_ONLY | \
+			       ETH_RSS_L4_SRC_ONLY | ETH_RSS_L4_DST_ONLY)
+
+/* Supported RSS */
+#define MLX5_RSS_HF_MASK (~(ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP | \
+			    MLX5_RSS_SRC_DST_ONLY))
+
+/* Timeout in seconds to get a valid link status. */
+#define MLX5_LINK_STATUS_TIMEOUT 10
+
+/* Number of times to retry retrieving the physical link information. */
+#define MLX5_GET_LINK_STATUS_RETRY_COUNT 3
+
+/* Maximum number of UAR pages used by a port,
+ * These are the size and mask for an array of mutexes used to synchronize
+ * the access to port's UARs on platforms that do not support 64 bit writes.
+ * In such systems it is possible to issue the 64 bits DoorBells through two
+ * consecutive writes, each write 32 bits. The access to a UAR page (which can
+ * be accessible by all threads in the process) must be synchronized
+ * (for example, using a semaphore). Such a synchronization is not required
+ * when ringing DoorBells on different UAR pages.
+ * A port with 512 Tx queues uses 8, 4kBytes, UAR pages which are shared
+ * among the ports.
+ */
+#define MLX5_UAR_PAGE_NUM_MAX 64
+#define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) - 1)
+
+/* Fields of memory mapping type in offset parameter of mmap() */
+#define MLX5_UAR_MMAP_CMD_SHIFT 8
+#define MLX5_UAR_MMAP_CMD_MASK 0xff
+
+/* Environment variable to control the doorbell register mapping. */
+#define MLX5_SHUT_UP_BF "MLX5_SHUT_UP_BF"
+#if defined(RTE_ARCH_ARM64)
+#define MLX5_SHUT_UP_BF_DEFAULT "0"
+#else
+#define MLX5_SHUT_UP_BF_DEFAULT "1"
+#endif
+
+#ifndef HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD
+#define MLX5_MMAP_GET_NC_PAGES_CMD 3
+#endif
+
+/* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */
+#define MLX5_MPRQ_STRIDE_NUM_N 6U
+
+/* Log 2 of the default size of a stride per WQE for Multi-Packet RQ. */
+#define MLX5_MPRQ_STRIDE_SIZE_N 11U
+
+/* Two-byte shift is disabled for Multi-Packet RQ. */
+#define MLX5_MPRQ_TWO_BYTE_SHIFT 0
+
+/*
+ * Minimum size of packet to be memcpy'd instead of being attached as an
+ * external buffer.
+ */
+#define MLX5_MPRQ_MEMCPY_DEFAULT_LEN 128
+
+/* Minimum number Rx queues to enable Multi-Packet RQ. */
+#define MLX5_MPRQ_MIN_RXQS 12
+
+/* Cache size of mempool for Multi-Packet RQ. */
+#define MLX5_MPRQ_MP_CACHE_SZ 32U
+
+/* MLX5_DV_XMETA_EN supported values. */
+#define MLX5_XMETA_MODE_LEGACY 0
+#define MLX5_XMETA_MODE_META16 1
+#define MLX5_XMETA_MODE_META32 2
+
+/* MLX5_TX_DB_NC supported values. */
+#define MLX5_TXDB_CACHED 0
+#define MLX5_TXDB_NCACHED 1
+#define MLX5_TXDB_HEURISTIC 2
+
+/* Size of the simple hash table for metadata register table. */
+#define MLX5_FLOW_MREG_HTABLE_SZ 4096
+#define MLX5_FLOW_MREG_HNAME "MARK_COPY_TABLE"
+#define MLX5_DEFAULT_COPY_ID UINT32_MAX
+
+/* Hairpin TX/RX queue configuration parameters. */
+#define MLX5_HAIRPIN_QUEUE_STRIDE 6
+#define MLX5_HAIRPIN_JUMBO_LOG_SIZE (14 + 2)
+
+/* Definition of static_assert found in /usr/include/assert.h */
+#ifndef HAVE_STATIC_ASSERT
+#define static_assert _Static_assert
+#endif
+
+#endif /* RTE_PMD_MLX5_DEFS_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c
new file mode 100644
index 000000000..47f11b963
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_ethdev.c
@@ -0,0 +1,2071 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <dirent.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <fcntl.h>
+#include <stdalign.h>
+#include <sys/un.h>
+#include <time.h>
+
+#include <rte_atomic.h>
+#include <rte_ethdev_driver.h>
+#include <rte_bus_pci.h>
+#include <rte_mbuf.h>
+#include <rte_common.h>
+#include <rte_interrupts.h>
+#include <rte_malloc.h>
+#include <rte_string_fns.h>
+#include <rte_rwlock.h>
+#include <rte_cycles.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+/* Supported speed values found in /usr/include/linux/ethtool.h */
+#ifndef HAVE_SUPPORTED_40000baseKR4_Full
+#define SUPPORTED_40000baseKR4_Full (1 << 23)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseCR4_Full
+#define SUPPORTED_40000baseCR4_Full (1 << 24)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseSR4_Full
+#define SUPPORTED_40000baseSR4_Full (1 << 25)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseLR4_Full
+#define SUPPORTED_40000baseLR4_Full (1 << 26)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseKR4_Full
+#define SUPPORTED_56000baseKR4_Full (1 << 27)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseCR4_Full
+#define SUPPORTED_56000baseCR4_Full (1 << 28)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseSR4_Full
+#define SUPPORTED_56000baseSR4_Full (1 << 29)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseLR4_Full
+#define SUPPORTED_56000baseLR4_Full (1 << 30)
+#endif
+
+/* Add defines in case the running kernel is not the same as user headers. */
+#ifndef ETHTOOL_GLINKSETTINGS
+struct ethtool_link_settings {
+	uint32_t cmd;
+	uint32_t speed;
+	uint8_t duplex;
+	uint8_t port;
+	uint8_t phy_address;
+	uint8_t autoneg;
+	uint8_t mdio_support;
+	uint8_t eth_to_mdix;
+	uint8_t eth_tp_mdix_ctrl;
+	int8_t link_mode_masks_nwords;
+	uint32_t reserved[8];
+	uint32_t link_mode_masks[];
+};
+
+/* The kernel values can be found in /include/uapi/linux/ethtool.h */
+#define ETHTOOL_GLINKSETTINGS 0x0000004c
+#define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
+#define ETHTOOL_LINK_MODE_Autoneg_BIT 6
+#define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
+#define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
+#define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
+#define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
+#define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
+#define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
+#define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
+#define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
+#define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
+#define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
+#define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
+#define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
+#define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
+#define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_25G
+#define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
+#define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
+#define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_50G
+#define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
+#define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_100G
+#define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
+#define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
+#define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
+#define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_200G
+#define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62
+#define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63
+#define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */
+#define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */
+#define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */
+#endif
+
+/**
+ * Get master interface name from private structure.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE])
+{
+	DIR *dir;
+	struct dirent *dent;
+	unsigned int dev_type = 0;
+	unsigned int dev_port_prev = ~0u;
+	char match[IF_NAMESIZE] = "";
+
+	MLX5_ASSERT(ibdev_path);
+	{
+		MKSTR(path, "%s/device/net", ibdev_path);
+
+		dir = opendir(path);
+		if (dir == NULL) {
+			rte_errno = errno;
+			return -rte_errno;
+		}
+	}
+	while ((dent = readdir(dir)) != NULL) {
+		char *name = dent->d_name;
+		FILE *file;
+		unsigned int dev_port;
+		int r;
+
+		if ((name[0] == '.') &&
+		    ((name[1] == '\0') ||
+		     ((name[1] == '.') && (name[2] == '\0'))))
+			continue;
+
+		MKSTR(path, "%s/device/net/%s/%s",
+		      ibdev_path, name,
+		      (dev_type ? "dev_id" : "dev_port"));
+
+		file = fopen(path, "rb");
+		if (file == NULL) {
+			if (errno != ENOENT)
+				continue;
+			/*
+			 * Switch to dev_id when dev_port does not exist as
+			 * is the case with Linux kernel versions < 3.15.
+			 */
+try_dev_id:
+			match[0] = '\0';
+			if (dev_type)
+				break;
+			dev_type = 1;
+			dev_port_prev = ~0u;
+			rewinddir(dir);
+			continue;
+		}
+		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
+		fclose(file);
+		if (r != 1)
+			continue;
+		/*
+		 * Switch to dev_id when dev_port returns the same value for
+		 * all ports. May happen when using a MOFED release older than
+		 * 3.0 with a Linux kernel >= 3.15.
+		 */
+		if (dev_port == dev_port_prev)
+			goto try_dev_id;
+		dev_port_prev = dev_port;
+		if (dev_port == 0)
+			strlcpy(match, name, sizeof(match));
+	}
+	closedir(dir);
+	if (match[0] == '\0') {
+		rte_errno = ENOENT;
+		return -rte_errno;
+	}
+	strncpy(*ifname, match, sizeof(*ifname));
+	return 0;
+}
+
+/**
+ * Get interface name from private structure.
+ *
+ * This is a port representor-aware version of mlx5_get_master_ifname().
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int ifindex;
+
+	MLX5_ASSERT(priv);
+	MLX5_ASSERT(priv->sh);
+	ifindex = mlx5_ifindex(dev);
+	if (!ifindex) {
+		if (!priv->representor)
+			return mlx5_get_master_ifname(priv->sh->ibdev_path,
+						      ifname);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	if (if_indextoname(ifindex, &(*ifname)[0]))
+		return 0;
+	rte_errno = errno;
+	return -rte_errno;
+}
+
+/**
+ * Get the interface index from device name.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   Nonzero interface index on success, zero otherwise and rte_errno is set.
+ */
+unsigned int
+mlx5_ifindex(const struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int ifindex;
+
+	MLX5_ASSERT(priv);
+	MLX5_ASSERT(priv->if_index);
+	ifindex = priv->if_index;
+	if (!ifindex)
+		rte_errno = ENXIO;
+	return ifindex;
+}
+
+/**
+ * Perform ifreq ioctl() on associated Ethernet device.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param req
+ *   Request number to pass to ioctl().
+ * @param[out] ifr
+ *   Interface request structure output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+{
+	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	int ret = 0;
+
+	if (sock == -1) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
+	if (ret)
+		goto error;
+	ret = ioctl(sock, req, ifr);
+	if (ret == -1) {
+		rte_errno = errno;
+		goto error;
+	}
+	close(sock);
+	return 0;
+error:
+	close(sock);
+	return -rte_errno;
+}
+
+/**
+ * Get device MTU.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] mtu
+ *   MTU value output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
+{
+	struct ifreq request;
+	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
+
+	if (ret)
+		return ret;
+	*mtu = request.ifr_mtu;
+	return 0;
+}
+
+/**
+ * Set device MTU.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mtu
+ *   MTU value to set.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct ifreq request = { .ifr_mtu = mtu, };
+
+	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
+}
+
+/**
+ * Set device flags.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param keep
+ *   Bitmask for flags that must remain untouched.
+ * @param flags
+ *   Bitmask for flags to modify.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
+{
+	struct ifreq request;
+	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
+
+	if (ret)
+		return ret;
+	request.ifr_flags &= keep;
+	request.ifr_flags |= flags & ~keep;
+	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
+}
+
+/**
+ * DPDK callback for Ethernet device configuration.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_configure(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int rxqs_n = dev->data->nb_rx_queues;
+	unsigned int txqs_n = dev->data->nb_tx_queues;
+	const uint8_t use_app_rss_key =
+		!!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
+	int ret = 0;
+
+	if (use_app_rss_key &&
+	    (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
+	     MLX5_RSS_HASH_KEY_LEN)) {
+		DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long",
+			dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN));
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	priv->rss_conf.rss_key =
+		rte_realloc(priv->rss_conf.rss_key,
+			    MLX5_RSS_HASH_KEY_LEN, 0);
+	if (!priv->rss_conf.rss_key) {
+		DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
+			dev->data->port_id, rxqs_n);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+
+	if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG)
+		dev->data->dev_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
+
+	memcpy(priv->rss_conf.rss_key,
+	       use_app_rss_key ?
+	       dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
+	       rss_hash_default_key,
+	       MLX5_RSS_HASH_KEY_LEN);
+	priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN;
+	priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
+	priv->rxqs = (void *)dev->data->rx_queues;
+	priv->txqs = (void *)dev->data->tx_queues;
+	if (txqs_n != priv->txqs_n) {
+		DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
+			dev->data->port_id, priv->txqs_n, txqs_n);
+		priv->txqs_n = txqs_n;
+	}
+	if (rxqs_n > priv->config.ind_table_max_size) {
+		DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
+			dev->data->port_id, rxqs_n);
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	if (rxqs_n != priv->rxqs_n) {
+		DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
+			dev->data->port_id, priv->rxqs_n, rxqs_n);
+		priv->rxqs_n = rxqs_n;
+	}
+	priv->skip_default_rss_reta = 0;
+	ret = mlx5_proc_priv_init(dev);
+	if (ret)
+		return ret;
+	return 0;
+}
+
+/**
+ * Configure default RSS reta.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int rxqs_n = dev->data->nb_rx_queues;
+	unsigned int i;
+	unsigned int j;
+	unsigned int reta_idx_n;
+	int ret = 0;
+	unsigned int *rss_queue_arr = NULL;
+	unsigned int rss_queue_n = 0;
+
+	if (priv->skip_default_rss_reta)
+		return ret;
+	rss_queue_arr = rte_malloc("", rxqs_n * sizeof(unsigned int), 0);
+	if (!rss_queue_arr) {
+		DRV_LOG(ERR, "port %u cannot allocate RSS queue list (%u)",
+			dev->data->port_id, rxqs_n);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	for (i = 0, j = 0; i < rxqs_n; i++) {
+		struct mlx5_rxq_data *rxq_data;
+		struct mlx5_rxq_ctrl *rxq_ctrl;
+
+		rxq_data = (*priv->rxqs)[i];
+		rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+		if (rxq_ctrl && rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD)
+			rss_queue_arr[j++] = i;
+	}
+	rss_queue_n = j;
+	if (rss_queue_n > priv->config.ind_table_max_size) {
+		DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
+			dev->data->port_id, rss_queue_n);
+		rte_errno = EINVAL;
+		rte_free(rss_queue_arr);
+		return -rte_errno;
+	}
+	DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
+		dev->data->port_id, priv->rxqs_n, rxqs_n);
+	priv->rxqs_n = rxqs_n;
+	/*
+	 * If the requested number of RX queues is not a power of two,
+	 * use the maximum indirection table size for better balancing.
+	 * The result is always rounded to the next power of two.
+	 */
+	reta_idx_n = (1 << log2above((rss_queue_n & (rss_queue_n - 1)) ?
+				priv->config.ind_table_max_size :
+				rss_queue_n));
+	ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
+	if (ret) {
+		rte_free(rss_queue_arr);
+		return ret;
+	}
+	/*
+	 * When the number of RX queues is not a power of two,
+	 * the remaining table entries are padded with reused WQs
+	 * and hashes are not spread uniformly.
+	 */
+	for (i = 0, j = 0; (i != reta_idx_n); ++i) {
+		(*priv->reta_idx)[i] = rss_queue_arr[j];
+		if (++j == rss_queue_n)
+			j = 0;
+	}
+	rte_free(rss_queue_arr);
+	return ret;
+}
+
+/**
+ * Sets default tuning parameters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] info
+ *   Info structure output buffer.
+ */
+static void
+mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	/* Minimum CPU utilization. */
+	info->default_rxportconf.ring_size = 256;
+	info->default_txportconf.ring_size = 256;
+	info->default_rxportconf.burst_size = MLX5_RX_DEFAULT_BURST;
+	info->default_txportconf.burst_size = MLX5_TX_DEFAULT_BURST;
+	if ((priv->link_speed_capa & ETH_LINK_SPEED_200G) |
+		(priv->link_speed_capa & ETH_LINK_SPEED_100G)) {
+		info->default_rxportconf.nb_queues = 16;
+		info->default_txportconf.nb_queues = 16;
+		if (dev->data->nb_rx_queues > 2 ||
+		    dev->data->nb_tx_queues > 2) {
+			/* Max Throughput. */
+			info->default_rxportconf.ring_size = 2048;
+			info->default_txportconf.ring_size = 2048;
+		}
+	} else {
+		info->default_rxportconf.nb_queues = 8;
+		info->default_txportconf.nb_queues = 8;
+		if (dev->data->nb_rx_queues > 2 ||
+		    dev->data->nb_tx_queues > 2) {
+			/* Max Throughput. */
+			info->default_rxportconf.ring_size = 4096;
+			info->default_txportconf.ring_size = 4096;
+		}
+	}
+}
+
+/**
+ * Sets tx mbuf limiting parameters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] info
+ *   Info structure output buffer.
+ */
+static void
+mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	unsigned int inlen;
+	uint16_t nb_max;
+
+	inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ?
+		MLX5_SEND_DEF_INLINE_LEN :
+		(unsigned int)config->txq_inline_max;
+	MLX5_ASSERT(config->txq_inline_min >= 0);
+	inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min);
+	inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX +
+			       MLX5_ESEG_MIN_INLINE_SIZE -
+			       MLX5_WQE_CSEG_SIZE -
+			       MLX5_WQE_ESEG_SIZE -
+			       MLX5_WQE_DSEG_SIZE * 2);
+	nb_max = (MLX5_WQE_SIZE_MAX +
+		  MLX5_ESEG_MIN_INLINE_SIZE -
+		  MLX5_WQE_CSEG_SIZE -
+		  MLX5_WQE_ESEG_SIZE -
+		  MLX5_WQE_DSEG_SIZE -
+		  inlen) / MLX5_WSEG_SIZE;
+	info->tx_desc_lim.nb_seg_max = nb_max;
+	info->tx_desc_lim.nb_mtu_seg_max = nb_max;
+}
+
+/**
+ * DPDK callback to get information about the device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] info
+ *   Info structure output buffer.
+ */
+int
+mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	unsigned int max;
+
+	/* FIXME: we should ask the device for these values. */
+	info->min_rx_bufsize = 32;
+	info->max_rx_pktlen = 65536;
+	info->max_lro_pkt_size = MLX5_MAX_LRO_SIZE;
+	/*
+	 * Since we need one CQ per QP, the limit is the minimum number
+	 * between the two values.
+	 */
+	max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq,
+		      priv->sh->device_attr.orig_attr.max_qp);
+	/* max_rx_queues is uint16_t. */
+	max = RTE_MIN(max, (unsigned int)UINT16_MAX);
+	info->max_rx_queues = max;
+	info->max_tx_queues = max;
+	info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
+	info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
+	info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
+				 info->rx_queue_offload_capa);
+	info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
+	info->if_index = mlx5_ifindex(dev);
+	info->reta_size = priv->reta_idx_n ?
+		priv->reta_idx_n : config->ind_table_max_size;
+	info->hash_key_size = MLX5_RSS_HASH_KEY_LEN;
+	info->speed_capa = priv->link_speed_capa;
+	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
+	mlx5_set_default_params(dev, info);
+	mlx5_set_txlimit_params(dev, info);
+	info->switch_info.name = dev->data->name;
+	info->switch_info.domain_id = priv->domain_id;
+	info->switch_info.port_id = priv->representor_id;
+	if (priv->representor) {
+		uint16_t port_id;
+
+		if (priv->pf_bond >= 0) {
+			/*
+			 * Switch port ID is opaque value with driver defined
+			 * format. Push the PF index in bonding configurations
+			 * in upper four bits of port ID. If we get too many
+			 * representors (more than 4K) or PFs (more than 15)
+			 * this approach must be reconsidered.
+			 */
+			if ((info->switch_info.port_id >>
+				MLX5_PORT_ID_BONDING_PF_SHIFT) ||
+			    priv->pf_bond > MLX5_PORT_ID_BONDING_PF_MASK) {
+				DRV_LOG(ERR, "can't update switch port ID"
+					     " for bonding device");
+				MLX5_ASSERT(false);
+				return -ENODEV;
+			}
+			info->switch_info.port_id |=
+				priv->pf_bond << MLX5_PORT_ID_BONDING_PF_SHIFT;
+		}
+		MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+			struct mlx5_priv *opriv =
+				rte_eth_devices[port_id].data->dev_private;
+
+			if (!opriv ||
+			    opriv->representor ||
+			    opriv->sh != priv->sh ||
+			    opriv->domain_id != priv->domain_id)
+				continue;
+			/*
+			 * Override switch name with that of the master
+			 * device.
+			 */
+			info->switch_info.name = opriv->dev_data->name;
+			break;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Get device current raw clock counter
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] time
+ *   Current raw clock counter of the device.
+ *
+ * @return
+ *   0 if the clock has correctly been read
+ *   The value of errno in case of error
+ */
+int
+mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ibv_context *ctx = priv->sh->ctx;
+	struct ibv_values_ex values;
+	int err = 0;
+
+	values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
+	err = mlx5_glue->query_rt_values_ex(ctx, &values);
+	if (err != 0) {
+		DRV_LOG(WARNING, "Could not query the clock !");
+		return err;
+	}
+	*clock = values.raw_clock.tv_nsec;
+	return 0;
+}
+
+/**
+ * Get firmware version of a device.
+ *
+ * @param dev
+ *   Ethernet device port.
+ * @param fw_ver
+ *   String output allocated by caller.
+ * @param fw_size
+ *   Size of the output string, including terminating null byte.
+ *
+ * @return
+ *   0 on success, or the size of the non truncated string if too big.
+ */
+int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr;
+	size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1;
+
+	if (fw_size < size)
+		return size;
+	if (fw_ver != NULL)
+		strlcpy(fw_ver, attr->fw_ver, fw_size);
+	return 0;
+}
+
+/**
+ * Get supported packet types.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   A pointer to the supported Packet types array.
+ */
+const uint32_t *
+mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
+{
+	static const uint32_t ptypes[] = {
+		/* refers to rxq_cq_to_pkt_type() */
+		RTE_PTYPE_L2_ETHER,
+		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+		RTE_PTYPE_L4_NONFRAG,
+		RTE_PTYPE_L4_FRAG,
+		RTE_PTYPE_L4_TCP,
+		RTE_PTYPE_L4_UDP,
+		RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+		RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+		RTE_PTYPE_INNER_L4_NONFRAG,
+		RTE_PTYPE_INNER_L4_FRAG,
+		RTE_PTYPE_INNER_L4_TCP,
+		RTE_PTYPE_INNER_L4_UDP,
+		RTE_PTYPE_UNKNOWN
+	};
+
+	if (dev->rx_pkt_burst == mlx5_rx_burst ||
+	    dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
+	    dev->rx_pkt_burst == mlx5_rx_burst_vec)
+		return ptypes;
+	return NULL;
+}
+
+/**
+ * Retrieve the master device for representor in the same switch domain.
+ *
+ * @param dev
+ *   Pointer to representor Ethernet device structure.
+ *
+ * @return
+ *   Master device structure  on success, NULL otherwise.
+ */
+
+static struct rte_eth_dev *
+mlx5_find_master_dev(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv;
+	uint16_t port_id;
+	uint16_t domain_id;
+
+	priv = dev->data->dev_private;
+	domain_id = priv->domain_id;
+	MLX5_ASSERT(priv->representor);
+	MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+		struct mlx5_priv *opriv =
+			rte_eth_devices[port_id].data->dev_private;
+		if (opriv &&
+		    opriv->master &&
+		    opriv->domain_id == domain_id &&
+		    opriv->sh == priv->sh)
+			return &rte_eth_devices[port_id];
+	}
+	return NULL;
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] link
+ *   Storage for current link status.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
+			       struct rte_eth_link *link)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ethtool_cmd edata = {
+		.cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
+	};
+	struct ifreq ifr;
+	struct rte_eth_link dev_link;
+	int link_speed = 0;
+	int ret;
+
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	dev_link = (struct rte_eth_link) {
+		.link_status = ((ifr.ifr_flags & IFF_UP) &&
+				(ifr.ifr_flags & IFF_RUNNING)),
+	};
+	ifr = (struct ifreq) {
+		.ifr_data = (void *)&edata,
+	};
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		if (ret == -ENOTSUP && priv->representor) {
+			struct rte_eth_dev *master;
+
+			/*
+			 * For representors we can try to inherit link
+			 * settings from the master device. Actually
+			 * link settings do not make a lot of sense
+			 * for representors due to missing physical
+			 * link. The old kernel drivers supported
+			 * emulated settings query for representors,
+			 * the new ones do not, so we have to add
+			 * this code for compatibility issues.
+			 */
+			master = mlx5_find_master_dev(dev);
+			if (master) {
+				ifr = (struct ifreq) {
+					.ifr_data = (void *)&edata,
+				};
+				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+			}
+		}
+		if (ret) {
+			DRV_LOG(WARNING,
+				"port %u ioctl(SIOCETHTOOL,"
+				" ETHTOOL_GSET) failed: %s",
+				dev->data->port_id, strerror(rte_errno));
+			return ret;
+		}
+	}
+	link_speed = ethtool_cmd_speed(&edata);
+	if (link_speed == -1)
+		dev_link.link_speed = ETH_SPEED_NUM_NONE;
+	else
+		dev_link.link_speed = link_speed;
+	priv->link_speed_capa = 0;
+	if (edata.supported & SUPPORTED_Autoneg)
+		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
+	if (edata.supported & (SUPPORTED_1000baseT_Full |
+			       SUPPORTED_1000baseKX_Full))
+		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+	if (edata.supported & SUPPORTED_10000baseKR_Full)
+		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+	if (edata.supported & (SUPPORTED_40000baseKR4_Full |
+			       SUPPORTED_40000baseCR4_Full |
+			       SUPPORTED_40000baseSR4_Full |
+			       SUPPORTED_40000baseLR4_Full))
+		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
+				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+			ETH_LINK_SPEED_FIXED);
+	if (((dev_link.link_speed && !dev_link.link_status) ||
+	     (!dev_link.link_speed && dev_link.link_status))) {
+		rte_errno = EAGAIN;
+		return -rte_errno;
+	}
+	*link = dev_link;
+	return 0;
+}
+
+/**
+ * Retrieve physical link information (unlocked version using new ioctl).
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] link
+ *   Storage for current link status.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
+			     struct rte_eth_link *link)
+
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
+	struct ifreq ifr;
+	struct rte_eth_link dev_link;
+	struct rte_eth_dev *master = NULL;
+	uint64_t sc;
+	int ret;
+
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	dev_link = (struct rte_eth_link) {
+		.link_status = ((ifr.ifr_flags & IFF_UP) &&
+				(ifr.ifr_flags & IFF_RUNNING)),
+	};
+	ifr = (struct ifreq) {
+		.ifr_data = (void *)&gcmd,
+	};
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		if (ret == -ENOTSUP && priv->representor) {
+			/*
+			 * For representors we can try to inherit link
+			 * settings from the master device. Actually
+			 * link settings do not make a lot of sense
+			 * for representors due to missing physical
+			 * link. The old kernel drivers supported
+			 * emulated settings query for representors,
+			 * the new ones do not, so we have to add
+			 * this code for compatibility issues.
+			 */
+			master = mlx5_find_master_dev(dev);
+			if (master) {
+				ifr = (struct ifreq) {
+					.ifr_data = (void *)&gcmd,
+				};
+				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+			}
+		}
+		if (ret) {
+			DRV_LOG(DEBUG,
+				"port %u ioctl(SIOCETHTOOL,"
+				" ETHTOOL_GLINKSETTINGS) failed: %s",
+				dev->data->port_id, strerror(rte_errno));
+			return ret;
+		}
+
+	}
+	gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
+
+	alignas(struct ethtool_link_settings)
+	uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
+		     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
+	struct ethtool_link_settings *ecmd = (void *)data;
+
+	*ecmd = gcmd;
+	ifr.ifr_data = (void *)ecmd;
+	ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(DEBUG,
+			"port %u ioctl(SIOCETHTOOL,"
+			"ETHTOOL_GLINKSETTINGS) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE :
+							    ecmd->speed;
+	sc = ecmd->link_mode_masks[0] |
+		((uint64_t)ecmd->link_mode_masks[1] << 32);
+	priv->link_speed_capa = 0;
+	if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
+		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_20G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_56G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_25G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_50G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_100G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+
+	sc = ecmd->link_mode_masks[2] |
+		((uint64_t)ecmd->link_mode_masks[3] << 32);
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) |
+		  MLX5_BITSHIFT(
+			ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+	dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
+				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+				  ETH_LINK_SPEED_FIXED);
+	if (((dev_link.link_speed && !dev_link.link_status) ||
+	     (!dev_link.link_speed && dev_link.link_status))) {
+		rte_errno = EAGAIN;
+		return -rte_errno;
+	}
+	*link = dev_link;
+	return 0;
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param wait_to_complete
+ *   Wait for request completion.
+ *
+ * @return
+ *   0 if link status was not updated, positive if it was, a negative errno
+ *   value otherwise and rte_errno is set.
+ */
+int
+mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
+{
+	int ret;
+	struct rte_eth_link dev_link;
+	time_t start_time = time(NULL);
+	int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
+
+	do {
+		ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
+		if (ret == -ENOTSUP)
+			ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
+		if (ret == 0)
+			break;
+		/* Handle wait to complete situation. */
+		if ((wait_to_complete || retry) && ret == -EAGAIN) {
+			if (abs((int)difftime(time(NULL), start_time)) <
+			    MLX5_LINK_STATUS_TIMEOUT) {
+				usleep(0);
+				continue;
+			} else {
+				rte_errno = EBUSY;
+				return -rte_errno;
+			}
+		} else if (ret < 0) {
+			return ret;
+		}
+	} while (wait_to_complete || retry-- > 0);
+	ret = !!memcmp(&dev->data->dev_link, &dev_link,
+		       sizeof(struct rte_eth_link));
+	dev->data->dev_link = dev_link;
+	return ret;
+}
+
+/**
+ * DPDK callback to change the MTU.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param in_mtu
+ *   New MTU.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint16_t kern_mtu = 0;
+	int ret;
+
+	ret = mlx5_get_mtu(dev, &kern_mtu);
+	if (ret)
+		return ret;
+	/* Set kernel interface MTU first. */
+	ret = mlx5_set_mtu(dev, mtu);
+	if (ret)
+		return ret;
+	ret = mlx5_get_mtu(dev, &kern_mtu);
+	if (ret)
+		return ret;
+	if (kern_mtu == mtu) {
+		priv->mtu = mtu;
+		DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
+			dev->data->port_id, mtu);
+		return 0;
+	}
+	rte_errno = EAGAIN;
+	return -rte_errno;
+}
+
+/**
+ * DPDK callback to get flow control status.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] fc_conf
+ *   Flow control output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+	struct ifreq ifr;
+	struct ethtool_pauseparam ethpause = {
+		.cmd = ETHTOOL_GPAUSEPARAM
+	};
+	int ret;
+
+	ifr.ifr_data = (void *)&ethpause;
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING,
+			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
+			" %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	fc_conf->autoneg = ethpause.autoneg;
+	if (ethpause.rx_pause && ethpause.tx_pause)
+		fc_conf->mode = RTE_FC_FULL;
+	else if (ethpause.rx_pause)
+		fc_conf->mode = RTE_FC_RX_PAUSE;
+	else if (ethpause.tx_pause)
+		fc_conf->mode = RTE_FC_TX_PAUSE;
+	else
+		fc_conf->mode = RTE_FC_NONE;
+	return 0;
+}
+
+/**
+ * DPDK callback to modify flow control parameters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[in] fc_conf
+ *   Flow control parameters.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+	struct ifreq ifr;
+	struct ethtool_pauseparam ethpause = {
+		.cmd = ETHTOOL_SPAUSEPARAM
+	};
+	int ret;
+
+	ifr.ifr_data = (void *)&ethpause;
+	ethpause.autoneg = fc_conf->autoneg;
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_RX_PAUSE))
+		ethpause.rx_pause = 1;
+	else
+		ethpause.rx_pause = 0;
+
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_TX_PAUSE))
+		ethpause.tx_pause = 1;
+	else
+		ethpause.tx_pause = 0;
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING,
+			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
+			" failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	return 0;
+}
+
+/**
+ * Handle asynchronous removal event for entire multiport device.
+ *
+ * @param sh
+ *   Infiniband device shared context.
+ */
+static void
+mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
+{
+	uint32_t i;
+
+	for (i = 0; i < sh->max_port; ++i) {
+		struct rte_eth_dev *dev;
+
+		if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
+			/*
+			 * Or not existing port either no
+			 * handler installed for this port.
+			 */
+			continue;
+		}
+		dev = &rte_eth_devices[sh->port[i].ih_port_id];
+		MLX5_ASSERT(dev);
+		if (dev->data->dev_conf.intr_conf.rmv)
+			_rte_eth_dev_callback_process
+				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
+	}
+}
+
+/**
+ * Handle shared asynchronous events the NIC (removal event
+ * and link status change). Supports multiport IB device.
+ *
+ * @param cb_arg
+ *   Callback argument.
+ */
+void
+mlx5_dev_interrupt_handler(void *cb_arg)
+{
+	struct mlx5_ibv_shared *sh = cb_arg;
+	struct ibv_async_event event;
+
+	/* Read all message from the IB device and acknowledge them. */
+	for (;;) {
+		struct rte_eth_dev *dev;
+		uint32_t tmp;
+
+		if (mlx5_glue->get_async_event(sh->ctx, &event))
+			break;
+		/* Retrieve and check IB port index. */
+		tmp = (uint32_t)event.element.port_num;
+		if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
+			/*
+			 * The DEVICE_FATAL event is called once for
+			 * entire device without port specifying.
+			 * We should notify all existing ports.
+			 */
+			mlx5_glue->ack_async_event(&event);
+			mlx5_dev_interrupt_device_fatal(sh);
+			continue;
+		}
+		MLX5_ASSERT(tmp && (tmp <= sh->max_port));
+		if (!tmp) {
+			/* Unsupported devive level event. */
+			mlx5_glue->ack_async_event(&event);
+			DRV_LOG(DEBUG,
+				"unsupported common event (type %d)",
+				event.event_type);
+			continue;
+		}
+		if (tmp > sh->max_port) {
+			/* Invalid IB port index. */
+			mlx5_glue->ack_async_event(&event);
+			DRV_LOG(DEBUG,
+				"cannot handle an event (type %d)"
+				"due to invalid IB port index (%u)",
+				event.event_type, tmp);
+			continue;
+		}
+		if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
+			/* No handler installed. */
+			mlx5_glue->ack_async_event(&event);
+			DRV_LOG(DEBUG,
+				"cannot handle an event (type %d)"
+				"due to no handler installed for port %u",
+				event.event_type, tmp);
+			continue;
+		}
+		/* Retrieve ethernet device descriptor. */
+		tmp = sh->port[tmp - 1].ih_port_id;
+		dev = &rte_eth_devices[tmp];
+		MLX5_ASSERT(dev);
+		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
+		     event.event_type == IBV_EVENT_PORT_ERR) &&
+			dev->data->dev_conf.intr_conf.lsc) {
+			mlx5_glue->ack_async_event(&event);
+			if (mlx5_link_update(dev, 0) == -EAGAIN) {
+				usleep(0);
+				continue;
+			}
+			_rte_eth_dev_callback_process
+				(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
+			continue;
+		}
+		DRV_LOG(DEBUG,
+			"port %u cannot handle an unknown event (type %d)",
+			dev->data->port_id, event.event_type);
+		mlx5_glue->ack_async_event(&event);
+	}
+}
+
+/*
+ * Unregister callback handler safely. The handler may be active
+ * while we are trying to unregister it, in this case code -EAGAIN
+ * is returned by rte_intr_callback_unregister(). This routine checks
+ * the return code and tries to unregister handler again.
+ *
+ * @param handle
+ *   interrupt handle
+ * @param cb_fn
+ *   pointer to callback routine
+ * @cb_arg
+ *   opaque callback parameter
+ */
+void
+mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
+			      rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+	/*
+	 * Try to reduce timeout management overhead by not calling
+	 * the timer related routines on the first iteration. If the
+	 * unregistering succeeds on first call there will be no
+	 * timer calls at all.
+	 */
+	uint64_t twait = 0;
+	uint64_t start = 0;
+
+	do {
+		int ret;
+
+		ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
+		if (ret >= 0)
+			return;
+		if (ret != -EAGAIN) {
+			DRV_LOG(INFO, "failed to unregister interrupt"
+				      " handler (error: %d)", ret);
+			MLX5_ASSERT(false);
+			return;
+		}
+		if (twait) {
+			struct timespec onems;
+
+			/* Wait one millisecond and try again. */
+			onems.tv_sec = 0;
+			onems.tv_nsec = NS_PER_S / MS_PER_S;
+			nanosleep(&onems, 0);
+			/* Check whether one second elapsed. */
+			if ((rte_get_timer_cycles() - start) <= twait)
+				continue;
+		} else {
+			/*
+			 * We get the amount of timer ticks for one second.
+			 * If this amount elapsed it means we spent one
+			 * second in waiting. This branch is executed once
+			 * on first iteration.
+			 */
+			twait = rte_get_timer_hz();
+			MLX5_ASSERT(twait);
+		}
+		/*
+		 * Timeout elapsed, show message (once a second) and retry.
+		 * We have no other acceptable option here, if we ignore
+		 * the unregistering return code the handler will not
+		 * be unregistered, fd will be closed and we may get the
+		 * crush. Hanging and messaging in the loop seems not to be
+		 * the worst choice.
+		 */
+		DRV_LOG(INFO, "Retrying to unregister interrupt handler");
+		start = rte_get_timer_cycles();
+	} while (true);
+}
+
+/**
+ * Handle DEVX interrupts from the NIC.
+ * This function is probably called from the DPDK host thread.
+ *
+ * @param cb_arg
+ *   Callback argument.
+ */
+void
+mlx5_dev_interrupt_handler_devx(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_ASYNC
+	(void)cb_arg;
+	return;
+#else
+	struct mlx5_ibv_shared *sh = cb_arg;
+	union {
+		struct mlx5dv_devx_async_cmd_hdr cmd_resp;
+		uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+			    MLX5_ST_SZ_BYTES(traffic_counter) +
+			    sizeof(struct mlx5dv_devx_async_cmd_hdr)];
+	} out;
+	uint8_t *buf = out.buf + sizeof(out.cmd_resp);
+
+	while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
+						   &out.cmd_resp,
+						   sizeof(out.buf)))
+		mlx5_flow_async_pool_query_handle
+			(sh, (uint64_t)out.cmd_resp.wr_id,
+			 mlx5_devx_get_out_command_status(buf));
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+/**
+ * Uninstall shared asynchronous device events handler.
+ * This function is implemented to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+		return;
+	pthread_mutex_lock(&sh->intr_mutex);
+	MLX5_ASSERT(priv->ibv_port);
+	MLX5_ASSERT(priv->ibv_port <= sh->max_port);
+	MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
+	if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS)
+		goto exit;
+	MLX5_ASSERT(sh->port[priv->ibv_port - 1].ih_port_id ==
+					(uint32_t)dev->data->port_id);
+	MLX5_ASSERT(sh->intr_cnt);
+	sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
+	if (!sh->intr_cnt || --sh->intr_cnt)
+		goto exit;
+	mlx5_intr_callback_unregister(&sh->intr_handle,
+				     mlx5_dev_interrupt_handler, sh);
+	sh->intr_handle.fd = 0;
+	sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+exit:
+	pthread_mutex_unlock(&sh->intr_mutex);
+}
+
+/**
+ * Uninstall devx shared asynchronous device events handler.
+ * This function is implemeted to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mlx5_dev_shared_handler_devx_uninstall(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+		return;
+	pthread_mutex_lock(&sh->intr_mutex);
+	MLX5_ASSERT(priv->ibv_port);
+	MLX5_ASSERT(priv->ibv_port <= sh->max_port);
+	MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
+	if (sh->port[priv->ibv_port - 1].devx_ih_port_id >= RTE_MAX_ETHPORTS)
+		goto exit;
+	MLX5_ASSERT(sh->port[priv->ibv_port - 1].devx_ih_port_id ==
+		    (uint32_t)dev->data->port_id);
+	sh->port[priv->ibv_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
+	if (!sh->devx_intr_cnt || --sh->devx_intr_cnt)
+		goto exit;
+	if (sh->intr_handle_devx.fd) {
+		rte_intr_callback_unregister(&sh->intr_handle_devx,
+					     mlx5_dev_interrupt_handler_devx,
+					     sh);
+		sh->intr_handle_devx.fd = 0;
+		sh->intr_handle_devx.type = RTE_INTR_HANDLE_UNKNOWN;
+	}
+	if (sh->devx_comp) {
+		mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
+		sh->devx_comp = NULL;
+	}
+exit:
+	pthread_mutex_unlock(&sh->intr_mutex);
+}
+
+/**
+ * Install shared asynchronous device events handler.
+ * This function is implemented to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mlx5_dev_shared_handler_install(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	int ret;
+	int flags;
+
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+		return;
+	pthread_mutex_lock(&sh->intr_mutex);
+	MLX5_ASSERT(priv->ibv_port);
+	MLX5_ASSERT(priv->ibv_port <= sh->max_port);
+	MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
+	if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) {
+		/* The handler is already installed for this port. */
+		MLX5_ASSERT(sh->intr_cnt);
+		goto exit;
+	}
+	if (sh->intr_cnt) {
+		sh->port[priv->ibv_port - 1].ih_port_id =
+						(uint32_t)dev->data->port_id;
+		sh->intr_cnt++;
+		goto exit;
+	}
+	/* No shared handler installed. */
+	MLX5_ASSERT(sh->ctx->async_fd > 0);
+	flags = fcntl(sh->ctx->async_fd, F_GETFL);
+	ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(INFO, "failed to change file descriptor async event"
+			" queue");
+		/* Indicate there will be no interrupts. */
+		dev->data->dev_conf.intr_conf.lsc = 0;
+		dev->data->dev_conf.intr_conf.rmv = 0;
+	} else {
+		sh->intr_handle.fd = sh->ctx->async_fd;
+		sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
+		rte_intr_callback_register(&sh->intr_handle,
+					   mlx5_dev_interrupt_handler, sh);
+		sh->intr_cnt++;
+		sh->port[priv->ibv_port - 1].ih_port_id =
+						(uint32_t)dev->data->port_id;
+	}
+exit:
+	pthread_mutex_unlock(&sh->intr_mutex);
+}
+
+/**
+ * Install devx shared asyncronous device events handler.
+ * This function is implemeted to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mlx5_dev_shared_handler_devx_install(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+		return;
+	pthread_mutex_lock(&sh->intr_mutex);
+	MLX5_ASSERT(priv->ibv_port);
+	MLX5_ASSERT(priv->ibv_port <= sh->max_port);
+	MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
+	if (sh->port[priv->ibv_port - 1].devx_ih_port_id < RTE_MAX_ETHPORTS) {
+		/* The handler is already installed for this port. */
+		MLX5_ASSERT(sh->devx_intr_cnt);
+		goto exit;
+	}
+	if (sh->devx_intr_cnt) {
+		sh->devx_intr_cnt++;
+		sh->port[priv->ibv_port - 1].devx_ih_port_id =
+					(uint32_t)dev->data->port_id;
+		goto exit;
+	}
+	if (priv->config.devx) {
+#ifndef HAVE_IBV_DEVX_ASYNC
+		goto exit;
+#else
+		sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
+		if (sh->devx_comp) {
+			int flags = fcntl(sh->devx_comp->fd, F_GETFL);
+			int ret = fcntl(sh->devx_comp->fd, F_SETFL,
+				    flags | O_NONBLOCK);
+
+			if (ret) {
+				DRV_LOG(INFO, "failed to change file descriptor"
+					" devx async event queue");
+			} else {
+				sh->intr_handle_devx.fd = sh->devx_comp->fd;
+				sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
+				rte_intr_callback_register
+					(&sh->intr_handle_devx,
+					 mlx5_dev_interrupt_handler_devx, sh);
+				sh->devx_intr_cnt++;
+				sh->port[priv->ibv_port - 1].devx_ih_port_id =
+						(uint32_t)dev->data->port_id;
+			}
+		}
+#endif /* HAVE_IBV_DEVX_ASYNC */
+	}
+exit:
+	pthread_mutex_unlock(&sh->intr_mutex);
+}
+
+/**
+ * Uninstall interrupt handler.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
+{
+	mlx5_dev_shared_handler_uninstall(dev);
+}
+
+/**
+ * Install interrupt handler.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
+{
+	mlx5_dev_shared_handler_install(dev);
+}
+
+/**
+ * Devx uninstall interrupt handler.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_dev_interrupt_handler_devx_uninstall(struct rte_eth_dev *dev)
+{
+	mlx5_dev_shared_handler_devx_uninstall(dev);
+}
+
+/**
+ * Devx install interrupt handler.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_dev_interrupt_handler_devx_install(struct rte_eth_dev *dev)
+{
+	mlx5_dev_shared_handler_devx_install(dev);
+}
+
+/**
+ * DPDK callback to bring the link DOWN.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_link_down(struct rte_eth_dev *dev)
+{
+	return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
+}
+
+/**
+ * DPDK callback to bring the link UP.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_link_up(struct rte_eth_dev *dev)
+{
+	return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
+}
+
+/**
+ * Configure the RX function to use.
+ *
+ * @param dev
+ *   Pointer to private data structure.
+ *
+ * @return
+ *   Pointer to selected Rx burst function.
+ */
+eth_rx_burst_t
+mlx5_select_rx_function(struct rte_eth_dev *dev)
+{
+	eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
+
+	MLX5_ASSERT(dev != NULL);
+	if (mlx5_check_vec_rx_support(dev) > 0) {
+		rx_pkt_burst = mlx5_rx_burst_vec;
+		DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
+			dev->data->port_id);
+	} else if (mlx5_mprq_enabled(dev)) {
+		rx_pkt_burst = mlx5_rx_burst_mprq;
+	}
+	return rx_pkt_burst;
+}
+
+/**
+ * Check if mlx5 device was removed.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   1 when device is removed, otherwise 0.
+ */
+int
+mlx5_is_removed(struct rte_eth_dev *dev)
+{
+	struct ibv_device_attr device_attr;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
+		return 1;
+	return 0;
+}
+
+/**
+ * Get the E-Switch parameters by port id.
+ *
+ * @param[in] port
+ *   Device port id.
+ * @param[in] valid
+ *   Device port id is valid, skip check. This flag is useful
+ *   when trials are performed from probing and device is not
+ *   flagged as valid yet (in attaching process).
+ * @param[out] es_domain_id
+ *   E-Switch domain id.
+ * @param[out] es_port_id
+ *   The port id of the port in the E-Switch.
+ *
+ * @return
+ *   pointer to device private data structure containing data needed
+ *   on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_priv *
+mlx5_port_to_eswitch_info(uint16_t port, bool valid)
+{
+	struct rte_eth_dev *dev;
+	struct mlx5_priv *priv;
+
+	if (port >= RTE_MAX_ETHPORTS) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	if (!valid && !rte_eth_dev_is_valid_port(port)) {
+		rte_errno = ENODEV;
+		return NULL;
+	}
+	dev = &rte_eth_devices[port];
+	priv = dev->data->dev_private;
+	if (!(priv->representor || priv->master)) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	return priv;
+}
+
+/**
+ * Get the E-Switch parameters by device instance.
+ *
+ * @param[in] port
+ *   Device port id.
+ * @param[out] es_domain_id
+ *   E-Switch domain id.
+ * @param[out] es_port_id
+ *   The port id of the port in the E-Switch.
+ *
+ * @return
+ *   pointer to device private data structure containing data needed
+ *   on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_priv *
+mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv;
+
+	priv = dev->data->dev_private;
+	if (!(priv->representor || priv->master)) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	return priv;
+}
+
+/**
+ * Get switch information associated with network interface.
+ *
+ * @param ifindex
+ *   Network interface index.
+ * @param[out] info
+ *   Switch information object, populated in case of success.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
+{
+	char ifname[IF_NAMESIZE];
+	char port_name[IF_NAMESIZE];
+	FILE *file;
+	struct mlx5_switch_info data = {
+		.master = 0,
+		.representor = 0,
+		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
+		.port_name = 0,
+		.switch_id = 0,
+	};
+	DIR *dir;
+	bool port_switch_id_set = false;
+	bool device_dir = false;
+	char c;
+	int ret;
+
+	if (!if_indextoname(ifindex, ifname)) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+
+	MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
+	      ifname);
+	MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
+	      ifname);
+	MKSTR(pci_device, "/sys/class/net/%s/device",
+	      ifname);
+
+	file = fopen(phys_port_name, "rb");
+	if (file != NULL) {
+		ret = fscanf(file, "%s", port_name);
+		fclose(file);
+		if (ret == 1)
+			mlx5_translate_port_name(port_name, &data);
+	}
+	file = fopen(phys_switch_id, "rb");
+	if (file == NULL) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	port_switch_id_set =
+		fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
+		c == '\n';
+	fclose(file);
+	dir = opendir(pci_device);
+	if (dir != NULL) {
+		closedir(dir);
+		device_dir = true;
+	}
+	if (port_switch_id_set) {
+		/* We have some E-Switch configuration. */
+		mlx5_sysfs_check_switch_info(device_dir, &data);
+	}
+	*info = data;
+	MLX5_ASSERT(!(data.master && data.representor));
+	if (data.master && data.representor) {
+		DRV_LOG(ERR, "ifindex %u device is recognized as master"
+			     " and as representor", ifindex);
+		rte_errno = ENODEV;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Analyze gathered port parameters via sysfs to recognize master
+ * and representor devices for E-Switch configuration.
+ *
+ * @param[in] device_dir
+ *   flag of presence of "device" directory under port device key.
+ * @param[inout] switch_info
+ *   Port information, including port name as a number and port name
+ *   type if recognized
+ *
+ * @return
+ *   master and representor flags are set in switch_info according to
+ *   recognized parameters (if any).
+ */
+void
+mlx5_sysfs_check_switch_info(bool device_dir,
+			     struct mlx5_switch_info *switch_info)
+{
+	switch (switch_info->name_type) {
+	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
+		/*
+		 * Name is not recognized, assume the master,
+		 * check the device directory presence.
+		 */
+		switch_info->master = device_dir;
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
+		/*
+		 * Name is not set, this assumes the legacy naming
+		 * schema for master, just check if there is
+		 * a device directory.
+		 */
+		switch_info->master = device_dir;
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+		/* New uplink naming schema recognized. */
+		switch_info->master = 1;
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
+		/* Legacy representors naming schema. */
+		switch_info->representor = !device_dir;
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+		/* New representors naming schema. */
+		switch_info->representor = 1;
+		break;
+	}
+}
+
+/**
+ * DPDK callback to retrieve plug-in module EEPROM information (type and size).
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] modinfo
+ *   Storage for plug-in module EEPROM information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_module_info(struct rte_eth_dev *dev,
+		     struct rte_eth_dev_module_info *modinfo)
+{
+	struct ethtool_modinfo info = {
+		.cmd = ETHTOOL_GMODULEINFO,
+	};
+	struct ifreq ifr = (struct ifreq) {
+		.ifr_data = (void *)&info,
+	};
+	int ret = 0;
+
+	if (!dev || !modinfo) {
+		DRV_LOG(WARNING, "missing argument, cannot get module info");
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	modinfo->type = info.type;
+	modinfo->eeprom_len = info.eeprom_len;
+	return ret;
+}
+
+/**
+ * DPDK callback to retrieve plug-in module EEPROM data.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] info
+ *   Storage for plug-in module EEPROM data.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
+			   struct rte_dev_eeprom_info *info)
+{
+	struct ethtool_eeprom *eeprom;
+	struct ifreq ifr;
+	int ret = 0;
+
+	if (!dev || !info) {
+		DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	eeprom = rte_calloc(__func__, 1,
+			    (sizeof(struct ethtool_eeprom) + info->length), 0);
+	if (!eeprom) {
+		DRV_LOG(WARNING, "port %u cannot allocate memory for "
+			"eeprom data", dev->data->port_id);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	eeprom->cmd = ETHTOOL_GMODULEEEPROM;
+	eeprom->offset = info->offset;
+	eeprom->len = info->length;
+	ifr = (struct ifreq) {
+		.ifr_data = (void *)eeprom,
+	};
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret)
+		DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+	else
+		rte_memcpy(info->data, eeprom->data, info->length);
+	rte_free(eeprom);
+	return ret;
+}
+
+/**
+ * DPDK callback to retrieve hairpin capabilities.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] cap
+ *   Storage for hairpin capability data.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_hairpin_cap_get(struct rte_eth_dev *dev,
+			 struct rte_eth_hairpin_cap *cap)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (priv->sh->devx == 0) {
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	cap->max_nb_queues = UINT16_MAX;
+	cap->max_rx_2_tx = 1;
+	cap->max_tx_2_rx = 1;
+	cap->max_nb_desc = 8192;
+	return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c
new file mode 100644
index 000000000..ae478a510
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.c
@@ -0,0 +1,6204 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox Technologies, Ltd
+ */
+
+#include <netinet/in.h>
+#include <sys/queue.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_flow.h>
+#include <rte_cycles.h>
+#include <rte_flow_driver.h>
+#include <rte_malloc.h>
+#include <rte_ip.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_flow.h"
+#include "mlx5_rxtx.h"
+
+/* Dev ops structure defined in mlx5.c */
+extern const struct eth_dev_ops mlx5_dev_ops;
+extern const struct eth_dev_ops mlx5_dev_ops_isolate;
+
+/** Device flow drivers. */
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+extern const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops;
+#endif
+extern const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops;
+
+const struct mlx5_flow_driver_ops mlx5_flow_null_drv_ops;
+
+const struct mlx5_flow_driver_ops *flow_drv_ops[] = {
+	[MLX5_FLOW_TYPE_MIN] = &mlx5_flow_null_drv_ops,
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	[MLX5_FLOW_TYPE_DV] = &mlx5_flow_dv_drv_ops,
+#endif
+	[MLX5_FLOW_TYPE_VERBS] = &mlx5_flow_verbs_drv_ops,
+	[MLX5_FLOW_TYPE_MAX] = &mlx5_flow_null_drv_ops
+};
+
+enum mlx5_expansion {
+	MLX5_EXPANSION_ROOT,
+	MLX5_EXPANSION_ROOT_OUTER,
+	MLX5_EXPANSION_ROOT_ETH_VLAN,
+	MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN,
+	MLX5_EXPANSION_OUTER_ETH,
+	MLX5_EXPANSION_OUTER_ETH_VLAN,
+	MLX5_EXPANSION_OUTER_VLAN,
+	MLX5_EXPANSION_OUTER_IPV4,
+	MLX5_EXPANSION_OUTER_IPV4_UDP,
+	MLX5_EXPANSION_OUTER_IPV4_TCP,
+	MLX5_EXPANSION_OUTER_IPV6,
+	MLX5_EXPANSION_OUTER_IPV6_UDP,
+	MLX5_EXPANSION_OUTER_IPV6_TCP,
+	MLX5_EXPANSION_VXLAN,
+	MLX5_EXPANSION_VXLAN_GPE,
+	MLX5_EXPANSION_GRE,
+	MLX5_EXPANSION_MPLS,
+	MLX5_EXPANSION_ETH,
+	MLX5_EXPANSION_ETH_VLAN,
+	MLX5_EXPANSION_VLAN,
+	MLX5_EXPANSION_IPV4,
+	MLX5_EXPANSION_IPV4_UDP,
+	MLX5_EXPANSION_IPV4_TCP,
+	MLX5_EXPANSION_IPV6,
+	MLX5_EXPANSION_IPV6_UDP,
+	MLX5_EXPANSION_IPV6_TCP,
+};
+
+/** Supported expansion of items. */
+static const struct rte_flow_expand_node mlx5_support_expansion[] = {
+	[MLX5_EXPANSION_ROOT] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH,
+						 MLX5_EXPANSION_IPV4,
+						 MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_END,
+	},
+	[MLX5_EXPANSION_ROOT_OUTER] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH,
+						 MLX5_EXPANSION_OUTER_IPV4,
+						 MLX5_EXPANSION_OUTER_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_END,
+	},
+	[MLX5_EXPANSION_ROOT_ETH_VLAN] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH_VLAN),
+		.type = RTE_FLOW_ITEM_TYPE_END,
+	},
+	[MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH_VLAN),
+		.type = RTE_FLOW_ITEM_TYPE_END,
+	},
+	[MLX5_EXPANSION_OUTER_ETH] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4,
+						 MLX5_EXPANSION_OUTER_IPV6,
+						 MLX5_EXPANSION_MPLS),
+		.type = RTE_FLOW_ITEM_TYPE_ETH,
+		.rss_types = 0,
+	},
+	[MLX5_EXPANSION_OUTER_ETH_VLAN] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_VLAN),
+		.type = RTE_FLOW_ITEM_TYPE_ETH,
+		.rss_types = 0,
+	},
+	[MLX5_EXPANSION_OUTER_VLAN] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4,
+						 MLX5_EXPANSION_OUTER_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_VLAN,
+	},
+	[MLX5_EXPANSION_OUTER_IPV4] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT
+			(MLX5_EXPANSION_OUTER_IPV4_UDP,
+			 MLX5_EXPANSION_OUTER_IPV4_TCP,
+			 MLX5_EXPANSION_GRE,
+			 MLX5_EXPANSION_IPV4,
+			 MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_IPV4,
+		.rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
+			ETH_RSS_NONFRAG_IPV4_OTHER,
+	},
+	[MLX5_EXPANSION_OUTER_IPV4_UDP] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN,
+						 MLX5_EXPANSION_VXLAN_GPE),
+		.type = RTE_FLOW_ITEM_TYPE_UDP,
+		.rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
+	},
+	[MLX5_EXPANSION_OUTER_IPV4_TCP] = {
+		.type = RTE_FLOW_ITEM_TYPE_TCP,
+		.rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
+	},
+	[MLX5_EXPANSION_OUTER_IPV6] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT
+			(MLX5_EXPANSION_OUTER_IPV6_UDP,
+			 MLX5_EXPANSION_OUTER_IPV6_TCP,
+			 MLX5_EXPANSION_IPV4,
+			 MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_IPV6,
+		.rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
+			ETH_RSS_NONFRAG_IPV6_OTHER,
+	},
+	[MLX5_EXPANSION_OUTER_IPV6_UDP] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN,
+						 MLX5_EXPANSION_VXLAN_GPE),
+		.type = RTE_FLOW_ITEM_TYPE_UDP,
+		.rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
+	},
+	[MLX5_EXPANSION_OUTER_IPV6_TCP] = {
+		.type = RTE_FLOW_ITEM_TYPE_TCP,
+		.rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
+	},
+	[MLX5_EXPANSION_VXLAN] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH,
+						 MLX5_EXPANSION_IPV4,
+						 MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_VXLAN,
+	},
+	[MLX5_EXPANSION_VXLAN_GPE] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH,
+						 MLX5_EXPANSION_IPV4,
+						 MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_VXLAN_GPE,
+	},
+	[MLX5_EXPANSION_GRE] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4),
+		.type = RTE_FLOW_ITEM_TYPE_GRE,
+	},
+	[MLX5_EXPANSION_MPLS] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
+						 MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_MPLS,
+	},
+	[MLX5_EXPANSION_ETH] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
+						 MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_ETH,
+	},
+	[MLX5_EXPANSION_ETH_VLAN] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VLAN),
+		.type = RTE_FLOW_ITEM_TYPE_ETH,
+	},
+	[MLX5_EXPANSION_VLAN] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
+						 MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_VLAN,
+	},
+	[MLX5_EXPANSION_IPV4] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4_UDP,
+						 MLX5_EXPANSION_IPV4_TCP),
+		.type = RTE_FLOW_ITEM_TYPE_IPV4,
+		.rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
+			ETH_RSS_NONFRAG_IPV4_OTHER,
+	},
+	[MLX5_EXPANSION_IPV4_UDP] = {
+		.type = RTE_FLOW_ITEM_TYPE_UDP,
+		.rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
+	},
+	[MLX5_EXPANSION_IPV4_TCP] = {
+		.type = RTE_FLOW_ITEM_TYPE_TCP,
+		.rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
+	},
+	[MLX5_EXPANSION_IPV6] = {
+		.next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV6_UDP,
+						 MLX5_EXPANSION_IPV6_TCP),
+		.type = RTE_FLOW_ITEM_TYPE_IPV6,
+		.rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
+			ETH_RSS_NONFRAG_IPV6_OTHER,
+	},
+	[MLX5_EXPANSION_IPV6_UDP] = {
+		.type = RTE_FLOW_ITEM_TYPE_UDP,
+		.rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
+	},
+	[MLX5_EXPANSION_IPV6_TCP] = {
+		.type = RTE_FLOW_ITEM_TYPE_TCP,
+		.rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
+	},
+};
+
+static const struct rte_flow_ops mlx5_flow_ops = {
+	.validate = mlx5_flow_validate,
+	.create = mlx5_flow_create,
+	.destroy = mlx5_flow_destroy,
+	.flush = mlx5_flow_flush,
+	.isolate = mlx5_flow_isolate,
+	.query = mlx5_flow_query,
+	.dev_dump = mlx5_flow_dev_dump,
+	.get_aged_flows = mlx5_flow_get_aged_flows,
+};
+
+/* Convert FDIR request to Generic flow. */
+struct mlx5_fdir {
+	struct rte_flow_attr attr;
+	struct rte_flow_item items[4];
+	struct rte_flow_item_eth l2;
+	struct rte_flow_item_eth l2_mask;
+	union {
+		struct rte_flow_item_ipv4 ipv4;
+		struct rte_flow_item_ipv6 ipv6;
+	} l3;
+	union {
+		struct rte_flow_item_ipv4 ipv4;
+		struct rte_flow_item_ipv6 ipv6;
+	} l3_mask;
+	union {
+		struct rte_flow_item_udp udp;
+		struct rte_flow_item_tcp tcp;
+	} l4;
+	union {
+		struct rte_flow_item_udp udp;
+		struct rte_flow_item_tcp tcp;
+	} l4_mask;
+	struct rte_flow_action actions[2];
+	struct rte_flow_action_queue queue;
+};
+
+/* Map of Verbs to Flow priority with 8 Verbs priorities. */
+static const uint32_t priority_map_3[][MLX5_PRIORITY_MAP_MAX] = {
+	{ 0, 1, 2 }, { 2, 3, 4 }, { 5, 6, 7 },
+};
+
+/* Map of Verbs to Flow priority with 16 Verbs priorities. */
+static const uint32_t priority_map_5[][MLX5_PRIORITY_MAP_MAX] = {
+	{ 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 },
+	{ 9, 10, 11 }, { 12, 13, 14 },
+};
+
+/* Tunnel information. */
+struct mlx5_flow_tunnel_info {
+	uint64_t tunnel; /**< Tunnel bit (see MLX5_FLOW_*). */
+	uint32_t ptype; /**< Tunnel Ptype (see RTE_PTYPE_*). */
+};
+
+static struct mlx5_flow_tunnel_info tunnels_info[] = {
+	{
+		.tunnel = MLX5_FLOW_LAYER_VXLAN,
+		.ptype = RTE_PTYPE_TUNNEL_VXLAN | RTE_PTYPE_L4_UDP,
+	},
+	{
+		.tunnel = MLX5_FLOW_LAYER_GENEVE,
+		.ptype = RTE_PTYPE_TUNNEL_GENEVE | RTE_PTYPE_L4_UDP,
+	},
+	{
+		.tunnel = MLX5_FLOW_LAYER_VXLAN_GPE,
+		.ptype = RTE_PTYPE_TUNNEL_VXLAN_GPE | RTE_PTYPE_L4_UDP,
+	},
+	{
+		.tunnel = MLX5_FLOW_LAYER_GRE,
+		.ptype = RTE_PTYPE_TUNNEL_GRE,
+	},
+	{
+		.tunnel = MLX5_FLOW_LAYER_MPLS | MLX5_FLOW_LAYER_OUTER_L4_UDP,
+		.ptype = RTE_PTYPE_TUNNEL_MPLS_IN_UDP | RTE_PTYPE_L4_UDP,
+	},
+	{
+		.tunnel = MLX5_FLOW_LAYER_MPLS,
+		.ptype = RTE_PTYPE_TUNNEL_MPLS_IN_GRE,
+	},
+	{
+		.tunnel = MLX5_FLOW_LAYER_NVGRE,
+		.ptype = RTE_PTYPE_TUNNEL_NVGRE,
+	},
+	{
+		.tunnel = MLX5_FLOW_LAYER_IPIP,
+		.ptype = RTE_PTYPE_TUNNEL_IP,
+	},
+	{
+		.tunnel = MLX5_FLOW_LAYER_IPV6_ENCAP,
+		.ptype = RTE_PTYPE_TUNNEL_IP,
+	},
+	{
+		.tunnel = MLX5_FLOW_LAYER_GTP,
+		.ptype = RTE_PTYPE_TUNNEL_GTPU,
+	},
+};
+
+/**
+ * Translate tag ID to register.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] feature
+ *   The feature that request the register.
+ * @param[in] id
+ *   The request register ID.
+ * @param[out] error
+ *   Error description in case of any.
+ *
+ * @return
+ *   The request register on success, a negative errno
+ *   value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_get_reg_id(struct rte_eth_dev *dev,
+		     enum mlx5_feature_name feature,
+		     uint32_t id,
+		     struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	enum modify_reg start_reg;
+	bool skip_mtr_reg = false;
+
+	switch (feature) {
+	case MLX5_HAIRPIN_RX:
+		return REG_B;
+	case MLX5_HAIRPIN_TX:
+		return REG_A;
+	case MLX5_METADATA_RX:
+		switch (config->dv_xmeta_en) {
+		case MLX5_XMETA_MODE_LEGACY:
+			return REG_B;
+		case MLX5_XMETA_MODE_META16:
+			return REG_C_0;
+		case MLX5_XMETA_MODE_META32:
+			return REG_C_1;
+		}
+		break;
+	case MLX5_METADATA_TX:
+		return REG_A;
+	case MLX5_METADATA_FDB:
+		switch (config->dv_xmeta_en) {
+		case MLX5_XMETA_MODE_LEGACY:
+			return REG_NONE;
+		case MLX5_XMETA_MODE_META16:
+			return REG_C_0;
+		case MLX5_XMETA_MODE_META32:
+			return REG_C_1;
+		}
+		break;
+	case MLX5_FLOW_MARK:
+		switch (config->dv_xmeta_en) {
+		case MLX5_XMETA_MODE_LEGACY:
+			return REG_NONE;
+		case MLX5_XMETA_MODE_META16:
+			return REG_C_1;
+		case MLX5_XMETA_MODE_META32:
+			return REG_C_0;
+		}
+		break;
+	case MLX5_MTR_SFX:
+		/*
+		 * If meter color and flow match share one register, flow match
+		 * should use the meter color register for match.
+		 */
+		if (priv->mtr_reg_share)
+			return priv->mtr_color_reg;
+		else
+			return priv->mtr_color_reg != REG_C_2 ? REG_C_2 :
+			       REG_C_3;
+	case MLX5_MTR_COLOR:
+		MLX5_ASSERT(priv->mtr_color_reg != REG_NONE);
+		return priv->mtr_color_reg;
+	case MLX5_COPY_MARK:
+		/*
+		 * Metadata COPY_MARK register using is in meter suffix sub
+		 * flow while with meter. It's safe to share the same register.
+		 */
+		return priv->mtr_color_reg != REG_C_2 ? REG_C_2 : REG_C_3;
+	case MLX5_APP_TAG:
+		/*
+		 * If meter is enable, it will engage the register for color
+		 * match and flow match. If meter color match is not using the
+		 * REG_C_2, need to skip the REG_C_x be used by meter color
+		 * match.
+		 * If meter is disable, free to use all available registers.
+		 */
+		start_reg = priv->mtr_color_reg != REG_C_2 ? REG_C_2 :
+			    (priv->mtr_reg_share ? REG_C_3 : REG_C_4);
+		skip_mtr_reg = !!(priv->mtr_en && start_reg == REG_C_2);
+		if (id > (REG_C_7 - start_reg))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  NULL, "invalid tag id");
+		if (config->flow_mreg_c[id + start_reg - REG_C_0] == REG_NONE)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  NULL, "unsupported tag id");
+		/*
+		 * This case means meter is using the REG_C_x great than 2.
+		 * Take care not to conflict with meter color REG_C_x.
+		 * If the available index REG_C_y >= REG_C_x, skip the
+		 * color register.
+		 */
+		if (skip_mtr_reg && config->flow_mreg_c
+		    [id + start_reg - REG_C_0] >= priv->mtr_color_reg) {
+			if (id >= (REG_C_7 - start_reg))
+				return rte_flow_error_set(error, EINVAL,
+						       RTE_FLOW_ERROR_TYPE_ITEM,
+							NULL, "invalid tag id");
+			if (config->flow_mreg_c
+			    [id + 1 + start_reg - REG_C_0] != REG_NONE)
+				return config->flow_mreg_c
+					       [id + 1 + start_reg - REG_C_0];
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  NULL, "unsupported tag id");
+		}
+		return config->flow_mreg_c[id + start_reg - REG_C_0];
+	}
+	MLX5_ASSERT(false);
+	return rte_flow_error_set(error, EINVAL,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				  NULL, "invalid feature name");
+}
+
+/**
+ * Check extensive flow metadata register support.
+ *
+ * @param dev
+ *   Pointer to rte_eth_dev structure.
+ *
+ * @return
+ *   True if device supports extensive flow metadata register, otherwise false.
+ */
+bool
+mlx5_flow_ext_mreg_supported(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+
+	/*
+	 * Having available reg_c can be regarded inclusively as supporting
+	 * extensive flow metadata register, which could mean,
+	 * - metadata register copy action by modify header.
+	 * - 16 modify header actions is supported.
+	 * - reg_c's are preserved across different domain (FDB and NIC) on
+	 *   packet loopback by flow lookup miss.
+	 */
+	return config->flow_mreg_c[2] != REG_NONE;
+}
+
+/**
+ * Discover the maximum number of priority available.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ *
+ * @return
+ *   number of supported flow priority on success, a negative errno
+ *   value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_discover_priorities(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct {
+		struct ibv_flow_attr attr;
+		struct ibv_flow_spec_eth eth;
+		struct ibv_flow_spec_action_drop drop;
+	} flow_attr = {
+		.attr = {
+			.num_of_specs = 2,
+			.port = (uint8_t)priv->ibv_port,
+		},
+		.eth = {
+			.type = IBV_FLOW_SPEC_ETH,
+			.size = sizeof(struct ibv_flow_spec_eth),
+		},
+		.drop = {
+			.size = sizeof(struct ibv_flow_spec_action_drop),
+			.type = IBV_FLOW_SPEC_ACTION_DROP,
+		},
+	};
+	struct ibv_flow *flow;
+	struct mlx5_hrxq *drop = mlx5_hrxq_drop_new(dev);
+	uint16_t vprio[] = { 8, 16 };
+	int i;
+	int priority = 0;
+
+	if (!drop) {
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	for (i = 0; i != RTE_DIM(vprio); i++) {
+		flow_attr.attr.priority = vprio[i] - 1;
+		flow = mlx5_glue->create_flow(drop->qp, &flow_attr.attr);
+		if (!flow)
+			break;
+		claim_zero(mlx5_glue->destroy_flow(flow));
+		priority = vprio[i];
+	}
+	mlx5_hrxq_drop_release(dev);
+	switch (priority) {
+	case 8:
+		priority = RTE_DIM(priority_map_3);
+		break;
+	case 16:
+		priority = RTE_DIM(priority_map_5);
+		break;
+	default:
+		rte_errno = ENOTSUP;
+		DRV_LOG(ERR,
+			"port %u verbs maximum priority: %d expected 8/16",
+			dev->data->port_id, priority);
+		return -rte_errno;
+	}
+	DRV_LOG(INFO, "port %u flow maximum priority: %d",
+		dev->data->port_id, priority);
+	return priority;
+}
+
+/**
+ * Adjust flow priority based on the highest layer and the request priority.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] priority
+ *   The rule base priority.
+ * @param[in] subpriority
+ *   The priority based on the items.
+ *
+ * @return
+ *   The new priority.
+ */
+uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
+				   uint32_t subpriority)
+{
+	uint32_t res = 0;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	switch (priv->config.flow_prio) {
+	case RTE_DIM(priority_map_3):
+		res = priority_map_3[priority][subpriority];
+		break;
+	case RTE_DIM(priority_map_5):
+		res = priority_map_5[priority][subpriority];
+		break;
+	}
+	return  res;
+}
+
+/**
+ * Verify the @p item specifications (spec, last, mask) are compatible with the
+ * NIC capabilities.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] mask
+ *   @p item->mask or flow default bit-masks.
+ * @param[in] nic_mask
+ *   Bit-masks covering supported fields by the NIC to compare with user mask.
+ * @param[in] size
+ *   Bit-masks size in bytes.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_item_acceptable(const struct rte_flow_item *item,
+			  const uint8_t *mask,
+			  const uint8_t *nic_mask,
+			  unsigned int size,
+			  struct rte_flow_error *error)
+{
+	unsigned int i;
+
+	MLX5_ASSERT(nic_mask);
+	for (i = 0; i < size; ++i)
+		if ((nic_mask[i] | mask[i]) != nic_mask[i])
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "mask enables non supported"
+						  " bits");
+	if (!item->spec && (item->mask || item->last))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "mask/last without a spec is not"
+					  " supported");
+	if (item->spec && item->last) {
+		uint8_t spec[size];
+		uint8_t last[size];
+		unsigned int i;
+		int ret;
+
+		for (i = 0; i < size; ++i) {
+			spec[i] = ((const uint8_t *)item->spec)[i] & mask[i];
+			last[i] = ((const uint8_t *)item->last)[i] & mask[i];
+		}
+		ret = memcmp(spec, last, size);
+		if (ret != 0)
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "range is not valid");
+	}
+	return 0;
+}
+
+/**
+ * Adjust the hash fields according to the @p flow information.
+ *
+ * @param[in] dev_flow.
+ *   Pointer to the mlx5_flow.
+ * @param[in] tunnel
+ *   1 when the hash field is for a tunnel item.
+ * @param[in] layer_types
+ *   ETH_RSS_* types.
+ * @param[in] hash_fields
+ *   Item hash fields.
+ *
+ * @return
+ *   The hash fields that should be used.
+ */
+uint64_t
+mlx5_flow_hashfields_adjust(struct mlx5_flow_rss_desc *rss_desc,
+			    int tunnel __rte_unused, uint64_t layer_types,
+			    uint64_t hash_fields)
+{
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	int rss_request_inner = rss_desc->level >= 2;
+
+	/* Check RSS hash level for tunnel. */
+	if (tunnel && rss_request_inner)
+		hash_fields |= IBV_RX_HASH_INNER;
+	else if (tunnel || rss_request_inner)
+		return 0;
+#endif
+	/* Check if requested layer matches RSS hash fields. */
+	if (!(rss_desc->types & layer_types))
+		return 0;
+	return hash_fields;
+}
+
+/**
+ * Lookup and set the ptype in the data Rx part.  A single Ptype can be used,
+ * if several tunnel rules are used on this queue, the tunnel ptype will be
+ * cleared.
+ *
+ * @param rxq_ctrl
+ *   Rx queue to update.
+ */
+static void
+flow_rxq_tunnel_ptype_update(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+	unsigned int i;
+	uint32_t tunnel_ptype = 0;
+
+	/* Look up for the ptype to use. */
+	for (i = 0; i != MLX5_FLOW_TUNNEL; ++i) {
+		if (!rxq_ctrl->flow_tunnels_n[i])
+			continue;
+		if (!tunnel_ptype) {
+			tunnel_ptype = tunnels_info[i].ptype;
+		} else {
+			tunnel_ptype = 0;
+			break;
+		}
+	}
+	rxq_ctrl->rxq.tunnel = tunnel_ptype;
+}
+
+/**
+ * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) according to the devive
+ * flow.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] dev_handle
+ *   Pointer to device flow handle structure.
+ */
+static void
+flow_drv_rxq_flags_set(struct rte_eth_dev *dev,
+		       struct mlx5_flow_handle *dev_handle)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const int mark = dev_handle->mark;
+	const int tunnel = !!(dev_handle->layers & MLX5_FLOW_LAYER_TUNNEL);
+	struct mlx5_hrxq *hrxq;
+	unsigned int i;
+
+	if (dev_handle->fate_action != MLX5_FLOW_FATE_QUEUE)
+		return;
+	hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ],
+			      dev_handle->rix_hrxq);
+	if (!hrxq)
+		return;
+	for (i = 0; i != hrxq->ind_table->queues_n; ++i) {
+		int idx = hrxq->ind_table->queues[i];
+		struct mlx5_rxq_ctrl *rxq_ctrl =
+			container_of((*priv->rxqs)[idx],
+				     struct mlx5_rxq_ctrl, rxq);
+
+		/*
+		 * To support metadata register copy on Tx loopback,
+		 * this must be always enabled (metadata may arive
+		 * from other port - not from local flows only.
+		 */
+		if (priv->config.dv_flow_en &&
+		    priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+		    mlx5_flow_ext_mreg_supported(dev)) {
+			rxq_ctrl->rxq.mark = 1;
+			rxq_ctrl->flow_mark_n = 1;
+		} else if (mark) {
+			rxq_ctrl->rxq.mark = 1;
+			rxq_ctrl->flow_mark_n++;
+		}
+		if (tunnel) {
+			unsigned int j;
+
+			/* Increase the counter matching the flow. */
+			for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) {
+				if ((tunnels_info[j].tunnel &
+				     dev_handle->layers) ==
+				    tunnels_info[j].tunnel) {
+					rxq_ctrl->flow_tunnels_n[j]++;
+					break;
+				}
+			}
+			flow_rxq_tunnel_ptype_update(rxq_ctrl);
+		}
+	}
+}
+
+/**
+ * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) for a flow
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] flow
+ *   Pointer to flow structure.
+ */
+static void
+flow_rxq_flags_set(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t handle_idx;
+	struct mlx5_flow_handle *dev_handle;
+
+	SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+		       handle_idx, dev_handle, next)
+		flow_drv_rxq_flags_set(dev, dev_handle);
+}
+
+/**
+ * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the
+ * device flow if no other flow uses it with the same kind of request.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] dev_handle
+ *   Pointer to the device flow handle structure.
+ */
+static void
+flow_drv_rxq_flags_trim(struct rte_eth_dev *dev,
+			struct mlx5_flow_handle *dev_handle)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const int mark = dev_handle->mark;
+	const int tunnel = !!(dev_handle->layers & MLX5_FLOW_LAYER_TUNNEL);
+	struct mlx5_hrxq *hrxq;
+	unsigned int i;
+
+	if (dev_handle->fate_action != MLX5_FLOW_FATE_QUEUE)
+		return;
+	hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ],
+			      dev_handle->rix_hrxq);
+	if (!hrxq)
+		return;
+	MLX5_ASSERT(dev->data->dev_started);
+	for (i = 0; i != hrxq->ind_table->queues_n; ++i) {
+		int idx = hrxq->ind_table->queues[i];
+		struct mlx5_rxq_ctrl *rxq_ctrl =
+			container_of((*priv->rxqs)[idx],
+				     struct mlx5_rxq_ctrl, rxq);
+
+		if (priv->config.dv_flow_en &&
+		    priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+		    mlx5_flow_ext_mreg_supported(dev)) {
+			rxq_ctrl->rxq.mark = 1;
+			rxq_ctrl->flow_mark_n = 1;
+		} else if (mark) {
+			rxq_ctrl->flow_mark_n--;
+			rxq_ctrl->rxq.mark = !!rxq_ctrl->flow_mark_n;
+		}
+		if (tunnel) {
+			unsigned int j;
+
+			/* Decrease the counter matching the flow. */
+			for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) {
+				if ((tunnels_info[j].tunnel &
+				     dev_handle->layers) ==
+				    tunnels_info[j].tunnel) {
+					rxq_ctrl->flow_tunnels_n[j]--;
+					break;
+				}
+			}
+			flow_rxq_tunnel_ptype_update(rxq_ctrl);
+		}
+	}
+}
+
+/**
+ * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the
+ * @p flow if no other flow uses it with the same kind of request.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] flow
+ *   Pointer to the flow.
+ */
+static void
+flow_rxq_flags_trim(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t handle_idx;
+	struct mlx5_flow_handle *dev_handle;
+
+	SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+		       handle_idx, dev_handle, next)
+		flow_drv_rxq_flags_trim(dev, dev_handle);
+}
+
+/**
+ * Clear the Mark/Flag and Tunnel ptype information in all Rx queues.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+flow_rxq_flags_clear(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+
+	for (i = 0; i != priv->rxqs_n; ++i) {
+		struct mlx5_rxq_ctrl *rxq_ctrl;
+		unsigned int j;
+
+		if (!(*priv->rxqs)[i])
+			continue;
+		rxq_ctrl = container_of((*priv->rxqs)[i],
+					struct mlx5_rxq_ctrl, rxq);
+		rxq_ctrl->flow_mark_n = 0;
+		rxq_ctrl->rxq.mark = 0;
+		for (j = 0; j != MLX5_FLOW_TUNNEL; ++j)
+			rxq_ctrl->flow_tunnels_n[j] = 0;
+		rxq_ctrl->rxq.tunnel = 0;
+	}
+}
+
+/**
+ * Set the Rx queue dynamic metadata (mask and offset) for a flow
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ */
+void
+mlx5_flow_rxq_dynf_metadata_set(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *data;
+	unsigned int i;
+
+	for (i = 0; i != priv->rxqs_n; ++i) {
+		if (!(*priv->rxqs)[i])
+			continue;
+		data = (*priv->rxqs)[i];
+		if (!rte_flow_dynf_metadata_avail()) {
+			data->dynf_meta = 0;
+			data->flow_meta_mask = 0;
+			data->flow_meta_offset = -1;
+		} else {
+			data->dynf_meta = 1;
+			data->flow_meta_mask = rte_flow_dynf_metadata_mask;
+			data->flow_meta_offset = rte_flow_dynf_metadata_offs;
+		}
+	}
+}
+
+/*
+ * return a pointer to the desired action in the list of actions.
+ *
+ * @param[in] actions
+ *   The list of actions to search the action in.
+ * @param[in] action
+ *   The action to find.
+ *
+ * @return
+ *   Pointer to the action in the list, if found. NULL otherwise.
+ */
+const struct rte_flow_action *
+mlx5_flow_find_action(const struct rte_flow_action *actions,
+		      enum rte_flow_action_type action)
+{
+	if (actions == NULL)
+		return NULL;
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++)
+		if (actions->type == action)
+			return actions;
+	return NULL;
+}
+
+/*
+ * Validate the flag action.
+ *
+ * @param[in] action_flags
+ *   Bit-fields that holds the actions detected until now.
+ * @param[in] attr
+ *   Attributes of flow that includes this action.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_flag(uint64_t action_flags,
+			       const struct rte_flow_attr *attr,
+			       struct rte_flow_error *error)
+{
+	if (action_flags & MLX5_FLOW_ACTION_MARK)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't mark and flag in same flow");
+	if (action_flags & MLX5_FLOW_ACTION_FLAG)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't have 2 flag"
+					  " actions in same flow");
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+					  "flag action not supported for "
+					  "egress");
+	return 0;
+}
+
+/*
+ * Validate the mark action.
+ *
+ * @param[in] action
+ *   Pointer to the queue action.
+ * @param[in] action_flags
+ *   Bit-fields that holds the actions detected until now.
+ * @param[in] attr
+ *   Attributes of flow that includes this action.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_mark(const struct rte_flow_action *action,
+			       uint64_t action_flags,
+			       const struct rte_flow_attr *attr,
+			       struct rte_flow_error *error)
+{
+	const struct rte_flow_action_mark *mark = action->conf;
+
+	if (!mark)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  action,
+					  "configuration cannot be null");
+	if (mark->id >= MLX5_FLOW_MARK_MAX)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &mark->id,
+					  "mark id must in 0 <= id < "
+					  RTE_STR(MLX5_FLOW_MARK_MAX));
+	if (action_flags & MLX5_FLOW_ACTION_FLAG)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't flag and mark in same flow");
+	if (action_flags & MLX5_FLOW_ACTION_MARK)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't have 2 mark actions in same"
+					  " flow");
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+					  "mark action not supported for "
+					  "egress");
+	return 0;
+}
+
+/*
+ * Validate the drop action.
+ *
+ * @param[in] action_flags
+ *   Bit-fields that holds the actions detected until now.
+ * @param[in] attr
+ *   Attributes of flow that includes this action.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_drop(uint64_t action_flags __rte_unused,
+			       const struct rte_flow_attr *attr,
+			       struct rte_flow_error *error)
+{
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+					  "drop action not supported for "
+					  "egress");
+	return 0;
+}
+
+/*
+ * Validate the queue action.
+ *
+ * @param[in] action
+ *   Pointer to the queue action.
+ * @param[in] action_flags
+ *   Bit-fields that holds the actions detected until now.
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] attr
+ *   Attributes of flow that includes this action.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_queue(const struct rte_flow_action *action,
+				uint64_t action_flags,
+				struct rte_eth_dev *dev,
+				const struct rte_flow_attr *attr,
+				struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_action_queue *queue = action->conf;
+
+	if (action_flags & MLX5_FLOW_FATE_ACTIONS)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't have 2 fate actions in"
+					  " same flow");
+	if (!priv->rxqs_n)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  NULL, "No Rx queues configured");
+	if (queue->index >= priv->rxqs_n)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &queue->index,
+					  "queue index out of range");
+	if (!(*priv->rxqs)[queue->index])
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &queue->index,
+					  "queue is not configured");
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+					  "queue action not supported for "
+					  "egress");
+	return 0;
+}
+
+/*
+ * Validate the rss action.
+ *
+ * @param[in] action
+ *   Pointer to the queue action.
+ * @param[in] action_flags
+ *   Bit-fields that holds the actions detected until now.
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] attr
+ *   Attributes of flow that includes this action.
+ * @param[in] item_flags
+ *   Items that were detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_rss(const struct rte_flow_action *action,
+			      uint64_t action_flags,
+			      struct rte_eth_dev *dev,
+			      const struct rte_flow_attr *attr,
+			      uint64_t item_flags,
+			      struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_action_rss *rss = action->conf;
+	int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	unsigned int i;
+
+	if (action_flags & MLX5_FLOW_FATE_ACTIONS)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't have 2 fate actions"
+					  " in same flow");
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT &&
+	    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->func,
+					  "RSS hash function not supported");
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	if (rss->level > 2)
+#else
+	if (rss->level > 1)
+#endif
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->level,
+					  "tunnel RSS is not supported");
+	/* allow RSS key_len 0 in case of NULL (default) RSS key. */
+	if (rss->key_len == 0 && rss->key != NULL)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->key_len,
+					  "RSS hash key length 0");
+	if (rss->key_len > 0 && rss->key_len < MLX5_RSS_HASH_KEY_LEN)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->key_len,
+					  "RSS hash key too small");
+	if (rss->key_len > MLX5_RSS_HASH_KEY_LEN)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->key_len,
+					  "RSS hash key too large");
+	if (rss->queue_num > priv->config.ind_table_max_size)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->queue_num,
+					  "number of queues too large");
+	if (rss->types & MLX5_RSS_HF_MASK)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->types,
+					  "some RSS protocols are not"
+					  " supported");
+	if ((rss->types & (ETH_RSS_L3_SRC_ONLY | ETH_RSS_L3_DST_ONLY)) &&
+	    !(rss->types & ETH_RSS_IP))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+					  "L3 partial RSS requested but L3 RSS"
+					  " type not specified");
+	if ((rss->types & (ETH_RSS_L4_SRC_ONLY | ETH_RSS_L4_DST_ONLY)) &&
+	    !(rss->types & (ETH_RSS_UDP | ETH_RSS_TCP)))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+					  "L4 partial RSS requested but L4 RSS"
+					  " type not specified");
+	if (!priv->rxqs_n)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  NULL, "No Rx queues configured");
+	if (!rss->queue_num)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  NULL, "No queues configured");
+	for (i = 0; i != rss->queue_num; ++i) {
+		if (rss->queue[i] >= priv->rxqs_n)
+			return rte_flow_error_set
+				(error, EINVAL,
+				 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+				 &rss->queue[i], "queue index out of range");
+		if (!(*priv->rxqs)[rss->queue[i]])
+			return rte_flow_error_set
+				(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+				 &rss->queue[i], "queue is not configured");
+	}
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+					  "rss action not supported for "
+					  "egress");
+	if (rss->level > 1 &&  !tunnel)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+					  "inner RSS is not supported for "
+					  "non-tunnel flows");
+	return 0;
+}
+
+/*
+ * Validate the count action.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] attr
+ *   Attributes of flow that includes this action.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_action_count(struct rte_eth_dev *dev __rte_unused,
+				const struct rte_flow_attr *attr,
+				struct rte_flow_error *error)
+{
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+					  "count action not supported for "
+					  "egress");
+	return 0;
+}
+
+/**
+ * Verify the @p attributes will be correctly understood by the NIC and store
+ * them in the @p flow if everything is correct.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] attributes
+ *   Pointer to flow attributes
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_attributes(struct rte_eth_dev *dev,
+			      const struct rte_flow_attr *attributes,
+			      struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t priority_max = priv->config.flow_prio - 1;
+
+	if (attributes->group)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+					  NULL, "groups is not supported");
+	if (attributes->priority != MLX5_FLOW_PRIO_RSVD &&
+	    attributes->priority >= priority_max)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+					  NULL, "priority out of range");
+	if (attributes->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL,
+					  "egress is not supported");
+	if (attributes->transfer && !priv->config.dv_esw_en)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+					  NULL, "transfer is not supported");
+	if (!attributes->ingress)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+					  NULL,
+					  "ingress attribute is mandatory");
+	return 0;
+}
+
+/**
+ * Validate ICMP6 item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_icmp6(const struct rte_flow_item *item,
+			       uint64_t item_flags,
+			       uint8_t target_protocol,
+			       struct rte_flow_error *error)
+{
+	const struct rte_flow_item_icmp6 *mask = item->mask;
+	const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+				      MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+	const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+				      MLX5_FLOW_LAYER_OUTER_L4;
+	int ret;
+
+	if (target_protocol != 0xFF && target_protocol != IPPROTO_ICMPV6)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "protocol filtering not compatible"
+					  " with ICMP6 layer");
+	if (!(item_flags & l3m))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "IPv6 is mandatory to filter on"
+					  " ICMP6");
+	if (item_flags & l4m)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple L4 layers not supported");
+	if (!mask)
+		mask = &rte_flow_item_icmp6_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&rte_flow_item_icmp6_mask,
+		 sizeof(struct rte_flow_item_icmp6), error);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * Validate ICMP item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_icmp(const struct rte_flow_item *item,
+			     uint64_t item_flags,
+			     uint8_t target_protocol,
+			     struct rte_flow_error *error)
+{
+	const struct rte_flow_item_icmp *mask = item->mask;
+	const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+				      MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+	const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+				      MLX5_FLOW_LAYER_OUTER_L4;
+	int ret;
+
+	if (target_protocol != 0xFF && target_protocol != IPPROTO_ICMP)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "protocol filtering not compatible"
+					  " with ICMP layer");
+	if (!(item_flags & l3m))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "IPv4 is mandatory to filter"
+					  " on ICMP");
+	if (item_flags & l4m)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple L4 layers not supported");
+	if (!mask)
+		mask = &rte_flow_item_icmp_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&rte_flow_item_icmp_mask,
+		 sizeof(struct rte_flow_item_icmp), error);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * Validate Ethernet item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
+			    uint64_t item_flags,
+			    struct rte_flow_error *error)
+{
+	const struct rte_flow_item_eth *mask = item->mask;
+	const struct rte_flow_item_eth nic_mask = {
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+		.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+		.type = RTE_BE16(0xffff),
+	};
+	int ret;
+	int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	const uint64_t ethm = tunnel ? MLX5_FLOW_LAYER_INNER_L2	:
+				       MLX5_FLOW_LAYER_OUTER_L2;
+
+	if (item_flags & ethm)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple L2 layers not supported");
+	if ((!tunnel && (item_flags & MLX5_FLOW_LAYER_OUTER_L3)) ||
+	    (tunnel && (item_flags & MLX5_FLOW_LAYER_INNER_L3)))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L2 layer should not follow "
+					  "L3 layers");
+	if ((!tunnel && (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)) ||
+	    (tunnel && (item_flags & MLX5_FLOW_LAYER_INNER_VLAN)))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L2 layer should not follow VLAN");
+	if (!mask)
+		mask = &rte_flow_item_eth_mask;
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					(const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_eth),
+					error);
+	return ret;
+}
+
+/**
+ * Validate VLAN item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] dev
+ *   Ethernet device flow is being created on.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
+			     uint64_t item_flags,
+			     struct rte_eth_dev *dev,
+			     struct rte_flow_error *error)
+{
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask = item->mask;
+	const struct rte_flow_item_vlan nic_mask = {
+		.tci = RTE_BE16(UINT16_MAX),
+		.inner_type = RTE_BE16(UINT16_MAX),
+	};
+	uint16_t vlan_tag = 0;
+	const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	int ret;
+	const uint64_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 |
+					MLX5_FLOW_LAYER_INNER_L4) :
+				       (MLX5_FLOW_LAYER_OUTER_L3 |
+					MLX5_FLOW_LAYER_OUTER_L4);
+	const uint64_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
+					MLX5_FLOW_LAYER_OUTER_VLAN;
+
+	if (item_flags & vlanm)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple VLAN layers not supported");
+	else if ((item_flags & l34m) != 0)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "VLAN cannot follow L3/L4 layer");
+	if (!mask)
+		mask = &rte_flow_item_vlan_mask;
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					(const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_vlan),
+					error);
+	if (ret)
+		return ret;
+	if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+
+		if (priv->vmwa_context) {
+			/*
+			 * Non-NULL context means we have a virtual machine
+			 * and SR-IOV enabled, we have to create VLAN interface
+			 * to make hypervisor to setup E-Switch vport
+			 * context correctly. We avoid creating the multiple
+			 * VLAN interfaces, so we cannot support VLAN tag mask.
+			 */
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "VLAN tag mask is not"
+						  " supported in virtual"
+						  " environment");
+		}
+	}
+	if (spec) {
+		vlan_tag = spec->tci;
+		vlan_tag &= mask->tci;
+	}
+	/*
+	 * From verbs perspective an empty VLAN is equivalent
+	 * to a packet without VLAN layer.
+	 */
+	if (!vlan_tag)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+					  item->spec,
+					  "VLAN cannot be empty");
+	return 0;
+}
+
+/**
+ * Validate IPV4 item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] acc_mask
+ *   Acceptable mask, if NULL default internal default mask
+ *   will be used to check whether item fields are supported.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item,
+			     uint64_t item_flags,
+			     uint64_t last_item,
+			     uint16_t ether_type,
+			     const struct rte_flow_item_ipv4 *acc_mask,
+			     struct rte_flow_error *error)
+{
+	const struct rte_flow_item_ipv4 *mask = item->mask;
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 nic_mask = {
+		.hdr = {
+			.src_addr = RTE_BE32(0xffffffff),
+			.dst_addr = RTE_BE32(0xffffffff),
+			.type_of_service = 0xff,
+			.next_proto_id = 0xff,
+		},
+	};
+	const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
+				      MLX5_FLOW_LAYER_OUTER_L3;
+	const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+				      MLX5_FLOW_LAYER_OUTER_L4;
+	int ret;
+	uint8_t next_proto = 0xFF;
+	const uint64_t l2_vlan = (MLX5_FLOW_LAYER_L2 |
+				  MLX5_FLOW_LAYER_OUTER_VLAN |
+				  MLX5_FLOW_LAYER_INNER_VLAN);
+
+	if ((last_item & l2_vlan) && ether_type &&
+	    ether_type != RTE_ETHER_TYPE_IPV4)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "IPv4 cannot follow L2/VLAN layer "
+					  "which ether type is not IPv4");
+	if (item_flags & MLX5_FLOW_LAYER_IPIP) {
+		if (mask && spec)
+			next_proto = mask->hdr.next_proto_id &
+				     spec->hdr.next_proto_id;
+		if (next_proto == IPPROTO_IPIP || next_proto == IPPROTO_IPV6)
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "multiple tunnel "
+						  "not supported");
+	}
+	if (item_flags & MLX5_FLOW_LAYER_IPV6_ENCAP)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "wrong tunnel type - IPv6 specified "
+					  "but IPv4 item provided");
+	if (item_flags & l3m)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple L3 layers not supported");
+	else if (item_flags & l4m)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 cannot follow an L4 layer.");
+	else if ((item_flags & MLX5_FLOW_LAYER_NVGRE) &&
+		  !(item_flags & MLX5_FLOW_LAYER_INNER_L2))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 cannot follow an NVGRE layer.");
+	if (!mask)
+		mask = &rte_flow_item_ipv4_mask;
+	else if (mask->hdr.next_proto_id != 0 &&
+		 mask->hdr.next_proto_id != 0xff)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
+					  "partial mask is not supported"
+					  " for protocol");
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					acc_mask ? (const uint8_t *)acc_mask
+						 : (const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_ipv4),
+					error);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * Validate IPV6 item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] acc_mask
+ *   Acceptable mask, if NULL default internal default mask
+ *   will be used to check whether item fields are supported.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item,
+			     uint64_t item_flags,
+			     uint64_t last_item,
+			     uint16_t ether_type,
+			     const struct rte_flow_item_ipv6 *acc_mask,
+			     struct rte_flow_error *error)
+{
+	const struct rte_flow_item_ipv6 *mask = item->mask;
+	const struct rte_flow_item_ipv6 *spec = item->spec;
+	const struct rte_flow_item_ipv6 nic_mask = {
+		.hdr = {
+			.src_addr =
+				"\xff\xff\xff\xff\xff\xff\xff\xff"
+				"\xff\xff\xff\xff\xff\xff\xff\xff",
+			.dst_addr =
+				"\xff\xff\xff\xff\xff\xff\xff\xff"
+				"\xff\xff\xff\xff\xff\xff\xff\xff",
+			.vtc_flow = RTE_BE32(0xffffffff),
+			.proto = 0xff,
+		},
+	};
+	const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
+				      MLX5_FLOW_LAYER_OUTER_L3;
+	const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+				      MLX5_FLOW_LAYER_OUTER_L4;
+	int ret;
+	uint8_t next_proto = 0xFF;
+	const uint64_t l2_vlan = (MLX5_FLOW_LAYER_L2 |
+				  MLX5_FLOW_LAYER_OUTER_VLAN |
+				  MLX5_FLOW_LAYER_INNER_VLAN);
+
+	if ((last_item & l2_vlan) && ether_type &&
+	    ether_type != RTE_ETHER_TYPE_IPV6)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "IPv6 cannot follow L2/VLAN layer "
+					  "which ether type is not IPv6");
+	if (item_flags & MLX5_FLOW_LAYER_IPV6_ENCAP) {
+		if (mask && spec)
+			next_proto = mask->hdr.proto & spec->hdr.proto;
+		if (next_proto == IPPROTO_IPIP || next_proto == IPPROTO_IPV6)
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "multiple tunnel "
+						  "not supported");
+	}
+	if (item_flags & MLX5_FLOW_LAYER_IPIP)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "wrong tunnel type - IPv4 specified "
+					  "but IPv6 item provided");
+	if (item_flags & l3m)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple L3 layers not supported");
+	else if (item_flags & l4m)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 cannot follow an L4 layer.");
+	else if ((item_flags & MLX5_FLOW_LAYER_NVGRE) &&
+		  !(item_flags & MLX5_FLOW_LAYER_INNER_L2))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 cannot follow an NVGRE layer.");
+	if (!mask)
+		mask = &rte_flow_item_ipv6_mask;
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					acc_mask ? (const uint8_t *)acc_mask
+						 : (const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_ipv6),
+					error);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * Validate UDP item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] target_protocol
+ *   The next protocol in the previous item.
+ * @param[in] flow_mask
+ *   mlx5 flow-specific (DV, verbs, etc.) supported header fields mask.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_udp(const struct rte_flow_item *item,
+			    uint64_t item_flags,
+			    uint8_t target_protocol,
+			    struct rte_flow_error *error)
+{
+	const struct rte_flow_item_udp *mask = item->mask;
+	const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
+				      MLX5_FLOW_LAYER_OUTER_L3;
+	const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+				      MLX5_FLOW_LAYER_OUTER_L4;
+	int ret;
+
+	if (target_protocol != 0xff && target_protocol != IPPROTO_UDP)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "protocol filtering not compatible"
+					  " with UDP layer");
+	if (!(item_flags & l3m))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 is mandatory to filter on L4");
+	if (item_flags & l4m)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple L4 layers not supported");
+	if (!mask)
+		mask = &rte_flow_item_udp_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&rte_flow_item_udp_mask,
+		 sizeof(struct rte_flow_item_udp), error);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * Validate TCP item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] target_protocol
+ *   The next protocol in the previous item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_tcp(const struct rte_flow_item *item,
+			    uint64_t item_flags,
+			    uint8_t target_protocol,
+			    const struct rte_flow_item_tcp *flow_mask,
+			    struct rte_flow_error *error)
+{
+	const struct rte_flow_item_tcp *mask = item->mask;
+	const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 :
+				      MLX5_FLOW_LAYER_OUTER_L3;
+	const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 :
+				      MLX5_FLOW_LAYER_OUTER_L4;
+	int ret;
+
+	MLX5_ASSERT(flow_mask);
+	if (target_protocol != 0xff && target_protocol != IPPROTO_TCP)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "protocol filtering not compatible"
+					  " with TCP layer");
+	if (!(item_flags & l3m))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 is mandatory to filter on L4");
+	if (item_flags & l4m)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple L4 layers not supported");
+	if (!mask)
+		mask = &rte_flow_item_tcp_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)flow_mask,
+		 sizeof(struct rte_flow_item_tcp), error);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * Validate VXLAN item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] target_protocol
+ *   The next protocol in the previous item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
+			      uint64_t item_flags,
+			      struct rte_flow_error *error)
+{
+	const struct rte_flow_item_vxlan *spec = item->spec;
+	const struct rte_flow_item_vxlan *mask = item->mask;
+	int ret;
+	union vni {
+		uint32_t vlan_id;
+		uint8_t vni[4];
+	} id = { .vlan_id = 0, };
+
+
+	if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple tunnel layers not"
+					  " supported");
+	/*
+	 * Verify only UDPv4 is present as defined in
+	 * https://tools.ietf.org/html/rfc7348
+	 */
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "no outer UDP layer found");
+	if (!mask)
+		mask = &rte_flow_item_vxlan_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&rte_flow_item_vxlan_mask,
+		 sizeof(struct rte_flow_item_vxlan),
+		 error);
+	if (ret < 0)
+		return ret;
+	if (spec) {
+		memcpy(&id.vni[1], spec->vni, 3);
+		memcpy(&id.vni[1], mask->vni, 3);
+	}
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "VXLAN tunnel must be fully defined");
+	return 0;
+}
+
+/**
+ * Validate VXLAN_GPE item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] priv
+ *   Pointer to the private data structure.
+ * @param[in] target_protocol
+ *   The next protocol in the previous item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item,
+				  uint64_t item_flags,
+				  struct rte_eth_dev *dev,
+				  struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_item_vxlan_gpe *spec = item->spec;
+	const struct rte_flow_item_vxlan_gpe *mask = item->mask;
+	int ret;
+	union vni {
+		uint32_t vlan_id;
+		uint8_t vni[4];
+	} id = { .vlan_id = 0, };
+
+	if (!priv->config.l3_vxlan_en)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 VXLAN is not enabled by device"
+					  " parameter and/or not configured in"
+					  " firmware");
+	if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple tunnel layers not"
+					  " supported");
+	/*
+	 * Verify only UDPv4 is present as defined in
+	 * https://tools.ietf.org/html/rfc7348
+	 */
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "no outer UDP layer found");
+	if (!mask)
+		mask = &rte_flow_item_vxlan_gpe_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&rte_flow_item_vxlan_gpe_mask,
+		 sizeof(struct rte_flow_item_vxlan_gpe),
+		 error);
+	if (ret < 0)
+		return ret;
+	if (spec) {
+		if (spec->protocol)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "VxLAN-GPE protocol"
+						  " not supported");
+		memcpy(&id.vni[1], spec->vni, 3);
+		memcpy(&id.vni[1], mask->vni, 3);
+	}
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "VXLAN-GPE tunnel must be fully"
+					  " defined");
+	return 0;
+}
+/**
+ * Validate GRE Key item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit flags to mark detected items.
+ * @param[in] gre_item
+ *   Pointer to gre_item
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_gre_key(const struct rte_flow_item *item,
+				uint64_t item_flags,
+				const struct rte_flow_item *gre_item,
+				struct rte_flow_error *error)
+{
+	const rte_be32_t *mask = item->mask;
+	int ret = 0;
+	rte_be32_t gre_key_default_mask = RTE_BE32(UINT32_MAX);
+	const struct rte_flow_item_gre *gre_spec;
+	const struct rte_flow_item_gre *gre_mask;
+
+	if (item_flags & MLX5_FLOW_LAYER_GRE_KEY)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "Multiple GRE key not support");
+	if (!(item_flags & MLX5_FLOW_LAYER_GRE))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "No preceding GRE header");
+	if (item_flags & MLX5_FLOW_LAYER_INNER)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "GRE key following a wrong item");
+	gre_mask = gre_item->mask;
+	if (!gre_mask)
+		gre_mask = &rte_flow_item_gre_mask;
+	gre_spec = gre_item->spec;
+	if (gre_spec && (gre_mask->c_rsvd0_ver & RTE_BE16(0x2000)) &&
+			 !(gre_spec->c_rsvd0_ver & RTE_BE16(0x2000)))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "Key bit must be on");
+
+	if (!mask)
+		mask = &gre_key_default_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&gre_key_default_mask,
+		 sizeof(rte_be32_t), error);
+	return ret;
+}
+
+/**
+ * Validate GRE item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit flags to mark detected items.
+ * @param[in] target_protocol
+ *   The next protocol in the previous item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_gre(const struct rte_flow_item *item,
+			    uint64_t item_flags,
+			    uint8_t target_protocol,
+			    struct rte_flow_error *error)
+{
+	const struct rte_flow_item_gre *spec __rte_unused = item->spec;
+	const struct rte_flow_item_gre *mask = item->mask;
+	int ret;
+	const struct rte_flow_item_gre nic_mask = {
+		.c_rsvd0_ver = RTE_BE16(0xB000),
+		.protocol = RTE_BE16(UINT16_MAX),
+	};
+
+	if (target_protocol != 0xff && target_protocol != IPPROTO_GRE)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "protocol filtering not compatible"
+					  " with this GRE layer");
+	if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple tunnel layers not"
+					  " supported");
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 Layer is missing");
+	if (!mask)
+		mask = &rte_flow_item_gre_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&nic_mask,
+		 sizeof(struct rte_flow_item_gre), error);
+	if (ret < 0)
+		return ret;
+#ifndef HAVE_MLX5DV_DR
+#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT
+	if (spec && (spec->protocol & mask->protocol))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "without MPLS support the"
+					  " specification cannot be used for"
+					  " filtering");
+#endif
+#endif
+	return 0;
+}
+
+/**
+ * Validate Geneve item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] itemFlags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] enPriv
+ *   Pointer to the private data structure.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+
+int
+mlx5_flow_validate_item_geneve(const struct rte_flow_item *item,
+			       uint64_t item_flags,
+			       struct rte_eth_dev *dev,
+			       struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_item_geneve *spec = item->spec;
+	const struct rte_flow_item_geneve *mask = item->mask;
+	int ret;
+	uint16_t gbhdr;
+	uint8_t opt_len = priv->config.hca_attr.geneve_max_opt_len ?
+			  MLX5_GENEVE_OPT_LEN_1 : MLX5_GENEVE_OPT_LEN_0;
+	const struct rte_flow_item_geneve nic_mask = {
+		.ver_opt_len_o_c_rsvd0 = RTE_BE16(0x3f80),
+		.vni = "\xff\xff\xff",
+		.protocol = RTE_BE16(UINT16_MAX),
+	};
+
+	if (!priv->config.hca_attr.tunnel_stateless_geneve_rx)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 Geneve is not enabled by device"
+					  " parameter and/or not configured in"
+					  " firmware");
+	if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple tunnel layers not"
+					  " supported");
+	/*
+	 * Verify only UDPv4 is present as defined in
+	 * https://tools.ietf.org/html/rfc7348
+	 */
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "no outer UDP layer found");
+	if (!mask)
+		mask = &rte_flow_item_geneve_mask;
+	ret = mlx5_flow_item_acceptable
+				  (item, (const uint8_t *)mask,
+				   (const uint8_t *)&nic_mask,
+				   sizeof(struct rte_flow_item_geneve), error);
+	if (ret)
+		return ret;
+	if (spec) {
+		gbhdr = rte_be_to_cpu_16(spec->ver_opt_len_o_c_rsvd0);
+		if (MLX5_GENEVE_VER_VAL(gbhdr) ||
+		     MLX5_GENEVE_CRITO_VAL(gbhdr) ||
+		     MLX5_GENEVE_RSVD_VAL(gbhdr) || spec->rsvd1)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "Geneve protocol unsupported"
+						  " fields are being used");
+		if (MLX5_GENEVE_OPTLEN_VAL(gbhdr) > opt_len)
+			return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_ITEM,
+					 item,
+					 "Unsupported Geneve options length");
+	}
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER))
+		return rte_flow_error_set
+				    (error, ENOTSUP,
+				     RTE_FLOW_ERROR_TYPE_ITEM, item,
+				     "Geneve tunnel must be fully defined");
+	return 0;
+}
+
+/**
+ * Validate MPLS item.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] prev_layer
+ *   The protocol layer indicated in previous item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_mpls(struct rte_eth_dev *dev __rte_unused,
+			     const struct rte_flow_item *item __rte_unused,
+			     uint64_t item_flags __rte_unused,
+			     uint64_t prev_layer __rte_unused,
+			     struct rte_flow_error *error)
+{
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+	const struct rte_flow_item_mpls *mask = item->mask;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (!priv->config.mpls_en)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "MPLS not supported or"
+					  " disabled in firmware"
+					  " configuration.");
+	/* MPLS over IP, UDP, GRE is allowed */
+	if (!(prev_layer & (MLX5_FLOW_LAYER_OUTER_L3 |
+			    MLX5_FLOW_LAYER_OUTER_L4_UDP |
+			    MLX5_FLOW_LAYER_GRE)))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "protocol filtering not compatible"
+					  " with MPLS layer");
+	/* Multi-tunnel isn't allowed but MPLS over GRE is an exception. */
+	if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
+	    !(item_flags & MLX5_FLOW_LAYER_GRE))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple tunnel layers not"
+					  " supported");
+	if (!mask)
+		mask = &rte_flow_item_mpls_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&rte_flow_item_mpls_mask,
+		 sizeof(struct rte_flow_item_mpls), error);
+	if (ret < 0)
+		return ret;
+	return 0;
+#endif
+	return rte_flow_error_set(error, ENOTSUP,
+				  RTE_FLOW_ERROR_TYPE_ITEM, item,
+				  "MPLS is not supported by Verbs, please"
+				  " update.");
+}
+
+/**
+ * Validate NVGRE item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit flags to mark detected items.
+ * @param[in] target_protocol
+ *   The next protocol in the previous item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_nvgre(const struct rte_flow_item *item,
+			      uint64_t item_flags,
+			      uint8_t target_protocol,
+			      struct rte_flow_error *error)
+{
+	const struct rte_flow_item_nvgre *mask = item->mask;
+	int ret;
+
+	if (target_protocol != 0xff && target_protocol != IPPROTO_GRE)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "protocol filtering not compatible"
+					  " with this GRE layer");
+	if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple tunnel layers not"
+					  " supported");
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "L3 Layer is missing");
+	if (!mask)
+		mask = &rte_flow_item_nvgre_mask;
+	ret = mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&rte_flow_item_nvgre_mask,
+		 sizeof(struct rte_flow_item_nvgre), error);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/* Allocate unique ID for the split Q/RSS subflows. */
+static uint32_t
+flow_qrss_get_id(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t qrss_id, ret;
+
+	ret = mlx5_flow_id_get(priv->qrss_id_pool, &qrss_id);
+	if (ret)
+		return 0;
+	MLX5_ASSERT(qrss_id);
+	return qrss_id;
+}
+
+/* Free unique ID for the split Q/RSS subflows. */
+static void
+flow_qrss_free_id(struct rte_eth_dev *dev,  uint32_t qrss_id)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (qrss_id)
+		mlx5_flow_id_release(priv->qrss_id_pool, qrss_id);
+}
+
+/**
+ * Release resource related QUEUE/RSS action split.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param flow
+ *   Flow to release id's from.
+ */
+static void
+flow_mreg_split_qrss_release(struct rte_eth_dev *dev,
+			     struct rte_flow *flow)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t handle_idx;
+	struct mlx5_flow_handle *dev_handle;
+
+	SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+		       handle_idx, dev_handle, next)
+		if (dev_handle->split_flow_id)
+			flow_qrss_free_id(dev, dev_handle->split_flow_id);
+}
+
+static int
+flow_null_validate(struct rte_eth_dev *dev __rte_unused,
+		   const struct rte_flow_attr *attr __rte_unused,
+		   const struct rte_flow_item items[] __rte_unused,
+		   const struct rte_flow_action actions[] __rte_unused,
+		   bool external __rte_unused,
+		   int hairpin __rte_unused,
+		   struct rte_flow_error *error)
+{
+	return rte_flow_error_set(error, ENOTSUP,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+}
+
+static struct mlx5_flow *
+flow_null_prepare(struct rte_eth_dev *dev __rte_unused,
+		  const struct rte_flow_attr *attr __rte_unused,
+		  const struct rte_flow_item items[] __rte_unused,
+		  const struct rte_flow_action actions[] __rte_unused,
+		  struct rte_flow_error *error)
+{
+	rte_flow_error_set(error, ENOTSUP,
+			   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+	return NULL;
+}
+
+static int
+flow_null_translate(struct rte_eth_dev *dev __rte_unused,
+		    struct mlx5_flow *dev_flow __rte_unused,
+		    const struct rte_flow_attr *attr __rte_unused,
+		    const struct rte_flow_item items[] __rte_unused,
+		    const struct rte_flow_action actions[] __rte_unused,
+		    struct rte_flow_error *error)
+{
+	return rte_flow_error_set(error, ENOTSUP,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+}
+
+static int
+flow_null_apply(struct rte_eth_dev *dev __rte_unused,
+		struct rte_flow *flow __rte_unused,
+		struct rte_flow_error *error)
+{
+	return rte_flow_error_set(error, ENOTSUP,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+}
+
+static void
+flow_null_remove(struct rte_eth_dev *dev __rte_unused,
+		 struct rte_flow *flow __rte_unused)
+{
+}
+
+static void
+flow_null_destroy(struct rte_eth_dev *dev __rte_unused,
+		  struct rte_flow *flow __rte_unused)
+{
+}
+
+static int
+flow_null_query(struct rte_eth_dev *dev __rte_unused,
+		struct rte_flow *flow __rte_unused,
+		const struct rte_flow_action *actions __rte_unused,
+		void *data __rte_unused,
+		struct rte_flow_error *error)
+{
+	return rte_flow_error_set(error, ENOTSUP,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, NULL);
+}
+
+/* Void driver to protect from null pointer reference. */
+const struct mlx5_flow_driver_ops mlx5_flow_null_drv_ops = {
+	.validate = flow_null_validate,
+	.prepare = flow_null_prepare,
+	.translate = flow_null_translate,
+	.apply = flow_null_apply,
+	.remove = flow_null_remove,
+	.destroy = flow_null_destroy,
+	.query = flow_null_query,
+};
+
+/**
+ * Select flow driver type according to flow attributes and device
+ * configuration.
+ *
+ * @param[in] dev
+ *   Pointer to the dev structure.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ *
+ * @return
+ *   flow driver type, MLX5_FLOW_TYPE_MAX otherwise.
+ */
+static enum mlx5_flow_drv_type
+flow_get_drv_type(struct rte_eth_dev *dev, const struct rte_flow_attr *attr)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	enum mlx5_flow_drv_type type = MLX5_FLOW_TYPE_MAX;
+
+	if (attr->transfer && priv->config.dv_esw_en)
+		type = MLX5_FLOW_TYPE_DV;
+	if (!attr->transfer)
+		type = priv->config.dv_flow_en ? MLX5_FLOW_TYPE_DV :
+						 MLX5_FLOW_TYPE_VERBS;
+	return type;
+}
+
+#define flow_get_drv_ops(type) flow_drv_ops[type]
+
+/**
+ * Flow driver validation API. This abstracts calling driver specific functions.
+ * The type of flow driver is determined according to flow attributes.
+ *
+ * @param[in] dev
+ *   Pointer to the dev structure.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[in] hairpin
+ *   Number of hairpin TX actions, 0 means classic flow.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static inline int
+flow_drv_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  bool external, int hairpin, struct rte_flow_error *error)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	enum mlx5_flow_drv_type type = flow_get_drv_type(dev, attr);
+
+	fops = flow_get_drv_ops(type);
+	return fops->validate(dev, attr, items, actions, external,
+			      hairpin, error);
+}
+
+/**
+ * Flow driver preparation API. This abstracts calling driver specific
+ * functions. Parent flow (rte_flow) should have driver type (drv_type). It
+ * calculates the size of memory required for device flow, allocates the memory,
+ * initializes the device flow and returns the pointer.
+ *
+ * @note
+ *   This function initializes device flow structure such as dv or verbs in
+ *   struct mlx5_flow. However, it is caller's responsibility to initialize the
+ *   rest. For example, adding returning device flow to flow->dev_flow list and
+ *   setting backward reference to the flow should be done out of this function.
+ *   layers field is not filled either.
+ *
+ * @param[in] dev
+ *   Pointer to the dev structure.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[in] flow_idx
+ *   This memory pool index to the flow.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   Pointer to device flow on success, otherwise NULL and rte_errno is set.
+ */
+static inline struct mlx5_flow *
+flow_drv_prepare(struct rte_eth_dev *dev,
+		 const struct rte_flow *flow,
+		 const struct rte_flow_attr *attr,
+		 const struct rte_flow_item items[],
+		 const struct rte_flow_action actions[],
+		 uint32_t flow_idx,
+		 struct rte_flow_error *error)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	enum mlx5_flow_drv_type type = flow->drv_type;
+	struct mlx5_flow *mlx5_flow = NULL;
+
+	MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+	fops = flow_get_drv_ops(type);
+	mlx5_flow = fops->prepare(dev, attr, items, actions, error);
+	if (mlx5_flow)
+		mlx5_flow->flow_idx = flow_idx;
+	return mlx5_flow;
+}
+
+/**
+ * Flow driver translation API. This abstracts calling driver specific
+ * functions. Parent flow (rte_flow) should have driver type (drv_type). It
+ * translates a generic flow into a driver flow. flow_drv_prepare() must
+ * precede.
+ *
+ * @note
+ *   dev_flow->layers could be filled as a result of parsing during translation
+ *   if needed by flow_drv_apply(). dev_flow->flow->actions can also be filled
+ *   if necessary. As a flow can have multiple dev_flows by RSS flow expansion,
+ *   flow->actions could be overwritten even though all the expanded dev_flows
+ *   have the same actions.
+ *
+ * @param[in] dev
+ *   Pointer to the rte dev structure.
+ * @param[in, out] dev_flow
+ *   Pointer to the mlx5 flow.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static inline int
+flow_drv_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
+		   const struct rte_flow_attr *attr,
+		   const struct rte_flow_item items[],
+		   const struct rte_flow_action actions[],
+		   struct rte_flow_error *error)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	enum mlx5_flow_drv_type type = dev_flow->flow->drv_type;
+
+	MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+	fops = flow_get_drv_ops(type);
+	return fops->translate(dev, dev_flow, attr, items, actions, error);
+}
+
+/**
+ * Flow driver apply API. This abstracts calling driver specific functions.
+ * Parent flow (rte_flow) should have driver type (drv_type). It applies
+ * translated driver flows on to device. flow_drv_translate() must precede.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device structure.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static inline int
+flow_drv_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
+	       struct rte_flow_error *error)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	enum mlx5_flow_drv_type type = flow->drv_type;
+
+	MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+	fops = flow_get_drv_ops(type);
+	return fops->apply(dev, flow, error);
+}
+
+/**
+ * Flow driver remove API. This abstracts calling driver specific functions.
+ * Parent flow (rte_flow) should have driver type (drv_type). It removes a flow
+ * on device. All the resources of the flow should be freed by calling
+ * flow_drv_destroy().
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ */
+static inline void
+flow_drv_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	enum mlx5_flow_drv_type type = flow->drv_type;
+
+	MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+	fops = flow_get_drv_ops(type);
+	fops->remove(dev, flow);
+}
+
+/**
+ * Flow driver destroy API. This abstracts calling driver specific functions.
+ * Parent flow (rte_flow) should have driver type (drv_type). It removes a flow
+ * on device and releases resources of the flow.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ */
+static inline void
+flow_drv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	enum mlx5_flow_drv_type type = flow->drv_type;
+
+	flow_mreg_split_qrss_release(dev, flow);
+	MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+	fops = flow_get_drv_ops(type);
+	fops->destroy(dev, flow);
+}
+
+/**
+ * Get RSS action from the action list.
+ *
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ *
+ * @return
+ *   Pointer to the RSS action if exist, else return NULL.
+ */
+static const struct rte_flow_action_rss*
+flow_get_rss_action(const struct rte_flow_action actions[])
+{
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			return (const struct rte_flow_action_rss *)
+			       actions->conf;
+		default:
+			break;
+		}
+	}
+	return NULL;
+}
+
+static unsigned int
+find_graph_root(const struct rte_flow_item pattern[], uint32_t rss_level)
+{
+	const struct rte_flow_item *item;
+	unsigned int has_vlan = 0;
+
+	for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) {
+		if (item->type == RTE_FLOW_ITEM_TYPE_VLAN) {
+			has_vlan = 1;
+			break;
+		}
+	}
+	if (has_vlan)
+		return rss_level < 2 ? MLX5_EXPANSION_ROOT_ETH_VLAN :
+				       MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN;
+	return rss_level < 2 ? MLX5_EXPANSION_ROOT :
+			       MLX5_EXPANSION_ROOT_OUTER;
+}
+
+/**
+ *  Get layer flags from the prefix flow.
+ *
+ *  Some flows may be split to several subflows, the prefix subflow gets the
+ *  match items and the suffix sub flow gets the actions.
+ *  Some actions need the user defined match item flags to get the detail for
+ *  the action.
+ *  This function helps the suffix flow to get the item layer flags from prefix
+ *  subflow.
+ *
+ * @param[in] dev_flow
+ *   Pointer the created preifx subflow.
+ *
+ * @return
+ *   The layers get from prefix subflow.
+ */
+static inline uint64_t
+flow_get_prefix_layer_flags(struct mlx5_flow *dev_flow)
+{
+	uint64_t layers = 0;
+
+	/*
+	 * Layers bits could be localization, but usually the compiler will
+	 * help to do the optimization work for source code.
+	 * If no decap actions, use the layers directly.
+	 */
+	if (!(dev_flow->act_flags & MLX5_FLOW_ACTION_DECAP))
+		return dev_flow->handle->layers;
+	/* Convert L3 layers with decap action. */
+	if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L3_IPV4)
+		layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+	else if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L3_IPV6)
+		layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+	/* Convert L4 layers with decap action.  */
+	if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L4_TCP)
+		layers |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
+	else if (dev_flow->handle->layers & MLX5_FLOW_LAYER_INNER_L4_UDP)
+		layers |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
+	return layers;
+}
+
+/**
+ * Get metadata split action information.
+ *
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] qrss
+ *   Pointer to the return pointer.
+ * @param[out] qrss_type
+ *   Pointer to the action type to return. RTE_FLOW_ACTION_TYPE_END is returned
+ *   if no QUEUE/RSS is found.
+ * @param[out] encap_idx
+ *   Pointer to the index of the encap action if exists, otherwise the last
+ *   action index.
+ *
+ * @return
+ *   Total number of actions.
+ */
+static int
+flow_parse_metadata_split_actions_info(const struct rte_flow_action actions[],
+				       const struct rte_flow_action **qrss,
+				       int *encap_idx)
+{
+	const struct rte_flow_action_raw_encap *raw_encap;
+	int actions_n = 0;
+	int raw_decap_idx = -1;
+
+	*encap_idx = -1;
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+		case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+			*encap_idx = actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+			raw_decap_idx = actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+			raw_encap = actions->conf;
+			if (raw_encap->size > MLX5_ENCAPSULATION_DECISION_SIZE)
+				*encap_idx = raw_decap_idx != -1 ?
+						      raw_decap_idx : actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			*qrss = actions;
+			break;
+		default:
+			break;
+		}
+		actions_n++;
+	}
+	if (*encap_idx == -1)
+		*encap_idx = actions_n;
+	/* Count RTE_FLOW_ACTION_TYPE_END. */
+	return actions_n + 1;
+}
+
+/**
+ * Check meter action from the action list.
+ *
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] mtr
+ *   Pointer to the meter exist flag.
+ *
+ * @return
+ *   Total number of actions.
+ */
+static int
+flow_check_meter_action(const struct rte_flow_action actions[], uint32_t *mtr)
+{
+	int actions_n = 0;
+
+	MLX5_ASSERT(mtr);
+	*mtr = 0;
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_METER:
+			*mtr = 1;
+			break;
+		default:
+			break;
+		}
+		actions_n++;
+	}
+	/* Count RTE_FLOW_ACTION_TYPE_END. */
+	return actions_n + 1;
+}
+
+/**
+ * Check if the flow should be splited due to hairpin.
+ * The reason for the split is that in current HW we can't
+ * support encap on Rx, so if a flow have encap we move it
+ * to Tx.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ *
+ * @return
+ *   > 0 the number of actions and the flow should be split,
+ *   0 when no split required.
+ */
+static int
+flow_check_hairpin_split(struct rte_eth_dev *dev,
+			 const struct rte_flow_attr *attr,
+			 const struct rte_flow_action actions[])
+{
+	int queue_action = 0;
+	int action_n = 0;
+	int encap = 0;
+	const struct rte_flow_action_queue *queue;
+	const struct rte_flow_action_rss *rss;
+	const struct rte_flow_action_raw_encap *raw_encap;
+
+	if (!attr->ingress)
+		return 0;
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+			queue = actions->conf;
+			if (queue == NULL)
+				return 0;
+			if (mlx5_rxq_get_type(dev, queue->index) !=
+			    MLX5_RXQ_TYPE_HAIRPIN)
+				return 0;
+			queue_action = 1;
+			action_n++;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			rss = actions->conf;
+			if (rss == NULL || rss->queue_num == 0)
+				return 0;
+			if (mlx5_rxq_get_type(dev, rss->queue[0]) !=
+			    MLX5_RXQ_TYPE_HAIRPIN)
+				return 0;
+			queue_action = 1;
+			action_n++;
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+		case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+			encap = 1;
+			action_n++;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+			raw_encap = actions->conf;
+			if (raw_encap->size >
+			    (sizeof(struct rte_flow_item_eth) +
+			     sizeof(struct rte_flow_item_ipv4)))
+				encap = 1;
+			action_n++;
+			break;
+		default:
+			action_n++;
+			break;
+		}
+	}
+	if (encap == 1 && queue_action)
+		return action_n;
+	return 0;
+}
+
+/* Declare flow create/destroy prototype in advance. */
+static uint32_t
+flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
+		 const struct rte_flow_attr *attr,
+		 const struct rte_flow_item items[],
+		 const struct rte_flow_action actions[],
+		 bool external, struct rte_flow_error *error);
+
+static void
+flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
+		  uint32_t flow_idx);
+
+/**
+ * Add a flow of copying flow metadata registers in RX_CP_TBL.
+ *
+ * As mark_id is unique, if there's already a registered flow for the mark_id,
+ * return by increasing the reference counter of the resource. Otherwise, create
+ * the resource (mcp_res) and flow.
+ *
+ * Flow looks like,
+ *   - If ingress port is ANY and reg_c[1] is mark_id,
+ *     flow_tag := mark_id, reg_b := reg_c[0] and jump to RX_ACT_TBL.
+ *
+ * For default flow (zero mark_id), flow is like,
+ *   - If ingress port is ANY,
+ *     reg_b := reg_c[0] and jump to RX_ACT_TBL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mark_id
+ *   ID of MARK action, zero means default flow for META.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   Associated resource on success, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_flow_mreg_copy_resource *
+flow_mreg_add_copy_action(struct rte_eth_dev *dev, uint32_t mark_id,
+			  struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow_attr attr = {
+		.group = MLX5_FLOW_MREG_CP_TABLE_GROUP,
+		.ingress = 1,
+	};
+	struct mlx5_rte_flow_item_tag tag_spec = {
+		.data = mark_id,
+	};
+	struct rte_flow_item items[] = {
+		[1] = { .type = RTE_FLOW_ITEM_TYPE_END, },
+	};
+	struct rte_flow_action_mark ftag = {
+		.id = mark_id,
+	};
+	struct mlx5_flow_action_copy_mreg cp_mreg = {
+		.dst = REG_B,
+		.src = 0,
+	};
+	struct rte_flow_action_jump jump = {
+		.group = MLX5_FLOW_MREG_ACT_TABLE_GROUP,
+	};
+	struct rte_flow_action actions[] = {
+		[3] = { .type = RTE_FLOW_ACTION_TYPE_END, },
+	};
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	uint32_t idx = 0;
+	int ret;
+
+	/* Fill the register fileds in the flow. */
+	ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+	if (ret < 0)
+		return NULL;
+	tag_spec.id = ret;
+	ret = mlx5_flow_get_reg_id(dev, MLX5_METADATA_RX, 0, error);
+	if (ret < 0)
+		return NULL;
+	cp_mreg.src = ret;
+	/* Check if already registered. */
+	MLX5_ASSERT(priv->mreg_cp_tbl);
+	mcp_res = (void *)mlx5_hlist_lookup(priv->mreg_cp_tbl, mark_id);
+	if (mcp_res) {
+		/* For non-default rule. */
+		if (mark_id != MLX5_DEFAULT_COPY_ID)
+			mcp_res->refcnt++;
+		MLX5_ASSERT(mark_id != MLX5_DEFAULT_COPY_ID ||
+			    mcp_res->refcnt == 1);
+		return mcp_res;
+	}
+	/* Provide the full width of FLAG specific value. */
+	if (mark_id == (priv->sh->dv_regc0_mask & MLX5_FLOW_MARK_DEFAULT))
+		tag_spec.data = MLX5_FLOW_MARK_DEFAULT;
+	/* Build a new flow. */
+	if (mark_id != MLX5_DEFAULT_COPY_ID) {
+		items[0] = (struct rte_flow_item){
+			.type = (enum rte_flow_item_type)
+				MLX5_RTE_FLOW_ITEM_TYPE_TAG,
+			.spec = &tag_spec,
+		};
+		items[1] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		};
+		actions[0] = (struct rte_flow_action){
+			.type = (enum rte_flow_action_type)
+				MLX5_RTE_FLOW_ACTION_TYPE_MARK,
+			.conf = &ftag,
+		};
+		actions[1] = (struct rte_flow_action){
+			.type = (enum rte_flow_action_type)
+				MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+			.conf = &cp_mreg,
+		};
+		actions[2] = (struct rte_flow_action){
+			.type = RTE_FLOW_ACTION_TYPE_JUMP,
+			.conf = &jump,
+		};
+		actions[3] = (struct rte_flow_action){
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		};
+	} else {
+		/* Default rule, wildcard match. */
+		attr.priority = MLX5_FLOW_PRIO_RSVD;
+		items[0] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		};
+		actions[0] = (struct rte_flow_action){
+			.type = (enum rte_flow_action_type)
+				MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+			.conf = &cp_mreg,
+		};
+		actions[1] = (struct rte_flow_action){
+			.type = RTE_FLOW_ACTION_TYPE_JUMP,
+			.conf = &jump,
+		};
+		actions[2] = (struct rte_flow_action){
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		};
+	}
+	/* Build a new entry. */
+	mcp_res = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MCP], &idx);
+	if (!mcp_res) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	mcp_res->idx = idx;
+	/*
+	 * The copy Flows are not included in any list. There
+	 * ones are referenced from other Flows and can not
+	 * be applied, removed, deleted in ardbitrary order
+	 * by list traversing.
+	 */
+	mcp_res->rix_flow = flow_list_create(dev, NULL, &attr, items,
+					 actions, false, error);
+	if (!mcp_res->rix_flow)
+		goto error;
+	mcp_res->refcnt++;
+	mcp_res->hlist_ent.key = mark_id;
+	ret = mlx5_hlist_insert(priv->mreg_cp_tbl,
+				&mcp_res->hlist_ent);
+	MLX5_ASSERT(!ret);
+	if (ret)
+		goto error;
+	return mcp_res;
+error:
+	if (mcp_res->rix_flow)
+		flow_list_destroy(dev, NULL, mcp_res->rix_flow);
+	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx);
+	return NULL;
+}
+
+/**
+ * Release flow in RX_CP_TBL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @flow
+ *   Parent flow for wich copying is provided.
+ */
+static void
+flow_mreg_del_copy_action(struct rte_eth_dev *dev,
+			  struct rte_flow *flow)
+{
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!flow->rix_mreg_copy)
+		return;
+	mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP],
+				 flow->rix_mreg_copy);
+	if (!mcp_res || !priv->mreg_cp_tbl)
+		return;
+	if (flow->copy_applied) {
+		MLX5_ASSERT(mcp_res->appcnt);
+		flow->copy_applied = 0;
+		--mcp_res->appcnt;
+		if (!mcp_res->appcnt) {
+			struct rte_flow *mcp_flow = mlx5_ipool_get
+					(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
+					mcp_res->rix_flow);
+
+			if (mcp_flow)
+				flow_drv_remove(dev, mcp_flow);
+		}
+	}
+	/*
+	 * We do not check availability of metadata registers here,
+	 * because copy resources are not allocated in this case.
+	 */
+	if (--mcp_res->refcnt)
+		return;
+	MLX5_ASSERT(mcp_res->rix_flow);
+	flow_list_destroy(dev, NULL, mcp_res->rix_flow);
+	mlx5_hlist_remove(priv->mreg_cp_tbl, &mcp_res->hlist_ent);
+	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx);
+	flow->rix_mreg_copy = 0;
+}
+
+/**
+ * Start flow in RX_CP_TBL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @flow
+ *   Parent flow for wich copying is provided.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_mreg_start_copy_action(struct rte_eth_dev *dev,
+			    struct rte_flow *flow)
+{
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (!flow->rix_mreg_copy || flow->copy_applied)
+		return 0;
+	mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP],
+				 flow->rix_mreg_copy);
+	if (!mcp_res)
+		return 0;
+	if (!mcp_res->appcnt) {
+		struct rte_flow *mcp_flow = mlx5_ipool_get
+				(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
+				mcp_res->rix_flow);
+
+		if (mcp_flow) {
+			ret = flow_drv_apply(dev, mcp_flow, NULL);
+			if (ret)
+				return ret;
+		}
+	}
+	++mcp_res->appcnt;
+	flow->copy_applied = 1;
+	return 0;
+}
+
+/**
+ * Stop flow in RX_CP_TBL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @flow
+ *   Parent flow for wich copying is provided.
+ */
+static void
+flow_mreg_stop_copy_action(struct rte_eth_dev *dev,
+			   struct rte_flow *flow)
+{
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!flow->rix_mreg_copy || !flow->copy_applied)
+		return;
+	mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP],
+				 flow->rix_mreg_copy);
+	if (!mcp_res)
+		return;
+	MLX5_ASSERT(mcp_res->appcnt);
+	--mcp_res->appcnt;
+	flow->copy_applied = 0;
+	if (!mcp_res->appcnt) {
+		struct rte_flow *mcp_flow = mlx5_ipool_get
+				(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
+				mcp_res->rix_flow);
+
+		if (mcp_flow)
+			flow_drv_remove(dev, mcp_flow);
+	}
+}
+
+/**
+ * Remove the default copy action from RX_CP_TBL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+flow_mreg_del_default_copy_action(struct rte_eth_dev *dev)
+{
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	/* Check if default flow is registered. */
+	if (!priv->mreg_cp_tbl)
+		return;
+	mcp_res = (void *)mlx5_hlist_lookup(priv->mreg_cp_tbl,
+					    MLX5_DEFAULT_COPY_ID);
+	if (!mcp_res)
+		return;
+	MLX5_ASSERT(mcp_res->rix_flow);
+	flow_list_destroy(dev, NULL, mcp_res->rix_flow);
+	mlx5_hlist_remove(priv->mreg_cp_tbl, &mcp_res->hlist_ent);
+	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx);
+}
+
+/**
+ * Add the default copy action in in RX_CP_TBL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 for success, negative value otherwise and rte_errno is set.
+ */
+static int
+flow_mreg_add_default_copy_action(struct rte_eth_dev *dev,
+				  struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+
+	/* Check whether extensive metadata feature is engaged. */
+	if (!priv->config.dv_flow_en ||
+	    priv->config.dv_xmeta_en == MLX5_XMETA_MODE_LEGACY ||
+	    !mlx5_flow_ext_mreg_supported(dev) ||
+	    !priv->sh->dv_regc0_mask)
+		return 0;
+	mcp_res = flow_mreg_add_copy_action(dev, MLX5_DEFAULT_COPY_ID, error);
+	if (!mcp_res)
+		return -rte_errno;
+	return 0;
+}
+
+/**
+ * Add a flow of copying flow metadata registers in RX_CP_TBL.
+ *
+ * All the flow having Q/RSS action should be split by
+ * flow_mreg_split_qrss_prep() to pass by RX_CP_TBL. A flow in the RX_CP_TBL
+ * performs the following,
+ *   - CQE->flow_tag := reg_c[1] (MARK)
+ *   - CQE->flow_table_metadata (reg_b) := reg_c[0] (META)
+ * As CQE's flow_tag is not a register, it can't be simply copied from reg_c[1]
+ * but there should be a flow per each MARK ID set by MARK action.
+ *
+ * For the aforementioned reason, if there's a MARK action in flow's action
+ * list, a corresponding flow should be added to the RX_CP_TBL in order to copy
+ * the MARK ID to CQE's flow_tag like,
+ *   - If reg_c[1] is mark_id,
+ *     flow_tag := mark_id, reg_b := reg_c[0] and jump to RX_ACT_TBL.
+ *
+ * For SET_META action which stores value in reg_c[0], as the destination is
+ * also a flow metadata register (reg_b), adding a default flow is enough. Zero
+ * MARK ID means the default flow. The default flow looks like,
+ *   - For all flow, reg_b := reg_c[0] and jump to RX_ACT_TBL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param flow
+ *   Pointer to flow structure.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, negative value otherwise and rte_errno is set.
+ */
+static int
+flow_mreg_update_copy_table(struct rte_eth_dev *dev,
+			    struct rte_flow *flow,
+			    const struct rte_flow_action *actions,
+			    struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	const struct rte_flow_action_mark *mark;
+
+	/* Check whether extensive metadata feature is engaged. */
+	if (!config->dv_flow_en ||
+	    config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY ||
+	    !mlx5_flow_ext_mreg_supported(dev) ||
+	    !priv->sh->dv_regc0_mask)
+		return 0;
+	/* Find MARK action. */
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_FLAG:
+			mcp_res = flow_mreg_add_copy_action
+				(dev, MLX5_FLOW_MARK_DEFAULT, error);
+			if (!mcp_res)
+				return -rte_errno;
+			flow->rix_mreg_copy = mcp_res->idx;
+			if (dev->data->dev_started) {
+				mcp_res->appcnt++;
+				flow->copy_applied = 1;
+			}
+			return 0;
+		case RTE_FLOW_ACTION_TYPE_MARK:
+			mark = (const struct rte_flow_action_mark *)
+				actions->conf;
+			mcp_res =
+				flow_mreg_add_copy_action(dev, mark->id, error);
+			if (!mcp_res)
+				return -rte_errno;
+			flow->rix_mreg_copy = mcp_res->idx;
+			if (dev->data->dev_started) {
+				mcp_res->appcnt++;
+				flow->copy_applied = 1;
+			}
+			return 0;
+		default:
+			break;
+		}
+	}
+	return 0;
+}
+
+#define MLX5_MAX_SPLIT_ACTIONS 24
+#define MLX5_MAX_SPLIT_ITEMS 24
+
+/**
+ * Split the hairpin flow.
+ * Since HW can't support encap on Rx we move the encap to Tx.
+ * If the count action is after the encap then we also
+ * move the count action. in this case the count will also measure
+ * the outer bytes.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] actions_rx
+ *   Rx flow actions.
+ * @param[out] actions_tx
+ *   Tx flow actions..
+ * @param[out] pattern_tx
+ *   The pattern items for the Tx flow.
+ * @param[out] flow_id
+ *   The flow ID connected to this flow.
+ *
+ * @return
+ *   0 on success.
+ */
+static int
+flow_hairpin_split(struct rte_eth_dev *dev,
+		   const struct rte_flow_action actions[],
+		   struct rte_flow_action actions_rx[],
+		   struct rte_flow_action actions_tx[],
+		   struct rte_flow_item pattern_tx[],
+		   uint32_t *flow_id)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_action_raw_encap *raw_encap;
+	const struct rte_flow_action_raw_decap *raw_decap;
+	struct mlx5_rte_flow_action_set_tag *set_tag;
+	struct rte_flow_action *tag_action;
+	struct mlx5_rte_flow_item_tag *tag_item;
+	struct rte_flow_item *item;
+	char *addr;
+	int encap = 0;
+
+	mlx5_flow_id_get(priv->sh->flow_id_pool, flow_id);
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+		case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+			rte_memcpy(actions_tx, actions,
+			       sizeof(struct rte_flow_action));
+			actions_tx++;
+			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			if (encap) {
+				rte_memcpy(actions_tx, actions,
+					   sizeof(struct rte_flow_action));
+				actions_tx++;
+			} else {
+				rte_memcpy(actions_rx, actions,
+					   sizeof(struct rte_flow_action));
+				actions_rx++;
+			}
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+			raw_encap = actions->conf;
+			if (raw_encap->size >
+			    (sizeof(struct rte_flow_item_eth) +
+			     sizeof(struct rte_flow_item_ipv4))) {
+				memcpy(actions_tx, actions,
+				       sizeof(struct rte_flow_action));
+				actions_tx++;
+				encap = 1;
+			} else {
+				rte_memcpy(actions_rx, actions,
+					   sizeof(struct rte_flow_action));
+				actions_rx++;
+			}
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+			raw_decap = actions->conf;
+			if (raw_decap->size <
+			    (sizeof(struct rte_flow_item_eth) +
+			     sizeof(struct rte_flow_item_ipv4))) {
+				memcpy(actions_tx, actions,
+				       sizeof(struct rte_flow_action));
+				actions_tx++;
+			} else {
+				rte_memcpy(actions_rx, actions,
+					   sizeof(struct rte_flow_action));
+				actions_rx++;
+			}
+			break;
+		default:
+			rte_memcpy(actions_rx, actions,
+				   sizeof(struct rte_flow_action));
+			actions_rx++;
+			break;
+		}
+	}
+	/* Add set meta action and end action for the Rx flow. */
+	tag_action = actions_rx;
+	tag_action->type = (enum rte_flow_action_type)
+			   MLX5_RTE_FLOW_ACTION_TYPE_TAG;
+	actions_rx++;
+	rte_memcpy(actions_rx, actions, sizeof(struct rte_flow_action));
+	actions_rx++;
+	set_tag = (void *)actions_rx;
+	set_tag->id = mlx5_flow_get_reg_id(dev, MLX5_HAIRPIN_RX, 0, NULL);
+	MLX5_ASSERT(set_tag->id > REG_NONE);
+	set_tag->data = *flow_id;
+	tag_action->conf = set_tag;
+	/* Create Tx item list. */
+	rte_memcpy(actions_tx, actions, sizeof(struct rte_flow_action));
+	addr = (void *)&pattern_tx[2];
+	item = pattern_tx;
+	item->type = (enum rte_flow_item_type)
+		     MLX5_RTE_FLOW_ITEM_TYPE_TAG;
+	tag_item = (void *)addr;
+	tag_item->data = *flow_id;
+	tag_item->id = mlx5_flow_get_reg_id(dev, MLX5_HAIRPIN_TX, 0, NULL);
+	MLX5_ASSERT(set_tag->id > REG_NONE);
+	item->spec = tag_item;
+	addr += sizeof(struct mlx5_rte_flow_item_tag);
+	tag_item = (void *)addr;
+	tag_item->data = UINT32_MAX;
+	tag_item->id = UINT16_MAX;
+	item->mask = tag_item;
+	addr += sizeof(struct mlx5_rte_flow_item_tag);
+	item->last = NULL;
+	item++;
+	item->type = RTE_FLOW_ITEM_TYPE_END;
+	return 0;
+}
+
+/**
+ * The last stage of splitting chain, just creates the subflow
+ * without any modification.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] flow
+ *   Parent flow structure pointer.
+ * @param[in, out] sub_flow
+ *   Pointer to return the created subflow, may be NULL.
+ * @param[in] prefix_layers
+ *   Prefix subflow layers, may be 0.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] items
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[in] flow_idx
+ *   This memory pool index to the flow.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @return
+ *   0 on success, negative value otherwise
+ */
+static int
+flow_create_split_inner(struct rte_eth_dev *dev,
+			struct rte_flow *flow,
+			struct mlx5_flow **sub_flow,
+			uint64_t prefix_layers,
+			const struct rte_flow_attr *attr,
+			const struct rte_flow_item items[],
+			const struct rte_flow_action actions[],
+			bool external, uint32_t flow_idx,
+			struct rte_flow_error *error)
+{
+	struct mlx5_flow *dev_flow;
+
+	dev_flow = flow_drv_prepare(dev, flow, attr, items, actions,
+		flow_idx, error);
+	if (!dev_flow)
+		return -rte_errno;
+	dev_flow->flow = flow;
+	dev_flow->external = external;
+	/* Subflow object was created, we must include one in the list. */
+	SILIST_INSERT(&flow->dev_handles, dev_flow->handle_idx,
+		      dev_flow->handle, next);
+	/*
+	 * If dev_flow is as one of the suffix flow, some actions in suffix
+	 * flow may need some user defined item layer flags.
+	 */
+	if (prefix_layers)
+		dev_flow->handle->layers = prefix_layers;
+	if (sub_flow)
+		*sub_flow = dev_flow;
+	return flow_drv_translate(dev, dev_flow, attr, items, actions, error);
+}
+
+/**
+ * Split the meter flow.
+ *
+ * As meter flow will split to three sub flow, other than meter
+ * action, the other actions make sense to only meter accepts
+ * the packet. If it need to be dropped, no other additional
+ * actions should be take.
+ *
+ * One kind of special action which decapsulates the L3 tunnel
+ * header will be in the prefix sub flow, as not to take the
+ * L3 tunnel header into account.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] items
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[out] sfx_items
+ *   Suffix flow match items (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] actions_sfx
+ *   Suffix flow actions.
+ * @param[out] actions_pre
+ *   Prefix flow actions.
+ * @param[out] pattern_sfx
+ *   The pattern items for the suffix flow.
+ * @param[out] tag_sfx
+ *   Pointer to suffix flow tag.
+ *
+ * @return
+ *   0 on success.
+ */
+static int
+flow_meter_split_prep(struct rte_eth_dev *dev,
+		 const struct rte_flow_item items[],
+		 struct rte_flow_item sfx_items[],
+		 const struct rte_flow_action actions[],
+		 struct rte_flow_action actions_sfx[],
+		 struct rte_flow_action actions_pre[])
+{
+	struct rte_flow_action *tag_action = NULL;
+	struct rte_flow_item *tag_item;
+	struct mlx5_rte_flow_action_set_tag *set_tag;
+	struct rte_flow_error error;
+	const struct rte_flow_action_raw_encap *raw_encap;
+	const struct rte_flow_action_raw_decap *raw_decap;
+	struct mlx5_rte_flow_item_tag *tag_spec;
+	struct mlx5_rte_flow_item_tag *tag_mask;
+	uint32_t tag_id;
+	bool copy_vlan = false;
+
+	/* Prepare the actions for prefix and suffix flow. */
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		struct rte_flow_action **action_cur = NULL;
+
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_METER:
+			/* Add the extra tag action first. */
+			tag_action = actions_pre;
+			tag_action->type = (enum rte_flow_action_type)
+					   MLX5_RTE_FLOW_ACTION_TYPE_TAG;
+			actions_pre++;
+			action_cur = &actions_pre;
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+		case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP:
+			action_cur = &actions_pre;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+			raw_encap = actions->conf;
+			if (raw_encap->size < MLX5_ENCAPSULATION_DECISION_SIZE)
+				action_cur = &actions_pre;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+			raw_decap = actions->conf;
+			if (raw_decap->size > MLX5_ENCAPSULATION_DECISION_SIZE)
+				action_cur = &actions_pre;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
+			copy_vlan = true;
+			break;
+		default:
+			break;
+		}
+		if (!action_cur)
+			action_cur = &actions_sfx;
+		memcpy(*action_cur, actions, sizeof(struct rte_flow_action));
+		(*action_cur)++;
+	}
+	/* Add end action to the actions. */
+	actions_sfx->type = RTE_FLOW_ACTION_TYPE_END;
+	actions_pre->type = RTE_FLOW_ACTION_TYPE_END;
+	actions_pre++;
+	/* Set the tag. */
+	set_tag = (void *)actions_pre;
+	set_tag->id = mlx5_flow_get_reg_id(dev, MLX5_MTR_SFX, 0, &error);
+	/*
+	 * Get the id from the qrss_pool to make qrss share the id with meter.
+	 */
+	tag_id = flow_qrss_get_id(dev);
+	set_tag->data = tag_id << MLX5_MTR_COLOR_BITS;
+	assert(tag_action);
+	tag_action->conf = set_tag;
+	/* Prepare the suffix subflow items. */
+	tag_item = sfx_items++;
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		int item_type = items->type;
+
+		switch (item_type) {
+		case RTE_FLOW_ITEM_TYPE_PORT_ID:
+			memcpy(sfx_items, items, sizeof(*sfx_items));
+			sfx_items++;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			if (copy_vlan) {
+				memcpy(sfx_items, items, sizeof(*sfx_items));
+				/*
+				 * Convert to internal match item, it is used
+				 * for vlan push and set vid.
+				 */
+				sfx_items->type = (enum rte_flow_item_type)
+						  MLX5_RTE_FLOW_ITEM_TYPE_VLAN;
+				sfx_items++;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	sfx_items->type = RTE_FLOW_ITEM_TYPE_END;
+	sfx_items++;
+	tag_spec = (struct mlx5_rte_flow_item_tag *)sfx_items;
+	tag_spec->data = tag_id << MLX5_MTR_COLOR_BITS;
+	tag_spec->id = mlx5_flow_get_reg_id(dev, MLX5_MTR_SFX, 0, &error);
+	tag_mask = tag_spec + 1;
+	tag_mask->data = 0xffffff00;
+	tag_item->type = (enum rte_flow_item_type)
+			 MLX5_RTE_FLOW_ITEM_TYPE_TAG;
+	tag_item->spec = tag_spec;
+	tag_item->last = NULL;
+	tag_item->mask = tag_mask;
+	return tag_id;
+}
+
+/**
+ * Split action list having QUEUE/RSS for metadata register copy.
+ *
+ * Once Q/RSS action is detected in user's action list, the flow action
+ * should be split in order to copy metadata registers, which will happen in
+ * RX_CP_TBL like,
+ *   - CQE->flow_tag := reg_c[1] (MARK)
+ *   - CQE->flow_table_metadata (reg_b) := reg_c[0] (META)
+ * The Q/RSS action will be performed on RX_ACT_TBL after passing by RX_CP_TBL.
+ * This is because the last action of each flow must be a terminal action
+ * (QUEUE, RSS or DROP).
+ *
+ * Flow ID must be allocated to identify actions in the RX_ACT_TBL and it is
+ * stored and kept in the mlx5_flow structure per each sub_flow.
+ *
+ * The Q/RSS action is replaced with,
+ *   - SET_TAG, setting the allocated flow ID to reg_c[2].
+ * And the following JUMP action is added at the end,
+ *   - JUMP, to RX_CP_TBL.
+ *
+ * A flow to perform remained Q/RSS action will be created in RX_ACT_TBL by
+ * flow_create_split_metadata() routine. The flow will look like,
+ *   - If flow ID matches (reg_c[2]), perform Q/RSS.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] split_actions
+ *   Pointer to store split actions to jump to CP_TBL.
+ * @param[in] actions
+ *   Pointer to the list of original flow actions.
+ * @param[in] qrss
+ *   Pointer to the Q/RSS action.
+ * @param[in] actions_n
+ *   Number of original actions.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   non-zero unique flow_id on success, otherwise 0 and
+ *   error/rte_error are set.
+ */
+static uint32_t
+flow_mreg_split_qrss_prep(struct rte_eth_dev *dev,
+			  struct rte_flow_action *split_actions,
+			  const struct rte_flow_action *actions,
+			  const struct rte_flow_action *qrss,
+			  int actions_n, struct rte_flow_error *error)
+{
+	struct mlx5_rte_flow_action_set_tag *set_tag;
+	struct rte_flow_action_jump *jump;
+	const int qrss_idx = qrss - actions;
+	uint32_t flow_id = 0;
+	int ret = 0;
+
+	/*
+	 * Given actions will be split
+	 * - Replace QUEUE/RSS action with SET_TAG to set flow ID.
+	 * - Add jump to mreg CP_TBL.
+	 * As a result, there will be one more action.
+	 */
+	++actions_n;
+	memcpy(split_actions, actions, sizeof(*split_actions) * actions_n);
+	set_tag = (void *)(split_actions + actions_n);
+	/*
+	 * If tag action is not set to void(it means we are not the meter
+	 * suffix flow), add the tag action. Since meter suffix flow already
+	 * has the tag added.
+	 */
+	if (split_actions[qrss_idx].type != RTE_FLOW_ACTION_TYPE_VOID) {
+		/*
+		 * Allocate the new subflow ID. This one is unique within
+		 * device and not shared with representors. Otherwise,
+		 * we would have to resolve multi-thread access synch
+		 * issue. Each flow on the shared device is appended
+		 * with source vport identifier, so the resulting
+		 * flows will be unique in the shared (by master and
+		 * representors) domain even if they have coinciding
+		 * IDs.
+		 */
+		flow_id = flow_qrss_get_id(dev);
+		if (!flow_id)
+			return rte_flow_error_set(error, ENOMEM,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL, "can't allocate id "
+						  "for split Q/RSS subflow");
+		/* Internal SET_TAG action to set flow ID. */
+		*set_tag = (struct mlx5_rte_flow_action_set_tag){
+			.data = flow_id,
+		};
+		ret = mlx5_flow_get_reg_id(dev, MLX5_COPY_MARK, 0, error);
+		if (ret < 0)
+			return ret;
+		set_tag->id = ret;
+		/* Construct new actions array. */
+		/* Replace QUEUE/RSS action. */
+		split_actions[qrss_idx] = (struct rte_flow_action){
+			.type = (enum rte_flow_action_type)
+				MLX5_RTE_FLOW_ACTION_TYPE_TAG,
+			.conf = set_tag,
+		};
+	}
+	/* JUMP action to jump to mreg copy table (CP_TBL). */
+	jump = (void *)(set_tag + 1);
+	*jump = (struct rte_flow_action_jump){
+		.group = MLX5_FLOW_MREG_CP_TABLE_GROUP,
+	};
+	split_actions[actions_n - 2] = (struct rte_flow_action){
+		.type = RTE_FLOW_ACTION_TYPE_JUMP,
+		.conf = jump,
+	};
+	split_actions[actions_n - 1] = (struct rte_flow_action){
+		.type = RTE_FLOW_ACTION_TYPE_END,
+	};
+	return flow_id;
+}
+
+/**
+ * Extend the given action list for Tx metadata copy.
+ *
+ * Copy the given action list to the ext_actions and add flow metadata register
+ * copy action in order to copy reg_a set by WQE to reg_c[0].
+ *
+ * @param[out] ext_actions
+ *   Pointer to the extended action list.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[in] actions_n
+ *   Number of actions in the list.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in] encap_idx
+ *   The encap action inndex.
+ *
+ * @return
+ *   0 on success, negative value otherwise
+ */
+static int
+flow_mreg_tx_copy_prep(struct rte_eth_dev *dev,
+		       struct rte_flow_action *ext_actions,
+		       const struct rte_flow_action *actions,
+		       int actions_n, struct rte_flow_error *error,
+		       int encap_idx)
+{
+	struct mlx5_flow_action_copy_mreg *cp_mreg =
+		(struct mlx5_flow_action_copy_mreg *)
+			(ext_actions + actions_n + 1);
+	int ret;
+
+	ret = mlx5_flow_get_reg_id(dev, MLX5_METADATA_RX, 0, error);
+	if (ret < 0)
+		return ret;
+	cp_mreg->dst = ret;
+	ret = mlx5_flow_get_reg_id(dev, MLX5_METADATA_TX, 0, error);
+	if (ret < 0)
+		return ret;
+	cp_mreg->src = ret;
+	if (encap_idx != 0)
+		memcpy(ext_actions, actions, sizeof(*ext_actions) * encap_idx);
+	if (encap_idx == actions_n - 1) {
+		ext_actions[actions_n - 1] = (struct rte_flow_action){
+			.type = (enum rte_flow_action_type)
+				MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+			.conf = cp_mreg,
+		};
+		ext_actions[actions_n] = (struct rte_flow_action){
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		};
+	} else {
+		ext_actions[encap_idx] = (struct rte_flow_action){
+			.type = (enum rte_flow_action_type)
+				MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+			.conf = cp_mreg,
+		};
+		memcpy(ext_actions + encap_idx + 1, actions + encap_idx,
+				sizeof(*ext_actions) * (actions_n - encap_idx));
+	}
+	return 0;
+}
+
+/**
+ * The splitting for metadata feature.
+ *
+ * - Q/RSS action on NIC Rx should be split in order to pass by
+ *   the mreg copy table (RX_CP_TBL) and then it jumps to the
+ *   action table (RX_ACT_TBL) which has the split Q/RSS action.
+ *
+ * - All the actions on NIC Tx should have a mreg copy action to
+ *   copy reg_a from WQE to reg_c[0].
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] flow
+ *   Parent flow structure pointer.
+ * @param[in] prefix_layers
+ *   Prefix flow layer flags.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] items
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[in] flow_idx
+ *   This memory pool index to the flow.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @return
+ *   0 on success, negative value otherwise
+ */
+static int
+flow_create_split_metadata(struct rte_eth_dev *dev,
+			   struct rte_flow *flow,
+			   uint64_t prefix_layers,
+			   const struct rte_flow_attr *attr,
+			   const struct rte_flow_item items[],
+			   const struct rte_flow_action actions[],
+			   bool external, uint32_t flow_idx,
+			   struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	const struct rte_flow_action *qrss = NULL;
+	struct rte_flow_action *ext_actions = NULL;
+	struct mlx5_flow *dev_flow = NULL;
+	uint32_t qrss_id = 0;
+	int mtr_sfx = 0;
+	size_t act_size;
+	int actions_n;
+	int encap_idx;
+	int ret;
+
+	/* Check whether extensive metadata feature is engaged. */
+	if (!config->dv_flow_en ||
+	    config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY ||
+	    !mlx5_flow_ext_mreg_supported(dev))
+		return flow_create_split_inner(dev, flow, NULL, prefix_layers,
+					       attr, items, actions, external,
+					       flow_idx, error);
+	actions_n = flow_parse_metadata_split_actions_info(actions, &qrss,
+							   &encap_idx);
+	if (qrss) {
+		/* Exclude hairpin flows from splitting. */
+		if (qrss->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
+			const struct rte_flow_action_queue *queue;
+
+			queue = qrss->conf;
+			if (mlx5_rxq_get_type(dev, queue->index) ==
+			    MLX5_RXQ_TYPE_HAIRPIN)
+				qrss = NULL;
+		} else if (qrss->type == RTE_FLOW_ACTION_TYPE_RSS) {
+			const struct rte_flow_action_rss *rss;
+
+			rss = qrss->conf;
+			if (mlx5_rxq_get_type(dev, rss->queue[0]) ==
+			    MLX5_RXQ_TYPE_HAIRPIN)
+				qrss = NULL;
+		}
+	}
+	if (qrss) {
+		/* Check if it is in meter suffix table. */
+		mtr_sfx = attr->group == (attr->transfer ?
+			  (MLX5_FLOW_TABLE_LEVEL_SUFFIX - 1) :
+			  MLX5_FLOW_TABLE_LEVEL_SUFFIX);
+		/*
+		 * Q/RSS action on NIC Rx should be split in order to pass by
+		 * the mreg copy table (RX_CP_TBL) and then it jumps to the
+		 * action table (RX_ACT_TBL) which has the split Q/RSS action.
+		 */
+		act_size = sizeof(struct rte_flow_action) * (actions_n + 1) +
+			   sizeof(struct rte_flow_action_set_tag) +
+			   sizeof(struct rte_flow_action_jump);
+		ext_actions = rte_zmalloc(__func__, act_size, 0);
+		if (!ext_actions)
+			return rte_flow_error_set(error, ENOMEM,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL, "no memory to split "
+						  "metadata flow");
+		/*
+		 * If we are the suffix flow of meter, tag already exist.
+		 * Set the tag action to void.
+		 */
+		if (mtr_sfx)
+			ext_actions[qrss - actions].type =
+						RTE_FLOW_ACTION_TYPE_VOID;
+		else
+			ext_actions[qrss - actions].type =
+						(enum rte_flow_action_type)
+						MLX5_RTE_FLOW_ACTION_TYPE_TAG;
+		/*
+		 * Create the new actions list with removed Q/RSS action
+		 * and appended set tag and jump to register copy table
+		 * (RX_CP_TBL). We should preallocate unique tag ID here
+		 * in advance, because it is needed for set tag action.
+		 */
+		qrss_id = flow_mreg_split_qrss_prep(dev, ext_actions, actions,
+						    qrss, actions_n, error);
+		if (!mtr_sfx && !qrss_id) {
+			ret = -rte_errno;
+			goto exit;
+		}
+	} else if (attr->egress && !attr->transfer) {
+		/*
+		 * All the actions on NIC Tx should have a metadata register
+		 * copy action to copy reg_a from WQE to reg_c[meta]
+		 */
+		act_size = sizeof(struct rte_flow_action) * (actions_n + 1) +
+			   sizeof(struct mlx5_flow_action_copy_mreg);
+		ext_actions = rte_zmalloc(__func__, act_size, 0);
+		if (!ext_actions)
+			return rte_flow_error_set(error, ENOMEM,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL, "no memory to split "
+						  "metadata flow");
+		/* Create the action list appended with copy register. */
+		ret = flow_mreg_tx_copy_prep(dev, ext_actions, actions,
+					     actions_n, error, encap_idx);
+		if (ret < 0)
+			goto exit;
+	}
+	/* Add the unmodified original or prefix subflow. */
+	ret = flow_create_split_inner(dev, flow, &dev_flow, prefix_layers, attr,
+				      items, ext_actions ? ext_actions :
+				      actions, external, flow_idx, error);
+	if (ret < 0)
+		goto exit;
+	MLX5_ASSERT(dev_flow);
+	if (qrss) {
+		const struct rte_flow_attr q_attr = {
+			.group = MLX5_FLOW_MREG_ACT_TABLE_GROUP,
+			.ingress = 1,
+		};
+		/* Internal PMD action to set register. */
+		struct mlx5_rte_flow_item_tag q_tag_spec = {
+			.data = qrss_id,
+			.id = 0,
+		};
+		struct rte_flow_item q_items[] = {
+			{
+				.type = (enum rte_flow_item_type)
+					MLX5_RTE_FLOW_ITEM_TYPE_TAG,
+				.spec = &q_tag_spec,
+				.last = NULL,
+				.mask = NULL,
+			},
+			{
+				.type = RTE_FLOW_ITEM_TYPE_END,
+			},
+		};
+		struct rte_flow_action q_actions[] = {
+			{
+				.type = qrss->type,
+				.conf = qrss->conf,
+			},
+			{
+				.type = RTE_FLOW_ACTION_TYPE_END,
+			},
+		};
+		uint64_t layers = flow_get_prefix_layer_flags(dev_flow);
+
+		/*
+		 * Configure the tag item only if there is no meter subflow.
+		 * Since tag is already marked in the meter suffix subflow
+		 * we can just use the meter suffix items as is.
+		 */
+		if (qrss_id) {
+			/* Not meter subflow. */
+			MLX5_ASSERT(!mtr_sfx);
+			/*
+			 * Put unique id in prefix flow due to it is destroyed
+			 * after suffix flow and id will be freed after there
+			 * is no actual flows with this id and identifier
+			 * reallocation becomes possible (for example, for
+			 * other flows in other threads).
+			 */
+			dev_flow->handle->split_flow_id = qrss_id;
+			ret = mlx5_flow_get_reg_id(dev, MLX5_COPY_MARK, 0,
+						   error);
+			if (ret < 0)
+				goto exit;
+			q_tag_spec.id = ret;
+		}
+		dev_flow = NULL;
+		/* Add suffix subflow to execute Q/RSS. */
+		ret = flow_create_split_inner(dev, flow, &dev_flow, layers,
+					      &q_attr, mtr_sfx ? items :
+					      q_items, q_actions,
+					      external, flow_idx, error);
+		if (ret < 0)
+			goto exit;
+		/* qrss ID should be freed if failed. */
+		qrss_id = 0;
+		MLX5_ASSERT(dev_flow);
+	}
+
+exit:
+	/*
+	 * We do not destroy the partially created sub_flows in case of error.
+	 * These ones are included into parent flow list and will be destroyed
+	 * by flow_drv_destroy.
+	 */
+	flow_qrss_free_id(dev, qrss_id);
+	rte_free(ext_actions);
+	return ret;
+}
+
+/**
+ * The splitting for meter feature.
+ *
+ * - The meter flow will be split to two flows as prefix and
+ *   suffix flow. The packets make sense only it pass the prefix
+ *   meter action.
+ *
+ * - Reg_C_5 is used for the packet to match betweend prefix and
+ *   suffix flow.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] flow
+ *   Parent flow structure pointer.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] items
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[in] flow_idx
+ *   This memory pool index to the flow.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @return
+ *   0 on success, negative value otherwise
+ */
+static int
+flow_create_split_meter(struct rte_eth_dev *dev,
+			   struct rte_flow *flow,
+			   const struct rte_flow_attr *attr,
+			   const struct rte_flow_item items[],
+			   const struct rte_flow_action actions[],
+			   bool external, uint32_t flow_idx,
+			   struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow_action *sfx_actions = NULL;
+	struct rte_flow_action *pre_actions = NULL;
+	struct rte_flow_item *sfx_items = NULL;
+	struct mlx5_flow *dev_flow = NULL;
+	struct rte_flow_attr sfx_attr = *attr;
+	uint32_t mtr = 0;
+	uint32_t mtr_tag_id = 0;
+	size_t act_size;
+	size_t item_size;
+	int actions_n = 0;
+	int ret;
+
+	if (priv->mtr_en)
+		actions_n = flow_check_meter_action(actions, &mtr);
+	if (mtr) {
+		/* The five prefix actions: meter, decap, encap, tag, end. */
+		act_size = sizeof(struct rte_flow_action) * (actions_n + 5) +
+			   sizeof(struct mlx5_rte_flow_action_set_tag);
+		/* tag, vlan, port id, end. */
+#define METER_SUFFIX_ITEM 4
+		item_size = sizeof(struct rte_flow_item) * METER_SUFFIX_ITEM +
+			    sizeof(struct mlx5_rte_flow_item_tag) * 2;
+		sfx_actions = rte_zmalloc(__func__, (act_size + item_size), 0);
+		if (!sfx_actions)
+			return rte_flow_error_set(error, ENOMEM,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL, "no memory to split "
+						  "meter flow");
+		sfx_items = (struct rte_flow_item *)((char *)sfx_actions +
+			     act_size);
+		pre_actions = sfx_actions + actions_n;
+		mtr_tag_id = flow_meter_split_prep(dev, items, sfx_items,
+						   actions, sfx_actions,
+						   pre_actions);
+		if (!mtr_tag_id) {
+			ret = -rte_errno;
+			goto exit;
+		}
+		/* Add the prefix subflow. */
+		ret = flow_create_split_inner(dev, flow, &dev_flow, 0, attr,
+					      items, pre_actions, external,
+					      flow_idx, error);
+		if (ret) {
+			ret = -rte_errno;
+			goto exit;
+		}
+		dev_flow->handle->split_flow_id = mtr_tag_id;
+		/* Setting the sfx group atrr. */
+		sfx_attr.group = sfx_attr.transfer ?
+				(MLX5_FLOW_TABLE_LEVEL_SUFFIX - 1) :
+				 MLX5_FLOW_TABLE_LEVEL_SUFFIX;
+	}
+	/* Add the prefix subflow. */
+	ret = flow_create_split_metadata(dev, flow, dev_flow ?
+					 flow_get_prefix_layer_flags(dev_flow) :
+					 0, &sfx_attr,
+					 sfx_items ? sfx_items : items,
+					 sfx_actions ? sfx_actions : actions,
+					 external, flow_idx, error);
+exit:
+	if (sfx_actions)
+		rte_free(sfx_actions);
+	return ret;
+}
+
+/**
+ * Split the flow to subflow set. The splitters might be linked
+ * in the chain, like this:
+ * flow_create_split_outer() calls:
+ *   flow_create_split_meter() calls:
+ *     flow_create_split_metadata(meter_subflow_0) calls:
+ *       flow_create_split_inner(metadata_subflow_0)
+ *       flow_create_split_inner(metadata_subflow_1)
+ *       flow_create_split_inner(metadata_subflow_2)
+ *     flow_create_split_metadata(meter_subflow_1) calls:
+ *       flow_create_split_inner(metadata_subflow_0)
+ *       flow_create_split_inner(metadata_subflow_1)
+ *       flow_create_split_inner(metadata_subflow_2)
+ *
+ * This provide flexible way to add new levels of flow splitting.
+ * The all of successfully created subflows are included to the
+ * parent flow dev_flow list.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] flow
+ *   Parent flow structure pointer.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] items
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[in] flow_idx
+ *   This memory pool index to the flow.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @return
+ *   0 on success, negative value otherwise
+ */
+static int
+flow_create_split_outer(struct rte_eth_dev *dev,
+			struct rte_flow *flow,
+			const struct rte_flow_attr *attr,
+			const struct rte_flow_item items[],
+			const struct rte_flow_action actions[],
+			bool external, uint32_t flow_idx,
+			struct rte_flow_error *error)
+{
+	int ret;
+
+	ret = flow_create_split_meter(dev, flow, attr, items,
+					 actions, external, flow_idx, error);
+	MLX5_ASSERT(ret <= 0);
+	return ret;
+}
+
+/**
+ * Create a flow and add it to @p list.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param list
+ *   Pointer to a TAILQ flow list. If this parameter NULL,
+ *   no list insertion occurred, flow is just created,
+ *   this is caller's responsibility to track the
+ *   created flow.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] items
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   A flow index on success, 0 otherwise and rte_errno is set.
+ */
+static uint32_t
+flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
+		 const struct rte_flow_attr *attr,
+		 const struct rte_flow_item items[],
+		 const struct rte_flow_action actions[],
+		 bool external, struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+	struct mlx5_flow *dev_flow;
+	const struct rte_flow_action_rss *rss;
+	union {
+		struct rte_flow_expand_rss buf;
+		uint8_t buffer[2048];
+	} expand_buffer;
+	union {
+		struct rte_flow_action actions[MLX5_MAX_SPLIT_ACTIONS];
+		uint8_t buffer[2048];
+	} actions_rx;
+	union {
+		struct rte_flow_action actions[MLX5_MAX_SPLIT_ACTIONS];
+		uint8_t buffer[2048];
+	} actions_hairpin_tx;
+	union {
+		struct rte_flow_item items[MLX5_MAX_SPLIT_ITEMS];
+		uint8_t buffer[2048];
+	} items_tx;
+	struct rte_flow_expand_rss *buf = &expand_buffer.buf;
+	struct mlx5_flow_rss_desc *rss_desc = &((struct mlx5_flow_rss_desc *)
+					      priv->rss_desc)[!!priv->flow_idx];
+	const struct rte_flow_action *p_actions_rx = actions;
+	uint32_t i;
+	uint32_t idx = 0;
+	int hairpin_flow;
+	uint32_t hairpin_id = 0;
+	struct rte_flow_attr attr_tx = { .priority = 0 };
+	int ret;
+
+	hairpin_flow = flow_check_hairpin_split(dev, attr, actions);
+	ret = flow_drv_validate(dev, attr, items, p_actions_rx,
+				external, hairpin_flow, error);
+	if (ret < 0)
+		return 0;
+	if (hairpin_flow > 0) {
+		if (hairpin_flow > MLX5_MAX_SPLIT_ACTIONS) {
+			rte_errno = EINVAL;
+			return 0;
+		}
+		flow_hairpin_split(dev, actions, actions_rx.actions,
+				   actions_hairpin_tx.actions, items_tx.items,
+				   &hairpin_id);
+		p_actions_rx = actions_rx.actions;
+	}
+	flow = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], &idx);
+	if (!flow) {
+		rte_errno = ENOMEM;
+		goto error_before_flow;
+	}
+	flow->drv_type = flow_get_drv_type(dev, attr);
+	if (hairpin_id != 0)
+		flow->hairpin_flow_id = hairpin_id;
+	MLX5_ASSERT(flow->drv_type > MLX5_FLOW_TYPE_MIN &&
+		    flow->drv_type < MLX5_FLOW_TYPE_MAX);
+	memset(rss_desc, 0, sizeof(*rss_desc));
+	rss = flow_get_rss_action(p_actions_rx);
+	if (rss) {
+		/*
+		 * The following information is required by
+		 * mlx5_flow_hashfields_adjust() in advance.
+		 */
+		rss_desc->level = rss->level;
+		/* RSS type 0 indicates default RSS type (ETH_RSS_IP). */
+		rss_desc->types = !rss->types ? ETH_RSS_IP : rss->types;
+	}
+	flow->dev_handles = 0;
+	if (rss && rss->types) {
+		unsigned int graph_root;
+
+		graph_root = find_graph_root(items, rss->level);
+		ret = rte_flow_expand_rss(buf, sizeof(expand_buffer.buffer),
+					  items, rss->types,
+					  mlx5_support_expansion,
+					  graph_root);
+		MLX5_ASSERT(ret > 0 &&
+		       (unsigned int)ret < sizeof(expand_buffer.buffer));
+	} else {
+		buf->entries = 1;
+		buf->entry[0].pattern = (void *)(uintptr_t)items;
+	}
+	/*
+	 * Record the start index when there is a nested call. All sub-flows
+	 * need to be translated before another calling.
+	 * No need to use ping-pong buffer to save memory here.
+	 */
+	if (priv->flow_idx) {
+		MLX5_ASSERT(!priv->flow_nested_idx);
+		priv->flow_nested_idx = priv->flow_idx;
+	}
+	for (i = 0; i < buf->entries; ++i) {
+		/*
+		 * The splitter may create multiple dev_flows,
+		 * depending on configuration. In the simplest
+		 * case it just creates unmodified original flow.
+		 */
+		ret = flow_create_split_outer(dev, flow, attr,
+					      buf->entry[i].pattern,
+					      p_actions_rx, external, idx,
+					      error);
+		if (ret < 0)
+			goto error;
+	}
+	/* Create the tx flow. */
+	if (hairpin_flow) {
+		attr_tx.group = MLX5_HAIRPIN_TX_TABLE;
+		attr_tx.ingress = 0;
+		attr_tx.egress = 1;
+		dev_flow = flow_drv_prepare(dev, flow, &attr_tx, items_tx.items,
+					 actions_hairpin_tx.actions,
+					 idx, error);
+		if (!dev_flow)
+			goto error;
+		dev_flow->flow = flow;
+		dev_flow->external = 0;
+		SILIST_INSERT(&flow->dev_handles, dev_flow->handle_idx,
+			      dev_flow->handle, next);
+		ret = flow_drv_translate(dev, dev_flow, &attr_tx,
+					 items_tx.items,
+					 actions_hairpin_tx.actions, error);
+		if (ret < 0)
+			goto error;
+	}
+	/*
+	 * Update the metadata register copy table. If extensive
+	 * metadata feature is enabled and registers are supported
+	 * we might create the extra rte_flow for each unique
+	 * MARK/FLAG action ID.
+	 *
+	 * The table is updated for ingress Flows only, because
+	 * the egress Flows belong to the different device and
+	 * copy table should be updated in peer NIC Rx domain.
+	 */
+	if (attr->ingress &&
+	    (external || attr->group != MLX5_FLOW_MREG_CP_TABLE_GROUP)) {
+		ret = flow_mreg_update_copy_table(dev, flow, actions, error);
+		if (ret)
+			goto error;
+	}
+	/*
+	 * If the flow is external (from application) OR device is started, then
+	 * the flow will be applied immediately.
+	 */
+	if (external || dev->data->dev_started) {
+		ret = flow_drv_apply(dev, flow, error);
+		if (ret < 0)
+			goto error;
+	}
+	if (list)
+		ILIST_INSERT(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], list, idx,
+			     flow, next);
+	flow_rxq_flags_set(dev, flow);
+	/* Nested flow creation index recovery. */
+	priv->flow_idx = priv->flow_nested_idx;
+	if (priv->flow_nested_idx)
+		priv->flow_nested_idx = 0;
+	return idx;
+error:
+	MLX5_ASSERT(flow);
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	flow_mreg_del_copy_action(dev, flow);
+	flow_drv_destroy(dev, flow);
+	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], idx);
+	rte_errno = ret; /* Restore rte_errno. */
+error_before_flow:
+	ret = rte_errno;
+	if (hairpin_id)
+		mlx5_flow_id_release(priv->sh->flow_id_pool,
+				     hairpin_id);
+	rte_errno = ret;
+	priv->flow_idx = priv->flow_nested_idx;
+	if (priv->flow_nested_idx)
+		priv->flow_nested_idx = 0;
+	return 0;
+}
+
+/**
+ * Create a dedicated flow rule on e-switch table 0 (root table), to direct all
+ * incoming packets to table 1.
+ *
+ * Other flow rules, requested for group n, will be created in
+ * e-switch table n+1.
+ * Jump action to e-switch group n will be created to group n+1.
+ *
+ * Used when working in switchdev mode, to utilise advantages of table 1
+ * and above.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   Pointer to flow on success, NULL otherwise and rte_errno is set.
+ */
+struct rte_flow *
+mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev)
+{
+	const struct rte_flow_attr attr = {
+		.group = 0,
+		.priority = 0,
+		.ingress = 1,
+		.egress = 0,
+		.transfer = 1,
+	};
+	const struct rte_flow_item pattern = {
+		.type = RTE_FLOW_ITEM_TYPE_END,
+	};
+	struct rte_flow_action_jump jump = {
+		.group = 1,
+	};
+	const struct rte_flow_action actions[] = {
+		{
+			.type = RTE_FLOW_ACTION_TYPE_JUMP,
+			.conf = &jump,
+		},
+		{
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
+
+	return (void *)(uintptr_t)flow_list_create(dev, &priv->ctrl_flows,
+						   &attr, &pattern,
+						   actions, false, &error);
+}
+
+/**
+ * Validate a flow supported by the NIC.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_validate(struct rte_eth_dev *dev,
+		   const struct rte_flow_attr *attr,
+		   const struct rte_flow_item items[],
+		   const struct rte_flow_action actions[],
+		   struct rte_flow_error *error)
+{
+	int hairpin_flow;
+
+	hairpin_flow = flow_check_hairpin_split(dev, attr, actions);
+	return flow_drv_validate(dev, attr, items, actions,
+				true, hairpin_flow, error);
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+struct rte_flow *
+mlx5_flow_create(struct rte_eth_dev *dev,
+		 const struct rte_flow_attr *attr,
+		 const struct rte_flow_item items[],
+		 const struct rte_flow_action actions[],
+		 struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	/*
+	 * If the device is not started yet, it is not allowed to created a
+	 * flow from application. PMD default flows and traffic control flows
+	 * are not affected.
+	 */
+	if (unlikely(!dev->data->dev_started)) {
+		DRV_LOG(DEBUG, "port %u is not started when "
+			"inserting a flow", dev->data->port_id);
+		rte_flow_error_set(error, ENODEV,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL,
+				   "port not started");
+		return NULL;
+	}
+	return (void *)(uintptr_t)flow_list_create(dev, &priv->flows,
+				  attr, items, actions, true, error);
+}
+
+/**
+ * Destroy a flow in a list.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param list
+ *   Pointer to the Indexed flow list. If this parameter NULL,
+ *   there is no flow removal from the list. Be noted that as
+ *   flow is add to the indexed list, memory of the indexed
+ *   list points to maybe changed as flow destroyed.
+ * @param[in] flow_idx
+ *   Index of flow to destroy.
+ */
+static void
+flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
+		  uint32_t flow_idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+	struct rte_flow *flow = mlx5_ipool_get(priv->sh->ipool
+					       [MLX5_IPOOL_RTE_FLOW], flow_idx);
+
+	if (!flow)
+		return;
+	/*
+	 * Update RX queue flags only if port is started, otherwise it is
+	 * already clean.
+	 */
+	if (dev->data->dev_started)
+		flow_rxq_flags_trim(dev, flow);
+	if (flow->hairpin_flow_id)
+		mlx5_flow_id_release(priv->sh->flow_id_pool,
+				     flow->hairpin_flow_id);
+	flow_drv_destroy(dev, flow);
+	if (list)
+		ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], list,
+			     flow_idx, flow, next);
+	flow_mreg_del_copy_action(dev, flow);
+	if (flow->fdir) {
+		LIST_FOREACH(priv_fdir_flow, &priv->fdir_flows, next) {
+			if (priv_fdir_flow->rix_flow == flow_idx)
+				break;
+		}
+		if (priv_fdir_flow) {
+			LIST_REMOVE(priv_fdir_flow, next);
+			rte_free(priv_fdir_flow->fdir);
+			rte_free(priv_fdir_flow);
+		}
+	}
+	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], flow_idx);
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param list
+ *   Pointer to the Indexed flow list.
+ * @param active
+ *   If flushing is called avtively.
+ */
+void
+mlx5_flow_list_flush(struct rte_eth_dev *dev, uint32_t *list, bool active)
+{
+	uint32_t num_flushed = 0;
+
+	while (*list) {
+		flow_list_destroy(dev, list, *list);
+		num_flushed++;
+	}
+	if (active) {
+		DRV_LOG(INFO, "port %u: %u flows flushed before stopping",
+			dev->data->port_id, num_flushed);
+	}
+}
+
+/**
+ * Remove all flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param list
+ *   Pointer to the Indexed flow list.
+ */
+void
+mlx5_flow_stop(struct rte_eth_dev *dev, uint32_t *list)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+	uint32_t idx;
+
+	ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], *list, idx,
+		      flow, next) {
+		flow_drv_remove(dev, flow);
+		flow_mreg_stop_copy_action(dev, flow);
+	}
+	flow_mreg_del_default_copy_action(dev);
+	flow_rxq_flags_clear(dev);
+}
+
+/**
+ * Add all flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param list
+ *   Pointer to the Indexed flow list.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_start(struct rte_eth_dev *dev, uint32_t *list)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow *flow = NULL;
+	struct rte_flow_error error;
+	uint32_t idx;
+	int ret = 0;
+
+	/* Make sure default copy action (reg_c[0] -> reg_b) is created. */
+	ret = flow_mreg_add_default_copy_action(dev, &error);
+	if (ret < 0)
+		return -rte_errno;
+	/* Apply Flows created by application. */
+	ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], *list, idx,
+		      flow, next) {
+		ret = flow_mreg_start_copy_action(dev, flow);
+		if (ret < 0)
+			goto error;
+		ret = flow_drv_apply(dev, flow, &error);
+		if (ret < 0)
+			goto error;
+		flow_rxq_flags_set(dev, flow);
+	}
+	return 0;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	mlx5_flow_stop(dev, list);
+	rte_errno = ret; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Stop all default actions for flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_flow_stop_default(struct rte_eth_dev *dev)
+{
+	flow_mreg_del_default_copy_action(dev);
+	flow_rxq_flags_clear(dev);
+}
+
+/**
+ * Start all default actions for flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_start_default(struct rte_eth_dev *dev)
+{
+	struct rte_flow_error error;
+
+	/* Make sure default copy action (reg_c[0] -> reg_b) is created. */
+	return flow_mreg_add_default_copy_action(dev, &error);
+}
+
+/**
+ * Allocate intermediate resources for flow creation.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_flow_alloc_intermediate(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!priv->inter_flows) {
+		priv->inter_flows = rte_calloc(__func__, 1,
+				    MLX5_NUM_MAX_DEV_FLOWS *
+				    sizeof(struct mlx5_flow) +
+				    (sizeof(struct mlx5_flow_rss_desc) +
+				    sizeof(uint16_t) * UINT16_MAX) * 2, 0);
+		if (!priv->inter_flows) {
+			DRV_LOG(ERR, "can't allocate intermediate memory.");
+			return;
+		}
+	}
+	priv->rss_desc = &((struct mlx5_flow *)priv->inter_flows)
+			 [MLX5_NUM_MAX_DEV_FLOWS];
+	/* Reset the index. */
+	priv->flow_idx = 0;
+	priv->flow_nested_idx = 0;
+}
+
+/**
+ * Free intermediate resources for flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_flow_free_intermediate(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	rte_free(priv->inter_flows);
+	priv->inter_flows = NULL;
+}
+
+/**
+ * Verify the flow list is empty
+ *
+ * @param dev
+ *  Pointer to Ethernet device.
+ *
+ * @return the number of flows not released.
+ */
+int
+mlx5_flow_verify(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow *flow;
+	uint32_t idx;
+	int ret = 0;
+
+	ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], priv->flows, idx,
+		      flow, next) {
+		DRV_LOG(DEBUG, "port %u flow %p still referenced",
+			dev->data->port_id, (void *)flow);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Enable default hairpin egress flow.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param queue
+ *   The queue index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev,
+			    uint32_t queue)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_attr attr = {
+		.egress = 1,
+		.priority = 0,
+	};
+	struct mlx5_rte_flow_item_tx_queue queue_spec = {
+		.queue = queue,
+	};
+	struct mlx5_rte_flow_item_tx_queue queue_mask = {
+		.queue = UINT32_MAX,
+	};
+	struct rte_flow_item items[] = {
+		{
+			.type = (enum rte_flow_item_type)
+				MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
+			.spec = &queue_spec,
+			.last = NULL,
+			.mask = &queue_mask,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	struct rte_flow_action_jump jump = {
+		.group = MLX5_HAIRPIN_TX_TABLE,
+	};
+	struct rte_flow_action actions[2];
+	uint32_t flow_idx;
+	struct rte_flow_error error;
+
+	actions[0].type = RTE_FLOW_ACTION_TYPE_JUMP;
+	actions[0].conf = &jump;
+	actions[1].type = RTE_FLOW_ACTION_TYPE_END;
+	flow_idx = flow_list_create(dev, &priv->ctrl_flows,
+				&attr, items, actions, false, &error);
+	if (!flow_idx) {
+		DRV_LOG(DEBUG,
+			"Failed to create ctrl flow: rte_errno(%d),"
+			" type(%d), message(%s)",
+			rte_errno, error.type,
+			error.message ? error.message : " (no stated reason)");
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Enable a control flow configured from the control plane.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param eth_spec
+ *   An Ethernet flow spec to apply.
+ * @param eth_mask
+ *   An Ethernet flow mask to apply.
+ * @param vlan_spec
+ *   A VLAN flow spec to apply.
+ * @param vlan_mask
+ *   A VLAN flow mask to apply.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
+		    struct rte_flow_item_eth *eth_spec,
+		    struct rte_flow_item_eth *eth_mask,
+		    struct rte_flow_item_vlan *vlan_spec,
+		    struct rte_flow_item_vlan *vlan_mask)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_attr attr = {
+		.ingress = 1,
+		.priority = MLX5_FLOW_PRIO_RSVD,
+	};
+	struct rte_flow_item items[] = {
+		{
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+			.spec = eth_spec,
+			.last = NULL,
+			.mask = eth_mask,
+		},
+		{
+			.type = (vlan_spec) ? RTE_FLOW_ITEM_TYPE_VLAN :
+					      RTE_FLOW_ITEM_TYPE_END,
+			.spec = vlan_spec,
+			.last = NULL,
+			.mask = vlan_mask,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	uint16_t queue[priv->reta_idx_n];
+	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
+		.types = priv->rss_conf.rss_hf,
+		.key_len = priv->rss_conf.rss_key_len,
+		.queue_num = priv->reta_idx_n,
+		.key = priv->rss_conf.rss_key,
+		.queue = queue,
+	};
+	struct rte_flow_action actions[] = {
+		{
+			.type = RTE_FLOW_ACTION_TYPE_RSS,
+			.conf = &action_rss,
+		},
+		{
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
+	uint32_t flow_idx;
+	struct rte_flow_error error;
+	unsigned int i;
+
+	if (!priv->reta_idx_n || !priv->rxqs_n) {
+		return 0;
+	}
+	if (!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG))
+		action_rss.types = 0;
+	for (i = 0; i != priv->reta_idx_n; ++i)
+		queue[i] = (*priv->reta_idx)[i];
+	flow_idx = flow_list_create(dev, &priv->ctrl_flows,
+				&attr, items, actions, false, &error);
+	if (!flow_idx)
+		return -rte_errno;
+	return 0;
+}
+
+/**
+ * Enable a flow control configured from the control plane.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param eth_spec
+ *   An Ethernet flow spec to apply.
+ * @param eth_mask
+ *   An Ethernet flow mask to apply.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_ctrl_flow(struct rte_eth_dev *dev,
+	       struct rte_flow_item_eth *eth_spec,
+	       struct rte_flow_item_eth *eth_mask)
+{
+	return mlx5_ctrl_flow_vlan(dev, eth_spec, eth_mask, NULL, NULL);
+}
+
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_destroy(struct rte_eth_dev *dev,
+		  struct rte_flow *flow,
+		  struct rte_flow_error *error __rte_unused)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	flow_list_destroy(dev, &priv->flows, (uintptr_t)(void *)flow);
+	return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_flush(struct rte_eth_dev *dev,
+		struct rte_flow_error *error __rte_unused)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	mlx5_flow_list_flush(dev, &priv->flows, false);
+	return 0;
+}
+
+/**
+ * Isolated mode.
+ *
+ * @see rte_flow_isolate()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_isolate(struct rte_eth_dev *dev,
+		  int enable,
+		  struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (dev->data->dev_started) {
+		rte_flow_error_set(error, EBUSY,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL,
+				   "port must be stopped first");
+		return -rte_errno;
+	}
+	priv->isolated = !!enable;
+	if (enable)
+		dev->dev_ops = &mlx5_dev_ops_isolate;
+	else
+		dev->dev_ops = &mlx5_dev_ops;
+	return 0;
+}
+
+/**
+ * Query a flow.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+static int
+flow_drv_query(struct rte_eth_dev *dev,
+	       uint32_t flow_idx,
+	       const struct rte_flow_action *actions,
+	       void *data,
+	       struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct mlx5_flow_driver_ops *fops;
+	struct rte_flow *flow = mlx5_ipool_get(priv->sh->ipool
+					       [MLX5_IPOOL_RTE_FLOW],
+					       flow_idx);
+	enum mlx5_flow_drv_type ftype;
+
+	if (!flow) {
+		return rte_flow_error_set(error, ENOENT,
+			  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			  NULL,
+			  "invalid flow handle");
+	}
+	ftype = flow->drv_type;
+	MLX5_ASSERT(ftype > MLX5_FLOW_TYPE_MIN && ftype < MLX5_FLOW_TYPE_MAX);
+	fops = flow_get_drv_ops(ftype);
+
+	return fops->query(dev, flow, actions, data, error);
+}
+
+/**
+ * Query a flow.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_query(struct rte_eth_dev *dev,
+		struct rte_flow *flow,
+		const struct rte_flow_action *actions,
+		void *data,
+		struct rte_flow_error *error)
+{
+	int ret;
+
+	ret = flow_drv_query(dev, (uintptr_t)(void *)flow, actions, data,
+			     error);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * Convert a flow director filter to a generic flow.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param fdir_filter
+ *   Flow director filter to add.
+ * @param attributes
+ *   Generic flow parameters structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_filter_convert(struct rte_eth_dev *dev,
+			 const struct rte_eth_fdir_filter *fdir_filter,
+			 struct mlx5_fdir *attributes)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_eth_fdir_input *input = &fdir_filter->input;
+	const struct rte_eth_fdir_masks *mask =
+		&dev->data->dev_conf.fdir_conf.mask;
+
+	/* Validate queue number. */
+	if (fdir_filter->action.rx_queue >= priv->rxqs_n) {
+		DRV_LOG(ERR, "port %u invalid queue number %d",
+			dev->data->port_id, fdir_filter->action.rx_queue);
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	attributes->attr.ingress = 1;
+	attributes->items[0] = (struct rte_flow_item) {
+		.type = RTE_FLOW_ITEM_TYPE_ETH,
+		.spec = &attributes->l2,
+		.mask = &attributes->l2_mask,
+	};
+	switch (fdir_filter->action.behavior) {
+	case RTE_ETH_FDIR_ACCEPT:
+		attributes->actions[0] = (struct rte_flow_action){
+			.type = RTE_FLOW_ACTION_TYPE_QUEUE,
+			.conf = &attributes->queue,
+		};
+		break;
+	case RTE_ETH_FDIR_REJECT:
+		attributes->actions[0] = (struct rte_flow_action){
+			.type = RTE_FLOW_ACTION_TYPE_DROP,
+		};
+		break;
+	default:
+		DRV_LOG(ERR, "port %u invalid behavior %d",
+			dev->data->port_id,
+			fdir_filter->action.behavior);
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	attributes->queue.index = fdir_filter->action.rx_queue;
+	/* Handle L3. */
+	switch (fdir_filter->input.flow_type) {
+	case RTE_ETH_FLOW_NONFRAG_IPV4_UDP:
+	case RTE_ETH_FLOW_NONFRAG_IPV4_TCP:
+	case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
+		attributes->l3.ipv4.hdr = (struct rte_ipv4_hdr){
+			.src_addr = input->flow.ip4_flow.src_ip,
+			.dst_addr = input->flow.ip4_flow.dst_ip,
+			.time_to_live = input->flow.ip4_flow.ttl,
+			.type_of_service = input->flow.ip4_flow.tos,
+		};
+		attributes->l3_mask.ipv4.hdr = (struct rte_ipv4_hdr){
+			.src_addr = mask->ipv4_mask.src_ip,
+			.dst_addr = mask->ipv4_mask.dst_ip,
+			.time_to_live = mask->ipv4_mask.ttl,
+			.type_of_service = mask->ipv4_mask.tos,
+			.next_proto_id = mask->ipv4_mask.proto,
+		};
+		attributes->items[1] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_IPV4,
+			.spec = &attributes->l3,
+			.mask = &attributes->l3_mask,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
+	case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
+	case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
+		attributes->l3.ipv6.hdr = (struct rte_ipv6_hdr){
+			.hop_limits = input->flow.ipv6_flow.hop_limits,
+			.proto = input->flow.ipv6_flow.proto,
+		};
+
+		memcpy(attributes->l3.ipv6.hdr.src_addr,
+		       input->flow.ipv6_flow.src_ip,
+		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+		memcpy(attributes->l3.ipv6.hdr.dst_addr,
+		       input->flow.ipv6_flow.dst_ip,
+		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+		memcpy(attributes->l3_mask.ipv6.hdr.src_addr,
+		       mask->ipv6_mask.src_ip,
+		       RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr));
+		memcpy(attributes->l3_mask.ipv6.hdr.dst_addr,
+		       mask->ipv6_mask.dst_ip,
+		       RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr));
+		attributes->items[1] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_IPV6,
+			.spec = &attributes->l3,
+			.mask = &attributes->l3_mask,
+		};
+		break;
+	default:
+		DRV_LOG(ERR, "port %u invalid flow type%d",
+			dev->data->port_id, fdir_filter->input.flow_type);
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	/* Handle L4. */
+	switch (fdir_filter->input.flow_type) {
+	case RTE_ETH_FLOW_NONFRAG_IPV4_UDP:
+		attributes->l4.udp.hdr = (struct rte_udp_hdr){
+			.src_port = input->flow.udp4_flow.src_port,
+			.dst_port = input->flow.udp4_flow.dst_port,
+		};
+		attributes->l4_mask.udp.hdr = (struct rte_udp_hdr){
+			.src_port = mask->src_port_mask,
+			.dst_port = mask->dst_port_mask,
+		};
+		attributes->items[2] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_UDP,
+			.spec = &attributes->l4,
+			.mask = &attributes->l4_mask,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV4_TCP:
+		attributes->l4.tcp.hdr = (struct rte_tcp_hdr){
+			.src_port = input->flow.tcp4_flow.src_port,
+			.dst_port = input->flow.tcp4_flow.dst_port,
+		};
+		attributes->l4_mask.tcp.hdr = (struct rte_tcp_hdr){
+			.src_port = mask->src_port_mask,
+			.dst_port = mask->dst_port_mask,
+		};
+		attributes->items[2] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_TCP,
+			.spec = &attributes->l4,
+			.mask = &attributes->l4_mask,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
+		attributes->l4.udp.hdr = (struct rte_udp_hdr){
+			.src_port = input->flow.udp6_flow.src_port,
+			.dst_port = input->flow.udp6_flow.dst_port,
+		};
+		attributes->l4_mask.udp.hdr = (struct rte_udp_hdr){
+			.src_port = mask->src_port_mask,
+			.dst_port = mask->dst_port_mask,
+		};
+		attributes->items[2] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_UDP,
+			.spec = &attributes->l4,
+			.mask = &attributes->l4_mask,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
+		attributes->l4.tcp.hdr = (struct rte_tcp_hdr){
+			.src_port = input->flow.tcp6_flow.src_port,
+			.dst_port = input->flow.tcp6_flow.dst_port,
+		};
+		attributes->l4_mask.tcp.hdr = (struct rte_tcp_hdr){
+			.src_port = mask->src_port_mask,
+			.dst_port = mask->dst_port_mask,
+		};
+		attributes->items[2] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_TCP,
+			.spec = &attributes->l4,
+			.mask = &attributes->l4_mask,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
+	case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
+		break;
+	default:
+		DRV_LOG(ERR, "port %u invalid flow type%d",
+			dev->data->port_id, fdir_filter->input.flow_type);
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+#define FLOW_FDIR_CMP(f1, f2, fld) \
+	memcmp(&(f1)->fld, &(f2)->fld, sizeof(f1->fld))
+
+/**
+ * Compare two FDIR flows. If items and actions are identical, the two flows are
+ * regarded as same.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param f1
+ *   FDIR flow to compare.
+ * @param f2
+ *   FDIR flow to compare.
+ *
+ * @return
+ *   Zero on match, 1 otherwise.
+ */
+static int
+flow_fdir_cmp(const struct mlx5_fdir *f1, const struct mlx5_fdir *f2)
+{
+	if (FLOW_FDIR_CMP(f1, f2, attr) ||
+	    FLOW_FDIR_CMP(f1, f2, l2) ||
+	    FLOW_FDIR_CMP(f1, f2, l2_mask) ||
+	    FLOW_FDIR_CMP(f1, f2, l3) ||
+	    FLOW_FDIR_CMP(f1, f2, l3_mask) ||
+	    FLOW_FDIR_CMP(f1, f2, l4) ||
+	    FLOW_FDIR_CMP(f1, f2, l4_mask) ||
+	    FLOW_FDIR_CMP(f1, f2, actions[0].type))
+		return 1;
+	if (f1->actions[0].type == RTE_FLOW_ACTION_TYPE_QUEUE &&
+	    FLOW_FDIR_CMP(f1, f2, queue))
+		return 1;
+	return 0;
+}
+
+/**
+ * Search device flow list to find out a matched FDIR flow.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param fdir_flow
+ *   FDIR flow to lookup.
+ *
+ * @return
+ *   Index of flow if found, 0 otherwise.
+ */
+static uint32_t
+flow_fdir_filter_lookup(struct rte_eth_dev *dev, struct mlx5_fdir *fdir_flow)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t flow_idx = 0;
+	struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+
+	MLX5_ASSERT(fdir_flow);
+	LIST_FOREACH(priv_fdir_flow, &priv->fdir_flows, next) {
+		if (!flow_fdir_cmp(priv_fdir_flow->fdir, fdir_flow)) {
+			DRV_LOG(DEBUG, "port %u found FDIR flow %u",
+				dev->data->port_id, flow_idx);
+			flow_idx = priv_fdir_flow->rix_flow;
+			break;
+		}
+	}
+	return flow_idx;
+}
+
+/**
+ * Add new flow director filter and store it in list.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param fdir_filter
+ *   Flow director filter to add.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_filter_add(struct rte_eth_dev *dev,
+		     const struct rte_eth_fdir_filter *fdir_filter)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_fdir *fdir_flow;
+	struct rte_flow *flow;
+	struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+	uint32_t flow_idx;
+	int ret;
+
+	fdir_flow = rte_zmalloc(__func__, sizeof(*fdir_flow), 0);
+	if (!fdir_flow) {
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow);
+	if (ret)
+		goto error;
+	flow_idx = flow_fdir_filter_lookup(dev, fdir_flow);
+	if (flow_idx) {
+		rte_errno = EEXIST;
+		goto error;
+	}
+	priv_fdir_flow = rte_zmalloc(__func__, sizeof(struct mlx5_fdir_flow),
+				     0);
+	if (!priv_fdir_flow) {
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	flow_idx = flow_list_create(dev, &priv->flows, &fdir_flow->attr,
+				    fdir_flow->items, fdir_flow->actions, true,
+				    NULL);
+	flow = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], flow_idx);
+	if (!flow)
+		goto error;
+	flow->fdir = 1;
+	priv_fdir_flow->fdir = fdir_flow;
+	priv_fdir_flow->rix_flow = flow_idx;
+	LIST_INSERT_HEAD(&priv->fdir_flows, priv_fdir_flow, next);
+	DRV_LOG(DEBUG, "port %u created FDIR flow %p",
+		dev->data->port_id, (void *)flow);
+	return 0;
+error:
+	rte_free(priv_fdir_flow);
+	rte_free(fdir_flow);
+	return -rte_errno;
+}
+
+/**
+ * Delete specific filter.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param fdir_filter
+ *   Filter to be deleted.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_filter_delete(struct rte_eth_dev *dev,
+			const struct rte_eth_fdir_filter *fdir_filter)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t flow_idx;
+	struct mlx5_fdir fdir_flow = {
+		.attr.group = 0,
+	};
+	struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+	int ret;
+
+	ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow);
+	if (ret)
+		return -rte_errno;
+	LIST_FOREACH(priv_fdir_flow, &priv->fdir_flows, next) {
+		/* Find the fdir in priv list */
+		if (!flow_fdir_cmp(priv_fdir_flow->fdir, &fdir_flow))
+			break;
+	}
+	if (!priv_fdir_flow)
+		return 0;
+	LIST_REMOVE(priv_fdir_flow, next);
+	flow_idx = priv_fdir_flow->rix_flow;
+	flow_list_destroy(dev, &priv->flows, flow_idx);
+	rte_free(priv_fdir_flow->fdir);
+	rte_free(priv_fdir_flow);
+	DRV_LOG(DEBUG, "port %u deleted FDIR flow %u",
+		dev->data->port_id, flow_idx);
+	return 0;
+}
+
+/**
+ * Update queue for specific filter.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param fdir_filter
+ *   Filter to be updated.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_filter_update(struct rte_eth_dev *dev,
+			const struct rte_eth_fdir_filter *fdir_filter)
+{
+	int ret;
+
+	ret = flow_fdir_filter_delete(dev, fdir_filter);
+	if (ret)
+		return ret;
+	return flow_fdir_filter_add(dev, fdir_filter);
+}
+
+/**
+ * Flush all filters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+flow_fdir_filter_flush(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_fdir_flow *priv_fdir_flow = NULL;
+
+	while (!LIST_EMPTY(&priv->fdir_flows)) {
+		priv_fdir_flow = LIST_FIRST(&priv->fdir_flows);
+		LIST_REMOVE(priv_fdir_flow, next);
+		flow_list_destroy(dev, &priv->flows, priv_fdir_flow->rix_flow);
+		rte_free(priv_fdir_flow->fdir);
+		rte_free(priv_fdir_flow);
+	}
+}
+
+/**
+ * Get flow director information.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] fdir_info
+ *   Resulting flow director information.
+ */
+static void
+flow_fdir_info_get(struct rte_eth_dev *dev, struct rte_eth_fdir_info *fdir_info)
+{
+	struct rte_eth_fdir_masks *mask =
+		&dev->data->dev_conf.fdir_conf.mask;
+
+	fdir_info->mode = dev->data->dev_conf.fdir_conf.mode;
+	fdir_info->guarant_spc = 0;
+	rte_memcpy(&fdir_info->mask, mask, sizeof(fdir_info->mask));
+	fdir_info->max_flexpayload = 0;
+	fdir_info->flow_types_mask[0] = 0;
+	fdir_info->flex_payload_unit = 0;
+	fdir_info->max_flex_payload_segment_num = 0;
+	fdir_info->flex_payload_limit = 0;
+	memset(&fdir_info->flex_conf, 0, sizeof(fdir_info->flex_conf));
+}
+
+/**
+ * Deal with flow director operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_fdir_ctrl_func(struct rte_eth_dev *dev, enum rte_filter_op filter_op,
+		    void *arg)
+{
+	enum rte_fdir_mode fdir_mode =
+		dev->data->dev_conf.fdir_conf.mode;
+
+	if (filter_op == RTE_ETH_FILTER_NOP)
+		return 0;
+	if (fdir_mode != RTE_FDIR_MODE_PERFECT &&
+	    fdir_mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN) {
+		DRV_LOG(ERR, "port %u flow director mode %d not supported",
+			dev->data->port_id, fdir_mode);
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	switch (filter_op) {
+	case RTE_ETH_FILTER_ADD:
+		return flow_fdir_filter_add(dev, arg);
+	case RTE_ETH_FILTER_UPDATE:
+		return flow_fdir_filter_update(dev, arg);
+	case RTE_ETH_FILTER_DELETE:
+		return flow_fdir_filter_delete(dev, arg);
+	case RTE_ETH_FILTER_FLUSH:
+		flow_fdir_filter_flush(dev);
+		break;
+	case RTE_ETH_FILTER_INFO:
+		flow_fdir_info_get(dev, arg);
+		break;
+	default:
+		DRV_LOG(DEBUG, "port %u unknown operation %u",
+			dev->data->port_id, filter_op);
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_filter_ctrl(struct rte_eth_dev *dev,
+		     enum rte_filter_type filter_type,
+		     enum rte_filter_op filter_op,
+		     void *arg)
+{
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET) {
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
+		*(const void **)arg = &mlx5_flow_ops;
+		return 0;
+	case RTE_ETH_FILTER_FDIR:
+		return flow_fdir_ctrl_func(dev, filter_op, arg);
+	default:
+		DRV_LOG(ERR, "port %u filter type (%d) not supported",
+			dev->data->port_id, filter_type);
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Create the needed meter and suffix tables.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] fm
+ *   Pointer to the flow meter.
+ *
+ * @return
+ *   Pointer to table set on success, NULL otherwise.
+ */
+struct mlx5_meter_domains_infos *
+mlx5_flow_create_mtr_tbls(struct rte_eth_dev *dev,
+			  const struct mlx5_flow_meter *fm)
+{
+	const struct mlx5_flow_driver_ops *fops;
+
+	fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+	return fops->create_mtr_tbls(dev, fm);
+}
+
+/**
+ * Destroy the meter table set.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] tbl
+ *   Pointer to the meter table set.
+ *
+ * @return
+ *   0 on success.
+ */
+int
+mlx5_flow_destroy_mtr_tbls(struct rte_eth_dev *dev,
+			   struct mlx5_meter_domains_infos *tbls)
+{
+	const struct mlx5_flow_driver_ops *fops;
+
+	fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+	return fops->destroy_mtr_tbls(dev, tbls);
+}
+
+/**
+ * Create policer rules.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] fm
+ *   Pointer to flow meter structure.
+ * @param[in] attr
+ *   Pointer to flow attributes.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+mlx5_flow_create_policer_rules(struct rte_eth_dev *dev,
+			       struct mlx5_flow_meter *fm,
+			       const struct rte_flow_attr *attr)
+{
+	const struct mlx5_flow_driver_ops *fops;
+
+	fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+	return fops->create_policer_rules(dev, fm, attr);
+}
+
+/**
+ * Destroy policer rules.
+ *
+ * @param[in] fm
+ *   Pointer to flow meter structure.
+ * @param[in] attr
+ *   Pointer to flow attributes.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+int
+mlx5_flow_destroy_policer_rules(struct rte_eth_dev *dev,
+				struct mlx5_flow_meter *fm,
+				const struct rte_flow_attr *attr)
+{
+	const struct mlx5_flow_driver_ops *fops;
+
+	fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+	return fops->destroy_policer_rules(dev, fm, attr);
+}
+
+/**
+ * Allocate a counter.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   Index to allocated counter  on success, 0 otherwise.
+ */
+uint32_t
+mlx5_counter_alloc(struct rte_eth_dev *dev)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	struct rte_flow_attr attr = { .transfer = 0 };
+
+	if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) {
+		fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+		return fops->counter_alloc(dev);
+	}
+	DRV_LOG(ERR,
+		"port %u counter allocate is not supported.",
+		 dev->data->port_id);
+	return 0;
+}
+
+/**
+ * Free a counter.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device structure.
+ * @param[in] cnt
+ *   Index to counter to be free.
+ */
+void
+mlx5_counter_free(struct rte_eth_dev *dev, uint32_t cnt)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	struct rte_flow_attr attr = { .transfer = 0 };
+
+	if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) {
+		fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+		fops->counter_free(dev, cnt);
+		return;
+	}
+	DRV_LOG(ERR,
+		"port %u counter free is not supported.",
+		 dev->data->port_id);
+}
+
+/**
+ * Query counter statistics.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device structure.
+ * @param[in] cnt
+ *   Index to counter to query.
+ * @param[in] clear
+ *   Set to clear counter statistics.
+ * @param[out] pkts
+ *   The counter hits packets number to save.
+ * @param[out] bytes
+ *   The counter hits bytes number to save.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise.
+ */
+int
+mlx5_counter_query(struct rte_eth_dev *dev, uint32_t cnt,
+		   bool clear, uint64_t *pkts, uint64_t *bytes)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	struct rte_flow_attr attr = { .transfer = 0 };
+
+	if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) {
+		fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+		return fops->counter_query(dev, cnt, clear, pkts, bytes);
+	}
+	DRV_LOG(ERR,
+		"port %u counter query is not supported.",
+		 dev->data->port_id);
+	return -ENOTSUP;
+}
+
+#define MLX5_POOL_QUERY_FREQ_US 1000000
+
+/**
+ * Get number of all validate pools.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object.
+ *
+ * @return
+ *   The number of all validate pools.
+ */
+static uint32_t
+mlx5_get_all_valid_pool_count(struct mlx5_ibv_shared *sh)
+{
+	int i;
+	uint32_t pools_n = 0;
+
+	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i)
+		pools_n += rte_atomic16_read(&sh->cmng.ccont[i].n_valid);
+	return pools_n;
+}
+
+/**
+ * Set the periodic procedure for triggering asynchronous batch queries for all
+ * the counter pools.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object.
+ */
+void
+mlx5_set_query_alarm(struct mlx5_ibv_shared *sh)
+{
+	uint32_t pools_n, us;
+
+	pools_n = mlx5_get_all_valid_pool_count(sh);
+	us = MLX5_POOL_QUERY_FREQ_US / pools_n;
+	DRV_LOG(DEBUG, "Set alarm for %u pools each %u us", pools_n, us);
+	if (rte_eal_alarm_set(us, mlx5_flow_query_alarm, sh)) {
+		sh->cmng.query_thread_on = 0;
+		DRV_LOG(ERR, "Cannot reinitialize query alarm");
+	} else {
+		sh->cmng.query_thread_on = 1;
+	}
+}
+
+/**
+ * The periodic procedure for triggering asynchronous batch queries for all the
+ * counter pools. This function is probably called by the host thread.
+ *
+ * @param[in] arg
+ *   The parameter for the alarm process.
+ */
+void
+mlx5_flow_query_alarm(void *arg)
+{
+	struct mlx5_ibv_shared *sh = arg;
+	struct mlx5_devx_obj *dcs;
+	uint16_t offset;
+	int ret;
+	uint8_t batch = sh->cmng.batch;
+	uint8_t age = sh->cmng.age;
+	uint16_t pool_index = sh->cmng.pool_index;
+	struct mlx5_pools_container *cont;
+	struct mlx5_flow_counter_pool *pool;
+	int cont_loop = MLX5_CCONT_TYPE_MAX;
+
+	if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES)
+		goto set_alarm;
+next_container:
+	cont = MLX5_CNT_CONTAINER(sh, batch, age);
+	rte_spinlock_lock(&cont->resize_sl);
+	if (!cont->pools) {
+		rte_spinlock_unlock(&cont->resize_sl);
+		/* Check if all the containers are empty. */
+		if (unlikely(--cont_loop == 0))
+			goto set_alarm;
+		batch ^= 0x1;
+		pool_index = 0;
+		if (batch == 0 && pool_index == 0) {
+			age ^= 0x1;
+			sh->cmng.batch = batch;
+			sh->cmng.age = age;
+		}
+		goto next_container;
+	}
+	pool = cont->pools[pool_index];
+	rte_spinlock_unlock(&cont->resize_sl);
+	if (pool->raw_hw)
+		/* There is a pool query in progress. */
+		goto set_alarm;
+	pool->raw_hw =
+		LIST_FIRST(&sh->cmng.free_stat_raws);
+	if (!pool->raw_hw)
+		/* No free counter statistics raw memory. */
+		goto set_alarm;
+	dcs = (struct mlx5_devx_obj *)(uintptr_t)rte_atomic64_read
+							      (&pool->a64_dcs);
+	offset = batch ? 0 : dcs->id % MLX5_COUNTERS_PER_POOL;
+	/*
+	 * Identify the counters released between query trigger and query
+	 * handle more effiecntly. The counter released in this gap period
+	 * should wait for a new round of query as the new arrived packets
+	 * will not be taken into account.
+	 */
+	rte_atomic64_add(&pool->start_query_gen, 1);
+	ret = mlx5_devx_cmd_flow_counter_query(dcs, 0, MLX5_COUNTERS_PER_POOL -
+					       offset, NULL, NULL,
+					       pool->raw_hw->mem_mng->dm->id,
+					       (void *)(uintptr_t)
+					       (pool->raw_hw->data + offset),
+					       sh->devx_comp,
+					       (uint64_t)(uintptr_t)pool);
+	if (ret) {
+		rte_atomic64_sub(&pool->start_query_gen, 1);
+		DRV_LOG(ERR, "Failed to trigger asynchronous query for dcs ID"
+			" %d", pool->min_dcs->id);
+		pool->raw_hw = NULL;
+		goto set_alarm;
+	}
+	pool->raw_hw->min_dcs_id = dcs->id;
+	LIST_REMOVE(pool->raw_hw, next);
+	sh->cmng.pending_queries++;
+	pool_index++;
+	if (pool_index >= rte_atomic16_read(&cont->n_valid)) {
+		batch ^= 0x1;
+		pool_index = 0;
+		if (batch == 0 && pool_index == 0)
+			age ^= 0x1;
+	}
+set_alarm:
+	sh->cmng.batch = batch;
+	sh->cmng.pool_index = pool_index;
+	sh->cmng.age = age;
+	mlx5_set_query_alarm(sh);
+}
+
+/**
+ * Check and callback event for new aged flow in the counter pool
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object.
+ * @param[in] pool
+ *   Pointer to Current counter pool.
+ */
+static void
+mlx5_flow_aging_check(struct mlx5_ibv_shared *sh,
+		   struct mlx5_flow_counter_pool *pool)
+{
+	struct mlx5_priv *priv;
+	struct mlx5_flow_counter *cnt;
+	struct mlx5_age_info *age_info;
+	struct mlx5_age_param *age_param;
+	struct mlx5_counter_stats_raw *cur = pool->raw_hw;
+	struct mlx5_counter_stats_raw *prev = pool->raw;
+	uint16_t curr = rte_rdtsc() / (rte_get_tsc_hz() / 10);
+	uint32_t i;
+
+	for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+		cnt = MLX5_POOL_GET_CNT(pool, i);
+		age_param = MLX5_CNT_TO_AGE(cnt);
+		if (rte_atomic16_read(&age_param->state) != AGE_CANDIDATE)
+			continue;
+		if (cur->data[i].hits != prev->data[i].hits) {
+			age_param->expire = curr + age_param->timeout;
+			continue;
+		}
+		if ((uint16_t)(curr - age_param->expire) >= (UINT16_MAX / 2))
+			continue;
+		/**
+		 * Hold the lock first, or if between the
+		 * state AGE_TMOUT and tailq operation the
+		 * release happened, the release procedure
+		 * may delete a non-existent tailq node.
+		 */
+		priv = rte_eth_devices[age_param->port_id].data->dev_private;
+		age_info = GET_PORT_AGE_INFO(priv);
+		rte_spinlock_lock(&age_info->aged_sl);
+		/* If the cpmset fails, release happens. */
+		if (rte_atomic16_cmpset((volatile uint16_t *)
+					&age_param->state,
+					AGE_CANDIDATE,
+					AGE_TMOUT) ==
+					AGE_CANDIDATE) {
+			TAILQ_INSERT_TAIL(&age_info->aged_counters, cnt, next);
+			MLX5_AGE_SET(age_info, MLX5_AGE_EVENT_NEW);
+		}
+		rte_spinlock_unlock(&age_info->aged_sl);
+	}
+	for (i = 0; i < sh->max_port; i++) {
+		age_info = &sh->port[i].age_info;
+		if (!MLX5_AGE_GET(age_info, MLX5_AGE_EVENT_NEW))
+			continue;
+		if (MLX5_AGE_GET(age_info, MLX5_AGE_TRIGGER))
+			_rte_eth_dev_callback_process
+				(&rte_eth_devices[sh->port[i].devx_ih_port_id],
+				RTE_ETH_EVENT_FLOW_AGED, NULL);
+		age_info->flags = 0;
+	}
+}
+
+/**
+ * Handler for the HW respond about ready values from an asynchronous batch
+ * query. This function is probably called by the host thread.
+ *
+ * @param[in] sh
+ *   The pointer to the shared IB device context.
+ * @param[in] async_id
+ *   The Devx async ID.
+ * @param[in] status
+ *   The status of the completion.
+ */
+void
+mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh,
+				  uint64_t async_id, int status)
+{
+	struct mlx5_flow_counter_pool *pool =
+		(struct mlx5_flow_counter_pool *)(uintptr_t)async_id;
+	struct mlx5_counter_stats_raw *raw_to_free;
+
+	if (unlikely(status)) {
+		rte_atomic64_sub(&pool->start_query_gen, 1);
+		raw_to_free = pool->raw_hw;
+	} else {
+		raw_to_free = pool->raw;
+		if (IS_AGE_POOL(pool))
+			mlx5_flow_aging_check(sh, pool);
+		rte_spinlock_lock(&pool->sl);
+		pool->raw = pool->raw_hw;
+		rte_spinlock_unlock(&pool->sl);
+		MLX5_ASSERT(rte_atomic64_read(&pool->end_query_gen) + 1 ==
+			    rte_atomic64_read(&pool->start_query_gen));
+		rte_atomic64_set(&pool->end_query_gen,
+				 rte_atomic64_read(&pool->start_query_gen));
+		/* Be sure the new raw counters data is updated in memory. */
+		rte_cio_wmb();
+	}
+	LIST_INSERT_HEAD(&sh->cmng.free_stat_raws, raw_to_free, next);
+	pool->raw_hw = NULL;
+	sh->cmng.pending_queries--;
+}
+
+/**
+ * Translate the rte_flow group index to HW table value.
+ *
+ * @param[in] attributes
+ *   Pointer to flow attributes
+ * @param[in] external
+ *   Value is part of flow rule created by request external to PMD.
+ * @param[in] group
+ *   rte_flow group index value.
+ * @param[out] fdb_def_rule
+ *   Whether fdb jump to table 1 is configured.
+ * @param[out] table
+ *   HW table value.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_group_to_table(const struct rte_flow_attr *attributes, bool external,
+			 uint32_t group, bool fdb_def_rule, uint32_t *table,
+			 struct rte_flow_error *error)
+{
+	if (attributes->transfer && external && fdb_def_rule) {
+		if (group == UINT32_MAX)
+			return rte_flow_error_set
+						(error, EINVAL,
+						 RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+						 NULL,
+						 "group index not supported");
+		*table = group + 1;
+	} else {
+		*table = group;
+	}
+	return 0;
+}
+
+/**
+ * Discover availability of metadata reg_c's.
+ *
+ * Iteratively use test flows to check availability.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_discover_mreg_c(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	enum modify_reg idx;
+	int n = 0;
+
+	/* reg_c[0] and reg_c[1] are reserved. */
+	config->flow_mreg_c[n++] = REG_C_0;
+	config->flow_mreg_c[n++] = REG_C_1;
+	/* Discover availability of other reg_c's. */
+	for (idx = REG_C_2; idx <= REG_C_7; ++idx) {
+		struct rte_flow_attr attr = {
+			.group = MLX5_FLOW_MREG_CP_TABLE_GROUP,
+			.priority = MLX5_FLOW_PRIO_RSVD,
+			.ingress = 1,
+		};
+		struct rte_flow_item items[] = {
+			[0] = {
+				.type = RTE_FLOW_ITEM_TYPE_END,
+			},
+		};
+		struct rte_flow_action actions[] = {
+			[0] = {
+				.type = (enum rte_flow_action_type)
+					MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+				.conf = &(struct mlx5_flow_action_copy_mreg){
+					.src = REG_C_1,
+					.dst = idx,
+				},
+			},
+			[1] = {
+				.type = RTE_FLOW_ACTION_TYPE_JUMP,
+				.conf = &(struct rte_flow_action_jump){
+					.group = MLX5_FLOW_MREG_ACT_TABLE_GROUP,
+				},
+			},
+			[2] = {
+				.type = RTE_FLOW_ACTION_TYPE_END,
+			},
+		};
+		uint32_t flow_idx;
+		struct rte_flow *flow;
+		struct rte_flow_error error;
+
+		if (!config->dv_flow_en)
+			break;
+		/* Create internal flow, validation skips copy action. */
+		flow_idx = flow_list_create(dev, NULL, &attr, items,
+					    actions, false, &error);
+		flow = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
+				      flow_idx);
+		if (!flow)
+			continue;
+		if (dev->data->dev_started || !flow_drv_apply(dev, flow, NULL))
+			config->flow_mreg_c[n++] = idx;
+		flow_list_destroy(dev, NULL, flow_idx);
+	}
+	for (; n < MLX5_MREG_C_NUM; ++n)
+		config->flow_mreg_c[n] = REG_NONE;
+	return 0;
+}
+
+/**
+ * Dump flow raw hw data to file
+ *
+ * @param[in] dev
+ *    The pointer to Ethernet device.
+ * @param[in] file
+ *   A pointer to a file for output.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL. PMDs initialize this
+ *   structure in case of error only.
+ * @return
+ *   0 on success, a nagative value otherwise.
+ */
+int
+mlx5_flow_dev_dump(struct rte_eth_dev *dev,
+		   FILE *file,
+		   struct rte_flow_error *error __rte_unused)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+
+	return mlx5_devx_cmd_flow_dump(sh->fdb_domain, sh->rx_domain,
+				       sh->tx_domain, file);
+}
+
+/**
+ * Get aged-out flows.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] context
+ *   The address of an array of pointers to the aged-out flows contexts.
+ * @param[in] nb_countexts
+ *   The length of context array pointers.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL. Initialized in case of
+ *   error only.
+ *
+ * @return
+ *   how many contexts get in success, otherwise negative errno value.
+ *   if nb_contexts is 0, return the amount of all aged contexts.
+ *   if nb_contexts is not 0 , return the amount of aged flows reported
+ *   in the context array.
+ */
+int
+mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void **contexts,
+			uint32_t nb_contexts, struct rte_flow_error *error)
+{
+	const struct mlx5_flow_driver_ops *fops;
+	struct rte_flow_attr attr = { .transfer = 0 };
+
+	if (flow_get_drv_type(dev, &attr) == MLX5_FLOW_TYPE_DV) {
+		fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV);
+		return fops->get_aged_flows(dev, contexts, nb_contexts,
+						    error);
+	}
+	DRV_LOG(ERR,
+		"port %u get aged flows is not supported.",
+		 dev->data->port_id);
+	return -ENOTSUP;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h
new file mode 100644
index 000000000..2c9667756
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow.h
@@ -0,0 +1,1034 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_FLOW_H_
+#define RTE_PMD_MLX5_FLOW_H_
+
+#include <netinet/in.h>
+#include <sys/queue.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_atomic.h>
+#include <rte_alarm.h>
+#include <rte_mtr.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5.h"
+
+/* Private rte flow items. */
+enum mlx5_rte_flow_item_type {
+	MLX5_RTE_FLOW_ITEM_TYPE_END = INT_MIN,
+	MLX5_RTE_FLOW_ITEM_TYPE_TAG,
+	MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
+	MLX5_RTE_FLOW_ITEM_TYPE_VLAN,
+};
+
+/* Private (internal) rte flow actions. */
+enum mlx5_rte_flow_action_type {
+	MLX5_RTE_FLOW_ACTION_TYPE_END = INT_MIN,
+	MLX5_RTE_FLOW_ACTION_TYPE_TAG,
+	MLX5_RTE_FLOW_ACTION_TYPE_MARK,
+	MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
+};
+
+/* Matches on selected register. */
+struct mlx5_rte_flow_item_tag {
+	enum modify_reg id;
+	uint32_t data;
+};
+
+/* Modify selected register. */
+struct mlx5_rte_flow_action_set_tag {
+	enum modify_reg id;
+	uint32_t data;
+};
+
+struct mlx5_flow_action_copy_mreg {
+	enum modify_reg dst;
+	enum modify_reg src;
+};
+
+/* Matches on source queue. */
+struct mlx5_rte_flow_item_tx_queue {
+	uint32_t queue;
+};
+
+/* Feature name to allocate metadata register. */
+enum mlx5_feature_name {
+	MLX5_HAIRPIN_RX,
+	MLX5_HAIRPIN_TX,
+	MLX5_METADATA_RX,
+	MLX5_METADATA_TX,
+	MLX5_METADATA_FDB,
+	MLX5_FLOW_MARK,
+	MLX5_APP_TAG,
+	MLX5_COPY_MARK,
+	MLX5_MTR_COLOR,
+	MLX5_MTR_SFX,
+};
+
+/* Pattern outer Layer bits. */
+#define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0)
+#define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1)
+#define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2)
+#define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3)
+#define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4)
+#define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5)
+
+/* Pattern inner Layer bits. */
+#define MLX5_FLOW_LAYER_INNER_L2 (1u << 6)
+#define MLX5_FLOW_LAYER_INNER_L3_IPV4 (1u << 7)
+#define MLX5_FLOW_LAYER_INNER_L3_IPV6 (1u << 8)
+#define MLX5_FLOW_LAYER_INNER_L4_UDP (1u << 9)
+#define MLX5_FLOW_LAYER_INNER_L4_TCP (1u << 10)
+#define MLX5_FLOW_LAYER_INNER_VLAN (1u << 11)
+
+/* Pattern tunnel Layer bits. */
+#define MLX5_FLOW_LAYER_VXLAN (1u << 12)
+#define MLX5_FLOW_LAYER_VXLAN_GPE (1u << 13)
+#define MLX5_FLOW_LAYER_GRE (1u << 14)
+#define MLX5_FLOW_LAYER_MPLS (1u << 15)
+/* List of tunnel Layer bits continued below. */
+
+/* General pattern items bits. */
+#define MLX5_FLOW_ITEM_METADATA (1u << 16)
+#define MLX5_FLOW_ITEM_PORT_ID (1u << 17)
+#define MLX5_FLOW_ITEM_TAG (1u << 18)
+#define MLX5_FLOW_ITEM_MARK (1u << 19)
+
+/* Pattern MISC bits. */
+#define MLX5_FLOW_LAYER_ICMP (1u << 20)
+#define MLX5_FLOW_LAYER_ICMP6 (1u << 21)
+#define MLX5_FLOW_LAYER_GRE_KEY (1u << 22)
+
+/* Pattern tunnel Layer bits (continued). */
+#define MLX5_FLOW_LAYER_IPIP (1u << 23)
+#define MLX5_FLOW_LAYER_IPV6_ENCAP (1u << 24)
+#define MLX5_FLOW_LAYER_NVGRE (1u << 25)
+#define MLX5_FLOW_LAYER_GENEVE (1u << 26)
+
+/* Queue items. */
+#define MLX5_FLOW_ITEM_TX_QUEUE (1u << 27)
+
+/* Pattern tunnel Layer bits (continued). */
+#define MLX5_FLOW_LAYER_GTP (1u << 28)
+
+/* Outer Masks. */
+#define MLX5_FLOW_LAYER_OUTER_L3 \
+	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
+#define MLX5_FLOW_LAYER_OUTER_L4 \
+	(MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP)
+#define MLX5_FLOW_LAYER_OUTER \
+	(MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_L3 | \
+	 MLX5_FLOW_LAYER_OUTER_L4)
+
+/* Tunnel Masks. */
+#define MLX5_FLOW_LAYER_TUNNEL \
+	(MLX5_FLOW_LAYER_VXLAN | MLX5_FLOW_LAYER_VXLAN_GPE | \
+	 MLX5_FLOW_LAYER_GRE | MLX5_FLOW_LAYER_NVGRE | MLX5_FLOW_LAYER_MPLS | \
+	 MLX5_FLOW_LAYER_IPIP | MLX5_FLOW_LAYER_IPV6_ENCAP | \
+	 MLX5_FLOW_LAYER_GENEVE | MLX5_FLOW_LAYER_GTP)
+
+/* Inner Masks. */
+#define MLX5_FLOW_LAYER_INNER_L3 \
+	(MLX5_FLOW_LAYER_INNER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV6)
+#define MLX5_FLOW_LAYER_INNER_L4 \
+	(MLX5_FLOW_LAYER_INNER_L4_UDP | MLX5_FLOW_LAYER_INNER_L4_TCP)
+#define MLX5_FLOW_LAYER_INNER \
+	(MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_L3 | \
+	 MLX5_FLOW_LAYER_INNER_L4)
+
+/* Layer Masks. */
+#define MLX5_FLOW_LAYER_L2 \
+	(MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_INNER_L2)
+#define MLX5_FLOW_LAYER_L3_IPV4 \
+	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV4)
+#define MLX5_FLOW_LAYER_L3_IPV6 \
+	(MLX5_FLOW_LAYER_OUTER_L3_IPV6 | MLX5_FLOW_LAYER_INNER_L3_IPV6)
+#define MLX5_FLOW_LAYER_L3 \
+	(MLX5_FLOW_LAYER_L3_IPV4 | MLX5_FLOW_LAYER_L3_IPV6)
+#define MLX5_FLOW_LAYER_L4 \
+	(MLX5_FLOW_LAYER_OUTER_L4 | MLX5_FLOW_LAYER_INNER_L4)
+
+/* Actions */
+#define MLX5_FLOW_ACTION_DROP (1u << 0)
+#define MLX5_FLOW_ACTION_QUEUE (1u << 1)
+#define MLX5_FLOW_ACTION_RSS (1u << 2)
+#define MLX5_FLOW_ACTION_FLAG (1u << 3)
+#define MLX5_FLOW_ACTION_MARK (1u << 4)
+#define MLX5_FLOW_ACTION_COUNT (1u << 5)
+#define MLX5_FLOW_ACTION_PORT_ID (1u << 6)
+#define MLX5_FLOW_ACTION_OF_POP_VLAN (1u << 7)
+#define MLX5_FLOW_ACTION_OF_PUSH_VLAN (1u << 8)
+#define MLX5_FLOW_ACTION_OF_SET_VLAN_VID (1u << 9)
+#define MLX5_FLOW_ACTION_OF_SET_VLAN_PCP (1u << 10)
+#define MLX5_FLOW_ACTION_SET_IPV4_SRC (1u << 11)
+#define MLX5_FLOW_ACTION_SET_IPV4_DST (1u << 12)
+#define MLX5_FLOW_ACTION_SET_IPV6_SRC (1u << 13)
+#define MLX5_FLOW_ACTION_SET_IPV6_DST (1u << 14)
+#define MLX5_FLOW_ACTION_SET_TP_SRC (1u << 15)
+#define MLX5_FLOW_ACTION_SET_TP_DST (1u << 16)
+#define MLX5_FLOW_ACTION_JUMP (1u << 17)
+#define MLX5_FLOW_ACTION_SET_TTL (1u << 18)
+#define MLX5_FLOW_ACTION_DEC_TTL (1u << 19)
+#define MLX5_FLOW_ACTION_SET_MAC_SRC (1u << 20)
+#define MLX5_FLOW_ACTION_SET_MAC_DST (1u << 21)
+#define MLX5_FLOW_ACTION_ENCAP (1u << 22)
+#define MLX5_FLOW_ACTION_DECAP (1u << 23)
+#define MLX5_FLOW_ACTION_INC_TCP_SEQ (1u << 24)
+#define MLX5_FLOW_ACTION_DEC_TCP_SEQ (1u << 25)
+#define MLX5_FLOW_ACTION_INC_TCP_ACK (1u << 26)
+#define MLX5_FLOW_ACTION_DEC_TCP_ACK (1u << 27)
+#define MLX5_FLOW_ACTION_SET_TAG (1ull << 28)
+#define MLX5_FLOW_ACTION_MARK_EXT (1ull << 29)
+#define MLX5_FLOW_ACTION_SET_META (1ull << 30)
+#define MLX5_FLOW_ACTION_METER (1ull << 31)
+#define MLX5_FLOW_ACTION_SET_IPV4_DSCP (1ull << 32)
+#define MLX5_FLOW_ACTION_SET_IPV6_DSCP (1ull << 33)
+#define MLX5_FLOW_ACTION_AGE (1ull << 34)
+
+#define MLX5_FLOW_FATE_ACTIONS \
+	(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE | \
+	 MLX5_FLOW_ACTION_RSS | MLX5_FLOW_ACTION_JUMP)
+
+#define MLX5_FLOW_FATE_ESWITCH_ACTIONS \
+	(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
+	 MLX5_FLOW_ACTION_JUMP)
+
+
+#define MLX5_FLOW_MODIFY_HDR_ACTIONS (MLX5_FLOW_ACTION_SET_IPV4_SRC | \
+				      MLX5_FLOW_ACTION_SET_IPV4_DST | \
+				      MLX5_FLOW_ACTION_SET_IPV6_SRC | \
+				      MLX5_FLOW_ACTION_SET_IPV6_DST | \
+				      MLX5_FLOW_ACTION_SET_TP_SRC | \
+				      MLX5_FLOW_ACTION_SET_TP_DST | \
+				      MLX5_FLOW_ACTION_SET_TTL | \
+				      MLX5_FLOW_ACTION_DEC_TTL | \
+				      MLX5_FLOW_ACTION_SET_MAC_SRC | \
+				      MLX5_FLOW_ACTION_SET_MAC_DST | \
+				      MLX5_FLOW_ACTION_INC_TCP_SEQ | \
+				      MLX5_FLOW_ACTION_DEC_TCP_SEQ | \
+				      MLX5_FLOW_ACTION_INC_TCP_ACK | \
+				      MLX5_FLOW_ACTION_DEC_TCP_ACK | \
+				      MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
+				      MLX5_FLOW_ACTION_SET_TAG | \
+				      MLX5_FLOW_ACTION_MARK_EXT | \
+				      MLX5_FLOW_ACTION_SET_META | \
+				      MLX5_FLOW_ACTION_SET_IPV4_DSCP | \
+				      MLX5_FLOW_ACTION_SET_IPV6_DSCP)
+
+#define MLX5_FLOW_VLAN_ACTIONS (MLX5_FLOW_ACTION_OF_POP_VLAN | \
+				MLX5_FLOW_ACTION_OF_PUSH_VLAN)
+
+#define MLX5_FLOW_XCAP_ACTIONS (MLX5_FLOW_ACTION_ENCAP | MLX5_FLOW_ACTION_DECAP)
+
+#ifndef IPPROTO_MPLS
+#define IPPROTO_MPLS 137
+#endif
+
+/* UDP port number for MPLS */
+#define MLX5_UDP_PORT_MPLS 6635
+
+/* UDP port numbers for VxLAN. */
+#define MLX5_UDP_PORT_VXLAN 4789
+#define MLX5_UDP_PORT_VXLAN_GPE 4790
+
+/* UDP port numbers for GENEVE. */
+#define MLX5_UDP_PORT_GENEVE 6081
+
+/* Priority reserved for default flows. */
+#define MLX5_FLOW_PRIO_RSVD ((uint32_t)-1)
+
+/*
+ * Number of sub priorities.
+ * For each kind of pattern matching i.e. L2, L3, L4 to have a correct
+ * matching on the NIC (firmware dependent) L4 most have the higher priority
+ * followed by L3 and ending with L2.
+ */
+#define MLX5_PRIORITY_MAP_L2 2
+#define MLX5_PRIORITY_MAP_L3 1
+#define MLX5_PRIORITY_MAP_L4 0
+#define MLX5_PRIORITY_MAP_MAX 3
+
+/* Valid layer type for IPV4 RSS. */
+#define MLX5_IPV4_LAYER_TYPES \
+	(ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | \
+	 ETH_RSS_NONFRAG_IPV4_TCP | ETH_RSS_NONFRAG_IPV4_UDP | \
+	 ETH_RSS_NONFRAG_IPV4_OTHER)
+
+/* IBV hash source bits  for IPV4. */
+#define MLX5_IPV4_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4)
+
+/* Valid layer type for IPV6 RSS. */
+#define MLX5_IPV6_LAYER_TYPES \
+	(ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 | ETH_RSS_NONFRAG_IPV6_TCP | \
+	 ETH_RSS_NONFRAG_IPV6_UDP | ETH_RSS_IPV6_EX  | ETH_RSS_IPV6_TCP_EX | \
+	 ETH_RSS_IPV6_UDP_EX | ETH_RSS_NONFRAG_IPV6_OTHER)
+
+/* IBV hash source bits  for IPV6. */
+#define MLX5_IPV6_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6)
+
+/* IBV hash bits for L3 SRC. */
+#define MLX5_L3_SRC_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_SRC_IPV6)
+
+/* IBV hash bits for L3 DST. */
+#define MLX5_L3_DST_IBV_RX_HASH (IBV_RX_HASH_DST_IPV4 | IBV_RX_HASH_DST_IPV6)
+
+/* IBV hash bits for TCP. */
+#define MLX5_TCP_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_TCP | \
+			      IBV_RX_HASH_DST_PORT_TCP)
+
+/* IBV hash bits for UDP. */
+#define MLX5_UDP_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_UDP | \
+			      IBV_RX_HASH_DST_PORT_UDP)
+
+/* IBV hash bits for L4 SRC. */
+#define MLX5_L4_SRC_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_TCP | \
+				 IBV_RX_HASH_SRC_PORT_UDP)
+
+/* IBV hash bits for L4 DST. */
+#define MLX5_L4_DST_IBV_RX_HASH (IBV_RX_HASH_DST_PORT_TCP | \
+				 IBV_RX_HASH_DST_PORT_UDP)
+
+/* Geneve header first 16Bit */
+#define MLX5_GENEVE_VER_MASK 0x3
+#define MLX5_GENEVE_VER_SHIFT 14
+#define MLX5_GENEVE_VER_VAL(a) \
+		(((a) >> (MLX5_GENEVE_VER_SHIFT)) & (MLX5_GENEVE_VER_MASK))
+#define MLX5_GENEVE_OPTLEN_MASK 0x3F
+#define MLX5_GENEVE_OPTLEN_SHIFT 7
+#define MLX5_GENEVE_OPTLEN_VAL(a) \
+	    (((a) >> (MLX5_GENEVE_OPTLEN_SHIFT)) & (MLX5_GENEVE_OPTLEN_MASK))
+#define MLX5_GENEVE_OAMF_MASK 0x1
+#define MLX5_GENEVE_OAMF_SHIFT 7
+#define MLX5_GENEVE_OAMF_VAL(a) \
+		(((a) >> (MLX5_GENEVE_OAMF_SHIFT)) & (MLX5_GENEVE_OAMF_MASK))
+#define MLX5_GENEVE_CRITO_MASK 0x1
+#define MLX5_GENEVE_CRITO_SHIFT 6
+#define MLX5_GENEVE_CRITO_VAL(a) \
+		(((a) >> (MLX5_GENEVE_CRITO_SHIFT)) & (MLX5_GENEVE_CRITO_MASK))
+#define MLX5_GENEVE_RSVD_MASK 0x3F
+#define MLX5_GENEVE_RSVD_VAL(a) ((a) & (MLX5_GENEVE_RSVD_MASK))
+/*
+ * The length of the Geneve options fields, expressed in four byte multiples,
+ * not including the eight byte fixed tunnel.
+ */
+#define MLX5_GENEVE_OPT_LEN_0 14
+#define MLX5_GENEVE_OPT_LEN_1 63
+
+#define MLX5_ENCAPSULATION_DECISION_SIZE (sizeof(struct rte_flow_item_eth) + \
+					  sizeof(struct rte_flow_item_ipv4))
+
+/* Software header modify action numbers of a flow. */
+#define MLX5_ACT_NUM_MDF_IPV4		1
+#define MLX5_ACT_NUM_MDF_IPV6		4
+#define MLX5_ACT_NUM_MDF_MAC		2
+#define MLX5_ACT_NUM_MDF_VID		1
+#define MLX5_ACT_NUM_MDF_PORT		2
+#define MLX5_ACT_NUM_MDF_TTL		1
+#define MLX5_ACT_NUM_DEC_TTL		MLX5_ACT_NUM_MDF_TTL
+#define MLX5_ACT_NUM_MDF_TCPSEQ		1
+#define MLX5_ACT_NUM_MDF_TCPACK		1
+#define MLX5_ACT_NUM_SET_REG		1
+#define MLX5_ACT_NUM_SET_TAG		1
+#define MLX5_ACT_NUM_CPY_MREG		MLX5_ACT_NUM_SET_TAG
+#define MLX5_ACT_NUM_SET_MARK		MLX5_ACT_NUM_SET_TAG
+#define MLX5_ACT_NUM_SET_META		MLX5_ACT_NUM_SET_TAG
+#define MLX5_ACT_NUM_SET_DSCP		1
+
+enum mlx5_flow_drv_type {
+	MLX5_FLOW_TYPE_MIN,
+	MLX5_FLOW_TYPE_DV,
+	MLX5_FLOW_TYPE_VERBS,
+	MLX5_FLOW_TYPE_MAX,
+};
+
+/* Fate action type. */
+enum mlx5_flow_fate_type {
+	MLX5_FLOW_FATE_NONE, /* Egress flow. */
+	MLX5_FLOW_FATE_QUEUE,
+	MLX5_FLOW_FATE_JUMP,
+	MLX5_FLOW_FATE_PORT_ID,
+	MLX5_FLOW_FATE_DROP,
+	MLX5_FLOW_FATE_MAX,
+};
+
+/* Matcher PRM representation */
+struct mlx5_flow_dv_match_params {
+	size_t size;
+	/**< Size of match value. Do NOT split size and key! */
+	uint32_t buf[MLX5_ST_SZ_DW(fte_match_param)];
+	/**< Matcher value. This value is used as the mask or as a key. */
+};
+
+/* Matcher structure. */
+struct mlx5_flow_dv_matcher {
+	LIST_ENTRY(mlx5_flow_dv_matcher) next;
+	/**< Pointer to the next element. */
+	struct mlx5_flow_tbl_resource *tbl;
+	/**< Pointer to the table(group) the matcher associated with. */
+	rte_atomic32_t refcnt; /**< Reference counter. */
+	void *matcher_object; /**< Pointer to DV matcher */
+	uint16_t crc; /**< CRC of key. */
+	uint16_t priority; /**< Priority of matcher. */
+	struct mlx5_flow_dv_match_params mask; /**< Matcher mask. */
+};
+
+#define MLX5_ENCAP_MAX_LEN 132
+
+/* Encap/decap resource structure. */
+struct mlx5_flow_dv_encap_decap_resource {
+	ILIST_ENTRY(uint32_t)next;
+	/* Pointer to next element. */
+	rte_atomic32_t refcnt; /**< Reference counter. */
+	void *verbs_action;
+	/**< Verbs encap/decap action object. */
+	uint8_t buf[MLX5_ENCAP_MAX_LEN];
+	size_t size;
+	uint8_t reformat_type;
+	uint8_t ft_type;
+	uint64_t flags; /**< Flags for RDMA API. */
+};
+
+/* Tag resource structure. */
+struct mlx5_flow_dv_tag_resource {
+	struct mlx5_hlist_entry entry;
+	/**< hash list entry for tag resource, tag value as the key. */
+	void *action;
+	/**< Verbs tag action object. */
+	rte_atomic32_t refcnt; /**< Reference counter. */
+	uint32_t idx; /**< Index for the index memory pool. */
+};
+
+/*
+ * Number of modification commands.
+ * The maximal actions amount in FW is some constant, and it is 16 in the
+ * latest releases. In some old releases, it will be limited to 8.
+ * Since there is no interface to query the capacity, the maximal value should
+ * be used to allow PMD to create the flow. The validation will be done in the
+ * lower driver layer or FW. A failure will be returned if exceeds the maximal
+ * supported actions number on the root table.
+ * On non-root tables, there is no limitation, but 32 is enough right now.
+ */
+#define MLX5_MAX_MODIFY_NUM			32
+#define MLX5_ROOT_TBL_MODIFY_NUM		16
+
+/* Modify resource structure */
+struct mlx5_flow_dv_modify_hdr_resource {
+	LIST_ENTRY(mlx5_flow_dv_modify_hdr_resource) next;
+	/* Pointer to next element. */
+	rte_atomic32_t refcnt; /**< Reference counter. */
+	struct ibv_flow_action *verbs_action;
+	/**< Verbs modify header action object. */
+	uint8_t ft_type; /**< Flow table type, Rx or Tx. */
+	uint32_t actions_num; /**< Number of modification actions. */
+	uint64_t flags; /**< Flags for RDMA API. */
+	struct mlx5_modification_cmd actions[];
+	/**< Modification actions. */
+};
+
+/* Jump action resource structure. */
+struct mlx5_flow_dv_jump_tbl_resource {
+	rte_atomic32_t refcnt; /**< Reference counter. */
+	uint8_t ft_type; /**< Flow table type, Rx or Tx. */
+	void *action; /**< Pointer to the rdma core action. */
+};
+
+/* Port ID resource structure. */
+struct mlx5_flow_dv_port_id_action_resource {
+	ILIST_ENTRY(uint32_t)next;
+	/* Pointer to next element. */
+	rte_atomic32_t refcnt; /**< Reference counter. */
+	void *action;
+	/**< Verbs tag action object. */
+	uint32_t port_id; /**< Port ID value. */
+};
+
+/* Push VLAN action resource structure */
+struct mlx5_flow_dv_push_vlan_action_resource {
+	ILIST_ENTRY(uint32_t)next;
+	/* Pointer to next element. */
+	rte_atomic32_t refcnt; /**< Reference counter. */
+	void *action; /**< Direct verbs action object. */
+	uint8_t ft_type; /**< Flow table type, Rx, Tx or FDB. */
+	rte_be32_t vlan_tag; /**< VLAN tag value. */
+};
+
+/* Metadata register copy table entry. */
+struct mlx5_flow_mreg_copy_resource {
+	/*
+	 * Hash list entry for copy table.
+	 *  - Key is 32/64-bit MARK action ID.
+	 *  - MUST be the first entry.
+	 */
+	struct mlx5_hlist_entry hlist_ent;
+	LIST_ENTRY(mlx5_flow_mreg_copy_resource) next;
+	/* List entry for device flows. */
+	uint32_t refcnt; /* Reference counter. */
+	uint32_t appcnt; /* Apply/Remove counter. */
+	uint32_t idx;
+	uint32_t rix_flow; /* Built flow for copy. */
+};
+
+/* Table data structure of the hash organization. */
+struct mlx5_flow_tbl_data_entry {
+	struct mlx5_hlist_entry entry;
+	/**< hash list entry, 64-bits key inside. */
+	struct mlx5_flow_tbl_resource tbl;
+	/**< flow table resource. */
+	LIST_HEAD(matchers, mlx5_flow_dv_matcher) matchers;
+	/**< matchers' header associated with the flow table. */
+	struct mlx5_flow_dv_jump_tbl_resource jump;
+	/**< jump resource, at most one for each table created. */
+	uint32_t idx; /**< index for the indexed mempool. */
+};
+
+/* Verbs specification header. */
+struct ibv_spec_header {
+	enum ibv_flow_spec_type type;
+	uint16_t size;
+};
+
+/* RSS description. */
+struct mlx5_flow_rss_desc {
+	uint32_t level;
+	uint32_t queue_num; /**< Number of entries in @p queue. */
+	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
+	uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */
+	uint16_t queue[]; /**< Destination queues to redirect traffic to. */
+};
+
+
+/** Device flow handle structure for DV mode only. */
+struct mlx5_flow_handle_dv {
+	/* Flow DV api: */
+	struct mlx5_flow_dv_matcher *matcher; /**< Cache to matcher. */
+	struct mlx5_flow_dv_modify_hdr_resource *modify_hdr;
+	/**< Pointer to modify header resource in cache. */
+	uint32_t rix_encap_decap;
+	/**< Index to encap/decap resource in cache. */
+	uint32_t rix_push_vlan;
+	/**< Index to push VLAN action resource in cache. */
+	uint32_t rix_tag;
+	/**< Index to the tag action. */
+} __rte_packed;
+
+/** Device flow handle structure: used both for creating & destroying. */
+struct mlx5_flow_handle {
+	SILIST_ENTRY(uint32_t)next;
+	struct mlx5_vf_vlan vf_vlan; /**< Structure for VF VLAN workaround. */
+	/**< Index to next device flow handle. */
+	uint64_t layers;
+	/**< Bit-fields of present layers, see MLX5_FLOW_LAYER_*. */
+	void *ib_flow; /**< Verbs flow pointer. */
+	uint32_t split_flow_id:28; /**< Sub flow unique match flow id. */
+	uint32_t mark:1; /**< Metadate rxq mark flag. */
+	uint32_t fate_action:3; /**< Fate action type. */
+	union {
+		uint32_t rix_hrxq; /**< Hash Rx queue object index. */
+		uint32_t rix_jump; /**< Index to the jump action resource. */
+		uint32_t rix_port_id_action;
+		/**< Index to port ID action resource. */
+		uint32_t rix_fate;
+		/**< Generic value indicates the fate action. */
+	};
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	struct mlx5_flow_handle_dv dvh;
+#endif
+} __rte_packed;
+
+/*
+ * Size for Verbs device flow handle structure only. Do not use the DV only
+ * structure in Verbs. No DV flows attributes will be accessed.
+ * Macro offsetof() could also be used here.
+ */
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+#define MLX5_FLOW_HANDLE_VERBS_SIZE \
+	(sizeof(struct mlx5_flow_handle) - sizeof(struct mlx5_flow_handle_dv))
+#else
+#define MLX5_FLOW_HANDLE_VERBS_SIZE (sizeof(struct mlx5_flow_handle))
+#endif
+
+/*
+ * Max number of actions per DV flow.
+ * See CREATE_FLOW_MAX_FLOW_ACTIONS_SUPPORTED
+ * in rdma-core file providers/mlx5/verbs.c.
+ */
+#define MLX5_DV_MAX_NUMBER_OF_ACTIONS 8
+
+/** Device flow structure only for DV flow creation. */
+struct mlx5_flow_dv_workspace {
+	uint32_t group; /**< The group index. */
+	uint8_t transfer; /**< 1 if the flow is E-Switch flow. */
+	int actions_n; /**< number of actions. */
+	void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS]; /**< Action list. */
+	struct mlx5_flow_dv_encap_decap_resource *encap_decap;
+	/**< Pointer to encap/decap resource in cache. */
+	struct mlx5_flow_dv_push_vlan_action_resource *push_vlan_res;
+	/**< Pointer to push VLAN action resource in cache. */
+	struct mlx5_flow_dv_tag_resource *tag_resource;
+	/**< pointer to the tag action. */
+	struct mlx5_flow_dv_port_id_action_resource *port_id_action;
+	/**< Pointer to port ID action resource. */
+	struct mlx5_flow_dv_jump_tbl_resource *jump;
+	/**< Pointer to the jump action resource. */
+	struct mlx5_flow_dv_match_params value;
+	/**< Holds the value that the packet is compared to. */
+};
+
+/*
+ * Maximal Verbs flow specifications & actions size.
+ * Some elements are mutually exclusive, but enough space should be allocated.
+ * Tunnel cases: 1. Max 2 Ethernet + IP(v6 len > v4 len) + TCP/UDP headers.
+ *               2. One tunnel header (exception: GRE + MPLS),
+ *                  SPEC length: GRE == tunnel.
+ * Actions: 1. 1 Mark OR Flag.
+ *          2. 1 Drop (if any).
+ *          3. No limitation for counters, but it makes no sense to support too
+ *             many counters in a single device flow.
+ */
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+#define MLX5_VERBS_MAX_SPEC_SIZE \
+		( \
+			(2 * (sizeof(struct ibv_flow_spec_eth) + \
+			      sizeof(struct ibv_flow_spec_ipv6) + \
+			      sizeof(struct ibv_flow_spec_tcp_udp)) + \
+			sizeof(struct ibv_flow_spec_gre) + \
+			sizeof(struct ibv_flow_spec_mpls)) \
+		)
+#else
+#define MLX5_VERBS_MAX_SPEC_SIZE \
+		( \
+			(2 * (sizeof(struct ibv_flow_spec_eth) + \
+			      sizeof(struct ibv_flow_spec_ipv6) + \
+			      sizeof(struct ibv_flow_spec_tcp_udp)) + \
+			sizeof(struct ibv_flow_spec_tunnel)) \
+		)
+#endif
+
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \
+	defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+#define MLX5_VERBS_MAX_ACT_SIZE \
+		( \
+			sizeof(struct ibv_flow_spec_action_tag) + \
+			sizeof(struct ibv_flow_spec_action_drop) + \
+			sizeof(struct ibv_flow_spec_counter_action) * 4 \
+		)
+#else
+#define MLX5_VERBS_MAX_ACT_SIZE \
+		( \
+			sizeof(struct ibv_flow_spec_action_tag) + \
+			sizeof(struct ibv_flow_spec_action_drop) \
+		)
+#endif
+
+#define MLX5_VERBS_MAX_SPEC_ACT_SIZE \
+		(MLX5_VERBS_MAX_SPEC_SIZE + MLX5_VERBS_MAX_ACT_SIZE)
+
+/** Device flow structure only for Verbs flow creation. */
+struct mlx5_flow_verbs_workspace {
+	unsigned int size; /**< Size of the attribute. */
+	struct ibv_flow_attr attr; /**< Verbs flow attribute buffer. */
+	uint8_t specs[MLX5_VERBS_MAX_SPEC_ACT_SIZE];
+	/**< Specifications & actions buffer of verbs flow. */
+};
+
+/** Maximal number of device sub-flows supported. */
+#define MLX5_NUM_MAX_DEV_FLOWS 32
+
+/** Device flow structure. */
+struct mlx5_flow {
+	struct rte_flow *flow; /**< Pointer to the main flow. */
+	uint32_t flow_idx; /**< The memory pool index to the main flow. */
+	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
+	uint64_t act_flags;
+	/**< Bit-fields of detected actions, see MLX5_FLOW_ACTION_*. */
+	bool external; /**< true if the flow is created external to PMD. */
+	uint8_t ingress; /**< 1 if the flow is ingress. */
+	union {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+		struct mlx5_flow_dv_workspace dv;
+#endif
+		struct mlx5_flow_verbs_workspace verbs;
+	};
+	struct mlx5_flow_handle *handle;
+	uint32_t handle_idx; /* Index of the mlx5 flow handle memory. */
+};
+
+/* Flow meter state. */
+#define MLX5_FLOW_METER_DISABLE 0
+#define MLX5_FLOW_METER_ENABLE 1
+
+#define MLX5_MAN_WIDTH 8
+/* Modify this value if enum rte_mtr_color changes. */
+#define RTE_MTR_DROPPED RTE_COLORS
+
+/* Meter policer statistics */
+struct mlx5_flow_policer_stats {
+	uint32_t cnt[RTE_COLORS + 1];
+	/**< Color counter, extra for drop. */
+	uint64_t stats_mask;
+	/**< Statistics mask for the colors. */
+};
+
+/* Meter table structure. */
+struct mlx5_meter_domain_info {
+	struct mlx5_flow_tbl_resource *tbl;
+	/**< Meter table. */
+	struct mlx5_flow_tbl_resource *sfx_tbl;
+	/**< Meter suffix table. */
+	void *any_matcher;
+	/**< Meter color not match default criteria. */
+	void *color_matcher;
+	/**< Meter color match criteria. */
+	void *jump_actn;
+	/**< Meter match action. */
+	void *policer_rules[RTE_MTR_DROPPED + 1];
+	/**< Meter policer for the match. */
+};
+
+/* Meter table set for TX RX FDB. */
+struct mlx5_meter_domains_infos {
+	uint32_t ref_cnt;
+	/**< Table user count. */
+	struct mlx5_meter_domain_info egress;
+	/**< TX meter table. */
+	struct mlx5_meter_domain_info ingress;
+	/**< RX meter table. */
+	struct mlx5_meter_domain_info transfer;
+	/**< FDB meter table. */
+	void *drop_actn;
+	/**< Drop action as not matched. */
+	void *count_actns[RTE_MTR_DROPPED + 1];
+	/**< Counters for match and unmatched statistics. */
+	uint32_t fmp[MLX5_ST_SZ_DW(flow_meter_parameters)];
+	/**< Flow meter parameter. */
+	size_t fmp_size;
+	/**< Flow meter parameter size. */
+	void *meter_action;
+	/**< Flow meter action. */
+};
+
+/* Meter parameter structure. */
+struct mlx5_flow_meter {
+	TAILQ_ENTRY(mlx5_flow_meter) next;
+	/**< Pointer to the next flow meter structure. */
+	uint32_t idx; /* Index to meter object. */
+	uint32_t meter_id;
+	/**< Meter id. */
+	struct mlx5_flow_meter_profile *profile;
+	/**< Meter profile parameters. */
+
+	/** Policer actions (per meter output color). */
+	enum rte_mtr_policer_action action[RTE_COLORS];
+
+	/** Set of stats counters to be enabled.
+	 * @see enum rte_mtr_stats_type
+	 */
+	uint64_t stats_mask;
+
+	/**< Rule applies to ingress traffic. */
+	uint32_t ingress:1;
+
+	/**< Rule applies to egress traffic. */
+	uint32_t egress:1;
+	/**
+	 * Instead of simply matching the properties of traffic as it would
+	 * appear on a given DPDK port ID, enabling this attribute transfers
+	 * a flow rule to the lowest possible level of any device endpoints
+	 * found in the pattern.
+	 *
+	 * When supported, this effectively enables an application to
+	 * re-route traffic not necessarily intended for it (e.g. coming
+	 * from or addressed to different physical ports, VFs or
+	 * applications) at the device level.
+	 *
+	 * It complements the behavior of some pattern items such as
+	 * RTE_FLOW_ITEM_TYPE_PHY_PORT and is meaningless without them.
+	 *
+	 * When transferring flow rules, ingress and egress attributes keep
+	 * their original meaning, as if processing traffic emitted or
+	 * received by the application.
+	 */
+	uint32_t transfer:1;
+	struct mlx5_meter_domains_infos *mfts;
+	/**< Flow table created for this meter. */
+	struct mlx5_flow_policer_stats policer_stats;
+	/**< Meter policer statistics. */
+	uint32_t ref_cnt;
+	/**< Use count. */
+	uint32_t active_state:1;
+	/**< Meter state. */
+	uint32_t shared:1;
+	/**< Meter shared or not. */
+};
+
+/* RFC2697 parameter structure. */
+struct mlx5_flow_meter_srtcm_rfc2697_prm {
+	/* green_saturation_value = cbs_mantissa * 2^cbs_exponent */
+	uint32_t cbs_exponent:5;
+	uint32_t cbs_mantissa:8;
+	/* cir = 8G * cir_mantissa * 1/(2^cir_exponent) Bytes/Sec */
+	uint32_t cir_exponent:5;
+	uint32_t cir_mantissa:8;
+	/* yellow _saturation_value = ebs_mantissa * 2^ebs_exponent */
+	uint32_t ebs_exponent:5;
+	uint32_t ebs_mantissa:8;
+};
+
+/* Flow meter profile structure. */
+struct mlx5_flow_meter_profile {
+	TAILQ_ENTRY(mlx5_flow_meter_profile) next;
+	/**< Pointer to the next flow meter structure. */
+	uint32_t meter_profile_id; /**< Profile id. */
+	struct rte_mtr_meter_profile profile; /**< Profile detail. */
+	union {
+		struct mlx5_flow_meter_srtcm_rfc2697_prm srtcm_prm;
+		/**< srtcm_rfc2697 struct. */
+	};
+	uint32_t ref_cnt; /**< Use count. */
+};
+
+/* Fdir flow structure */
+struct mlx5_fdir_flow {
+	LIST_ENTRY(mlx5_fdir_flow) next; /* Pointer to the next element. */
+	struct mlx5_fdir *fdir; /* Pointer to fdir. */
+	uint32_t rix_flow; /* Index to flow. */
+};
+
+#define HAIRPIN_FLOW_ID_BITS 28
+
+/* Flow structure. */
+struct rte_flow {
+	ILIST_ENTRY(uint32_t)next; /**< Index to the next flow structure. */
+	uint32_t dev_handles;
+	/**< Device flow handles that are part of the flow. */
+	uint32_t drv_type:2; /**< Driver type. */
+	uint32_t fdir:1; /**< Identifier of associated FDIR if any. */
+	uint32_t hairpin_flow_id:HAIRPIN_FLOW_ID_BITS;
+	/**< The flow id used for hairpin. */
+	uint32_t copy_applied:1; /**< The MARK copy Flow os applied. */
+	uint32_t rix_mreg_copy;
+	/**< Index to metadata register copy table resource. */
+	uint32_t counter; /**< Holds flow counter. */
+	uint16_t meter; /**< Holds flow meter id. */
+} __rte_packed;
+
+typedef int (*mlx5_flow_validate_t)(struct rte_eth_dev *dev,
+				    const struct rte_flow_attr *attr,
+				    const struct rte_flow_item items[],
+				    const struct rte_flow_action actions[],
+				    bool external,
+				    int hairpin,
+				    struct rte_flow_error *error);
+typedef struct mlx5_flow *(*mlx5_flow_prepare_t)
+	(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
+	 const struct rte_flow_item items[],
+	 const struct rte_flow_action actions[], struct rte_flow_error *error);
+typedef int (*mlx5_flow_translate_t)(struct rte_eth_dev *dev,
+				     struct mlx5_flow *dev_flow,
+				     const struct rte_flow_attr *attr,
+				     const struct rte_flow_item items[],
+				     const struct rte_flow_action actions[],
+				     struct rte_flow_error *error);
+typedef int (*mlx5_flow_apply_t)(struct rte_eth_dev *dev, struct rte_flow *flow,
+				 struct rte_flow_error *error);
+typedef void (*mlx5_flow_remove_t)(struct rte_eth_dev *dev,
+				   struct rte_flow *flow);
+typedef void (*mlx5_flow_destroy_t)(struct rte_eth_dev *dev,
+				    struct rte_flow *flow);
+typedef int (*mlx5_flow_query_t)(struct rte_eth_dev *dev,
+				 struct rte_flow *flow,
+				 const struct rte_flow_action *actions,
+				 void *data,
+				 struct rte_flow_error *error);
+typedef struct mlx5_meter_domains_infos *(*mlx5_flow_create_mtr_tbls_t)
+					    (struct rte_eth_dev *dev,
+					     const struct mlx5_flow_meter *fm);
+typedef int (*mlx5_flow_destroy_mtr_tbls_t)(struct rte_eth_dev *dev,
+					struct mlx5_meter_domains_infos *tbls);
+typedef int (*mlx5_flow_create_policer_rules_t)
+					(struct rte_eth_dev *dev,
+					 struct mlx5_flow_meter *fm,
+					 const struct rte_flow_attr *attr);
+typedef int (*mlx5_flow_destroy_policer_rules_t)
+					(struct rte_eth_dev *dev,
+					 const struct mlx5_flow_meter *fm,
+					 const struct rte_flow_attr *attr);
+typedef uint32_t (*mlx5_flow_counter_alloc_t)
+				   (struct rte_eth_dev *dev);
+typedef void (*mlx5_flow_counter_free_t)(struct rte_eth_dev *dev,
+					 uint32_t cnt);
+typedef int (*mlx5_flow_counter_query_t)(struct rte_eth_dev *dev,
+					 uint32_t cnt,
+					 bool clear, uint64_t *pkts,
+					 uint64_t *bytes);
+typedef int (*mlx5_flow_get_aged_flows_t)
+					(struct rte_eth_dev *dev,
+					 void **context,
+					 uint32_t nb_contexts,
+					 struct rte_flow_error *error);
+struct mlx5_flow_driver_ops {
+	mlx5_flow_validate_t validate;
+	mlx5_flow_prepare_t prepare;
+	mlx5_flow_translate_t translate;
+	mlx5_flow_apply_t apply;
+	mlx5_flow_remove_t remove;
+	mlx5_flow_destroy_t destroy;
+	mlx5_flow_query_t query;
+	mlx5_flow_create_mtr_tbls_t create_mtr_tbls;
+	mlx5_flow_destroy_mtr_tbls_t destroy_mtr_tbls;
+	mlx5_flow_create_policer_rules_t create_policer_rules;
+	mlx5_flow_destroy_policer_rules_t destroy_policer_rules;
+	mlx5_flow_counter_alloc_t counter_alloc;
+	mlx5_flow_counter_free_t counter_free;
+	mlx5_flow_counter_query_t counter_query;
+	mlx5_flow_get_aged_flows_t get_aged_flows;
+};
+
+/* mlx5_flow.c */
+
+struct mlx5_flow_id_pool *mlx5_flow_id_pool_alloc(uint32_t max_id);
+void mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool);
+uint32_t mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id);
+uint32_t mlx5_flow_id_release(struct mlx5_flow_id_pool *pool,
+			      uint32_t id);
+int mlx5_flow_group_to_table(const struct rte_flow_attr *attributes,
+			     bool external, uint32_t group, bool fdb_def_rule,
+			     uint32_t *table, struct rte_flow_error *error);
+uint64_t mlx5_flow_hashfields_adjust(struct mlx5_flow_rss_desc *rss_desc,
+				     int tunnel, uint64_t layer_types,
+				     uint64_t hash_fields);
+uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
+				   uint32_t subpriority);
+int mlx5_flow_get_reg_id(struct rte_eth_dev *dev,
+				     enum mlx5_feature_name feature,
+				     uint32_t id,
+				     struct rte_flow_error *error);
+const struct rte_flow_action *mlx5_flow_find_action
+					(const struct rte_flow_action *actions,
+					 enum rte_flow_action_type action);
+int mlx5_flow_validate_action_count(struct rte_eth_dev *dev,
+				    const struct rte_flow_attr *attr,
+				    struct rte_flow_error *error);
+int mlx5_flow_validate_action_drop(uint64_t action_flags,
+				   const struct rte_flow_attr *attr,
+				   struct rte_flow_error *error);
+int mlx5_flow_validate_action_flag(uint64_t action_flags,
+				   const struct rte_flow_attr *attr,
+				   struct rte_flow_error *error);
+int mlx5_flow_validate_action_mark(const struct rte_flow_action *action,
+				   uint64_t action_flags,
+				   const struct rte_flow_attr *attr,
+				   struct rte_flow_error *error);
+int mlx5_flow_validate_action_queue(const struct rte_flow_action *action,
+				    uint64_t action_flags,
+				    struct rte_eth_dev *dev,
+				    const struct rte_flow_attr *attr,
+				    struct rte_flow_error *error);
+int mlx5_flow_validate_action_rss(const struct rte_flow_action *action,
+				  uint64_t action_flags,
+				  struct rte_eth_dev *dev,
+				  const struct rte_flow_attr *attr,
+				  uint64_t item_flags,
+				  struct rte_flow_error *error);
+int mlx5_flow_validate_attributes(struct rte_eth_dev *dev,
+				  const struct rte_flow_attr *attributes,
+				  struct rte_flow_error *error);
+int mlx5_flow_item_acceptable(const struct rte_flow_item *item,
+			      const uint8_t *mask,
+			      const uint8_t *nic_mask,
+			      unsigned int size,
+			      struct rte_flow_error *error);
+int mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
+				uint64_t item_flags,
+				struct rte_flow_error *error);
+int mlx5_flow_validate_item_gre(const struct rte_flow_item *item,
+				uint64_t item_flags,
+				uint8_t target_protocol,
+				struct rte_flow_error *error);
+int mlx5_flow_validate_item_gre_key(const struct rte_flow_item *item,
+				    uint64_t item_flags,
+				    const struct rte_flow_item *gre_item,
+				    struct rte_flow_error *error);
+int mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item,
+				 uint64_t item_flags,
+				 uint64_t last_item,
+				 uint16_t ether_type,
+				 const struct rte_flow_item_ipv4 *acc_mask,
+				 struct rte_flow_error *error);
+int mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item,
+				 uint64_t item_flags,
+				 uint64_t last_item,
+				 uint16_t ether_type,
+				 const struct rte_flow_item_ipv6 *acc_mask,
+				 struct rte_flow_error *error);
+int mlx5_flow_validate_item_mpls(struct rte_eth_dev *dev,
+				 const struct rte_flow_item *item,
+				 uint64_t item_flags,
+				 uint64_t prev_layer,
+				 struct rte_flow_error *error);
+int mlx5_flow_validate_item_tcp(const struct rte_flow_item *item,
+				uint64_t item_flags,
+				uint8_t target_protocol,
+				const struct rte_flow_item_tcp *flow_mask,
+				struct rte_flow_error *error);
+int mlx5_flow_validate_item_udp(const struct rte_flow_item *item,
+				uint64_t item_flags,
+				uint8_t target_protocol,
+				struct rte_flow_error *error);
+int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
+				 uint64_t item_flags,
+				 struct rte_eth_dev *dev,
+				 struct rte_flow_error *error);
+int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
+				  uint64_t item_flags,
+				  struct rte_flow_error *error);
+int mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item,
+				      uint64_t item_flags,
+				      struct rte_eth_dev *dev,
+				      struct rte_flow_error *error);
+int mlx5_flow_validate_item_icmp(const struct rte_flow_item *item,
+				 uint64_t item_flags,
+				 uint8_t target_protocol,
+				 struct rte_flow_error *error);
+int mlx5_flow_validate_item_icmp6(const struct rte_flow_item *item,
+				   uint64_t item_flags,
+				   uint8_t target_protocol,
+				   struct rte_flow_error *error);
+int mlx5_flow_validate_item_nvgre(const struct rte_flow_item *item,
+				  uint64_t item_flags,
+				  uint8_t target_protocol,
+				  struct rte_flow_error *error);
+int mlx5_flow_validate_item_geneve(const struct rte_flow_item *item,
+				   uint64_t item_flags,
+				   struct rte_eth_dev *dev,
+				   struct rte_flow_error *error);
+struct mlx5_meter_domains_infos *mlx5_flow_create_mtr_tbls
+					(struct rte_eth_dev *dev,
+					 const struct mlx5_flow_meter *fm);
+int mlx5_flow_destroy_mtr_tbls(struct rte_eth_dev *dev,
+			       struct mlx5_meter_domains_infos *tbl);
+int mlx5_flow_create_policer_rules(struct rte_eth_dev *dev,
+				   struct mlx5_flow_meter *fm,
+				   const struct rte_flow_attr *attr);
+int mlx5_flow_destroy_policer_rules(struct rte_eth_dev *dev,
+				    struct mlx5_flow_meter *fm,
+				    const struct rte_flow_attr *attr);
+int mlx5_flow_meter_flush(struct rte_eth_dev *dev,
+			  struct rte_mtr_error *error);
+#endif /* RTE_PMD_MLX5_FLOW_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c
new file mode 100644
index 000000000..e48183195
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_dv.c
@@ -0,0 +1,9666 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <sys/queue.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+#include <rte_malloc.h>
+#include <rte_cycles.h>
+#include <rte_ip.h>
+#include <rte_gre.h>
+#include <rte_vxlan.h>
+#include <rte_gtp.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_flow.h"
+#include "mlx5_rxtx.h"
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+
+#ifndef HAVE_IBV_FLOW_DEVX_COUNTERS
+#define MLX5DV_FLOW_ACTION_COUNTERS_DEVX 0
+#endif
+
+#ifndef HAVE_MLX5DV_DR_ESWITCH
+#ifndef MLX5DV_FLOW_TABLE_TYPE_FDB
+#define MLX5DV_FLOW_TABLE_TYPE_FDB 0
+#endif
+#endif
+
+#ifndef HAVE_MLX5DV_DR
+#define MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL 1
+#endif
+
+/* VLAN header definitions */
+#define MLX5DV_FLOW_VLAN_PCP_SHIFT 13
+#define MLX5DV_FLOW_VLAN_PCP_MASK (0x7 << MLX5DV_FLOW_VLAN_PCP_SHIFT)
+#define MLX5DV_FLOW_VLAN_VID_MASK 0x0fff
+#define MLX5DV_FLOW_VLAN_PCP_MASK_BE RTE_BE16(MLX5DV_FLOW_VLAN_PCP_MASK)
+#define MLX5DV_FLOW_VLAN_VID_MASK_BE RTE_BE16(MLX5DV_FLOW_VLAN_VID_MASK)
+
+union flow_dv_attr {
+	struct {
+		uint32_t valid:1;
+		uint32_t ipv4:1;
+		uint32_t ipv6:1;
+		uint32_t tcp:1;
+		uint32_t udp:1;
+		uint32_t reserved:27;
+	};
+	uint32_t attr;
+};
+
+static int
+flow_dv_tbl_resource_release(struct rte_eth_dev *dev,
+			     struct mlx5_flow_tbl_resource *tbl);
+
+/**
+ * Initialize flow attributes structure according to flow items' types.
+ *
+ * flow_dv_validate() avoids multiple L3/L4 layers cases other than tunnel
+ * mode. For tunnel mode, the items to be modified are the outermost ones.
+ *
+ * @param[in] item
+ *   Pointer to item specification.
+ * @param[out] attr
+ *   Pointer to flow attributes structure.
+ * @param[in] dev_flow
+ *   Pointer to the sub flow.
+ * @param[in] tunnel_decap
+ *   Whether action is after tunnel decapsulation.
+ */
+static void
+flow_dv_attr_init(const struct rte_flow_item *item, union flow_dv_attr *attr,
+		  struct mlx5_flow *dev_flow, bool tunnel_decap)
+{
+	uint64_t layers = dev_flow->handle->layers;
+
+	/*
+	 * If layers is already initialized, it means this dev_flow is the
+	 * suffix flow, the layers flags is set by the prefix flow. Need to
+	 * use the layer flags from prefix flow as the suffix flow may not
+	 * have the user defined items as the flow is split.
+	 */
+	if (layers) {
+		if (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV4)
+			attr->ipv4 = 1;
+		else if (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV6)
+			attr->ipv6 = 1;
+		if (layers & MLX5_FLOW_LAYER_OUTER_L4_TCP)
+			attr->tcp = 1;
+		else if (layers & MLX5_FLOW_LAYER_OUTER_L4_UDP)
+			attr->udp = 1;
+		attr->valid = 1;
+		return;
+	}
+	for (; item->type != RTE_FLOW_ITEM_TYPE_END; item++) {
+		uint8_t next_protocol = 0xff;
+		switch (item->type) {
+		case RTE_FLOW_ITEM_TYPE_GRE:
+		case RTE_FLOW_ITEM_TYPE_NVGRE:
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+		case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+		case RTE_FLOW_ITEM_TYPE_GENEVE:
+		case RTE_FLOW_ITEM_TYPE_MPLS:
+			if (tunnel_decap)
+				attr->attr = 0;
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			if (!attr->ipv6)
+				attr->ipv4 = 1;
+			if (item->mask != NULL &&
+			    ((const struct rte_flow_item_ipv4 *)
+			    item->mask)->hdr.next_proto_id)
+				next_protocol =
+				    ((const struct rte_flow_item_ipv4 *)
+				      (item->spec))->hdr.next_proto_id &
+				    ((const struct rte_flow_item_ipv4 *)
+				      (item->mask))->hdr.next_proto_id;
+			if ((next_protocol == IPPROTO_IPIP ||
+			    next_protocol == IPPROTO_IPV6) && tunnel_decap)
+				attr->attr = 0;
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			if (!attr->ipv4)
+				attr->ipv6 = 1;
+			if (item->mask != NULL &&
+			    ((const struct rte_flow_item_ipv6 *)
+			    item->mask)->hdr.proto)
+				next_protocol =
+				    ((const struct rte_flow_item_ipv6 *)
+				      (item->spec))->hdr.proto &
+				    ((const struct rte_flow_item_ipv6 *)
+				      (item->mask))->hdr.proto;
+			if ((next_protocol == IPPROTO_IPIP ||
+			    next_protocol == IPPROTO_IPV6) && tunnel_decap)
+				attr->attr = 0;
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			if (!attr->tcp)
+				attr->udp = 1;
+			break;
+		case RTE_FLOW_ITEM_TYPE_TCP:
+			if (!attr->udp)
+				attr->tcp = 1;
+			break;
+		default:
+			break;
+		}
+	}
+	attr->valid = 1;
+}
+
+/**
+ * Convert rte_mtr_color to mlx5 color.
+ *
+ * @param[in] rcol
+ *   rte_mtr_color.
+ *
+ * @return
+ *   mlx5 color.
+ */
+static int
+rte_col_2_mlx5_col(enum rte_color rcol)
+{
+	switch (rcol) {
+	case RTE_COLOR_GREEN:
+		return MLX5_FLOW_COLOR_GREEN;
+	case RTE_COLOR_YELLOW:
+		return MLX5_FLOW_COLOR_YELLOW;
+	case RTE_COLOR_RED:
+		return MLX5_FLOW_COLOR_RED;
+	default:
+		break;
+	}
+	return MLX5_FLOW_COLOR_UNDEFINED;
+}
+
+struct field_modify_info {
+	uint32_t size; /* Size of field in protocol header, in bytes. */
+	uint32_t offset; /* Offset of field in protocol header, in bytes. */
+	enum mlx5_modification_field id;
+};
+
+struct field_modify_info modify_eth[] = {
+	{4,  0, MLX5_MODI_OUT_DMAC_47_16},
+	{2,  4, MLX5_MODI_OUT_DMAC_15_0},
+	{4,  6, MLX5_MODI_OUT_SMAC_47_16},
+	{2, 10, MLX5_MODI_OUT_SMAC_15_0},
+	{0, 0, 0},
+};
+
+struct field_modify_info modify_vlan_out_first_vid[] = {
+	/* Size in bits !!! */
+	{12, 0, MLX5_MODI_OUT_FIRST_VID},
+	{0, 0, 0},
+};
+
+struct field_modify_info modify_ipv4[] = {
+	{1,  1, MLX5_MODI_OUT_IP_DSCP},
+	{1,  8, MLX5_MODI_OUT_IPV4_TTL},
+	{4, 12, MLX5_MODI_OUT_SIPV4},
+	{4, 16, MLX5_MODI_OUT_DIPV4},
+	{0, 0, 0},
+};
+
+struct field_modify_info modify_ipv6[] = {
+	{1,  0, MLX5_MODI_OUT_IP_DSCP},
+	{1,  7, MLX5_MODI_OUT_IPV6_HOPLIMIT},
+	{4,  8, MLX5_MODI_OUT_SIPV6_127_96},
+	{4, 12, MLX5_MODI_OUT_SIPV6_95_64},
+	{4, 16, MLX5_MODI_OUT_SIPV6_63_32},
+	{4, 20, MLX5_MODI_OUT_SIPV6_31_0},
+	{4, 24, MLX5_MODI_OUT_DIPV6_127_96},
+	{4, 28, MLX5_MODI_OUT_DIPV6_95_64},
+	{4, 32, MLX5_MODI_OUT_DIPV6_63_32},
+	{4, 36, MLX5_MODI_OUT_DIPV6_31_0},
+	{0, 0, 0},
+};
+
+struct field_modify_info modify_udp[] = {
+	{2, 0, MLX5_MODI_OUT_UDP_SPORT},
+	{2, 2, MLX5_MODI_OUT_UDP_DPORT},
+	{0, 0, 0},
+};
+
+struct field_modify_info modify_tcp[] = {
+	{2, 0, MLX5_MODI_OUT_TCP_SPORT},
+	{2, 2, MLX5_MODI_OUT_TCP_DPORT},
+	{4, 4, MLX5_MODI_OUT_TCP_SEQ_NUM},
+	{4, 8, MLX5_MODI_OUT_TCP_ACK_NUM},
+	{0, 0, 0},
+};
+
+static void
+mlx5_flow_tunnel_ip_check(const struct rte_flow_item *item __rte_unused,
+			  uint8_t next_protocol, uint64_t *item_flags,
+			  int *tunnel)
+{
+	MLX5_ASSERT(item->type == RTE_FLOW_ITEM_TYPE_IPV4 ||
+		    item->type == RTE_FLOW_ITEM_TYPE_IPV6);
+	if (next_protocol == IPPROTO_IPIP) {
+		*item_flags |= MLX5_FLOW_LAYER_IPIP;
+		*tunnel = 1;
+	}
+	if (next_protocol == IPPROTO_IPV6) {
+		*item_flags |= MLX5_FLOW_LAYER_IPV6_ENCAP;
+		*tunnel = 1;
+	}
+}
+
+/**
+ * Acquire the synchronizing object to protect multithreaded access
+ * to shared dv context. Lock occurs only if context is actually
+ * shared, i.e. we have multiport IB device and representors are
+ * created.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ */
+static void
+flow_dv_shared_lock(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+
+	if (sh->dv_refcnt > 1) {
+		int ret;
+
+		ret = pthread_mutex_lock(&sh->dv_mutex);
+		MLX5_ASSERT(!ret);
+		(void)ret;
+	}
+}
+
+static void
+flow_dv_shared_unlock(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+
+	if (sh->dv_refcnt > 1) {
+		int ret;
+
+		ret = pthread_mutex_unlock(&sh->dv_mutex);
+		MLX5_ASSERT(!ret);
+		(void)ret;
+	}
+}
+
+/* Update VLAN's VID/PCP based on input rte_flow_action.
+ *
+ * @param[in] action
+ *   Pointer to struct rte_flow_action.
+ * @param[out] vlan
+ *   Pointer to struct rte_vlan_hdr.
+ */
+static void
+mlx5_update_vlan_vid_pcp(const struct rte_flow_action *action,
+			 struct rte_vlan_hdr *vlan)
+{
+	uint16_t vlan_tci;
+	if (action->type == RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
+		vlan_tci =
+		    ((const struct rte_flow_action_of_set_vlan_pcp *)
+					       action->conf)->vlan_pcp;
+		vlan_tci = vlan_tci << MLX5DV_FLOW_VLAN_PCP_SHIFT;
+		vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_PCP_MASK;
+		vlan->vlan_tci |= vlan_tci;
+	} else if (action->type == RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
+		vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_VID_MASK;
+		vlan->vlan_tci |= rte_be_to_cpu_16
+		    (((const struct rte_flow_action_of_set_vlan_vid *)
+					     action->conf)->vlan_vid);
+	}
+}
+
+/**
+ * Fetch 1, 2, 3 or 4 byte field from the byte array
+ * and return as unsigned integer in host-endian format.
+ *
+ * @param[in] data
+ *   Pointer to data array.
+ * @param[in] size
+ *   Size of field to extract.
+ *
+ * @return
+ *   converted field in host endian format.
+ */
+static inline uint32_t
+flow_dv_fetch_field(const uint8_t *data, uint32_t size)
+{
+	uint32_t ret;
+
+	switch (size) {
+	case 1:
+		ret = *data;
+		break;
+	case 2:
+		ret = rte_be_to_cpu_16(*(const unaligned_uint16_t *)data);
+		break;
+	case 3:
+		ret = rte_be_to_cpu_16(*(const unaligned_uint16_t *)data);
+		ret = (ret << 8) | *(data + sizeof(uint16_t));
+		break;
+	case 4:
+		ret = rte_be_to_cpu_32(*(const unaligned_uint32_t *)data);
+		break;
+	default:
+		MLX5_ASSERT(false);
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * Convert modify-header action to DV specification.
+ *
+ * Data length of each action is determined by provided field description
+ * and the item mask. Data bit offset and width of each action is determined
+ * by provided item mask.
+ *
+ * @param[in] item
+ *   Pointer to item specification.
+ * @param[in] field
+ *   Pointer to field modification information.
+ *     For MLX5_MODIFICATION_TYPE_SET specifies destination field.
+ *     For MLX5_MODIFICATION_TYPE_ADD specifies destination field.
+ *     For MLX5_MODIFICATION_TYPE_COPY specifies source field.
+ * @param[in] dcopy
+ *   Destination field info for MLX5_MODIFICATION_TYPE_COPY in @type.
+ *   Negative offset value sets the same offset as source offset.
+ *   size field is ignored, value is taken from source field.
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] type
+ *   Type of modification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_modify_action(struct rte_flow_item *item,
+			      struct field_modify_info *field,
+			      struct field_modify_info *dcopy,
+			      struct mlx5_flow_dv_modify_hdr_resource *resource,
+			      uint32_t type, struct rte_flow_error *error)
+{
+	uint32_t i = resource->actions_num;
+	struct mlx5_modification_cmd *actions = resource->actions;
+
+	/*
+	 * The item and mask are provided in big-endian format.
+	 * The fields should be presented as in big-endian format either.
+	 * Mask must be always present, it defines the actual field width.
+	 */
+	MLX5_ASSERT(item->mask);
+	MLX5_ASSERT(field->size);
+	do {
+		unsigned int size_b;
+		unsigned int off_b;
+		uint32_t mask;
+		uint32_t data;
+
+		if (i >= MLX5_MAX_MODIFY_NUM)
+			return rte_flow_error_set(error, EINVAL,
+				 RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+				 "too many items to modify");
+		/* Fetch variable byte size mask from the array. */
+		mask = flow_dv_fetch_field((const uint8_t *)item->mask +
+					   field->offset, field->size);
+		if (!mask) {
+			++field;
+			continue;
+		}
+		/* Deduce actual data width in bits from mask value. */
+		off_b = rte_bsf32(mask);
+		size_b = sizeof(uint32_t) * CHAR_BIT -
+			 off_b - __builtin_clz(mask);
+		MLX5_ASSERT(size_b);
+		size_b = size_b == sizeof(uint32_t) * CHAR_BIT ? 0 : size_b;
+		actions[i] = (struct mlx5_modification_cmd) {
+			.action_type = type,
+			.field = field->id,
+			.offset = off_b,
+			.length = size_b,
+		};
+		/* Convert entire record to expected big-endian format. */
+		actions[i].data0 = rte_cpu_to_be_32(actions[i].data0);
+		if (type == MLX5_MODIFICATION_TYPE_COPY) {
+			MLX5_ASSERT(dcopy);
+			actions[i].dst_field = dcopy->id;
+			actions[i].dst_offset =
+				(int)dcopy->offset < 0 ? off_b : dcopy->offset;
+			/* Convert entire record to big-endian format. */
+			actions[i].data1 = rte_cpu_to_be_32(actions[i].data1);
+		} else {
+			MLX5_ASSERT(item->spec);
+			data = flow_dv_fetch_field((const uint8_t *)item->spec +
+						   field->offset, field->size);
+			/* Shift out the trailing masked bits from data. */
+			data = (data & mask) >> off_b;
+			actions[i].data1 = rte_cpu_to_be_32(data);
+		}
+		++i;
+		++field;
+	} while (field->size);
+	if (resource->actions_num == i)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "invalid modification flow item");
+	resource->actions_num = i;
+	return 0;
+}
+
+/**
+ * Convert modify-header set IPv4 address action to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ipv4
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 struct rte_flow_error *error)
+{
+	const struct rte_flow_action_set_ipv4 *conf =
+		(const struct rte_flow_action_set_ipv4 *)(action->conf);
+	struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV4 };
+	struct rte_flow_item_ipv4 ipv4;
+	struct rte_flow_item_ipv4 ipv4_mask;
+
+	memset(&ipv4, 0, sizeof(ipv4));
+	memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+	if (action->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC) {
+		ipv4.hdr.src_addr = conf->ipv4_addr;
+		ipv4_mask.hdr.src_addr = rte_flow_item_ipv4_mask.hdr.src_addr;
+	} else {
+		ipv4.hdr.dst_addr = conf->ipv4_addr;
+		ipv4_mask.hdr.dst_addr = rte_flow_item_ipv4_mask.hdr.dst_addr;
+	}
+	item.spec = &ipv4;
+	item.mask = &ipv4_mask;
+	return flow_dv_convert_modify_action(&item, modify_ipv4, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set IPv6 address action to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ipv6
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 struct rte_flow_error *error)
+{
+	const struct rte_flow_action_set_ipv6 *conf =
+		(const struct rte_flow_action_set_ipv6 *)(action->conf);
+	struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV6 };
+	struct rte_flow_item_ipv6 ipv6;
+	struct rte_flow_item_ipv6 ipv6_mask;
+
+	memset(&ipv6, 0, sizeof(ipv6));
+	memset(&ipv6_mask, 0, sizeof(ipv6_mask));
+	if (action->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC) {
+		memcpy(&ipv6.hdr.src_addr, &conf->ipv6_addr,
+		       sizeof(ipv6.hdr.src_addr));
+		memcpy(&ipv6_mask.hdr.src_addr,
+		       &rte_flow_item_ipv6_mask.hdr.src_addr,
+		       sizeof(ipv6.hdr.src_addr));
+	} else {
+		memcpy(&ipv6.hdr.dst_addr, &conf->ipv6_addr,
+		       sizeof(ipv6.hdr.dst_addr));
+		memcpy(&ipv6_mask.hdr.dst_addr,
+		       &rte_flow_item_ipv6_mask.hdr.dst_addr,
+		       sizeof(ipv6.hdr.dst_addr));
+	}
+	item.spec = &ipv6;
+	item.mask = &ipv6_mask;
+	return flow_dv_convert_modify_action(&item, modify_ipv6, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set MAC address action to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_mac
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 struct rte_flow_error *error)
+{
+	const struct rte_flow_action_set_mac *conf =
+		(const struct rte_flow_action_set_mac *)(action->conf);
+	struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_ETH };
+	struct rte_flow_item_eth eth;
+	struct rte_flow_item_eth eth_mask;
+
+	memset(&eth, 0, sizeof(eth));
+	memset(&eth_mask, 0, sizeof(eth_mask));
+	if (action->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC) {
+		memcpy(&eth.src.addr_bytes, &conf->mac_addr,
+		       sizeof(eth.src.addr_bytes));
+		memcpy(&eth_mask.src.addr_bytes,
+		       &rte_flow_item_eth_mask.src.addr_bytes,
+		       sizeof(eth_mask.src.addr_bytes));
+	} else {
+		memcpy(&eth.dst.addr_bytes, &conf->mac_addr,
+		       sizeof(eth.dst.addr_bytes));
+		memcpy(&eth_mask.dst.addr_bytes,
+		       &rte_flow_item_eth_mask.dst.addr_bytes,
+		       sizeof(eth_mask.dst.addr_bytes));
+	}
+	item.spec = &eth;
+	item.mask = &eth_mask;
+	return flow_dv_convert_modify_action(&item, modify_eth, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set VLAN VID action to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_vlan_vid
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 struct rte_flow_error *error)
+{
+	const struct rte_flow_action_of_set_vlan_vid *conf =
+		(const struct rte_flow_action_of_set_vlan_vid *)(action->conf);
+	int i = resource->actions_num;
+	struct mlx5_modification_cmd *actions = resource->actions;
+	struct field_modify_info *field = modify_vlan_out_first_vid;
+
+	if (i >= MLX5_MAX_MODIFY_NUM)
+		return rte_flow_error_set(error, EINVAL,
+			 RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+			 "too many items to modify");
+	actions[i] = (struct mlx5_modification_cmd) {
+		.action_type = MLX5_MODIFICATION_TYPE_SET,
+		.field = field->id,
+		.length = field->size,
+		.offset = field->offset,
+	};
+	actions[i].data0 = rte_cpu_to_be_32(actions[i].data0);
+	actions[i].data1 = conf->vlan_vid;
+	actions[i].data1 = actions[i].data1 << 16;
+	resource->actions_num = ++i;
+	return 0;
+}
+
+/**
+ * Convert modify-header set TP action to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[in] items
+ *   Pointer to rte_flow_item objects list.
+ * @param[in] attr
+ *   Pointer to flow attributes structure.
+ * @param[in] dev_flow
+ *   Pointer to the sub flow.
+ * @param[in] tunnel_decap
+ *   Whether action is after tunnel decapsulation.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_tp
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 const struct rte_flow_item *items,
+			 union flow_dv_attr *attr, struct mlx5_flow *dev_flow,
+			 bool tunnel_decap, struct rte_flow_error *error)
+{
+	const struct rte_flow_action_set_tp *conf =
+		(const struct rte_flow_action_set_tp *)(action->conf);
+	struct rte_flow_item item;
+	struct rte_flow_item_udp udp;
+	struct rte_flow_item_udp udp_mask;
+	struct rte_flow_item_tcp tcp;
+	struct rte_flow_item_tcp tcp_mask;
+	struct field_modify_info *field;
+
+	if (!attr->valid)
+		flow_dv_attr_init(items, attr, dev_flow, tunnel_decap);
+	if (attr->udp) {
+		memset(&udp, 0, sizeof(udp));
+		memset(&udp_mask, 0, sizeof(udp_mask));
+		if (action->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC) {
+			udp.hdr.src_port = conf->port;
+			udp_mask.hdr.src_port =
+					rte_flow_item_udp_mask.hdr.src_port;
+		} else {
+			udp.hdr.dst_port = conf->port;
+			udp_mask.hdr.dst_port =
+					rte_flow_item_udp_mask.hdr.dst_port;
+		}
+		item.type = RTE_FLOW_ITEM_TYPE_UDP;
+		item.spec = &udp;
+		item.mask = &udp_mask;
+		field = modify_udp;
+	} else {
+		MLX5_ASSERT(attr->tcp);
+		memset(&tcp, 0, sizeof(tcp));
+		memset(&tcp_mask, 0, sizeof(tcp_mask));
+		if (action->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC) {
+			tcp.hdr.src_port = conf->port;
+			tcp_mask.hdr.src_port =
+					rte_flow_item_tcp_mask.hdr.src_port;
+		} else {
+			tcp.hdr.dst_port = conf->port;
+			tcp_mask.hdr.dst_port =
+					rte_flow_item_tcp_mask.hdr.dst_port;
+		}
+		item.type = RTE_FLOW_ITEM_TYPE_TCP;
+		item.spec = &tcp;
+		item.mask = &tcp_mask;
+		field = modify_tcp;
+	}
+	return flow_dv_convert_modify_action(&item, field, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set TTL action to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[in] items
+ *   Pointer to rte_flow_item objects list.
+ * @param[in] attr
+ *   Pointer to flow attributes structure.
+ * @param[in] dev_flow
+ *   Pointer to the sub flow.
+ * @param[in] tunnel_decap
+ *   Whether action is after tunnel decapsulation.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ttl
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 const struct rte_flow_item *items,
+			 union flow_dv_attr *attr, struct mlx5_flow *dev_flow,
+			 bool tunnel_decap, struct rte_flow_error *error)
+{
+	const struct rte_flow_action_set_ttl *conf =
+		(const struct rte_flow_action_set_ttl *)(action->conf);
+	struct rte_flow_item item;
+	struct rte_flow_item_ipv4 ipv4;
+	struct rte_flow_item_ipv4 ipv4_mask;
+	struct rte_flow_item_ipv6 ipv6;
+	struct rte_flow_item_ipv6 ipv6_mask;
+	struct field_modify_info *field;
+
+	if (!attr->valid)
+		flow_dv_attr_init(items, attr, dev_flow, tunnel_decap);
+	if (attr->ipv4) {
+		memset(&ipv4, 0, sizeof(ipv4));
+		memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+		ipv4.hdr.time_to_live = conf->ttl_value;
+		ipv4_mask.hdr.time_to_live = 0xFF;
+		item.type = RTE_FLOW_ITEM_TYPE_IPV4;
+		item.spec = &ipv4;
+		item.mask = &ipv4_mask;
+		field = modify_ipv4;
+	} else {
+		MLX5_ASSERT(attr->ipv6);
+		memset(&ipv6, 0, sizeof(ipv6));
+		memset(&ipv6_mask, 0, sizeof(ipv6_mask));
+		ipv6.hdr.hop_limits = conf->ttl_value;
+		ipv6_mask.hdr.hop_limits = 0xFF;
+		item.type = RTE_FLOW_ITEM_TYPE_IPV6;
+		item.spec = &ipv6;
+		item.mask = &ipv6_mask;
+		field = modify_ipv6;
+	}
+	return flow_dv_convert_modify_action(&item, field, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header decrement TTL action to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[in] items
+ *   Pointer to rte_flow_item objects list.
+ * @param[in] attr
+ *   Pointer to flow attributes structure.
+ * @param[in] dev_flow
+ *   Pointer to the sub flow.
+ * @param[in] tunnel_decap
+ *   Whether action is after tunnel decapsulation.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_dec_ttl
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_item *items,
+			 union flow_dv_attr *attr, struct mlx5_flow *dev_flow,
+			 bool tunnel_decap, struct rte_flow_error *error)
+{
+	struct rte_flow_item item;
+	struct rte_flow_item_ipv4 ipv4;
+	struct rte_flow_item_ipv4 ipv4_mask;
+	struct rte_flow_item_ipv6 ipv6;
+	struct rte_flow_item_ipv6 ipv6_mask;
+	struct field_modify_info *field;
+
+	if (!attr->valid)
+		flow_dv_attr_init(items, attr, dev_flow, tunnel_decap);
+	if (attr->ipv4) {
+		memset(&ipv4, 0, sizeof(ipv4));
+		memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+		ipv4.hdr.time_to_live = 0xFF;
+		ipv4_mask.hdr.time_to_live = 0xFF;
+		item.type = RTE_FLOW_ITEM_TYPE_IPV4;
+		item.spec = &ipv4;
+		item.mask = &ipv4_mask;
+		field = modify_ipv4;
+	} else {
+		MLX5_ASSERT(attr->ipv6);
+		memset(&ipv6, 0, sizeof(ipv6));
+		memset(&ipv6_mask, 0, sizeof(ipv6_mask));
+		ipv6.hdr.hop_limits = 0xFF;
+		ipv6_mask.hdr.hop_limits = 0xFF;
+		item.type = RTE_FLOW_ITEM_TYPE_IPV6;
+		item.spec = &ipv6;
+		item.mask = &ipv6_mask;
+		field = modify_ipv6;
+	}
+	return flow_dv_convert_modify_action(&item, field, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_ADD, error);
+}
+
+/**
+ * Convert modify-header increment/decrement TCP Sequence number
+ * to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_tcp_seq
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 struct rte_flow_error *error)
+{
+	const rte_be32_t *conf = (const rte_be32_t *)(action->conf);
+	uint64_t value = rte_be_to_cpu_32(*conf);
+	struct rte_flow_item item;
+	struct rte_flow_item_tcp tcp;
+	struct rte_flow_item_tcp tcp_mask;
+
+	memset(&tcp, 0, sizeof(tcp));
+	memset(&tcp_mask, 0, sizeof(tcp_mask));
+	if (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ)
+		/*
+		 * The HW has no decrement operation, only increment operation.
+		 * To simulate decrement X from Y using increment operation
+		 * we need to add UINT32_MAX X times to Y.
+		 * Each adding of UINT32_MAX decrements Y by 1.
+		 */
+		value *= UINT32_MAX;
+	tcp.hdr.sent_seq = rte_cpu_to_be_32((uint32_t)value);
+	tcp_mask.hdr.sent_seq = RTE_BE32(UINT32_MAX);
+	item.type = RTE_FLOW_ITEM_TYPE_TCP;
+	item.spec = &tcp;
+	item.mask = &tcp_mask;
+	return flow_dv_convert_modify_action(&item, modify_tcp, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_ADD, error);
+}
+
+/**
+ * Convert modify-header increment/decrement TCP Acknowledgment number
+ * to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_tcp_ack
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 struct rte_flow_error *error)
+{
+	const rte_be32_t *conf = (const rte_be32_t *)(action->conf);
+	uint64_t value = rte_be_to_cpu_32(*conf);
+	struct rte_flow_item item;
+	struct rte_flow_item_tcp tcp;
+	struct rte_flow_item_tcp tcp_mask;
+
+	memset(&tcp, 0, sizeof(tcp));
+	memset(&tcp_mask, 0, sizeof(tcp_mask));
+	if (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK)
+		/*
+		 * The HW has no decrement operation, only increment operation.
+		 * To simulate decrement X from Y using increment operation
+		 * we need to add UINT32_MAX X times to Y.
+		 * Each adding of UINT32_MAX decrements Y by 1.
+		 */
+		value *= UINT32_MAX;
+	tcp.hdr.recv_ack = rte_cpu_to_be_32((uint32_t)value);
+	tcp_mask.hdr.recv_ack = RTE_BE32(UINT32_MAX);
+	item.type = RTE_FLOW_ITEM_TYPE_TCP;
+	item.spec = &tcp;
+	item.mask = &tcp_mask;
+	return flow_dv_convert_modify_action(&item, modify_tcp, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_ADD, error);
+}
+
+static enum mlx5_modification_field reg_to_field[] = {
+	[REG_NONE] = MLX5_MODI_OUT_NONE,
+	[REG_A] = MLX5_MODI_META_DATA_REG_A,
+	[REG_B] = MLX5_MODI_META_DATA_REG_B,
+	[REG_C_0] = MLX5_MODI_META_REG_C_0,
+	[REG_C_1] = MLX5_MODI_META_REG_C_1,
+	[REG_C_2] = MLX5_MODI_META_REG_C_2,
+	[REG_C_3] = MLX5_MODI_META_REG_C_3,
+	[REG_C_4] = MLX5_MODI_META_REG_C_4,
+	[REG_C_5] = MLX5_MODI_META_REG_C_5,
+	[REG_C_6] = MLX5_MODI_META_REG_C_6,
+	[REG_C_7] = MLX5_MODI_META_REG_C_7,
+};
+
+/**
+ * Convert register set to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_set_reg
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 struct rte_flow_error *error)
+{
+	const struct mlx5_rte_flow_action_set_tag *conf = action->conf;
+	struct mlx5_modification_cmd *actions = resource->actions;
+	uint32_t i = resource->actions_num;
+
+	if (i >= MLX5_MAX_MODIFY_NUM)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "too many items to modify");
+	MLX5_ASSERT(conf->id != REG_NONE);
+	MLX5_ASSERT(conf->id < RTE_DIM(reg_to_field));
+	actions[i] = (struct mlx5_modification_cmd) {
+		.action_type = MLX5_MODIFICATION_TYPE_SET,
+		.field = reg_to_field[conf->id],
+	};
+	actions[i].data0 = rte_cpu_to_be_32(actions[i].data0);
+	actions[i].data1 = rte_cpu_to_be_32(conf->data);
+	++i;
+	resource->actions_num = i;
+	return 0;
+}
+
+/**
+ * Convert SET_TAG action to DV specification.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] conf
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_set_tag
+			(struct rte_eth_dev *dev,
+			 struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action_set_tag *conf,
+			 struct rte_flow_error *error)
+{
+	rte_be32_t data = rte_cpu_to_be_32(conf->data);
+	rte_be32_t mask = rte_cpu_to_be_32(conf->mask);
+	struct rte_flow_item item = {
+		.spec = &data,
+		.mask = &mask,
+	};
+	struct field_modify_info reg_c_x[] = {
+		[1] = {0, 0, 0},
+	};
+	enum mlx5_modification_field reg_type;
+	int ret;
+
+	ret = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, conf->index, error);
+	if (ret < 0)
+		return ret;
+	MLX5_ASSERT(ret != REG_NONE);
+	MLX5_ASSERT((unsigned int)ret < RTE_DIM(reg_to_field));
+	reg_type = reg_to_field[ret];
+	MLX5_ASSERT(reg_type > 0);
+	reg_c_x[0] = (struct field_modify_info){4, 0, reg_type};
+	return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert internal COPY_REG action to DV specification.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in,out] res
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_copy_mreg(struct rte_eth_dev *dev,
+				 struct mlx5_flow_dv_modify_hdr_resource *res,
+				 const struct rte_flow_action *action,
+				 struct rte_flow_error *error)
+{
+	const struct mlx5_flow_action_copy_mreg *conf = action->conf;
+	rte_be32_t mask = RTE_BE32(UINT32_MAX);
+	struct rte_flow_item item = {
+		.spec = NULL,
+		.mask = &mask,
+	};
+	struct field_modify_info reg_src[] = {
+		{4, 0, reg_to_field[conf->src]},
+		{0, 0, 0},
+	};
+	struct field_modify_info reg_dst = {
+		.offset = 0,
+		.id = reg_to_field[conf->dst],
+	};
+	/* Adjust reg_c[0] usage according to reported mask. */
+	if (conf->dst == REG_C_0 || conf->src == REG_C_0) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+		uint32_t reg_c0 = priv->sh->dv_regc0_mask;
+
+		MLX5_ASSERT(reg_c0);
+		MLX5_ASSERT(priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY);
+		if (conf->dst == REG_C_0) {
+			/* Copy to reg_c[0], within mask only. */
+			reg_dst.offset = rte_bsf32(reg_c0);
+			/*
+			 * Mask is ignoring the enianness, because
+			 * there is no conversion in datapath.
+			 */
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+			/* Copy from destination lower bits to reg_c[0]. */
+			mask = reg_c0 >> reg_dst.offset;
+#else
+			/* Copy from destination upper bits to reg_c[0]. */
+			mask = reg_c0 << (sizeof(reg_c0) * CHAR_BIT -
+					  rte_fls_u32(reg_c0));
+#endif
+		} else {
+			mask = rte_cpu_to_be_32(reg_c0);
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+			/* Copy from reg_c[0] to destination lower bits. */
+			reg_dst.offset = 0;
+#else
+			/* Copy from reg_c[0] to destination upper bits. */
+			reg_dst.offset = sizeof(reg_c0) * CHAR_BIT -
+					 (rte_fls_u32(reg_c0) -
+					  rte_bsf32(reg_c0));
+#endif
+		}
+	}
+	return flow_dv_convert_modify_action(&item,
+					     reg_src, &reg_dst, res,
+					     MLX5_MODIFICATION_TYPE_COPY,
+					     error);
+}
+
+/**
+ * Convert MARK action to DV specification. This routine is used
+ * in extensive metadata only and requires metadata register to be
+ * handled. In legacy mode hardware tag resource is engaged.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] conf
+ *   Pointer to MARK action specification.
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_mark(struct rte_eth_dev *dev,
+			    const struct rte_flow_action_mark *conf,
+			    struct mlx5_flow_dv_modify_hdr_resource *resource,
+			    struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	rte_be32_t mask = rte_cpu_to_be_32(MLX5_FLOW_MARK_MASK &
+					   priv->sh->dv_mark_mask);
+	rte_be32_t data = rte_cpu_to_be_32(conf->id) & mask;
+	struct rte_flow_item item = {
+		.spec = &data,
+		.mask = &mask,
+	};
+	struct field_modify_info reg_c_x[] = {
+		{4, 0, 0}, /* dynamic instead of MLX5_MODI_META_REG_C_1. */
+		{0, 0, 0},
+	};
+	int reg;
+
+	if (!mask)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  NULL, "zero mark action mask");
+	reg = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+	if (reg < 0)
+		return reg;
+	MLX5_ASSERT(reg > 0);
+	if (reg == REG_C_0) {
+		uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+		uint32_t shl_c0 = rte_bsf32(msk_c0);
+
+		data = rte_cpu_to_be_32(rte_cpu_to_be_32(data) << shl_c0);
+		mask = rte_cpu_to_be_32(mask) & msk_c0;
+		mask = rte_cpu_to_be_32(mask << shl_c0);
+	}
+	reg_c_x[0].id = reg_to_field[reg];
+	return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Get metadata register index for specified steering domain.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] attr
+ *   Attributes of flow to determine steering domain.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   positive index on success, a negative errno value otherwise
+ *   and rte_errno is set.
+ */
+static enum modify_reg
+flow_dv_get_metadata_reg(struct rte_eth_dev *dev,
+			 const struct rte_flow_attr *attr,
+			 struct rte_flow_error *error)
+{
+	int reg =
+		mlx5_flow_get_reg_id(dev, attr->transfer ?
+					  MLX5_METADATA_FDB :
+					    attr->egress ?
+					    MLX5_METADATA_TX :
+					    MLX5_METADATA_RX, 0, error);
+	if (reg < 0)
+		return rte_flow_error_set(error,
+					  ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+					  NULL, "unavailable "
+					  "metadata register");
+	return reg;
+}
+
+/**
+ * Convert SET_META action to DV specification.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] attr
+ *   Attributes of flow that includes this item.
+ * @param[in] conf
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_set_meta
+			(struct rte_eth_dev *dev,
+			 struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_attr *attr,
+			 const struct rte_flow_action_set_meta *conf,
+			 struct rte_flow_error *error)
+{
+	uint32_t data = conf->data;
+	uint32_t mask = conf->mask;
+	struct rte_flow_item item = {
+		.spec = &data,
+		.mask = &mask,
+	};
+	struct field_modify_info reg_c_x[] = {
+		[1] = {0, 0, 0},
+	};
+	int reg = flow_dv_get_metadata_reg(dev, attr, error);
+
+	if (reg < 0)
+		return reg;
+	/*
+	 * In datapath code there is no endianness
+	 * coversions for perfromance reasons, all
+	 * pattern conversions are done in rte_flow.
+	 */
+	if (reg == REG_C_0) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+		uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+		uint32_t shl_c0;
+
+		MLX5_ASSERT(msk_c0);
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+		shl_c0 = rte_bsf32(msk_c0);
+#else
+		shl_c0 = sizeof(msk_c0) * CHAR_BIT - rte_fls_u32(msk_c0);
+#endif
+		mask <<= shl_c0;
+		data <<= shl_c0;
+		MLX5_ASSERT(!(~msk_c0 & rte_cpu_to_be_32(mask)));
+	}
+	reg_c_x[0] = (struct field_modify_info){4, 0, reg_to_field[reg]};
+	/* The routine expects parameters in memory as big-endian ones. */
+	return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set IPv4 DSCP action to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ipv4_dscp
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 struct rte_flow_error *error)
+{
+	const struct rte_flow_action_set_dscp *conf =
+		(const struct rte_flow_action_set_dscp *)(action->conf);
+	struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV4 };
+	struct rte_flow_item_ipv4 ipv4;
+	struct rte_flow_item_ipv4 ipv4_mask;
+
+	memset(&ipv4, 0, sizeof(ipv4));
+	memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+	ipv4.hdr.type_of_service = conf->dscp;
+	ipv4_mask.hdr.type_of_service = RTE_IPV4_HDR_DSCP_MASK >> 2;
+	item.spec = &ipv4;
+	item.mask = &ipv4_mask;
+	return flow_dv_convert_modify_action(&item, modify_ipv4, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Convert modify-header set IPv6 DSCP action to DV specification.
+ *
+ * @param[in,out] resource
+ *   Pointer to the modify-header resource.
+ * @param[in] action
+ *   Pointer to action specification.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_action_modify_ipv6_dscp
+			(struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 const struct rte_flow_action *action,
+			 struct rte_flow_error *error)
+{
+	const struct rte_flow_action_set_dscp *conf =
+		(const struct rte_flow_action_set_dscp *)(action->conf);
+	struct rte_flow_item item = { .type = RTE_FLOW_ITEM_TYPE_IPV6 };
+	struct rte_flow_item_ipv6 ipv6;
+	struct rte_flow_item_ipv6 ipv6_mask;
+
+	memset(&ipv6, 0, sizeof(ipv6));
+	memset(&ipv6_mask, 0, sizeof(ipv6_mask));
+	/*
+	 * Even though the DSCP bits offset of IPv6 is not byte aligned,
+	 * rdma-core only accept the DSCP bits byte aligned start from
+	 * bit 0 to 5 as to be compatible with IPv4. No need to shift the
+	 * bits in IPv6 case as rdma-core requires byte aligned value.
+	 */
+	ipv6.hdr.vtc_flow = conf->dscp;
+	ipv6_mask.hdr.vtc_flow = RTE_IPV6_HDR_DSCP_MASK >> 22;
+	item.spec = &ipv6;
+	item.mask = &ipv6_mask;
+	return flow_dv_convert_modify_action(&item, modify_ipv6, NULL, resource,
+					     MLX5_MODIFICATION_TYPE_SET, error);
+}
+
+/**
+ * Validate MARK item.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] attr
+ *   Attributes of flow that includes this item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_mark(struct rte_eth_dev *dev,
+			   const struct rte_flow_item *item,
+			   const struct rte_flow_attr *attr __rte_unused,
+			   struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	const struct rte_flow_item_mark *spec = item->spec;
+	const struct rte_flow_item_mark *mask = item->mask;
+	const struct rte_flow_item_mark nic_mask = {
+		.id = priv->sh->dv_mark_mask,
+	};
+	int ret;
+
+	if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "extended metadata feature"
+					  " isn't enabled");
+	if (!mlx5_flow_ext_mreg_supported(dev))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "extended metadata register"
+					  " isn't supported");
+	if (!nic_mask.id)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "extended metadata register"
+					  " isn't available");
+	ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+	if (ret < 0)
+		return ret;
+	if (!spec)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+					  item->spec,
+					  "data cannot be empty");
+	if (spec->id >= (MLX5_FLOW_MARK_MAX & nic_mask.id))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &spec->id,
+					  "mark id exceeds the limit");
+	if (!mask)
+		mask = &nic_mask;
+	if (!mask->id)
+		return rte_flow_error_set(error, EINVAL,
+					RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL,
+					"mask cannot be zero");
+
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					(const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_mark),
+					error);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * Validate META item.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] attr
+ *   Attributes of flow that includes this item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_meta(struct rte_eth_dev *dev __rte_unused,
+			   const struct rte_flow_item *item,
+			   const struct rte_flow_attr *attr,
+			   struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	const struct rte_flow_item_meta *spec = item->spec;
+	const struct rte_flow_item_meta *mask = item->mask;
+	struct rte_flow_item_meta nic_mask = {
+		.data = UINT32_MAX
+	};
+	int reg;
+	int ret;
+
+	if (!spec)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+					  item->spec,
+					  "data cannot be empty");
+	if (config->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+		if (!mlx5_flow_ext_mreg_supported(dev))
+			return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "extended metadata register"
+					  " isn't supported");
+		reg = flow_dv_get_metadata_reg(dev, attr, error);
+		if (reg < 0)
+			return reg;
+		if (reg == REG_B)
+			return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "match on reg_b "
+					  "isn't supported");
+		if (reg != REG_A)
+			nic_mask.data = priv->sh->dv_meta_mask;
+	}
+	if (!mask)
+		mask = &rte_flow_item_meta_mask;
+	if (!mask->data)
+		return rte_flow_error_set(error, EINVAL,
+					RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL,
+					"mask cannot be zero");
+
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					(const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_meta),
+					error);
+	return ret;
+}
+
+/**
+ * Validate TAG item.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] attr
+ *   Attributes of flow that includes this item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_tag(struct rte_eth_dev *dev,
+			  const struct rte_flow_item *item,
+			  const struct rte_flow_attr *attr __rte_unused,
+			  struct rte_flow_error *error)
+{
+	const struct rte_flow_item_tag *spec = item->spec;
+	const struct rte_flow_item_tag *mask = item->mask;
+	const struct rte_flow_item_tag nic_mask = {
+		.data = RTE_BE32(UINT32_MAX),
+		.index = 0xff,
+	};
+	int ret;
+
+	if (!mlx5_flow_ext_mreg_supported(dev))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "extensive metadata register"
+					  " isn't supported");
+	if (!spec)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+					  item->spec,
+					  "data cannot be empty");
+	if (!mask)
+		mask = &rte_flow_item_tag_mask;
+	if (!mask->data)
+		return rte_flow_error_set(error, EINVAL,
+					RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL,
+					"mask cannot be zero");
+
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					(const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_tag),
+					error);
+	if (ret < 0)
+		return ret;
+	if (mask->index != 0xff)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL,
+					  "partial mask for tag index"
+					  " is not supported");
+	ret = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, spec->index, error);
+	if (ret < 0)
+		return ret;
+	MLX5_ASSERT(ret != REG_NONE);
+	return 0;
+}
+
+/**
+ * Validate vport item.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] attr
+ *   Attributes of flow that includes this item.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_port_id(struct rte_eth_dev *dev,
+			      const struct rte_flow_item *item,
+			      const struct rte_flow_attr *attr,
+			      uint64_t item_flags,
+			      struct rte_flow_error *error)
+{
+	const struct rte_flow_item_port_id *spec = item->spec;
+	const struct rte_flow_item_port_id *mask = item->mask;
+	const struct rte_flow_item_port_id switch_mask = {
+			.id = 0xffffffff,
+	};
+	struct mlx5_priv *esw_priv;
+	struct mlx5_priv *dev_priv;
+	int ret;
+
+	if (!attr->transfer)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM,
+					  NULL,
+					  "match on port id is valid only"
+					  " when transfer flag is enabled");
+	if (item_flags & MLX5_FLOW_ITEM_PORT_ID)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple source ports are not"
+					  " supported");
+	if (!mask)
+		mask = &switch_mask;
+	if (mask->id != 0xffffffff)
+		return rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ITEM_MASK,
+					   mask,
+					   "no support for partial mask on"
+					   " \"id\" field");
+	ret = mlx5_flow_item_acceptable
+				(item, (const uint8_t *)mask,
+				 (const uint8_t *)&rte_flow_item_port_id_mask,
+				 sizeof(struct rte_flow_item_port_id),
+				 error);
+	if (ret)
+		return ret;
+	if (!spec)
+		return 0;
+	esw_priv = mlx5_port_to_eswitch_info(spec->id, false);
+	if (!esw_priv)
+		return rte_flow_error_set(error, rte_errno,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC, spec,
+					  "failed to obtain E-Switch info for"
+					  " port");
+	dev_priv = mlx5_dev_to_eswitch_info(dev);
+	if (!dev_priv)
+		return rte_flow_error_set(error, rte_errno,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "failed to obtain E-Switch info");
+	if (esw_priv->domain_id != dev_priv->domain_id)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC, spec,
+					  "cannot match on a port from a"
+					  " different E-Switch");
+	return 0;
+}
+
+/*
+ * GTP flags are contained in 1 byte of the format:
+ * -------------------------------------------
+ * | bit   | 0 - 2   | 3  | 4   | 5 | 6 | 7  |
+ * |-----------------------------------------|
+ * | value | Version | PT | Res | E | S | PN |
+ * -------------------------------------------
+ *
+ * Matching is supported only for GTP flags E, S, PN.
+ */
+#define MLX5_GTP_FLAGS_MASK	0x07
+
+/**
+ * Validate VLAN item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] dev
+ *   Ethernet device flow is being created on.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_vlan(const struct rte_flow_item *item,
+			   uint64_t item_flags,
+			   struct rte_eth_dev *dev,
+			   struct rte_flow_error *error)
+{
+	const struct rte_flow_item_vlan *mask = item->mask;
+	const struct rte_flow_item_vlan nic_mask = {
+		.tci = RTE_BE16(UINT16_MAX),
+		.inner_type = RTE_BE16(UINT16_MAX),
+	};
+	const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	int ret;
+	const uint64_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 |
+					MLX5_FLOW_LAYER_INNER_L4) :
+				       (MLX5_FLOW_LAYER_OUTER_L3 |
+					MLX5_FLOW_LAYER_OUTER_L4);
+	const uint64_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
+					MLX5_FLOW_LAYER_OUTER_VLAN;
+
+	if (item_flags & vlanm)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple VLAN layers not supported");
+	else if ((item_flags & l34m) != 0)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "VLAN cannot follow L3/L4 layer");
+	if (!mask)
+		mask = &rte_flow_item_vlan_mask;
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					(const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_vlan),
+					error);
+	if (ret)
+		return ret;
+	if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+
+		if (priv->vmwa_context) {
+			/*
+			 * Non-NULL context means we have a virtual machine
+			 * and SR-IOV enabled, we have to create VLAN interface
+			 * to make hypervisor to setup E-Switch vport
+			 * context correctly. We avoid creating the multiple
+			 * VLAN interfaces, so we cannot support VLAN tag mask.
+			 */
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "VLAN tag mask is not"
+						  " supported in virtual"
+						  " environment");
+		}
+	}
+	return 0;
+}
+
+/**
+ * Validate GTP item.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_gtp(struct rte_eth_dev *dev,
+			  const struct rte_flow_item *item,
+			  uint64_t item_flags,
+			  struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_item_gtp *spec = item->spec;
+	const struct rte_flow_item_gtp *mask = item->mask;
+	const struct rte_flow_item_gtp nic_mask = {
+		.v_pt_rsv_flags = MLX5_GTP_FLAGS_MASK,
+		.msg_type = 0xff,
+		.teid = RTE_BE32(0xffffffff),
+	};
+
+	if (!priv->config.hca_attr.tunnel_stateless_gtp)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "GTP support is not enabled");
+	if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "multiple tunnel layers not"
+					  " supported");
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "no outer UDP layer found");
+	if (!mask)
+		mask = &rte_flow_item_gtp_mask;
+	if (spec && spec->v_pt_rsv_flags & ~MLX5_GTP_FLAGS_MASK)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, item,
+					  "Match is supported for GTP"
+					  " flags only");
+	return mlx5_flow_item_acceptable
+		(item, (const uint8_t *)mask,
+		 (const uint8_t *)&nic_mask,
+		 sizeof(struct rte_flow_item_gtp),
+		 error);
+}
+
+/**
+ * Validate the pop VLAN action.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the pop vlan action.
+ * @param[in] item_flags
+ *   The items found in this flow rule.
+ * @param[in] attr
+ *   Pointer to flow attributes.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_pop_vlan(struct rte_eth_dev *dev,
+				 uint64_t action_flags,
+				 const struct rte_flow_action *action,
+				 uint64_t item_flags,
+				 const struct rte_flow_attr *attr,
+				 struct rte_flow_error *error)
+{
+	const struct mlx5_priv *priv = dev->data->dev_private;
+
+	(void)action;
+	(void)attr;
+	if (!priv->sh->pop_vlan_action)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "pop vlan action is not supported");
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
+					  NULL,
+					  "pop vlan action not supported for "
+					  "egress");
+	if (action_flags & MLX5_FLOW_VLAN_ACTIONS)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "no support for multiple VLAN "
+					  "actions");
+	if (!(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "cannot pop vlan without a "
+					  "match on (outer) vlan in the flow");
+	if (action_flags & MLX5_FLOW_ACTION_PORT_ID)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "wrong action order, port_id should "
+					  "be after pop VLAN action");
+	if (!attr->transfer && priv->representor)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "pop vlan action for VF representor "
+					  "not supported on NIC table");
+	return 0;
+}
+
+/**
+ * Get VLAN default info from vlan match info.
+ *
+ * @param[in] items
+ *   the list of item specifications.
+ * @param[out] vlan
+ *   pointer VLAN info to fill to.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static void
+flow_dev_get_vlan_info_from_items(const struct rte_flow_item *items,
+				  struct rte_vlan_hdr *vlan)
+{
+	const struct rte_flow_item_vlan nic_mask = {
+		.tci = RTE_BE16(MLX5DV_FLOW_VLAN_PCP_MASK |
+				MLX5DV_FLOW_VLAN_VID_MASK),
+		.inner_type = RTE_BE16(0xffff),
+	};
+
+	if (items == NULL)
+		return;
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		int type = items->type;
+
+		if (type == RTE_FLOW_ITEM_TYPE_VLAN ||
+		    type == MLX5_RTE_FLOW_ITEM_TYPE_VLAN)
+			break;
+	}
+	if (items->type != RTE_FLOW_ITEM_TYPE_END) {
+		const struct rte_flow_item_vlan *vlan_m = items->mask;
+		const struct rte_flow_item_vlan *vlan_v = items->spec;
+
+		/* If VLAN item in pattern doesn't contain data, return here. */
+		if (!vlan_v)
+			return;
+		if (!vlan_m)
+			vlan_m = &nic_mask;
+		/* Only full match values are accepted */
+		if ((vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) ==
+		     MLX5DV_FLOW_VLAN_PCP_MASK_BE) {
+			vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_PCP_MASK;
+			vlan->vlan_tci |=
+				rte_be_to_cpu_16(vlan_v->tci &
+						 MLX5DV_FLOW_VLAN_PCP_MASK_BE);
+		}
+		if ((vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) ==
+		     MLX5DV_FLOW_VLAN_VID_MASK_BE) {
+			vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_VID_MASK;
+			vlan->vlan_tci |=
+				rte_be_to_cpu_16(vlan_v->tci &
+						 MLX5DV_FLOW_VLAN_VID_MASK_BE);
+		}
+		if (vlan_m->inner_type == nic_mask.inner_type)
+			vlan->eth_proto = rte_be_to_cpu_16(vlan_v->inner_type &
+							   vlan_m->inner_type);
+	}
+}
+
+/**
+ * Validate the push VLAN action.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] item_flags
+ *   The items found in this flow rule.
+ * @param[in] action
+ *   Pointer to the action structure.
+ * @param[in] attr
+ *   Pointer to flow attributes
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_push_vlan(struct rte_eth_dev *dev,
+				  uint64_t action_flags,
+				  const struct rte_flow_item_vlan *vlan_m,
+				  const struct rte_flow_action *action,
+				  const struct rte_flow_attr *attr,
+				  struct rte_flow_error *error)
+{
+	const struct rte_flow_action_of_push_vlan *push_vlan = action->conf;
+	const struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!attr->transfer && attr->ingress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+					  NULL,
+					  "push VLAN action not supported for "
+					  "ingress");
+	if (push_vlan->ethertype != RTE_BE16(RTE_ETHER_TYPE_VLAN) &&
+	    push_vlan->ethertype != RTE_BE16(RTE_ETHER_TYPE_QINQ))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "invalid vlan ethertype");
+	if (action_flags & MLX5_FLOW_VLAN_ACTIONS)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "no support for multiple VLAN "
+					  "actions");
+	if (action_flags & MLX5_FLOW_ACTION_PORT_ID)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "wrong action order, port_id should "
+					  "be after push VLAN");
+	if (!attr->transfer && priv->representor)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "push vlan action for VF representor "
+					  "not supported on NIC table");
+	if (vlan_m &&
+	    (vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) &&
+	    (vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) !=
+		MLX5DV_FLOW_VLAN_PCP_MASK_BE &&
+	    !(action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_PCP) &&
+	    !(mlx5_flow_find_action
+		(action + 1, RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP)))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "not full match mask on VLAN PCP and "
+					  "there is no of_set_vlan_pcp action, "
+					  "push VLAN action cannot figure out "
+					  "PCP value");
+	if (vlan_m &&
+	    (vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) &&
+	    (vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) !=
+		MLX5DV_FLOW_VLAN_VID_MASK_BE &&
+	    !(action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_VID) &&
+	    !(mlx5_flow_find_action
+		(action + 1, RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID)))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "not full match mask on VLAN VID and "
+					  "there is no of_set_vlan_vid action, "
+					  "push VLAN action cannot figure out "
+					  "VID value");
+	(void)attr;
+	return 0;
+}
+
+/**
+ * Validate the set VLAN PCP.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] actions
+ *   Pointer to the list of actions remaining in the flow rule.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_set_vlan_pcp(uint64_t action_flags,
+				     const struct rte_flow_action actions[],
+				     struct rte_flow_error *error)
+{
+	const struct rte_flow_action *action = actions;
+	const struct rte_flow_action_of_set_vlan_pcp *conf = action->conf;
+
+	if (conf->vlan_pcp > 7)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "VLAN PCP value is too big");
+	if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "set VLAN PCP action must follow "
+					  "the push VLAN action");
+	if (action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "Multiple VLAN PCP modification are "
+					  "not supported");
+	if (action_flags & MLX5_FLOW_ACTION_PORT_ID)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "wrong action order, port_id should "
+					  "be after set VLAN PCP");
+	return 0;
+}
+
+/**
+ * Validate the set VLAN VID.
+ *
+ * @param[in] item_flags
+ *   Holds the items detected in this rule.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] actions
+ *   Pointer to the list of actions remaining in the flow rule.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_set_vlan_vid(uint64_t item_flags,
+				     uint64_t action_flags,
+				     const struct rte_flow_action actions[],
+				     struct rte_flow_error *error)
+{
+	const struct rte_flow_action *action = actions;
+	const struct rte_flow_action_of_set_vlan_vid *conf = action->conf;
+
+	if (rte_be_to_cpu_16(conf->vlan_vid) > 0xFFE)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "VLAN VID value is too big");
+	if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
+	    !(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "set VLAN VID action must follow push"
+					  " VLAN action or match on VLAN item");
+	if (action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_VID)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "Multiple VLAN VID modifications are "
+					  "not supported");
+	if (action_flags & MLX5_FLOW_ACTION_PORT_ID)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "wrong action order, port_id should "
+					  "be after set VLAN VID");
+	return 0;
+}
+
+/*
+ * Validate the FLAG action.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] attr
+ *   Pointer to flow attributes
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_flag(struct rte_eth_dev *dev,
+			     uint64_t action_flags,
+			     const struct rte_flow_attr *attr,
+			     struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	int ret;
+
+	/* Fall back if no extended metadata register support. */
+	if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY)
+		return mlx5_flow_validate_action_flag(action_flags, attr,
+						      error);
+	/* Extensive metadata mode requires registers. */
+	if (!mlx5_flow_ext_mreg_supported(dev))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "no metadata registers "
+					  "to support flag action");
+	if (!(priv->sh->dv_mark_mask & MLX5_FLOW_MARK_DEFAULT))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "extended metadata register"
+					  " isn't available");
+	ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+	if (ret < 0)
+		return ret;
+	MLX5_ASSERT(ret > 0);
+	if (action_flags & MLX5_FLOW_ACTION_MARK)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't mark and flag in same flow");
+	if (action_flags & MLX5_FLOW_ACTION_FLAG)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't have 2 flag"
+					  " actions in same flow");
+	return 0;
+}
+
+/**
+ * Validate MARK action.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] action
+ *   Pointer to action.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] attr
+ *   Pointer to flow attributes
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_mark(struct rte_eth_dev *dev,
+			     const struct rte_flow_action *action,
+			     uint64_t action_flags,
+			     const struct rte_flow_attr *attr,
+			     struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	const struct rte_flow_action_mark *mark = action->conf;
+	int ret;
+
+	/* Fall back if no extended metadata register support. */
+	if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY)
+		return mlx5_flow_validate_action_mark(action, action_flags,
+						      attr, error);
+	/* Extensive metadata mode requires registers. */
+	if (!mlx5_flow_ext_mreg_supported(dev))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "no metadata registers "
+					  "to support mark action");
+	if (!priv->sh->dv_mark_mask)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "extended metadata register"
+					  " isn't available");
+	ret = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, error);
+	if (ret < 0)
+		return ret;
+	MLX5_ASSERT(ret > 0);
+	if (!mark)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "configuration cannot be null");
+	if (mark->id >= (MLX5_FLOW_MARK_MAX & priv->sh->dv_mark_mask))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &mark->id,
+					  "mark id exceeds the limit");
+	if (action_flags & MLX5_FLOW_ACTION_FLAG)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't flag and mark in same flow");
+	if (action_flags & MLX5_FLOW_ACTION_MARK)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't have 2 mark actions in same"
+					  " flow");
+	return 0;
+}
+
+/**
+ * Validate SET_META action.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] action
+ *   Pointer to the action structure.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] attr
+ *   Pointer to flow attributes
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_set_meta(struct rte_eth_dev *dev,
+				 const struct rte_flow_action *action,
+				 uint64_t action_flags __rte_unused,
+				 const struct rte_flow_attr *attr,
+				 struct rte_flow_error *error)
+{
+	const struct rte_flow_action_set_meta *conf;
+	uint32_t nic_mask = UINT32_MAX;
+	int reg;
+
+	if (!mlx5_flow_ext_mreg_supported(dev))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "extended metadata register"
+					  " isn't supported");
+	reg = flow_dv_get_metadata_reg(dev, attr, error);
+	if (reg < 0)
+		return reg;
+	if (reg != REG_A && reg != REG_B) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+
+		nic_mask = priv->sh->dv_meta_mask;
+	}
+	if (!(action->conf))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "configuration cannot be null");
+	conf = (const struct rte_flow_action_set_meta *)action->conf;
+	if (!conf->mask)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "zero mask doesn't have any effect");
+	if (conf->mask & ~nic_mask)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "meta data must be within reg C0");
+	return 0;
+}
+
+/**
+ * Validate SET_TAG action.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] action
+ *   Pointer to the action structure.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] attr
+ *   Pointer to flow attributes
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_set_tag(struct rte_eth_dev *dev,
+				const struct rte_flow_action *action,
+				uint64_t action_flags,
+				const struct rte_flow_attr *attr,
+				struct rte_flow_error *error)
+{
+	const struct rte_flow_action_set_tag *conf;
+	const uint64_t terminal_action_flags =
+		MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE |
+		MLX5_FLOW_ACTION_RSS;
+	int ret;
+
+	if (!mlx5_flow_ext_mreg_supported(dev))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "extensive metadata register"
+					  " isn't supported");
+	if (!(action->conf))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "configuration cannot be null");
+	conf = (const struct rte_flow_action_set_tag *)action->conf;
+	if (!conf->mask)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "zero mask doesn't have any effect");
+	ret = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, conf->index, error);
+	if (ret < 0)
+		return ret;
+	if (!attr->transfer && attr->ingress &&
+	    (action_flags & terminal_action_flags))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "set_tag has no effect"
+					  " with terminal actions");
+	return 0;
+}
+
+/**
+ * Validate count action.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_count(struct rte_eth_dev *dev,
+			      struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!priv->config.devx)
+		goto notsup_err;
+#ifdef HAVE_IBV_FLOW_DEVX_COUNTERS
+	return 0;
+#endif
+notsup_err:
+	return rte_flow_error_set
+		      (error, ENOTSUP,
+		       RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+		       NULL,
+		       "count action not supported");
+}
+
+/**
+ * Validate the L2 encap action.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the action structure.
+ * @param[in] attr
+ *   Pointer to flow attributes.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_l2_encap(struct rte_eth_dev *dev,
+				 uint64_t action_flags,
+				 const struct rte_flow_action *action,
+				 const struct rte_flow_attr *attr,
+				 struct rte_flow_error *error)
+{
+	const struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!(action->conf))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "configuration cannot be null");
+	if (action_flags & MLX5_FLOW_ACTION_ENCAP)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can only have a single encap action "
+					  "in a flow");
+	if (!attr->transfer && priv->representor)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "encap action for VF representor "
+					  "not supported on NIC table");
+	return 0;
+}
+
+/**
+ * Validate a decap action.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] attr
+ *   Pointer to flow attributes
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_decap(struct rte_eth_dev *dev,
+			      uint64_t action_flags,
+			      const struct rte_flow_attr *attr,
+			      struct rte_flow_error *error)
+{
+	const struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (action_flags & MLX5_FLOW_XCAP_ACTIONS)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  action_flags &
+					  MLX5_FLOW_ACTION_DECAP ? "can only "
+					  "have a single decap action" : "decap "
+					  "after encap is not supported");
+	if (action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't have decap action after"
+					  " modify action");
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
+					  NULL,
+					  "decap action not supported for "
+					  "egress");
+	if (!attr->transfer && priv->representor)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "decap action for VF representor "
+					  "not supported on NIC table");
+	return 0;
+}
+
+const struct rte_flow_action_raw_decap empty_decap = {.data = NULL, .size = 0,};
+
+/**
+ * Validate the raw encap and decap actions.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] decap
+ *   Pointer to the decap action.
+ * @param[in] encap
+ *   Pointer to the encap action.
+ * @param[in] attr
+ *   Pointer to flow attributes
+ * @param[in/out] action_flags
+ *   Holds the actions detected until now.
+ * @param[out] actions_n
+ *   pointer to the number of actions counter.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_raw_encap_decap
+	(struct rte_eth_dev *dev,
+	 const struct rte_flow_action_raw_decap *decap,
+	 const struct rte_flow_action_raw_encap *encap,
+	 const struct rte_flow_attr *attr, uint64_t *action_flags,
+	 int *actions_n, struct rte_flow_error *error)
+{
+	const struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (encap && (!encap->size || !encap->data))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "raw encap data cannot be empty");
+	if (decap && encap) {
+		if (decap->size <= MLX5_ENCAPSULATION_DECISION_SIZE &&
+		    encap->size > MLX5_ENCAPSULATION_DECISION_SIZE)
+			/* L3 encap. */
+			decap = NULL;
+		else if (encap->size <=
+			   MLX5_ENCAPSULATION_DECISION_SIZE &&
+			   decap->size >
+			   MLX5_ENCAPSULATION_DECISION_SIZE)
+			/* L3 decap. */
+			encap = NULL;
+		else if (encap->size >
+			   MLX5_ENCAPSULATION_DECISION_SIZE &&
+			   decap->size >
+			   MLX5_ENCAPSULATION_DECISION_SIZE)
+			/* 2 L2 actions: encap and decap. */
+			;
+		else
+			return rte_flow_error_set(error,
+				ENOTSUP,
+				RTE_FLOW_ERROR_TYPE_ACTION,
+				NULL, "unsupported too small "
+				"raw decap and too small raw "
+				"encap combination");
+	}
+	if (decap) {
+		ret = flow_dv_validate_action_decap(dev, *action_flags, attr,
+						    error);
+		if (ret < 0)
+			return ret;
+		*action_flags |= MLX5_FLOW_ACTION_DECAP;
+		++(*actions_n);
+	}
+	if (encap) {
+		if (encap->size <= MLX5_ENCAPSULATION_DECISION_SIZE)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "small raw encap size");
+		if (*action_flags & MLX5_FLOW_ACTION_ENCAP)
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "more than one encap action");
+		if (!attr->transfer && priv->representor)
+			return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					 "encap action for VF representor "
+					 "not supported on NIC table");
+		*action_flags |= MLX5_FLOW_ACTION_ENCAP;
+		++(*actions_n);
+	}
+	return 0;
+}
+
+/**
+ * Find existing encap/decap resource or create and register a new one.
+ *
+ * @param[in, out] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] resource
+ *   Pointer to encap/decap resource.
+ * @parm[in, out] dev_flow
+ *   Pointer to the dev_flow.
+ * @param[out] error
+ *   pointer to error structure.
+ *
+ * @return
+ *   0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_encap_decap_resource_register
+			(struct rte_eth_dev *dev,
+			 struct mlx5_flow_dv_encap_decap_resource *resource,
+			 struct mlx5_flow *dev_flow,
+			 struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_dv_encap_decap_resource *cache_resource;
+	struct mlx5dv_dr_domain *domain;
+	uint32_t idx = 0;
+
+	resource->flags = dev_flow->dv.group ? 0 : 1;
+	if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_FDB)
+		domain = sh->fdb_domain;
+	else if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_NIC_RX)
+		domain = sh->rx_domain;
+	else
+		domain = sh->tx_domain;
+	/* Lookup a matching resource from cache. */
+	ILIST_FOREACH(sh->ipool[MLX5_IPOOL_DECAP_ENCAP], sh->encaps_decaps, idx,
+		      cache_resource, next) {
+		if (resource->reformat_type == cache_resource->reformat_type &&
+		    resource->ft_type == cache_resource->ft_type &&
+		    resource->flags == cache_resource->flags &&
+		    resource->size == cache_resource->size &&
+		    !memcmp((const void *)resource->buf,
+			    (const void *)cache_resource->buf,
+			    resource->size)) {
+			DRV_LOG(DEBUG, "encap/decap resource %p: refcnt %d++",
+				(void *)cache_resource,
+				rte_atomic32_read(&cache_resource->refcnt));
+			rte_atomic32_inc(&cache_resource->refcnt);
+			dev_flow->handle->dvh.rix_encap_decap = idx;
+			dev_flow->dv.encap_decap = cache_resource;
+			return 0;
+		}
+	}
+	/* Register new encap/decap resource. */
+	cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_DECAP_ENCAP],
+				       &dev_flow->handle->dvh.rix_encap_decap);
+	if (!cache_resource)
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "cannot allocate resource memory");
+	*cache_resource = *resource;
+	cache_resource->verbs_action =
+		mlx5_glue->dv_create_flow_action_packet_reformat
+			(sh->ctx, cache_resource->reformat_type,
+			 cache_resource->ft_type, domain, cache_resource->flags,
+			 cache_resource->size,
+			 (cache_resource->size ? cache_resource->buf : NULL));
+	if (!cache_resource->verbs_action) {
+		rte_free(cache_resource);
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL, "cannot create action");
+	}
+	rte_atomic32_init(&cache_resource->refcnt);
+	rte_atomic32_inc(&cache_resource->refcnt);
+	ILIST_INSERT(sh->ipool[MLX5_IPOOL_DECAP_ENCAP], &sh->encaps_decaps,
+		     dev_flow->handle->dvh.rix_encap_decap, cache_resource,
+		     next);
+	dev_flow->dv.encap_decap = cache_resource;
+	DRV_LOG(DEBUG, "new encap/decap resource %p: refcnt %d++",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	return 0;
+}
+
+/**
+ * Find existing table jump resource or create and register a new one.
+ *
+ * @param[in, out] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] tbl
+ *   Pointer to flow table resource.
+ * @parm[in, out] dev_flow
+ *   Pointer to the dev_flow.
+ * @param[out] error
+ *   pointer to error structure.
+ *
+ * @return
+ *   0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_jump_tbl_resource_register
+			(struct rte_eth_dev *dev __rte_unused,
+			 struct mlx5_flow_tbl_resource *tbl,
+			 struct mlx5_flow *dev_flow,
+			 struct rte_flow_error *error)
+{
+	struct mlx5_flow_tbl_data_entry *tbl_data =
+		container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl);
+	int cnt;
+
+	MLX5_ASSERT(tbl);
+	cnt = rte_atomic32_read(&tbl_data->jump.refcnt);
+	if (!cnt) {
+		tbl_data->jump.action =
+			mlx5_glue->dr_create_flow_action_dest_flow_tbl
+			(tbl->obj);
+		if (!tbl_data->jump.action)
+			return rte_flow_error_set(error, ENOMEM,
+					RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					NULL, "cannot create jump action");
+		DRV_LOG(DEBUG, "new jump table resource %p: refcnt %d++",
+			(void *)&tbl_data->jump, cnt);
+	} else {
+		/* old jump should not make the table ref++. */
+		flow_dv_tbl_resource_release(dev, &tbl_data->tbl);
+		MLX5_ASSERT(tbl_data->jump.action);
+		DRV_LOG(DEBUG, "existed jump table resource %p: refcnt %d++",
+			(void *)&tbl_data->jump, cnt);
+	}
+	rte_atomic32_inc(&tbl_data->jump.refcnt);
+	dev_flow->handle->rix_jump = tbl_data->idx;
+	dev_flow->dv.jump = &tbl_data->jump;
+	return 0;
+}
+
+/**
+ * Find existing table port ID resource or create and register a new one.
+ *
+ * @param[in, out] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] resource
+ *   Pointer to port ID action resource.
+ * @parm[in, out] dev_flow
+ *   Pointer to the dev_flow.
+ * @param[out] error
+ *   pointer to error structure.
+ *
+ * @return
+ *   0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_port_id_action_resource_register
+			(struct rte_eth_dev *dev,
+			 struct mlx5_flow_dv_port_id_action_resource *resource,
+			 struct mlx5_flow *dev_flow,
+			 struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_dv_port_id_action_resource *cache_resource;
+	uint32_t idx = 0;
+
+	/* Lookup a matching resource from cache. */
+	ILIST_FOREACH(sh->ipool[MLX5_IPOOL_PORT_ID], sh->port_id_action_list,
+		      idx, cache_resource, next) {
+		if (resource->port_id == cache_resource->port_id) {
+			DRV_LOG(DEBUG, "port id action resource resource %p: "
+				"refcnt %d++",
+				(void *)cache_resource,
+				rte_atomic32_read(&cache_resource->refcnt));
+			rte_atomic32_inc(&cache_resource->refcnt);
+			dev_flow->handle->rix_port_id_action = idx;
+			dev_flow->dv.port_id_action = cache_resource;
+			return 0;
+		}
+	}
+	/* Register new port id action resource. */
+	cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_PORT_ID],
+				       &dev_flow->handle->rix_port_id_action);
+	if (!cache_resource)
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "cannot allocate resource memory");
+	*cache_resource = *resource;
+	/*
+	 * Depending on rdma_core version the glue routine calls
+	 * either mlx5dv_dr_action_create_dest_ib_port(domain, ibv_port)
+	 * or mlx5dv_dr_action_create_dest_vport(domain, vport_id).
+	 */
+	cache_resource->action =
+		mlx5_glue->dr_create_flow_action_dest_port
+			(priv->sh->fdb_domain, resource->port_id);
+	if (!cache_resource->action) {
+		rte_free(cache_resource);
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL, "cannot create action");
+	}
+	rte_atomic32_init(&cache_resource->refcnt);
+	rte_atomic32_inc(&cache_resource->refcnt);
+	ILIST_INSERT(sh->ipool[MLX5_IPOOL_PORT_ID], &sh->port_id_action_list,
+		     dev_flow->handle->rix_port_id_action, cache_resource,
+		     next);
+	dev_flow->dv.port_id_action = cache_resource;
+	DRV_LOG(DEBUG, "new port id action resource %p: refcnt %d++",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	return 0;
+}
+
+/**
+ * Find existing push vlan resource or create and register a new one.
+ *
+ * @param [in, out] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] resource
+ *   Pointer to port ID action resource.
+ * @parm[in, out] dev_flow
+ *   Pointer to the dev_flow.
+ * @param[out] error
+ *   pointer to error structure.
+ *
+ * @return
+ *   0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_push_vlan_action_resource_register
+		       (struct rte_eth_dev *dev,
+			struct mlx5_flow_dv_push_vlan_action_resource *resource,
+			struct mlx5_flow *dev_flow,
+			struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_dv_push_vlan_action_resource *cache_resource;
+	struct mlx5dv_dr_domain *domain;
+	uint32_t idx = 0;
+
+	/* Lookup a matching resource from cache. */
+	ILIST_FOREACH(sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+		      sh->push_vlan_action_list, idx, cache_resource, next) {
+		if (resource->vlan_tag == cache_resource->vlan_tag &&
+		    resource->ft_type == cache_resource->ft_type) {
+			DRV_LOG(DEBUG, "push-VLAN action resource resource %p: "
+				"refcnt %d++",
+				(void *)cache_resource,
+				rte_atomic32_read(&cache_resource->refcnt));
+			rte_atomic32_inc(&cache_resource->refcnt);
+			dev_flow->handle->dvh.rix_push_vlan = idx;
+			dev_flow->dv.push_vlan_res = cache_resource;
+			return 0;
+		}
+	}
+	/* Register new push_vlan action resource. */
+	cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+				       &dev_flow->handle->dvh.rix_push_vlan);
+	if (!cache_resource)
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "cannot allocate resource memory");
+	*cache_resource = *resource;
+	if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_FDB)
+		domain = sh->fdb_domain;
+	else if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_NIC_RX)
+		domain = sh->rx_domain;
+	else
+		domain = sh->tx_domain;
+	cache_resource->action =
+		mlx5_glue->dr_create_flow_action_push_vlan(domain,
+							   resource->vlan_tag);
+	if (!cache_resource->action) {
+		rte_free(cache_resource);
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL, "cannot create action");
+	}
+	rte_atomic32_init(&cache_resource->refcnt);
+	rte_atomic32_inc(&cache_resource->refcnt);
+	ILIST_INSERT(sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+		     &sh->push_vlan_action_list,
+		     dev_flow->handle->dvh.rix_push_vlan,
+		     cache_resource, next);
+	dev_flow->dv.push_vlan_res = cache_resource;
+	DRV_LOG(DEBUG, "new push vlan action resource %p: refcnt %d++",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	return 0;
+}
+/**
+ * Get the size of specific rte_flow_item_type
+ *
+ * @param[in] item_type
+ *   Tested rte_flow_item_type.
+ *
+ * @return
+ *   sizeof struct item_type, 0 if void or irrelevant.
+ */
+static size_t
+flow_dv_get_item_len(const enum rte_flow_item_type item_type)
+{
+	size_t retval;
+
+	switch (item_type) {
+	case RTE_FLOW_ITEM_TYPE_ETH:
+		retval = sizeof(struct rte_flow_item_eth);
+		break;
+	case RTE_FLOW_ITEM_TYPE_VLAN:
+		retval = sizeof(struct rte_flow_item_vlan);
+		break;
+	case RTE_FLOW_ITEM_TYPE_IPV4:
+		retval = sizeof(struct rte_flow_item_ipv4);
+		break;
+	case RTE_FLOW_ITEM_TYPE_IPV6:
+		retval = sizeof(struct rte_flow_item_ipv6);
+		break;
+	case RTE_FLOW_ITEM_TYPE_UDP:
+		retval = sizeof(struct rte_flow_item_udp);
+		break;
+	case RTE_FLOW_ITEM_TYPE_TCP:
+		retval = sizeof(struct rte_flow_item_tcp);
+		break;
+	case RTE_FLOW_ITEM_TYPE_VXLAN:
+		retval = sizeof(struct rte_flow_item_vxlan);
+		break;
+	case RTE_FLOW_ITEM_TYPE_GRE:
+		retval = sizeof(struct rte_flow_item_gre);
+		break;
+	case RTE_FLOW_ITEM_TYPE_NVGRE:
+		retval = sizeof(struct rte_flow_item_nvgre);
+		break;
+	case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+		retval = sizeof(struct rte_flow_item_vxlan_gpe);
+		break;
+	case RTE_FLOW_ITEM_TYPE_MPLS:
+		retval = sizeof(struct rte_flow_item_mpls);
+		break;
+	case RTE_FLOW_ITEM_TYPE_VOID: /* Fall through. */
+	default:
+		retval = 0;
+		break;
+	}
+	return retval;
+}
+
+#define MLX5_ENCAP_IPV4_VERSION		0x40
+#define MLX5_ENCAP_IPV4_IHL_MIN		0x05
+#define MLX5_ENCAP_IPV4_TTL_DEF		0x40
+#define MLX5_ENCAP_IPV6_VTC_FLOW	0x60000000
+#define MLX5_ENCAP_IPV6_HOP_LIMIT	0xff
+#define MLX5_ENCAP_VXLAN_FLAGS		0x08000000
+#define MLX5_ENCAP_VXLAN_GPE_FLAGS	0x04
+
+/**
+ * Convert the encap action data from list of rte_flow_item to raw buffer
+ *
+ * @param[in] items
+ *   Pointer to rte_flow_item objects list.
+ * @param[out] buf
+ *   Pointer to the output buffer.
+ * @param[out] size
+ *   Pointer to the output buffer size.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_convert_encap_data(const struct rte_flow_item *items, uint8_t *buf,
+			   size_t *size, struct rte_flow_error *error)
+{
+	struct rte_ether_hdr *eth = NULL;
+	struct rte_vlan_hdr *vlan = NULL;
+	struct rte_ipv4_hdr *ipv4 = NULL;
+	struct rte_ipv6_hdr *ipv6 = NULL;
+	struct rte_udp_hdr *udp = NULL;
+	struct rte_vxlan_hdr *vxlan = NULL;
+	struct rte_vxlan_gpe_hdr *vxlan_gpe = NULL;
+	struct rte_gre_hdr *gre = NULL;
+	size_t len;
+	size_t temp_size = 0;
+
+	if (!items)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  NULL, "invalid empty data");
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		len = flow_dv_get_item_len(items->type);
+		if (len + temp_size > MLX5_ENCAP_MAX_LEN)
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  (void *)items->type,
+						  "items total size is too big"
+						  " for encap action");
+		rte_memcpy((void *)&buf[temp_size], items->spec, len);
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			eth = (struct rte_ether_hdr *)&buf[temp_size];
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			vlan = (struct rte_vlan_hdr *)&buf[temp_size];
+			if (!eth)
+				return rte_flow_error_set(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						(void *)items->type,
+						"eth header not found");
+			if (!eth->ether_type)
+				eth->ether_type = RTE_BE16(RTE_ETHER_TYPE_VLAN);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			ipv4 = (struct rte_ipv4_hdr *)&buf[temp_size];
+			if (!vlan && !eth)
+				return rte_flow_error_set(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						(void *)items->type,
+						"neither eth nor vlan"
+						" header found");
+			if (vlan && !vlan->eth_proto)
+				vlan->eth_proto = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+			else if (eth && !eth->ether_type)
+				eth->ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+			if (!ipv4->version_ihl)
+				ipv4->version_ihl = MLX5_ENCAP_IPV4_VERSION |
+						    MLX5_ENCAP_IPV4_IHL_MIN;
+			if (!ipv4->time_to_live)
+				ipv4->time_to_live = MLX5_ENCAP_IPV4_TTL_DEF;
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			ipv6 = (struct rte_ipv6_hdr *)&buf[temp_size];
+			if (!vlan && !eth)
+				return rte_flow_error_set(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						(void *)items->type,
+						"neither eth nor vlan"
+						" header found");
+			if (vlan && !vlan->eth_proto)
+				vlan->eth_proto = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+			else if (eth && !eth->ether_type)
+				eth->ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+			if (!ipv6->vtc_flow)
+				ipv6->vtc_flow =
+					RTE_BE32(MLX5_ENCAP_IPV6_VTC_FLOW);
+			if (!ipv6->hop_limits)
+				ipv6->hop_limits = MLX5_ENCAP_IPV6_HOP_LIMIT;
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			udp = (struct rte_udp_hdr *)&buf[temp_size];
+			if (!ipv4 && !ipv6)
+				return rte_flow_error_set(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						(void *)items->type,
+						"ip header not found");
+			if (ipv4 && !ipv4->next_proto_id)
+				ipv4->next_proto_id = IPPROTO_UDP;
+			else if (ipv6 && !ipv6->proto)
+				ipv6->proto = IPPROTO_UDP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			vxlan = (struct rte_vxlan_hdr *)&buf[temp_size];
+			if (!udp)
+				return rte_flow_error_set(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						(void *)items->type,
+						"udp header not found");
+			if (!udp->dst_port)
+				udp->dst_port = RTE_BE16(MLX5_UDP_PORT_VXLAN);
+			if (!vxlan->vx_flags)
+				vxlan->vx_flags =
+					RTE_BE32(MLX5_ENCAP_VXLAN_FLAGS);
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+			vxlan_gpe = (struct rte_vxlan_gpe_hdr *)&buf[temp_size];
+			if (!udp)
+				return rte_flow_error_set(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						(void *)items->type,
+						"udp header not found");
+			if (!vxlan_gpe->proto)
+				return rte_flow_error_set(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						(void *)items->type,
+						"next protocol not found");
+			if (!udp->dst_port)
+				udp->dst_port =
+					RTE_BE16(MLX5_UDP_PORT_VXLAN_GPE);
+			if (!vxlan_gpe->vx_flags)
+				vxlan_gpe->vx_flags =
+						MLX5_ENCAP_VXLAN_GPE_FLAGS;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GRE:
+		case RTE_FLOW_ITEM_TYPE_NVGRE:
+			gre = (struct rte_gre_hdr *)&buf[temp_size];
+			if (!gre->proto)
+				return rte_flow_error_set(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						(void *)items->type,
+						"next protocol not found");
+			if (!ipv4 && !ipv6)
+				return rte_flow_error_set(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						(void *)items->type,
+						"ip header not found");
+			if (ipv4 && !ipv4->next_proto_id)
+				ipv4->next_proto_id = IPPROTO_GRE;
+			else if (ipv6 && !ipv6->proto)
+				ipv6->proto = IPPROTO_GRE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		default:
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  (void *)items->type,
+						  "unsupported item type");
+			break;
+		}
+		temp_size += len;
+	}
+	*size = temp_size;
+	return 0;
+}
+
+static int
+flow_dv_zero_encap_udp_csum(void *data, struct rte_flow_error *error)
+{
+	struct rte_ether_hdr *eth = NULL;
+	struct rte_vlan_hdr *vlan = NULL;
+	struct rte_ipv6_hdr *ipv6 = NULL;
+	struct rte_udp_hdr *udp = NULL;
+	char *next_hdr;
+	uint16_t proto;
+
+	eth = (struct rte_ether_hdr *)data;
+	next_hdr = (char *)(eth + 1);
+	proto = RTE_BE16(eth->ether_type);
+
+	/* VLAN skipping */
+	while (proto == RTE_ETHER_TYPE_VLAN || proto == RTE_ETHER_TYPE_QINQ) {
+		vlan = (struct rte_vlan_hdr *)next_hdr;
+		proto = RTE_BE16(vlan->eth_proto);
+		next_hdr += sizeof(struct rte_vlan_hdr);
+	}
+
+	/* HW calculates IPv4 csum. no need to proceed */
+	if (proto == RTE_ETHER_TYPE_IPV4)
+		return 0;
+
+	/* non IPv4/IPv6 header. not supported */
+	if (proto != RTE_ETHER_TYPE_IPV6) {
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  NULL, "Cannot offload non IPv4/IPv6");
+	}
+
+	ipv6 = (struct rte_ipv6_hdr *)next_hdr;
+
+	/* ignore non UDP */
+	if (ipv6->proto != IPPROTO_UDP)
+		return 0;
+
+	udp = (struct rte_udp_hdr *)(ipv6 + 1);
+	udp->dgram_cksum = 0;
+
+	return 0;
+}
+
+/**
+ * Convert L2 encap action to DV specification.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] action
+ *   Pointer to action structure.
+ * @param[in, out] dev_flow
+ *   Pointer to the mlx5_flow.
+ * @param[in] transfer
+ *   Mark if the flow is E-Switch flow.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_create_action_l2_encap(struct rte_eth_dev *dev,
+			       const struct rte_flow_action *action,
+			       struct mlx5_flow *dev_flow,
+			       uint8_t transfer,
+			       struct rte_flow_error *error)
+{
+	const struct rte_flow_item *encap_data;
+	const struct rte_flow_action_raw_encap *raw_encap_data;
+	struct mlx5_flow_dv_encap_decap_resource res = {
+		.reformat_type =
+			MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL,
+		.ft_type = transfer ? MLX5DV_FLOW_TABLE_TYPE_FDB :
+				      MLX5DV_FLOW_TABLE_TYPE_NIC_TX,
+	};
+
+	if (action->type == RTE_FLOW_ACTION_TYPE_RAW_ENCAP) {
+		raw_encap_data =
+			(const struct rte_flow_action_raw_encap *)action->conf;
+		res.size = raw_encap_data->size;
+		memcpy(res.buf, raw_encap_data->data, res.size);
+	} else {
+		if (action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP)
+			encap_data =
+				((const struct rte_flow_action_vxlan_encap *)
+						action->conf)->definition;
+		else
+			encap_data =
+				((const struct rte_flow_action_nvgre_encap *)
+						action->conf)->definition;
+		if (flow_dv_convert_encap_data(encap_data, res.buf,
+					       &res.size, error))
+			return -rte_errno;
+	}
+	if (flow_dv_zero_encap_udp_csum(res.buf, error))
+		return -rte_errno;
+	if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  NULL, "can't create L2 encap action");
+	return 0;
+}
+
+/**
+ * Convert L2 decap action to DV specification.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] dev_flow
+ *   Pointer to the mlx5_flow.
+ * @param[in] transfer
+ *   Mark if the flow is E-Switch flow.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_create_action_l2_decap(struct rte_eth_dev *dev,
+			       struct mlx5_flow *dev_flow,
+			       uint8_t transfer,
+			       struct rte_flow_error *error)
+{
+	struct mlx5_flow_dv_encap_decap_resource res = {
+		.size = 0,
+		.reformat_type =
+			MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2,
+		.ft_type = transfer ? MLX5DV_FLOW_TABLE_TYPE_FDB :
+				      MLX5DV_FLOW_TABLE_TYPE_NIC_RX,
+	};
+
+	if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  NULL, "can't create L2 decap action");
+	return 0;
+}
+
+/**
+ * Convert raw decap/encap (L3 tunnel) action to DV specification.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] action
+ *   Pointer to action structure.
+ * @param[in, out] dev_flow
+ *   Pointer to the mlx5_flow.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_create_action_raw_encap(struct rte_eth_dev *dev,
+				const struct rte_flow_action *action,
+				struct mlx5_flow *dev_flow,
+				const struct rte_flow_attr *attr,
+				struct rte_flow_error *error)
+{
+	const struct rte_flow_action_raw_encap *encap_data;
+	struct mlx5_flow_dv_encap_decap_resource res;
+
+	memset(&res, 0, sizeof(res));
+	encap_data = (const struct rte_flow_action_raw_encap *)action->conf;
+	res.size = encap_data->size;
+	memcpy(res.buf, encap_data->data, res.size);
+	res.reformat_type = res.size < MLX5_ENCAPSULATION_DECISION_SIZE ?
+		MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2 :
+		MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL;
+	if (attr->transfer)
+		res.ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB;
+	else
+		res.ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX :
+					     MLX5DV_FLOW_TABLE_TYPE_NIC_RX;
+	if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  NULL, "can't create encap action");
+	return 0;
+}
+
+/**
+ * Create action push VLAN.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] vlan
+ *   Pointer to the vlan to push to the Ethernet header.
+ * @param[in, out] dev_flow
+ *   Pointer to the mlx5_flow.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_create_action_push_vlan(struct rte_eth_dev *dev,
+				const struct rte_flow_attr *attr,
+				const struct rte_vlan_hdr *vlan,
+				struct mlx5_flow *dev_flow,
+				struct rte_flow_error *error)
+{
+	struct mlx5_flow_dv_push_vlan_action_resource res;
+
+	memset(&res, 0, sizeof(res));
+	res.vlan_tag =
+		rte_cpu_to_be_32(((uint32_t)vlan->eth_proto) << 16 |
+				 vlan->vlan_tci);
+	if (attr->transfer)
+		res.ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB;
+	else
+		res.ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX :
+					     MLX5DV_FLOW_TABLE_TYPE_NIC_RX;
+	return flow_dv_push_vlan_action_resource_register
+					    (dev, &res, dev_flow, error);
+}
+
+/**
+ * Validate the modify-header actions.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_hdr(const uint64_t action_flags,
+				   const struct rte_flow_action *action,
+				   struct rte_flow_error *error)
+{
+	if (action->type != RTE_FLOW_ACTION_TYPE_DEC_TTL && !action->conf)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  NULL, "action configuration not set");
+	if (action_flags & MLX5_FLOW_ACTION_ENCAP)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't have encap action before"
+					  " modify action");
+	return 0;
+}
+
+/**
+ * Validate the modify-header MAC address actions.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[in] item_flags
+ *   Holds the items detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_mac(const uint64_t action_flags,
+				   const struct rte_flow_action *action,
+				   const uint64_t item_flags,
+				   struct rte_flow_error *error)
+{
+	int ret = 0;
+
+	ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+	if (!ret) {
+		if (!(item_flags & MLX5_FLOW_LAYER_L2))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "no L2 item in pattern");
+	}
+	return ret;
+}
+
+/**
+ * Validate the modify-header IPv4 address actions.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[in] item_flags
+ *   Holds the items detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ipv4(const uint64_t action_flags,
+				    const struct rte_flow_action *action,
+				    const uint64_t item_flags,
+				    struct rte_flow_error *error)
+{
+	int ret = 0;
+	uint64_t layer;
+
+	ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+	if (!ret) {
+		layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+				 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+				 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+		if (!(item_flags & layer))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "no ipv4 item in pattern");
+	}
+	return ret;
+}
+
+/**
+ * Validate the modify-header IPv6 address actions.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[in] item_flags
+ *   Holds the items detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ipv6(const uint64_t action_flags,
+				    const struct rte_flow_action *action,
+				    const uint64_t item_flags,
+				    struct rte_flow_error *error)
+{
+	int ret = 0;
+	uint64_t layer;
+
+	ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+	if (!ret) {
+		layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+				 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+				 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+		if (!(item_flags & layer))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "no ipv6 item in pattern");
+	}
+	return ret;
+}
+
+/**
+ * Validate the modify-header TP actions.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[in] item_flags
+ *   Holds the items detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_tp(const uint64_t action_flags,
+				  const struct rte_flow_action *action,
+				  const uint64_t item_flags,
+				  struct rte_flow_error *error)
+{
+	int ret = 0;
+	uint64_t layer;
+
+	ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+	if (!ret) {
+		layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+				 MLX5_FLOW_LAYER_INNER_L4 :
+				 MLX5_FLOW_LAYER_OUTER_L4;
+		if (!(item_flags & layer))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL, "no transport layer "
+						  "in pattern");
+	}
+	return ret;
+}
+
+/**
+ * Validate the modify-header actions of increment/decrement
+ * TCP Sequence-number.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[in] item_flags
+ *   Holds the items detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_tcp_seq(const uint64_t action_flags,
+				       const struct rte_flow_action *action,
+				       const uint64_t item_flags,
+				       struct rte_flow_error *error)
+{
+	int ret = 0;
+	uint64_t layer;
+
+	ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+	if (!ret) {
+		layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+				 MLX5_FLOW_LAYER_INNER_L4_TCP :
+				 MLX5_FLOW_LAYER_OUTER_L4_TCP;
+		if (!(item_flags & layer))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL, "no TCP item in"
+						  " pattern");
+		if ((action->type == RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ &&
+			(action_flags & MLX5_FLOW_ACTION_DEC_TCP_SEQ)) ||
+		    (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ &&
+			(action_flags & MLX5_FLOW_ACTION_INC_TCP_SEQ)))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "cannot decrease and increase"
+						  " TCP sequence number"
+						  " at the same time");
+	}
+	return ret;
+}
+
+/**
+ * Validate the modify-header actions of increment/decrement
+ * TCP Acknowledgment number.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[in] item_flags
+ *   Holds the items detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_tcp_ack(const uint64_t action_flags,
+				       const struct rte_flow_action *action,
+				       const uint64_t item_flags,
+				       struct rte_flow_error *error)
+{
+	int ret = 0;
+	uint64_t layer;
+
+	ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+	if (!ret) {
+		layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+				 MLX5_FLOW_LAYER_INNER_L4_TCP :
+				 MLX5_FLOW_LAYER_OUTER_L4_TCP;
+		if (!(item_flags & layer))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL, "no TCP item in"
+						  " pattern");
+		if ((action->type == RTE_FLOW_ACTION_TYPE_INC_TCP_ACK &&
+			(action_flags & MLX5_FLOW_ACTION_DEC_TCP_ACK)) ||
+		    (action->type == RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK &&
+			(action_flags & MLX5_FLOW_ACTION_INC_TCP_ACK)))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "cannot decrease and increase"
+						  " TCP acknowledgment number"
+						  " at the same time");
+	}
+	return ret;
+}
+
+/**
+ * Validate the modify-header TTL actions.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[in] item_flags
+ *   Holds the items detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ttl(const uint64_t action_flags,
+				   const struct rte_flow_action *action,
+				   const uint64_t item_flags,
+				   struct rte_flow_error *error)
+{
+	int ret = 0;
+	uint64_t layer;
+
+	ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+	if (!ret) {
+		layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ?
+				 MLX5_FLOW_LAYER_INNER_L3 :
+				 MLX5_FLOW_LAYER_OUTER_L3;
+		if (!(item_flags & layer))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "no IP protocol in pattern");
+	}
+	return ret;
+}
+
+/**
+ * Validate jump action.
+ *
+ * @param[in] action
+ *   Pointer to the jump action.
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] attributes
+ *   Pointer to flow attributes
+ * @param[in] external
+ *   Action belongs to flow rule created by request external to PMD.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_jump(const struct rte_flow_action *action,
+			     uint64_t action_flags,
+			     const struct rte_flow_attr *attributes,
+			     bool external, struct rte_flow_error *error)
+{
+	uint32_t target_group, table;
+	int ret = 0;
+
+	if (action_flags & (MLX5_FLOW_FATE_ACTIONS |
+			    MLX5_FLOW_FATE_ESWITCH_ACTIONS))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can't have 2 fate actions in"
+					  " same flow");
+	if (action_flags & MLX5_FLOW_ACTION_METER)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "jump with meter not support");
+	if (!action->conf)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  NULL, "action configuration not set");
+	target_group =
+		((const struct rte_flow_action_jump *)action->conf)->group;
+	ret = mlx5_flow_group_to_table(attributes, external, target_group,
+				       true, &table, error);
+	if (ret)
+		return ret;
+	if (attributes->group == target_group)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "target group must be other than"
+					  " the current flow group");
+	return 0;
+}
+
+/*
+ * Validate the port_id action.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] action_flags
+ *   Bit-fields that holds the actions detected until now.
+ * @param[in] action
+ *   Port_id RTE action structure.
+ * @param[in] attr
+ *   Attributes of flow that includes this action.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_port_id(struct rte_eth_dev *dev,
+				uint64_t action_flags,
+				const struct rte_flow_action *action,
+				const struct rte_flow_attr *attr,
+				struct rte_flow_error *error)
+{
+	const struct rte_flow_action_port_id *port_id;
+	struct mlx5_priv *act_priv;
+	struct mlx5_priv *dev_priv;
+	uint16_t port;
+
+	if (!attr->transfer)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "port id action is valid in transfer"
+					  " mode only");
+	if (!action || !action->conf)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  NULL,
+					  "port id action parameters must be"
+					  " specified");
+	if (action_flags & (MLX5_FLOW_FATE_ACTIONS |
+			    MLX5_FLOW_FATE_ESWITCH_ACTIONS))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "can have only one fate actions in"
+					  " a flow");
+	dev_priv = mlx5_dev_to_eswitch_info(dev);
+	if (!dev_priv)
+		return rte_flow_error_set(error, rte_errno,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "failed to obtain E-Switch info");
+	port_id = action->conf;
+	port = port_id->original ? dev->data->port_id : port_id->id;
+	act_priv = mlx5_port_to_eswitch_info(port, false);
+	if (!act_priv)
+		return rte_flow_error_set
+				(error, rte_errno,
+				 RTE_FLOW_ERROR_TYPE_ACTION_CONF, port_id,
+				 "failed to obtain E-Switch port id for port");
+	if (act_priv->domain_id != dev_priv->domain_id)
+		return rte_flow_error_set
+				(error, EINVAL,
+				 RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+				 "port does not belong to"
+				 " E-Switch being configured");
+	return 0;
+}
+
+/**
+ * Get the maximum number of modify header actions.
+ *
+ * @param dev
+ *   Pointer to rte_eth_dev structure.
+ * @param flags
+ *   Flags bits to check if root level.
+ *
+ * @return
+ *   Max number of modify header actions device can support.
+ */
+static inline unsigned int
+flow_dv_modify_hdr_action_max(struct rte_eth_dev *dev __rte_unused,
+			      uint64_t flags)
+{
+	/*
+	 * There's no way to directly query the max capacity from FW.
+	 * The maximal value on root table should be assumed to be supported.
+	 */
+	if (!(flags & MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL))
+		return MLX5_MAX_MODIFY_NUM;
+	else
+		return MLX5_ROOT_TBL_MODIFY_NUM;
+}
+
+/**
+ * Validate the meter action.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] action_flags
+ *   Bit-fields that holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the meter action.
+ * @param[in] attr
+ *   Attributes of flow that includes this action.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_ernno is set.
+ */
+static int
+mlx5_flow_validate_action_meter(struct rte_eth_dev *dev,
+				uint64_t action_flags,
+				const struct rte_flow_action *action,
+				const struct rte_flow_attr *attr,
+				struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_action_meter *am = action->conf;
+	struct mlx5_flow_meter *fm;
+
+	if (!am)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "meter action conf is NULL");
+
+	if (action_flags & MLX5_FLOW_ACTION_METER)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "meter chaining not support");
+	if (action_flags & MLX5_FLOW_ACTION_JUMP)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "meter with jump not support");
+	if (!priv->mtr_en)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "meter action not supported");
+	fm = mlx5_flow_meter_find(priv, am->mtr_id);
+	if (!fm)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "Meter not found");
+	if (fm->ref_cnt && (!(fm->transfer == attr->transfer ||
+	      (!fm->ingress && !attr->ingress && attr->egress) ||
+	      (!fm->egress && !attr->egress && attr->ingress))))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "Flow attributes are either invalid "
+					  "or have a conflict with current "
+					  "meter attributes");
+	return 0;
+}
+
+/**
+ * Validate the age action.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the age action.
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_age(uint64_t action_flags,
+			    const struct rte_flow_action *action,
+			    struct rte_eth_dev *dev,
+			    struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_action_age *age = action->conf;
+
+	if (!priv->config.devx || priv->counter_fallback)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "age action not supported");
+	if (!(action->conf))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "configuration cannot be null");
+	if (age->timeout >= UINT16_MAX / 2 / 10)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION, action,
+					  "Max age time: 3275 seconds");
+	if (action_flags & MLX5_FLOW_ACTION_AGE)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "Duplicate age ctions set");
+	return 0;
+}
+
+/**
+ * Validate the modify-header IPv4 DSCP actions.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[in] item_flags
+ *   Holds the items detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ipv4_dscp(const uint64_t action_flags,
+					 const struct rte_flow_action *action,
+					 const uint64_t item_flags,
+					 struct rte_flow_error *error)
+{
+	int ret = 0;
+
+	ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+	if (!ret) {
+		if (!(item_flags & MLX5_FLOW_LAYER_L3_IPV4))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "no ipv4 item in pattern");
+	}
+	return ret;
+}
+
+/**
+ * Validate the modify-header IPv6 DSCP actions.
+ *
+ * @param[in] action_flags
+ *   Holds the actions detected until now.
+ * @param[in] action
+ *   Pointer to the modify action.
+ * @param[in] item_flags
+ *   Holds the items detected.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_action_modify_ipv6_dscp(const uint64_t action_flags,
+					 const struct rte_flow_action *action,
+					 const uint64_t item_flags,
+					 struct rte_flow_error *error)
+{
+	int ret = 0;
+
+	ret = flow_dv_validate_action_modify_hdr(action_flags, action, error);
+	if (!ret) {
+		if (!(item_flags & MLX5_FLOW_LAYER_L3_IPV6))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "no ipv6 item in pattern");
+	}
+	return ret;
+}
+
+/**
+ * Find existing modify-header resource or create and register a new one.
+ *
+ * @param dev[in, out]
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] resource
+ *   Pointer to modify-header resource.
+ * @parm[in, out] dev_flow
+ *   Pointer to the dev_flow.
+ * @param[out] error
+ *   pointer to error structure.
+ *
+ * @return
+ *   0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_modify_hdr_resource_register
+			(struct rte_eth_dev *dev,
+			 struct mlx5_flow_dv_modify_hdr_resource *resource,
+			 struct mlx5_flow *dev_flow,
+			 struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_dv_modify_hdr_resource *cache_resource;
+	struct mlx5dv_dr_domain *ns;
+	uint32_t actions_len;
+
+	resource->flags = dev_flow->dv.group ? 0 :
+			  MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL;
+	if (resource->actions_num > flow_dv_modify_hdr_action_max(dev,
+				    resource->flags))
+		return rte_flow_error_set(error, EOVERFLOW,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "too many modify header items");
+	if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_FDB)
+		ns = sh->fdb_domain;
+	else if (resource->ft_type == MLX5DV_FLOW_TABLE_TYPE_NIC_TX)
+		ns = sh->tx_domain;
+	else
+		ns = sh->rx_domain;
+	/* Lookup a matching resource from cache. */
+	actions_len = resource->actions_num * sizeof(resource->actions[0]);
+	LIST_FOREACH(cache_resource, &sh->modify_cmds, next) {
+		if (resource->ft_type == cache_resource->ft_type &&
+		    resource->actions_num == cache_resource->actions_num &&
+		    resource->flags == cache_resource->flags &&
+		    !memcmp((const void *)resource->actions,
+			    (const void *)cache_resource->actions,
+			    actions_len)) {
+			DRV_LOG(DEBUG, "modify-header resource %p: refcnt %d++",
+				(void *)cache_resource,
+				rte_atomic32_read(&cache_resource->refcnt));
+			rte_atomic32_inc(&cache_resource->refcnt);
+			dev_flow->handle->dvh.modify_hdr = cache_resource;
+			return 0;
+		}
+	}
+	/* Register new modify-header resource. */
+	cache_resource = rte_calloc(__func__, 1,
+				    sizeof(*cache_resource) + actions_len, 0);
+	if (!cache_resource)
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "cannot allocate resource memory");
+	*cache_resource = *resource;
+	rte_memcpy(cache_resource->actions, resource->actions, actions_len);
+	cache_resource->verbs_action =
+		mlx5_glue->dv_create_flow_action_modify_header
+					(sh->ctx, cache_resource->ft_type, ns,
+					 cache_resource->flags, actions_len,
+					 (uint64_t *)cache_resource->actions);
+	if (!cache_resource->verbs_action) {
+		rte_free(cache_resource);
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL, "cannot create action");
+	}
+	rte_atomic32_init(&cache_resource->refcnt);
+	rte_atomic32_inc(&cache_resource->refcnt);
+	LIST_INSERT_HEAD(&sh->modify_cmds, cache_resource, next);
+	dev_flow->handle->dvh.modify_hdr = cache_resource;
+	DRV_LOG(DEBUG, "new modify-header resource %p: refcnt %d++",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	return 0;
+}
+
+/**
+ * Get DV flow counter by index.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] idx
+ *   mlx5 flow counter index in the container.
+ * @param[out] ppool
+ *   mlx5 flow counter pool in the container,
+ *
+ * @return
+ *   Pointer to the counter, NULL otherwise.
+ */
+static struct mlx5_flow_counter *
+flow_dv_counter_get_by_idx(struct rte_eth_dev *dev,
+			   uint32_t idx,
+			   struct mlx5_flow_counter_pool **ppool)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_pools_container *cont;
+	struct mlx5_flow_counter_pool *pool;
+	uint32_t batch = 0, age = 0;
+
+	idx--;
+	age = MLX_CNT_IS_AGE(idx);
+	idx = age ? idx - MLX5_CNT_AGE_OFFSET : idx;
+	if (idx >= MLX5_CNT_BATCH_OFFSET) {
+		idx -= MLX5_CNT_BATCH_OFFSET;
+		batch = 1;
+	}
+	cont = MLX5_CNT_CONTAINER(priv->sh, batch, age);
+	MLX5_ASSERT(idx / MLX5_COUNTERS_PER_POOL < cont->n);
+	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
+	MLX5_ASSERT(pool);
+	if (ppool)
+		*ppool = pool;
+	return MLX5_POOL_GET_CNT(pool, idx % MLX5_COUNTERS_PER_POOL);
+}
+
+/**
+ * Get a pool by devx counter ID.
+ *
+ * @param[in] cont
+ *   Pointer to the counter container.
+ * @param[in] id
+ *   The counter devx ID.
+ *
+ * @return
+ *   The counter pool pointer if exists, NULL otherwise,
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_find_pool_by_id(struct mlx5_pools_container *cont, int id)
+{
+	uint32_t i;
+	uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
+
+	for (i = 0; i < n_valid; i++) {
+		struct mlx5_flow_counter_pool *pool = cont->pools[i];
+		int base = (pool->min_dcs->id / MLX5_COUNTERS_PER_POOL) *
+			   MLX5_COUNTERS_PER_POOL;
+
+		if (id >= base && id < base + MLX5_COUNTERS_PER_POOL) {
+			/*
+			 * Move the pool to the head, as counter allocate
+			 * always gets the first pool in the container.
+			 */
+			if (pool != TAILQ_FIRST(&cont->pool_list)) {
+				TAILQ_REMOVE(&cont->pool_list, pool, next);
+				TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+			}
+			return pool;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Allocate a new memory for the counter values wrapped by all the needed
+ * management.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] raws_n
+ *   The raw memory areas - each one for MLX5_COUNTERS_PER_POOL counters.
+ *
+ * @return
+ *   The new memory management pointer on success, otherwise NULL and rte_errno
+ *   is set.
+ */
+static struct mlx5_counter_stats_mem_mng *
+flow_dv_create_counter_stat_mem_mng(struct rte_eth_dev *dev, int raws_n)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_devx_mkey_attr mkey_attr;
+	struct mlx5_counter_stats_mem_mng *mem_mng;
+	volatile struct flow_counter_stats *raw_data;
+	int size = (sizeof(struct flow_counter_stats) *
+			MLX5_COUNTERS_PER_POOL +
+			sizeof(struct mlx5_counter_stats_raw)) * raws_n +
+			sizeof(struct mlx5_counter_stats_mem_mng);
+	uint8_t *mem = rte_calloc(__func__, 1, size, sysconf(_SC_PAGESIZE));
+	int i;
+
+	if (!mem) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	mem_mng = (struct mlx5_counter_stats_mem_mng *)(mem + size) - 1;
+	size = sizeof(*raw_data) * MLX5_COUNTERS_PER_POOL * raws_n;
+	mem_mng->umem = mlx5_glue->devx_umem_reg(sh->ctx, mem, size,
+						 IBV_ACCESS_LOCAL_WRITE);
+	if (!mem_mng->umem) {
+		rte_errno = errno;
+		rte_free(mem);
+		return NULL;
+	}
+	mkey_attr.addr = (uintptr_t)mem;
+	mkey_attr.size = size;
+	mkey_attr.umem_id = mem_mng->umem->umem_id;
+	mkey_attr.pd = sh->pdn;
+	mkey_attr.log_entity_size = 0;
+	mkey_attr.pg_access = 0;
+	mkey_attr.klm_array = NULL;
+	mkey_attr.klm_num = 0;
+	if (priv->config.hca_attr.relaxed_ordering_write &&
+		priv->config.hca_attr.relaxed_ordering_read  &&
+		!haswell_broadwell_cpu)
+		mkey_attr.relaxed_ordering = 1;
+	mem_mng->dm = mlx5_devx_cmd_mkey_create(sh->ctx, &mkey_attr);
+	if (!mem_mng->dm) {
+		mlx5_glue->devx_umem_dereg(mem_mng->umem);
+		rte_errno = errno;
+		rte_free(mem);
+		return NULL;
+	}
+	mem_mng->raws = (struct mlx5_counter_stats_raw *)(mem + size);
+	raw_data = (volatile struct flow_counter_stats *)mem;
+	for (i = 0; i < raws_n; ++i) {
+		mem_mng->raws[i].mem_mng = mem_mng;
+		mem_mng->raws[i].data = raw_data + i * MLX5_COUNTERS_PER_POOL;
+	}
+	LIST_INSERT_HEAD(&sh->cmng.mem_mngs, mem_mng, next);
+	return mem_mng;
+}
+
+/**
+ * Resize a counter container.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] batch
+ *   Whether the pool is for counter that was allocated by batch command.
+ * @param[in] age
+ *   Whether the pool is for Aging counter.
+ *
+ * @return
+ *   0 on success, otherwise negative errno value and rte_errno is set.
+ */
+static int
+flow_dv_container_resize(struct rte_eth_dev *dev,
+				uint32_t batch, uint32_t age)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
+							       age);
+	struct mlx5_counter_stats_mem_mng *mem_mng = NULL;
+	void *old_pools = cont->pools;
+	uint32_t resize = cont->n + MLX5_CNT_CONTAINER_RESIZE;
+	uint32_t mem_size = sizeof(struct mlx5_flow_counter_pool *) * resize;
+	void *pools = rte_calloc(__func__, 1, mem_size, 0);
+
+	if (!pools) {
+		rte_errno = ENOMEM;
+		return -ENOMEM;
+	}
+	if (old_pools)
+		memcpy(pools, old_pools, cont->n *
+				       sizeof(struct mlx5_flow_counter_pool *));
+	/*
+	 * Fallback mode query the counter directly, no background query
+	 * resources are needed.
+	 */
+	if (!priv->counter_fallback) {
+		int i;
+
+		mem_mng = flow_dv_create_counter_stat_mem_mng(dev,
+			  MLX5_CNT_CONTAINER_RESIZE + MLX5_MAX_PENDING_QUERIES);
+		if (!mem_mng) {
+			rte_free(pools);
+			return -ENOMEM;
+		}
+		for (i = 0; i < MLX5_MAX_PENDING_QUERIES; ++i)
+			LIST_INSERT_HEAD(&priv->sh->cmng.free_stat_raws,
+					 mem_mng->raws +
+					 MLX5_CNT_CONTAINER_RESIZE +
+					 i, next);
+	}
+	rte_spinlock_lock(&cont->resize_sl);
+	cont->n = resize;
+	cont->mem_mng = mem_mng;
+	cont->pools = pools;
+	rte_spinlock_unlock(&cont->resize_sl);
+	if (old_pools)
+		rte_free(old_pools);
+	return 0;
+}
+
+/**
+ * Query a devx flow counter.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] cnt
+ *   Index to the flow counter.
+ * @param[out] pkts
+ *   The statistics value of packets.
+ * @param[out] bytes
+ *   The statistics value of bytes.
+ *
+ * @return
+ *   0 on success, otherwise a negative errno value and rte_errno is set.
+ */
+static inline int
+_flow_dv_query_count(struct rte_eth_dev *dev, uint32_t counter, uint64_t *pkts,
+		     uint64_t *bytes)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_counter_pool *pool = NULL;
+	struct mlx5_flow_counter *cnt;
+	struct mlx5_flow_counter_ext *cnt_ext = NULL;
+	int offset;
+
+	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
+	MLX5_ASSERT(pool);
+	if (counter < MLX5_CNT_BATCH_OFFSET) {
+		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+		if (priv->counter_fallback)
+			return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
+					0, pkts, bytes, 0, NULL, NULL, 0);
+	}
+
+	rte_spinlock_lock(&pool->sl);
+	/*
+	 * The single counters allocation may allocate smaller ID than the
+	 * current allocated in parallel to the host reading.
+	 * In this case the new counter values must be reported as 0.
+	 */
+	if (unlikely(cnt_ext && cnt_ext->dcs->id < pool->raw->min_dcs_id)) {
+		*pkts = 0;
+		*bytes = 0;
+	} else {
+		offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
+		*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
+		*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
+	}
+	rte_spinlock_unlock(&pool->sl);
+	return 0;
+}
+
+/**
+ * Create and initialize a new counter pool.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[out] dcs
+ *   The devX counter handle.
+ * @param[in] batch
+ *   Whether the pool is for counter that was allocated by batch command.
+ * @param[in] age
+ *   Whether the pool is for counter that was allocated for aging.
+ * @param[in/out] cont_cur
+ *   Pointer to the container pointer, it will be update in pool resize.
+ *
+ * @return
+ *   The pool container pointer on success, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_pool_create(struct rte_eth_dev *dev, struct mlx5_devx_obj *dcs,
+		    uint32_t batch, uint32_t age)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_counter_pool *pool;
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
+							       age);
+	int16_t n_valid = rte_atomic16_read(&cont->n_valid);
+	uint32_t size = sizeof(*pool);
+
+	if (cont->n == n_valid && flow_dv_container_resize(dev, batch, age))
+		return NULL;
+	size += MLX5_COUNTERS_PER_POOL * CNT_SIZE;
+	size += (batch ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
+	size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * AGE_SIZE);
+	pool = rte_calloc(__func__, 1, size, 0);
+	if (!pool) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	pool->min_dcs = dcs;
+	if (!priv->counter_fallback)
+		pool->raw = cont->mem_mng->raws + n_valid %
+						      MLX5_CNT_CONTAINER_RESIZE;
+	pool->raw_hw = NULL;
+	pool->type = 0;
+	pool->type |= (batch ? 0 :  CNT_POOL_TYPE_EXT);
+	pool->type |= (!age ? 0 :  CNT_POOL_TYPE_AGE);
+	rte_spinlock_init(&pool->sl);
+	/*
+	 * The generation of the new allocated counters in this pool is 0, 2 in
+	 * the pool generation makes all the counters valid for allocation.
+	 * The start and end query generation protect the counters be released
+	 * between the query and update gap period will not be reallocated
+	 * without the last query finished and stats updated to the memory.
+	 */
+	rte_atomic64_set(&pool->start_query_gen, 0x2);
+	/*
+	 * There's no background query thread for fallback mode, set the
+	 * end_query_gen to the maximum value since no need to wait for
+	 * statistics update.
+	 */
+	rte_atomic64_set(&pool->end_query_gen, priv->counter_fallback ?
+			 INT64_MAX : 0x2);
+	TAILQ_INIT(&pool->counters);
+	TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+	pool->index = n_valid;
+	cont->pools[n_valid] = pool;
+	/* Pool initialization must be updated before host thread access. */
+	rte_cio_wmb();
+	rte_atomic16_add(&cont->n_valid, 1);
+	return pool;
+}
+
+/**
+ * Update the minimum dcs-id for aged or no-aged counter pool.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] pool
+ *   Current counter pool.
+ * @param[in] batch
+ *   Whether the pool is for counter that was allocated by batch command.
+ * @param[in] age
+ *   Whether the counter is for aging.
+ */
+static void
+flow_dv_counter_update_min_dcs(struct rte_eth_dev *dev,
+			struct mlx5_flow_counter_pool *pool,
+			uint32_t batch, uint32_t age)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_counter_pool *other;
+	struct mlx5_pools_container *cont;
+
+	cont = MLX5_CNT_CONTAINER(priv->sh, batch, (age ^ 0x1));
+	other = flow_dv_find_pool_by_id(cont, pool->min_dcs->id);
+	if (!other)
+		return;
+	if (pool->min_dcs->id < other->min_dcs->id) {
+		rte_atomic64_set(&other->a64_dcs,
+			rte_atomic64_read(&pool->a64_dcs));
+	} else {
+		rte_atomic64_set(&pool->a64_dcs,
+			rte_atomic64_read(&other->a64_dcs));
+	}
+}
+/**
+ * Prepare a new counter and/or a new counter pool.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[out] cnt_free
+ *   Where to put the pointer of a new counter.
+ * @param[in] batch
+ *   Whether the pool is for counter that was allocated by batch command.
+ * @param[in] age
+ *   Whether the pool is for counter that was allocated for aging.
+ *
+ * @return
+ *   The counter pool pointer and @p cnt_free is set on success,
+ *   NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_flow_counter_pool *
+flow_dv_counter_pool_prepare(struct rte_eth_dev *dev,
+			     struct mlx5_flow_counter **cnt_free,
+			     uint32_t batch, uint32_t age)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_pools_container *cont;
+	struct mlx5_flow_counter_pool *pool;
+	struct mlx5_devx_obj *dcs = NULL;
+	struct mlx5_flow_counter *cnt;
+	uint32_t i;
+
+	cont = MLX5_CNT_CONTAINER(priv->sh, batch, age);
+	if (!batch) {
+		/* bulk_bitmap must be 0 for single counter allocation. */
+		dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0);
+		if (!dcs)
+			return NULL;
+		pool = flow_dv_find_pool_by_id(cont, dcs->id);
+		if (!pool) {
+			pool = flow_dv_pool_create(dev, dcs, batch, age);
+			if (!pool) {
+				mlx5_devx_cmd_destroy(dcs);
+				return NULL;
+			}
+		} else if (dcs->id < pool->min_dcs->id) {
+			rte_atomic64_set(&pool->a64_dcs,
+					 (int64_t)(uintptr_t)dcs);
+		}
+		flow_dv_counter_update_min_dcs(dev,
+						pool, batch, age);
+		i = dcs->id % MLX5_COUNTERS_PER_POOL;
+		cnt = MLX5_POOL_GET_CNT(pool, i);
+		TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+		MLX5_GET_POOL_CNT_EXT(pool, i)->dcs = dcs;
+		*cnt_free = cnt;
+		return pool;
+	}
+	/* bulk_bitmap is in 128 counters units. */
+	if (priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4)
+		dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0x4);
+	if (!dcs) {
+		rte_errno = ENODATA;
+		return NULL;
+	}
+	pool = flow_dv_pool_create(dev, dcs, batch, age);
+	if (!pool) {
+		mlx5_devx_cmd_destroy(dcs);
+		return NULL;
+	}
+	for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+		cnt = MLX5_POOL_GET_CNT(pool, i);
+		TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+	}
+	*cnt_free = MLX5_POOL_GET_CNT(pool, 0);
+	return pool;
+}
+
+/**
+ * Search for existed shared counter.
+ *
+ * @param[in] cont
+ *   Pointer to the relevant counter pool container.
+ * @param[in] id
+ *   The shared counter ID to search.
+ * @param[out] ppool
+ *   mlx5 flow counter pool in the container,
+ *
+ * @return
+ *   NULL if not existed, otherwise pointer to the shared extend counter.
+ */
+static struct mlx5_flow_counter_ext *
+flow_dv_counter_shared_search(struct mlx5_pools_container *cont, uint32_t id,
+			      struct mlx5_flow_counter_pool **ppool)
+{
+	struct mlx5_flow_counter_ext *cnt;
+	struct mlx5_flow_counter_pool *pool;
+	uint32_t i, j;
+	uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
+
+	for (i = 0; i < n_valid; i++) {
+		pool = cont->pools[i];
+		for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
+			cnt = MLX5_GET_POOL_CNT_EXT(pool, j);
+			if (cnt->ref_cnt && cnt->shared && cnt->id == id) {
+				if (ppool)
+					*ppool = cont->pools[i];
+				return cnt;
+			}
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Allocate a flow counter.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] shared
+ *   Indicate if this counter is shared with other flows.
+ * @param[in] id
+ *   Counter identifier.
+ * @param[in] group
+ *   Counter flow group.
+ * @param[in] age
+ *   Whether the counter was allocated for aging.
+ *
+ * @return
+ *   Index to flow counter on success, 0 otherwise and rte_errno is set.
+ */
+static uint32_t
+flow_dv_counter_alloc(struct rte_eth_dev *dev, uint32_t shared, uint32_t id,
+		      uint16_t group, uint32_t age)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_counter_pool *pool = NULL;
+	struct mlx5_flow_counter *cnt_free = NULL;
+	struct mlx5_flow_counter_ext *cnt_ext = NULL;
+	/*
+	 * Currently group 0 flow counter cannot be assigned to a flow if it is
+	 * not the first one in the batch counter allocation, so it is better
+	 * to allocate counters one by one for these flows in a separate
+	 * container.
+	 * A counter can be shared between different groups so need to take
+	 * shared counters from the single container.
+	 */
+	uint32_t batch = (group && !shared && !priv->counter_fallback) ? 1 : 0;
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
+							       age);
+	uint32_t cnt_idx;
+
+	if (!priv->config.devx) {
+		rte_errno = ENOTSUP;
+		return 0;
+	}
+	if (shared) {
+		cnt_ext = flow_dv_counter_shared_search(cont, id, &pool);
+		if (cnt_ext) {
+			if (cnt_ext->ref_cnt + 1 == 0) {
+				rte_errno = E2BIG;
+				return 0;
+			}
+			cnt_ext->ref_cnt++;
+			cnt_idx = pool->index * MLX5_COUNTERS_PER_POOL +
+				  (cnt_ext->dcs->id % MLX5_COUNTERS_PER_POOL)
+				  + 1;
+			return cnt_idx;
+		}
+	}
+	/* Pools which has a free counters are in the start. */
+	TAILQ_FOREACH(pool, &cont->pool_list, next) {
+		/*
+		 * The free counter reset values must be updated between the
+		 * counter release to the counter allocation, so, at least one
+		 * query must be done in this time. ensure it by saving the
+		 * query generation in the release time.
+		 * The free list is sorted according to the generation - so if
+		 * the first one is not updated, all the others are not
+		 * updated too.
+		 */
+		cnt_free = TAILQ_FIRST(&pool->counters);
+		if (cnt_free && cnt_free->query_gen <
+		    rte_atomic64_read(&pool->end_query_gen))
+			break;
+		cnt_free = NULL;
+	}
+	if (!cnt_free) {
+		pool = flow_dv_counter_pool_prepare(dev, &cnt_free, batch, age);
+		if (!pool)
+			return 0;
+	}
+	if (!batch)
+		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt_free);
+	/* Create a DV counter action only in the first time usage. */
+	if (!cnt_free->action) {
+		uint16_t offset;
+		struct mlx5_devx_obj *dcs;
+
+		if (batch) {
+			offset = MLX5_CNT_ARRAY_IDX(pool, cnt_free);
+			dcs = pool->min_dcs;
+		} else {
+			offset = 0;
+			dcs = cnt_ext->dcs;
+		}
+		cnt_free->action = mlx5_glue->dv_create_flow_action_counter
+					(dcs->obj, offset);
+		if (!cnt_free->action) {
+			rte_errno = errno;
+			return 0;
+		}
+	}
+	cnt_idx = MLX5_MAKE_CNT_IDX(pool->index,
+				MLX5_CNT_ARRAY_IDX(pool, cnt_free));
+	cnt_idx += batch * MLX5_CNT_BATCH_OFFSET;
+	cnt_idx += age * MLX5_CNT_AGE_OFFSET;
+	/* Update the counter reset values. */
+	if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits,
+				 &cnt_free->bytes))
+		return 0;
+	if (cnt_ext) {
+		cnt_ext->shared = shared;
+		cnt_ext->ref_cnt = 1;
+		cnt_ext->id = id;
+	}
+	if (!priv->counter_fallback && !priv->sh->cmng.query_thread_on)
+		/* Start the asynchronous batch query by the host thread. */
+		mlx5_set_query_alarm(priv->sh);
+	TAILQ_REMOVE(&pool->counters, cnt_free, next);
+	if (TAILQ_EMPTY(&pool->counters)) {
+		/* Move the pool to the end of the container pool list. */
+		TAILQ_REMOVE(&cont->pool_list, pool, next);
+		TAILQ_INSERT_TAIL(&cont->pool_list, pool, next);
+	}
+	return cnt_idx;
+}
+
+/**
+ * Get age param from counter index.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] counter
+ *   Index to the counter handler.
+ *
+ * @return
+ *   The aging parameter specified for the counter index.
+ */
+static struct mlx5_age_param*
+flow_dv_counter_idx_get_age(struct rte_eth_dev *dev,
+				uint32_t counter)
+{
+	struct mlx5_flow_counter *cnt;
+	struct mlx5_flow_counter_pool *pool = NULL;
+
+	flow_dv_counter_get_by_idx(dev, counter, &pool);
+	counter = (counter - 1) % MLX5_COUNTERS_PER_POOL;
+	cnt = MLX5_POOL_GET_CNT(pool, counter);
+	return MLX5_CNT_TO_AGE(cnt);
+}
+
+/**
+ * Remove a flow counter from aged counter list.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] counter
+ *   Index to the counter handler.
+ * @param[in] cnt
+ *   Pointer to the counter handler.
+ */
+static void
+flow_dv_counter_remove_from_age(struct rte_eth_dev *dev,
+				uint32_t counter, struct mlx5_flow_counter *cnt)
+{
+	struct mlx5_age_info *age_info;
+	struct mlx5_age_param *age_param;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	age_info = GET_PORT_AGE_INFO(priv);
+	age_param = flow_dv_counter_idx_get_age(dev, counter);
+	if (rte_atomic16_cmpset((volatile uint16_t *)
+			&age_param->state,
+			AGE_CANDIDATE, AGE_FREE)
+			!= AGE_CANDIDATE) {
+		/**
+		 * We need the lock even it is age timeout,
+		 * since counter may still in process.
+		 */
+		rte_spinlock_lock(&age_info->aged_sl);
+		TAILQ_REMOVE(&age_info->aged_counters, cnt, next);
+		rte_spinlock_unlock(&age_info->aged_sl);
+	}
+	rte_atomic16_set(&age_param->state, AGE_FREE);
+}
+/**
+ * Release a flow counter.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] counter
+ *   Index to the counter handler.
+ */
+static void
+flow_dv_counter_release(struct rte_eth_dev *dev, uint32_t counter)
+{
+	struct mlx5_flow_counter_pool *pool = NULL;
+	struct mlx5_flow_counter *cnt;
+	struct mlx5_flow_counter_ext *cnt_ext = NULL;
+
+	if (!counter)
+		return;
+	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
+	MLX5_ASSERT(pool);
+	if (counter < MLX5_CNT_BATCH_OFFSET) {
+		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+		if (cnt_ext && --cnt_ext->ref_cnt)
+			return;
+	}
+	if (IS_AGE_POOL(pool))
+		flow_dv_counter_remove_from_age(dev, counter, cnt);
+	/* Put the counter in the end - the last updated one. */
+	TAILQ_INSERT_TAIL(&pool->counters, cnt, next);
+	/*
+	 * Counters released between query trigger and handler need
+	 * to wait the next round of query. Since the packets arrive
+	 * in the gap period will not be taken into account to the
+	 * old counter.
+	 */
+	cnt->query_gen = rte_atomic64_read(&pool->start_query_gen);
+}
+
+/**
+ * Verify the @p attributes will be correctly understood by the NIC and store
+ * them in the @p flow if everything is correct.
+ *
+ * @param[in] dev
+ *   Pointer to dev struct.
+ * @param[in] attributes
+ *   Pointer to flow attributes
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   - 0 on success and non root table.
+ *   - 1 on success and root table.
+ *   - a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_attributes(struct rte_eth_dev *dev,
+			    const struct rte_flow_attr *attributes,
+			    bool external __rte_unused,
+			    struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t priority_max = priv->config.flow_prio - 1;
+	int ret = 0;
+
+#ifndef HAVE_MLX5DV_DR
+	if (attributes->group)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+					  NULL,
+					  "groups are not supported");
+#else
+	uint32_t table = 0;
+
+	ret = mlx5_flow_group_to_table(attributes, external,
+				       attributes->group, !!priv->fdb_def_rule,
+				       &table, error);
+	if (ret)
+		return ret;
+	if (!table)
+		ret = MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL;
+#endif
+	if (attributes->priority != MLX5_FLOW_PRIO_RSVD &&
+	    attributes->priority >= priority_max)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+					  NULL,
+					  "priority out of range");
+	if (attributes->transfer) {
+		if (!priv->config.dv_esw_en)
+			return rte_flow_error_set
+				(error, ENOTSUP,
+				 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				 "E-Switch dr is not supported");
+		if (!(priv->representor || priv->master))
+			return rte_flow_error_set
+				(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				 NULL, "E-Switch configuration can only be"
+				 " done by a master or a representor device");
+		if (attributes->egress)
+			return rte_flow_error_set
+				(error, ENOTSUP,
+				 RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, attributes,
+				 "egress is not supported");
+	}
+	if (!(attributes->egress ^ attributes->ingress))
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR, NULL,
+					  "must specify exactly one of "
+					  "ingress or egress");
+	return ret;
+}
+
+/**
+ * Internal validation function. For validating both actions and items.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[in] hairpin
+ *   Number of hairpin TX actions, 0 means classic flow.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
+		 const struct rte_flow_item items[],
+		 const struct rte_flow_action actions[],
+		 bool external, int hairpin, struct rte_flow_error *error)
+{
+	int ret;
+	uint64_t action_flags = 0;
+	uint64_t item_flags = 0;
+	uint64_t last_item = 0;
+	uint8_t next_protocol = 0xff;
+	uint16_t ether_type = 0;
+	int actions_n = 0;
+	uint8_t item_ipv6_proto = 0;
+	const struct rte_flow_item *gre_item = NULL;
+	const struct rte_flow_action_raw_decap *decap;
+	const struct rte_flow_action_raw_encap *encap;
+	const struct rte_flow_action_rss *rss;
+	const struct rte_flow_item_tcp nic_tcp_mask = {
+		.hdr = {
+			.tcp_flags = 0xFF,
+			.src_port = RTE_BE16(UINT16_MAX),
+			.dst_port = RTE_BE16(UINT16_MAX),
+		}
+	};
+	const struct rte_flow_item_ipv4 nic_ipv4_mask = {
+		.hdr = {
+			.src_addr = RTE_BE32(0xffffffff),
+			.dst_addr = RTE_BE32(0xffffffff),
+			.type_of_service = 0xff,
+			.next_proto_id = 0xff,
+			.time_to_live = 0xff,
+		},
+	};
+	const struct rte_flow_item_ipv6 nic_ipv6_mask = {
+		.hdr = {
+			.src_addr =
+			"\xff\xff\xff\xff\xff\xff\xff\xff"
+			"\xff\xff\xff\xff\xff\xff\xff\xff",
+			.dst_addr =
+			"\xff\xff\xff\xff\xff\xff\xff\xff"
+			"\xff\xff\xff\xff\xff\xff\xff\xff",
+			.vtc_flow = RTE_BE32(0xffffffff),
+			.proto = 0xff,
+			.hop_limits = 0xff,
+		},
+	};
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *dev_conf = &priv->config;
+	uint16_t queue_index = 0xFFFF;
+	const struct rte_flow_item_vlan *vlan_m = NULL;
+	int16_t rw_act_num = 0;
+	uint64_t is_root;
+
+	if (items == NULL)
+		return -1;
+	ret = flow_dv_validate_attributes(dev, attr, external, error);
+	if (ret < 0)
+		return ret;
+	is_root = (uint64_t)ret;
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+		int type = items->type;
+
+		switch (type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_PORT_ID:
+			ret = flow_dv_validate_item_port_id
+					(dev, items, attr, item_flags, error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_ITEM_PORT_ID;
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			ret = mlx5_flow_validate_item_eth(items, item_flags,
+							  error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+					     MLX5_FLOW_LAYER_OUTER_L2;
+			if (items->mask != NULL && items->spec != NULL) {
+				ether_type =
+					((const struct rte_flow_item_eth *)
+					 items->spec)->type;
+				ether_type &=
+					((const struct rte_flow_item_eth *)
+					 items->mask)->type;
+				ether_type = rte_be_to_cpu_16(ether_type);
+			} else {
+				ether_type = 0;
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			ret = flow_dv_validate_item_vlan(items, item_flags,
+							 dev, error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
+					     MLX5_FLOW_LAYER_OUTER_VLAN;
+			if (items->mask != NULL && items->spec != NULL) {
+				ether_type =
+					((const struct rte_flow_item_vlan *)
+					 items->spec)->inner_type;
+				ether_type &=
+					((const struct rte_flow_item_vlan *)
+					 items->mask)->inner_type;
+				ether_type = rte_be_to_cpu_16(ether_type);
+			} else {
+				ether_type = 0;
+			}
+			/* Store outer VLAN mask for of_push_vlan action. */
+			if (!tunnel)
+				vlan_m = items->mask;
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			mlx5_flow_tunnel_ip_check(items, next_protocol,
+						  &item_flags, &tunnel);
+			ret = mlx5_flow_validate_item_ipv4(items, item_flags,
+							   last_item,
+							   ether_type,
+							   &nic_ipv4_mask,
+							   error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+					     MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+			if (items->mask != NULL &&
+			    ((const struct rte_flow_item_ipv4 *)
+			     items->mask)->hdr.next_proto_id) {
+				next_protocol =
+					((const struct rte_flow_item_ipv4 *)
+					 (items->spec))->hdr.next_proto_id;
+				next_protocol &=
+					((const struct rte_flow_item_ipv4 *)
+					 (items->mask))->hdr.next_proto_id;
+			} else {
+				/* Reset for inner layer. */
+				next_protocol = 0xff;
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			mlx5_flow_tunnel_ip_check(items, next_protocol,
+						  &item_flags, &tunnel);
+			ret = mlx5_flow_validate_item_ipv6(items, item_flags,
+							   last_item,
+							   ether_type,
+							   &nic_ipv6_mask,
+							   error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+					     MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+			if (items->mask != NULL &&
+			    ((const struct rte_flow_item_ipv6 *)
+			     items->mask)->hdr.proto) {
+				item_ipv6_proto =
+					((const struct rte_flow_item_ipv6 *)
+					 items->spec)->hdr.proto;
+				next_protocol =
+					((const struct rte_flow_item_ipv6 *)
+					 items->spec)->hdr.proto;
+				next_protocol &=
+					((const struct rte_flow_item_ipv6 *)
+					 items->mask)->hdr.proto;
+			} else {
+				/* Reset for inner layer. */
+				next_protocol = 0xff;
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_TCP:
+			ret = mlx5_flow_validate_item_tcp
+						(items, item_flags,
+						 next_protocol,
+						 &nic_tcp_mask,
+						 error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP :
+					     MLX5_FLOW_LAYER_OUTER_L4_TCP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			ret = mlx5_flow_validate_item_udp(items, item_flags,
+							  next_protocol,
+							  error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP :
+					     MLX5_FLOW_LAYER_OUTER_L4_UDP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GRE:
+			ret = mlx5_flow_validate_item_gre(items, item_flags,
+							  next_protocol, error);
+			if (ret < 0)
+				return ret;
+			gre_item = items;
+			last_item = MLX5_FLOW_LAYER_GRE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_NVGRE:
+			ret = mlx5_flow_validate_item_nvgre(items, item_flags,
+							    next_protocol,
+							    error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_NVGRE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GRE_KEY:
+			ret = mlx5_flow_validate_item_gre_key
+				(items, item_flags, gre_item, error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_GRE_KEY;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			ret = mlx5_flow_validate_item_vxlan(items, item_flags,
+							    error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_VXLAN;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+			ret = mlx5_flow_validate_item_vxlan_gpe(items,
+								item_flags, dev,
+								error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_VXLAN_GPE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GENEVE:
+			ret = mlx5_flow_validate_item_geneve(items,
+							     item_flags, dev,
+							     error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_GENEVE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_MPLS:
+			ret = mlx5_flow_validate_item_mpls(dev, items,
+							   item_flags,
+							   last_item, error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_MPLS;
+			break;
+
+		case RTE_FLOW_ITEM_TYPE_MARK:
+			ret = flow_dv_validate_item_mark(dev, items, attr,
+							 error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_ITEM_MARK;
+			break;
+		case RTE_FLOW_ITEM_TYPE_META:
+			ret = flow_dv_validate_item_meta(dev, items, attr,
+							 error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_ITEM_METADATA;
+			break;
+		case RTE_FLOW_ITEM_TYPE_ICMP:
+			ret = mlx5_flow_validate_item_icmp(items, item_flags,
+							   next_protocol,
+							   error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_ICMP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_ICMP6:
+			ret = mlx5_flow_validate_item_icmp6(items, item_flags,
+							    next_protocol,
+							    error);
+			if (ret < 0)
+				return ret;
+			item_ipv6_proto = IPPROTO_ICMPV6;
+			last_item = MLX5_FLOW_LAYER_ICMP6;
+			break;
+		case RTE_FLOW_ITEM_TYPE_TAG:
+			ret = flow_dv_validate_item_tag(dev, items,
+							attr, error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_ITEM_TAG;
+			break;
+		case MLX5_RTE_FLOW_ITEM_TYPE_TAG:
+		case MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE:
+			break;
+		case RTE_FLOW_ITEM_TYPE_GTP:
+			ret = flow_dv_validate_item_gtp(dev, items, item_flags,
+							error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_GTP;
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  NULL, "item not supported");
+		}
+		item_flags |= last_item;
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		int type = actions->type;
+		if (actions_n == MLX5_DV_MAX_NUMBER_OF_ACTIONS)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions, "too many actions");
+		switch (type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_PORT_ID:
+			ret = flow_dv_validate_action_port_id(dev,
+							      action_flags,
+							      actions,
+							      attr,
+							      error);
+			if (ret)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_PORT_ID;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_FLAG:
+			ret = flow_dv_validate_action_flag(dev, action_flags,
+							   attr, error);
+			if (ret < 0)
+				return ret;
+			if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+				/* Count all modify-header actions as one. */
+				if (!(action_flags &
+				      MLX5_FLOW_MODIFY_HDR_ACTIONS))
+					++actions_n;
+				action_flags |= MLX5_FLOW_ACTION_FLAG |
+						MLX5_FLOW_ACTION_MARK_EXT;
+			} else {
+				action_flags |= MLX5_FLOW_ACTION_FLAG;
+				++actions_n;
+			}
+			rw_act_num += MLX5_ACT_NUM_SET_MARK;
+			break;
+		case RTE_FLOW_ACTION_TYPE_MARK:
+			ret = flow_dv_validate_action_mark(dev, actions,
+							   action_flags,
+							   attr, error);
+			if (ret < 0)
+				return ret;
+			if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+				/* Count all modify-header actions as one. */
+				if (!(action_flags &
+				      MLX5_FLOW_MODIFY_HDR_ACTIONS))
+					++actions_n;
+				action_flags |= MLX5_FLOW_ACTION_MARK |
+						MLX5_FLOW_ACTION_MARK_EXT;
+			} else {
+				action_flags |= MLX5_FLOW_ACTION_MARK;
+				++actions_n;
+			}
+			rw_act_num += MLX5_ACT_NUM_SET_MARK;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_META:
+			ret = flow_dv_validate_action_set_meta(dev, actions,
+							       action_flags,
+							       attr, error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= MLX5_FLOW_ACTION_SET_META;
+			rw_act_num += MLX5_ACT_NUM_SET_META;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_TAG:
+			ret = flow_dv_validate_action_set_tag(dev, actions,
+							      action_flags,
+							      attr, error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= MLX5_FLOW_ACTION_SET_TAG;
+			rw_act_num += MLX5_ACT_NUM_SET_TAG;
+			break;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			ret = mlx5_flow_validate_action_drop(action_flags,
+							     attr, error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_DROP;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+			ret = mlx5_flow_validate_action_queue(actions,
+							      action_flags, dev,
+							      attr, error);
+			if (ret < 0)
+				return ret;
+			queue_index = ((const struct rte_flow_action_queue *)
+							(actions->conf))->index;
+			action_flags |= MLX5_FLOW_ACTION_QUEUE;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			rss = actions->conf;
+			ret = mlx5_flow_validate_action_rss(actions,
+							    action_flags, dev,
+							    attr, item_flags,
+							    error);
+			if (ret < 0)
+				return ret;
+			if (rss != NULL && rss->queue_num)
+				queue_index = rss->queue[0];
+			action_flags |= MLX5_FLOW_ACTION_RSS;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			ret = flow_dv_validate_action_count(dev, error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_COUNT;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
+			if (flow_dv_validate_action_pop_vlan(dev,
+							     action_flags,
+							     actions,
+							     item_flags, attr,
+							     error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
+			ret = flow_dv_validate_action_push_vlan(dev,
+								action_flags,
+								vlan_m,
+								actions, attr,
+								error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
+			ret = flow_dv_validate_action_set_vlan_pcp
+						(action_flags, actions, error);
+			if (ret < 0)
+				return ret;
+			/* Count PCP with push_vlan command. */
+			action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
+			ret = flow_dv_validate_action_set_vlan_vid
+						(item_flags, action_flags,
+						 actions, error);
+			if (ret < 0)
+				return ret;
+			/* Count VID with push_vlan command. */
+			action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
+			rw_act_num += MLX5_ACT_NUM_MDF_VID;
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+		case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+			ret = flow_dv_validate_action_l2_encap(dev,
+							       action_flags,
+							       actions, attr,
+							       error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_ENCAP;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+		case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP:
+			ret = flow_dv_validate_action_decap(dev, action_flags,
+							    attr, error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_DECAP;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+			ret = flow_dv_validate_action_raw_encap_decap
+				(dev, NULL, actions->conf, attr, &action_flags,
+				 &actions_n, error);
+			if (ret < 0)
+				return ret;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+			decap = actions->conf;
+			while ((++actions)->type == RTE_FLOW_ACTION_TYPE_VOID)
+				;
+			if (actions->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) {
+				encap = NULL;
+				actions--;
+			} else {
+				encap = actions->conf;
+			}
+			ret = flow_dv_validate_action_raw_encap_decap
+					   (dev,
+					    decap ? decap : &empty_decap, encap,
+					    attr, &action_flags, &actions_n,
+					    error);
+			if (ret < 0)
+				return ret;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
+			ret = flow_dv_validate_action_modify_mac(action_flags,
+								 actions,
+								 item_flags,
+								 error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
+						MLX5_FLOW_ACTION_SET_MAC_SRC :
+						MLX5_FLOW_ACTION_SET_MAC_DST;
+			/*
+			 * Even if the source and destination MAC addresses have
+			 * overlap in the header with 4B alignment, the convert
+			 * function will handle them separately and 4 SW actions
+			 * will be created. And 2 actions will be added each
+			 * time no matter how many bytes of address will be set.
+			 */
+			rw_act_num += MLX5_ACT_NUM_MDF_MAC;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
+			ret = flow_dv_validate_action_modify_ipv4(action_flags,
+								  actions,
+								  item_flags,
+								  error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
+						MLX5_FLOW_ACTION_SET_IPV4_SRC :
+						MLX5_FLOW_ACTION_SET_IPV4_DST;
+			rw_act_num += MLX5_ACT_NUM_MDF_IPV4;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
+			ret = flow_dv_validate_action_modify_ipv6(action_flags,
+								  actions,
+								  item_flags,
+								  error);
+			if (ret < 0)
+				return ret;
+			if (item_ipv6_proto == IPPROTO_ICMPV6)
+				return rte_flow_error_set(error, ENOTSUP,
+					RTE_FLOW_ERROR_TYPE_ACTION,
+					actions,
+					"Can't change header "
+					"with ICMPv6 proto");
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
+						MLX5_FLOW_ACTION_SET_IPV6_SRC :
+						MLX5_FLOW_ACTION_SET_IPV6_DST;
+			rw_act_num += MLX5_ACT_NUM_MDF_IPV6;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
+			ret = flow_dv_validate_action_modify_tp(action_flags,
+								actions,
+								item_flags,
+								error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
+						MLX5_FLOW_ACTION_SET_TP_SRC :
+						MLX5_FLOW_ACTION_SET_TP_DST;
+			rw_act_num += MLX5_ACT_NUM_MDF_PORT;
+			break;
+		case RTE_FLOW_ACTION_TYPE_DEC_TTL:
+		case RTE_FLOW_ACTION_TYPE_SET_TTL:
+			ret = flow_dv_validate_action_modify_ttl(action_flags,
+								 actions,
+								 item_flags,
+								 error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_SET_TTL ?
+						MLX5_FLOW_ACTION_SET_TTL :
+						MLX5_FLOW_ACTION_DEC_TTL;
+			rw_act_num += MLX5_ACT_NUM_MDF_TTL;
+			break;
+		case RTE_FLOW_ACTION_TYPE_JUMP:
+			ret = flow_dv_validate_action_jump(actions,
+							   action_flags,
+							   attr, external,
+							   error);
+			if (ret)
+				return ret;
+			++actions_n;
+			action_flags |= MLX5_FLOW_ACTION_JUMP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ:
+			ret = flow_dv_validate_action_modify_tcp_seq
+								(action_flags,
+								 actions,
+								 item_flags,
+								 error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ ?
+						MLX5_FLOW_ACTION_INC_TCP_SEQ :
+						MLX5_FLOW_ACTION_DEC_TCP_SEQ;
+			rw_act_num += MLX5_ACT_NUM_MDF_TCPSEQ;
+			break;
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+			ret = flow_dv_validate_action_modify_tcp_ack
+								(action_flags,
+								 actions,
+								 item_flags,
+								 error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_INC_TCP_ACK ?
+						MLX5_FLOW_ACTION_INC_TCP_ACK :
+						MLX5_FLOW_ACTION_DEC_TCP_ACK;
+			rw_act_num += MLX5_ACT_NUM_MDF_TCPACK;
+			break;
+		case MLX5_RTE_FLOW_ACTION_TYPE_MARK:
+			break;
+		case MLX5_RTE_FLOW_ACTION_TYPE_TAG:
+		case MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG:
+			rw_act_num += MLX5_ACT_NUM_SET_TAG;
+			break;
+		case RTE_FLOW_ACTION_TYPE_METER:
+			ret = mlx5_flow_validate_action_meter(dev,
+							      action_flags,
+							      actions, attr,
+							      error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_METER;
+			++actions_n;
+			/* Meter action will add one more TAG action. */
+			rw_act_num += MLX5_ACT_NUM_SET_TAG;
+			break;
+		case RTE_FLOW_ACTION_TYPE_AGE:
+			ret = flow_dv_validate_action_age(action_flags,
+							  actions, dev,
+							  error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_AGE;
+			++actions_n;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+			ret = flow_dv_validate_action_modify_ipv4_dscp
+							 (action_flags,
+							  actions,
+							  item_flags,
+							  error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= MLX5_FLOW_ACTION_SET_IPV4_DSCP;
+			rw_act_num += MLX5_ACT_NUM_SET_DSCP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP:
+			ret = flow_dv_validate_action_modify_ipv6_dscp
+								(action_flags,
+								 actions,
+								 item_flags,
+								 error);
+			if (ret < 0)
+				return ret;
+			/* Count all modify-header actions as one action. */
+			if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS))
+				++actions_n;
+			action_flags |= MLX5_FLOW_ACTION_SET_IPV6_DSCP;
+			rw_act_num += MLX5_ACT_NUM_SET_DSCP;
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "action not supported");
+		}
+	}
+	/*
+	 * Validate the drop action mutual exclusion with other actions.
+	 * Drop action is mutually-exclusive with any other action, except for
+	 * Count action.
+	 */
+	if ((action_flags & MLX5_FLOW_ACTION_DROP) &&
+	    (action_flags & ~(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_COUNT)))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "Drop action is mutually-exclusive "
+					  "with any other action, except for "
+					  "Count action");
+	/* Eswitch has few restrictions on using items and actions */
+	if (attr->transfer) {
+		if (!mlx5_flow_ext_mreg_supported(dev) &&
+		    action_flags & MLX5_FLOW_ACTION_FLAG)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "unsupported action FLAG");
+		if (!mlx5_flow_ext_mreg_supported(dev) &&
+		    action_flags & MLX5_FLOW_ACTION_MARK)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "unsupported action MARK");
+		if (action_flags & MLX5_FLOW_ACTION_QUEUE)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "unsupported action QUEUE");
+		if (action_flags & MLX5_FLOW_ACTION_RSS)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL,
+						  "unsupported action RSS");
+		if (!(action_flags & MLX5_FLOW_FATE_ESWITCH_ACTIONS))
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "no fate action is found");
+	} else {
+		if (!(action_flags & MLX5_FLOW_FATE_ACTIONS) && attr->ingress)
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "no fate action is found");
+	}
+	/* Continue validation for Xcap actions.*/
+	if ((action_flags & MLX5_FLOW_XCAP_ACTIONS) && (queue_index == 0xFFFF ||
+	    mlx5_rxq_get_type(dev, queue_index) != MLX5_RXQ_TYPE_HAIRPIN)) {
+		if ((action_flags & MLX5_FLOW_XCAP_ACTIONS) ==
+		    MLX5_FLOW_XCAP_ACTIONS)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL, "encap and decap "
+						  "combination aren't supported");
+		if (!attr->transfer && attr->ingress && (action_flags &
+							MLX5_FLOW_ACTION_ENCAP))
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  NULL, "encap is not supported"
+						  " for ingress traffic");
+	}
+	/* Hairpin flow will add one more TAG action. */
+	if (hairpin > 0)
+		rw_act_num += MLX5_ACT_NUM_SET_TAG;
+	/* extra metadata enabled: one more TAG action will be add. */
+	if (dev_conf->dv_flow_en &&
+	    dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+	    mlx5_flow_ext_mreg_supported(dev))
+		rw_act_num += MLX5_ACT_NUM_SET_TAG;
+	if ((uint32_t)rw_act_num >
+			flow_dv_modify_hdr_action_max(dev, is_root)) {
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  NULL, "too many header modify"
+					  " actions to support");
+	}
+	return 0;
+}
+
+/**
+ * Internal preparation function. Allocates the DV flow size,
+ * this size is constant.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   Pointer to mlx5_flow object on success,
+ *   otherwise NULL and rte_errno is set.
+ */
+static struct mlx5_flow *
+flow_dv_prepare(struct rte_eth_dev *dev,
+		const struct rte_flow_attr *attr __rte_unused,
+		const struct rte_flow_item items[] __rte_unused,
+		const struct rte_flow_action actions[] __rte_unused,
+		struct rte_flow_error *error)
+{
+	uint32_t handle_idx = 0;
+	struct mlx5_flow *dev_flow;
+	struct mlx5_flow_handle *dev_handle;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	/* In case of corrupting the memory. */
+	if (priv->flow_idx >= MLX5_NUM_MAX_DEV_FLOWS) {
+		rte_flow_error_set(error, ENOSPC,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "not free temporary device flow");
+		return NULL;
+	}
+	dev_handle = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+				   &handle_idx);
+	if (!dev_handle) {
+		rte_flow_error_set(error, ENOMEM,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "not enough memory to create flow handle");
+		return NULL;
+	}
+	/* No multi-thread supporting. */
+	dev_flow = &((struct mlx5_flow *)priv->inter_flows)[priv->flow_idx++];
+	dev_flow->handle = dev_handle;
+	dev_flow->handle_idx = handle_idx;
+	dev_flow->dv.value.size = MLX5_ST_SZ_BYTES(fte_match_param);
+	/*
+	 * The matching value needs to be cleared to 0 before using. In the
+	 * past, it will be automatically cleared when using rte_*alloc
+	 * API. The time consumption will be almost the same as before.
+	 */
+	memset(dev_flow->dv.value.buf, 0, MLX5_ST_SZ_BYTES(fte_match_param));
+	dev_flow->ingress = attr->ingress;
+	dev_flow->dv.transfer = attr->transfer;
+	return dev_flow;
+}
+
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+/**
+ * Sanity check for match mask and value. Similar to check_valid_spec() in
+ * kernel driver. If unmasked bit is present in value, it returns failure.
+ *
+ * @param match_mask
+ *   pointer to match mask buffer.
+ * @param match_value
+ *   pointer to match value buffer.
+ *
+ * @return
+ *   0 if valid, -EINVAL otherwise.
+ */
+static int
+flow_dv_check_valid_spec(void *match_mask, void *match_value)
+{
+	uint8_t *m = match_mask;
+	uint8_t *v = match_value;
+	unsigned int i;
+
+	for (i = 0; i < MLX5_ST_SZ_BYTES(fte_match_param); ++i) {
+		if (v[i] & ~m[i]) {
+			DRV_LOG(ERR,
+				"match_value differs from match_criteria"
+				" %p[%u] != %p[%u]",
+				match_value, i, match_mask, i);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+#endif
+
+/**
+ * Add match of ip_version.
+ *
+ * @param[in] group
+ *   Flow group.
+ * @param[in] headers_v
+ *   Values header pointer.
+ * @param[in] headers_m
+ *   Masks header pointer.
+ * @param[in] ip_version
+ *   The IP version to set.
+ */
+static inline void
+flow_dv_set_match_ip_version(uint32_t group,
+			     void *headers_v,
+			     void *headers_m,
+			     uint8_t ip_version)
+{
+	if (group == 0)
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 0xf);
+	else
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version,
+			 ip_version);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, ip_version);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, 0);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype, 0);
+}
+
+/**
+ * Add Ethernet item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_eth(void *matcher, void *key,
+			   const struct rte_flow_item *item, int inner,
+			   uint32_t group)
+{
+	const struct rte_flow_item_eth *eth_m = item->mask;
+	const struct rte_flow_item_eth *eth_v = item->spec;
+	const struct rte_flow_item_eth nic_mask = {
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+		.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+		.type = RTE_BE16(0xffff),
+	};
+	void *headers_m;
+	void *headers_v;
+	char *l24_v;
+	unsigned int i;
+
+	if (!eth_v)
+		return;
+	if (!eth_m)
+		eth_m = &nic_mask;
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, dmac_47_16),
+	       &eth_m->dst, sizeof(eth_m->dst));
+	/* The value must be in the range of the mask. */
+	l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, dmac_47_16);
+	for (i = 0; i < sizeof(eth_m->dst); ++i)
+		l24_v[i] = eth_m->dst.addr_bytes[i] & eth_v->dst.addr_bytes[i];
+	memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, smac_47_16),
+	       &eth_m->src, sizeof(eth_m->src));
+	l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, smac_47_16);
+	/* The value must be in the range of the mask. */
+	for (i = 0; i < sizeof(eth_m->dst); ++i)
+		l24_v[i] = eth_m->src.addr_bytes[i] & eth_v->src.addr_bytes[i];
+	if (eth_v->type) {
+		/* When ethertype is present set mask for tagged VLAN. */
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1);
+		/* Set value for tagged VLAN if ethertype is 802.1Q. */
+		if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
+		    eth_v->type == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag,
+				 1);
+			/* Return here to avoid setting match on ethertype. */
+			return;
+		}
+	}
+	/*
+	 * HW supports match on one Ethertype, the Ethertype following the last
+	 * VLAN tag of the packet (see PRM).
+	 * Set match on ethertype only if ETH header is not followed by VLAN.
+	 * HW is optimized for IPv4/IPv6. In such cases, avoid setting
+	 * ethertype, and use ip_version field instead.
+	 */
+	if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_IPV4) &&
+	    eth_m->type == 0xFFFF) {
+		flow_dv_set_match_ip_version(group, headers_v, headers_m, 4);
+	} else if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_IPV6) &&
+		   eth_m->type == 0xFFFF) {
+		flow_dv_set_match_ip_version(group, headers_v, headers_m, 6);
+	} else {
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype,
+			 rte_be_to_cpu_16(eth_m->type));
+		l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				     ethertype);
+		*(uint16_t *)(l24_v) = eth_m->type & eth_v->type;
+	}
+}
+
+/**
+ * Add VLAN item to matcher and to the value.
+ *
+ * @param[in, out] dev_flow
+ *   Flow descriptor.
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow,
+			    void *matcher, void *key,
+			    const struct rte_flow_item *item,
+			    int inner, uint32_t group)
+{
+	const struct rte_flow_item_vlan *vlan_m = item->mask;
+	const struct rte_flow_item_vlan *vlan_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+	uint16_t tci_m;
+	uint16_t tci_v;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+		/*
+		 * This is workaround, masks are not supported,
+		 * and pre-validated.
+		 */
+		if (vlan_v)
+			dev_flow->handle->vf_vlan.tag =
+					rte_be_to_cpu_16(vlan_v->tci) & 0x0fff;
+	}
+	/*
+	 * When VLAN item exists in flow, mark packet as tagged,
+	 * even if TCI is not specified.
+	 */
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
+	if (!vlan_v)
+		return;
+	if (!vlan_m)
+		vlan_m = &rte_flow_item_vlan_mask;
+	tci_m = rte_be_to_cpu_16(vlan_m->tci);
+	tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_vid, tci_m);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, tci_v);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_cfi, tci_m >> 12);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_cfi, tci_v >> 12);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_prio, tci_m >> 13);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, tci_v >> 13);
+	/*
+	 * HW is optimized for IPv4/IPv6. In such cases, avoid setting
+	 * ethertype, and use ip_version field instead.
+	 */
+	if (vlan_v->inner_type == RTE_BE16(RTE_ETHER_TYPE_IPV4) &&
+	    vlan_m->inner_type == 0xFFFF) {
+		flow_dv_set_match_ip_version(group, headers_v, headers_m, 4);
+	} else if (vlan_v->inner_type == RTE_BE16(RTE_ETHER_TYPE_IPV6) &&
+		   vlan_m->inner_type == 0xFFFF) {
+		flow_dv_set_match_ip_version(group, headers_v, headers_m, 6);
+	} else {
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype,
+			 rte_be_to_cpu_16(vlan_m->inner_type));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
+			 rte_be_to_cpu_16(vlan_m->inner_type &
+					  vlan_v->inner_type));
+	}
+}
+
+/**
+ * Add IPV4 item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] inner
+ *   Item is inner pattern.
+ * @param[in] group
+ *   The group to insert the rule.
+ */
+static void
+flow_dv_translate_item_ipv4(void *matcher, void *key,
+			    const struct rte_flow_item *item,
+			    const uint64_t item_flags,
+			    int inner, uint32_t group)
+{
+	const struct rte_flow_item_ipv4 *ipv4_m = item->mask;
+	const struct rte_flow_item_ipv4 *ipv4_v = item->spec;
+	const struct rte_flow_item_ipv4 nic_mask = {
+		.hdr = {
+			.src_addr = RTE_BE32(0xffffffff),
+			.dst_addr = RTE_BE32(0xffffffff),
+			.type_of_service = 0xff,
+			.next_proto_id = 0xff,
+			.time_to_live = 0xff,
+		},
+	};
+	void *headers_m;
+	void *headers_v;
+	char *l24_m;
+	char *l24_v;
+	uint8_t tos;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	flow_dv_set_match_ip_version(group, headers_v, headers_m, 4);
+	/*
+	 * On outer header (which must contains L2), or inner header with L2,
+	 * set cvlan_tag mask bit to mark this packet as untagged.
+	 * This should be done even if item->spec is empty.
+	 */
+	if (!inner || item_flags & MLX5_FLOW_LAYER_INNER_L2)
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1);
+	if (!ipv4_v)
+		return;
+	if (!ipv4_m)
+		ipv4_m = &nic_mask;
+	l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m,
+			     dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+	l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+			     dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+	*(uint32_t *)l24_m = ipv4_m->hdr.dst_addr;
+	*(uint32_t *)l24_v = ipv4_m->hdr.dst_addr & ipv4_v->hdr.dst_addr;
+	l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m,
+			  src_ipv4_src_ipv6.ipv4_layout.ipv4);
+	l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+			  src_ipv4_src_ipv6.ipv4_layout.ipv4);
+	*(uint32_t *)l24_m = ipv4_m->hdr.src_addr;
+	*(uint32_t *)l24_v = ipv4_m->hdr.src_addr & ipv4_v->hdr.src_addr;
+	tos = ipv4_m->hdr.type_of_service & ipv4_v->hdr.type_of_service;
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ecn,
+		 ipv4_m->hdr.type_of_service);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, tos);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_dscp,
+		 ipv4_m->hdr.type_of_service >> 2);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, tos >> 2);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol,
+		 ipv4_m->hdr.next_proto_id);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+		 ipv4_v->hdr.next_proto_id & ipv4_m->hdr.next_proto_id);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ttl_hoplimit,
+		 ipv4_m->hdr.time_to_live);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ttl_hoplimit,
+		 ipv4_v->hdr.time_to_live & ipv4_m->hdr.time_to_live);
+}
+
+/**
+ * Add IPV6 item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] item_flags
+ *   Bit-fields that holds the items detected until now.
+ * @param[in] inner
+ *   Item is inner pattern.
+ * @param[in] group
+ *   The group to insert the rule.
+ */
+static void
+flow_dv_translate_item_ipv6(void *matcher, void *key,
+			    const struct rte_flow_item *item,
+			    const uint64_t item_flags,
+			    int inner, uint32_t group)
+{
+	const struct rte_flow_item_ipv6 *ipv6_m = item->mask;
+	const struct rte_flow_item_ipv6 *ipv6_v = item->spec;
+	const struct rte_flow_item_ipv6 nic_mask = {
+		.hdr = {
+			.src_addr =
+				"\xff\xff\xff\xff\xff\xff\xff\xff"
+				"\xff\xff\xff\xff\xff\xff\xff\xff",
+			.dst_addr =
+				"\xff\xff\xff\xff\xff\xff\xff\xff"
+				"\xff\xff\xff\xff\xff\xff\xff\xff",
+			.vtc_flow = RTE_BE32(0xffffffff),
+			.proto = 0xff,
+			.hop_limits = 0xff,
+		},
+	};
+	void *headers_m;
+	void *headers_v;
+	void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+	char *l24_m;
+	char *l24_v;
+	uint32_t vtc_m;
+	uint32_t vtc_v;
+	int i;
+	int size;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	flow_dv_set_match_ip_version(group, headers_v, headers_m, 6);
+	/*
+	 * On outer header (which must contains L2), or inner header with L2,
+	 * set cvlan_tag mask bit to mark this packet as untagged.
+	 * This should be done even if item->spec is empty.
+	 */
+	if (!inner || item_flags & MLX5_FLOW_LAYER_INNER_L2)
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1);
+	if (!ipv6_v)
+		return;
+	if (!ipv6_m)
+		ipv6_m = &nic_mask;
+	size = sizeof(ipv6_m->hdr.dst_addr);
+	l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m,
+			     dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
+	l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+			     dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
+	memcpy(l24_m, ipv6_m->hdr.dst_addr, size);
+	for (i = 0; i < size; ++i)
+		l24_v[i] = l24_m[i] & ipv6_v->hdr.dst_addr[i];
+	l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m,
+			     src_ipv4_src_ipv6.ipv6_layout.ipv6);
+	l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+			     src_ipv4_src_ipv6.ipv6_layout.ipv6);
+	memcpy(l24_m, ipv6_m->hdr.src_addr, size);
+	for (i = 0; i < size; ++i)
+		l24_v[i] = l24_m[i] & ipv6_v->hdr.src_addr[i];
+	/* TOS. */
+	vtc_m = rte_be_to_cpu_32(ipv6_m->hdr.vtc_flow);
+	vtc_v = rte_be_to_cpu_32(ipv6_m->hdr.vtc_flow & ipv6_v->hdr.vtc_flow);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ecn, vtc_m >> 20);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, vtc_v >> 20);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_dscp, vtc_m >> 22);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, vtc_v >> 22);
+	/* Label. */
+	if (inner) {
+		MLX5_SET(fte_match_set_misc, misc_m, inner_ipv6_flow_label,
+			 vtc_m);
+		MLX5_SET(fte_match_set_misc, misc_v, inner_ipv6_flow_label,
+			 vtc_v);
+	} else {
+		MLX5_SET(fte_match_set_misc, misc_m, outer_ipv6_flow_label,
+			 vtc_m);
+		MLX5_SET(fte_match_set_misc, misc_v, outer_ipv6_flow_label,
+			 vtc_v);
+	}
+	/* Protocol. */
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol,
+		 ipv6_m->hdr.proto);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+		 ipv6_v->hdr.proto & ipv6_m->hdr.proto);
+	/* Hop limit. */
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ttl_hoplimit,
+		 ipv6_m->hdr.hop_limits);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ttl_hoplimit,
+		 ipv6_v->hdr.hop_limits & ipv6_m->hdr.hop_limits);
+}
+
+/**
+ * Add TCP item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_tcp(void *matcher, void *key,
+			   const struct rte_flow_item *item,
+			   int inner)
+{
+	const struct rte_flow_item_tcp *tcp_m = item->mask;
+	const struct rte_flow_item_tcp *tcp_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_TCP);
+	if (!tcp_v)
+		return;
+	if (!tcp_m)
+		tcp_m = &rte_flow_item_tcp_mask;
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_sport,
+		 rte_be_to_cpu_16(tcp_m->hdr.src_port));
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
+		 rte_be_to_cpu_16(tcp_v->hdr.src_port & tcp_m->hdr.src_port));
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_dport,
+		 rte_be_to_cpu_16(tcp_m->hdr.dst_port));
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
+		 rte_be_to_cpu_16(tcp_v->hdr.dst_port & tcp_m->hdr.dst_port));
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_flags,
+		 tcp_m->hdr.tcp_flags);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_flags,
+		 (tcp_v->hdr.tcp_flags & tcp_m->hdr.tcp_flags));
+}
+
+/**
+ * Add UDP item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_udp(void *matcher, void *key,
+			   const struct rte_flow_item *item,
+			   int inner)
+{
+	const struct rte_flow_item_udp *udp_m = item->mask;
+	const struct rte_flow_item_udp *udp_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_UDP);
+	if (!udp_v)
+		return;
+	if (!udp_m)
+		udp_m = &rte_flow_item_udp_mask;
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_sport,
+		 rte_be_to_cpu_16(udp_m->hdr.src_port));
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
+		 rte_be_to_cpu_16(udp_v->hdr.src_port & udp_m->hdr.src_port));
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport,
+		 rte_be_to_cpu_16(udp_m->hdr.dst_port));
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
+		 rte_be_to_cpu_16(udp_v->hdr.dst_port & udp_m->hdr.dst_port));
+}
+
+/**
+ * Add GRE optional Key item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_gre_key(void *matcher, void *key,
+				   const struct rte_flow_item *item)
+{
+	const rte_be32_t *key_m = item->mask;
+	const rte_be32_t *key_v = item->spec;
+	void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+	rte_be32_t gre_key_default_mask = RTE_BE32(UINT32_MAX);
+
+	/* GRE K bit must be on and should already be validated */
+	MLX5_SET(fte_match_set_misc, misc_m, gre_k_present, 1);
+	MLX5_SET(fte_match_set_misc, misc_v, gre_k_present, 1);
+	if (!key_v)
+		return;
+	if (!key_m)
+		key_m = &gre_key_default_mask;
+	MLX5_SET(fte_match_set_misc, misc_m, gre_key_h,
+		 rte_be_to_cpu_32(*key_m) >> 8);
+	MLX5_SET(fte_match_set_misc, misc_v, gre_key_h,
+		 rte_be_to_cpu_32((*key_v) & (*key_m)) >> 8);
+	MLX5_SET(fte_match_set_misc, misc_m, gre_key_l,
+		 rte_be_to_cpu_32(*key_m) & 0xFF);
+	MLX5_SET(fte_match_set_misc, misc_v, gre_key_l,
+		 rte_be_to_cpu_32((*key_v) & (*key_m)) & 0xFF);
+}
+
+/**
+ * Add GRE item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_gre(void *matcher, void *key,
+			   const struct rte_flow_item *item,
+			   int inner)
+{
+	const struct rte_flow_item_gre *gre_m = item->mask;
+	const struct rte_flow_item_gre *gre_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+	void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+	struct {
+		union {
+			__extension__
+			struct {
+				uint16_t version:3;
+				uint16_t rsvd0:9;
+				uint16_t s_present:1;
+				uint16_t k_present:1;
+				uint16_t rsvd_bit1:1;
+				uint16_t c_present:1;
+			};
+			uint16_t value;
+		};
+	} gre_crks_rsvd0_ver_m, gre_crks_rsvd0_ver_v;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_GRE);
+	if (!gre_v)
+		return;
+	if (!gre_m)
+		gre_m = &rte_flow_item_gre_mask;
+	MLX5_SET(fte_match_set_misc, misc_m, gre_protocol,
+		 rte_be_to_cpu_16(gre_m->protocol));
+	MLX5_SET(fte_match_set_misc, misc_v, gre_protocol,
+		 rte_be_to_cpu_16(gre_v->protocol & gre_m->protocol));
+	gre_crks_rsvd0_ver_m.value = rte_be_to_cpu_16(gre_m->c_rsvd0_ver);
+	gre_crks_rsvd0_ver_v.value = rte_be_to_cpu_16(gre_v->c_rsvd0_ver);
+	MLX5_SET(fte_match_set_misc, misc_m, gre_c_present,
+		 gre_crks_rsvd0_ver_m.c_present);
+	MLX5_SET(fte_match_set_misc, misc_v, gre_c_present,
+		 gre_crks_rsvd0_ver_v.c_present &
+		 gre_crks_rsvd0_ver_m.c_present);
+	MLX5_SET(fte_match_set_misc, misc_m, gre_k_present,
+		 gre_crks_rsvd0_ver_m.k_present);
+	MLX5_SET(fte_match_set_misc, misc_v, gre_k_present,
+		 gre_crks_rsvd0_ver_v.k_present &
+		 gre_crks_rsvd0_ver_m.k_present);
+	MLX5_SET(fte_match_set_misc, misc_m, gre_s_present,
+		 gre_crks_rsvd0_ver_m.s_present);
+	MLX5_SET(fte_match_set_misc, misc_v, gre_s_present,
+		 gre_crks_rsvd0_ver_v.s_present &
+		 gre_crks_rsvd0_ver_m.s_present);
+}
+
+/**
+ * Add NVGRE item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_nvgre(void *matcher, void *key,
+			     const struct rte_flow_item *item,
+			     int inner)
+{
+	const struct rte_flow_item_nvgre *nvgre_m = item->mask;
+	const struct rte_flow_item_nvgre *nvgre_v = item->spec;
+	void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+	const char *tni_flow_id_m = (const char *)nvgre_m->tni;
+	const char *tni_flow_id_v = (const char *)nvgre_v->tni;
+	char *gre_key_m;
+	char *gre_key_v;
+	int size;
+	int i;
+
+	/* For NVGRE, GRE header fields must be set with defined values. */
+	const struct rte_flow_item_gre gre_spec = {
+		.c_rsvd0_ver = RTE_BE16(0x2000),
+		.protocol = RTE_BE16(RTE_ETHER_TYPE_TEB)
+	};
+	const struct rte_flow_item_gre gre_mask = {
+		.c_rsvd0_ver = RTE_BE16(0xB000),
+		.protocol = RTE_BE16(UINT16_MAX),
+	};
+	const struct rte_flow_item gre_item = {
+		.spec = &gre_spec,
+		.mask = &gre_mask,
+		.last = NULL,
+	};
+	flow_dv_translate_item_gre(matcher, key, &gre_item, inner);
+	if (!nvgre_v)
+		return;
+	if (!nvgre_m)
+		nvgre_m = &rte_flow_item_nvgre_mask;
+	size = sizeof(nvgre_m->tni) + sizeof(nvgre_m->flow_id);
+	gre_key_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, gre_key_h);
+	gre_key_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, gre_key_h);
+	memcpy(gre_key_m, tni_flow_id_m, size);
+	for (i = 0; i < size; ++i)
+		gre_key_v[i] = gre_key_m[i] & tni_flow_id_v[i];
+}
+
+/**
+ * Add VXLAN item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_vxlan(void *matcher, void *key,
+			     const struct rte_flow_item *item,
+			     int inner)
+{
+	const struct rte_flow_item_vxlan *vxlan_m = item->mask;
+	const struct rte_flow_item_vxlan *vxlan_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+	void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+	char *vni_m;
+	char *vni_v;
+	uint16_t dport;
+	int size;
+	int i;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	dport = item->type == RTE_FLOW_ITEM_TYPE_VXLAN ?
+		MLX5_UDP_PORT_VXLAN : MLX5_UDP_PORT_VXLAN_GPE;
+	if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) {
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport);
+	}
+	if (!vxlan_v)
+		return;
+	if (!vxlan_m)
+		vxlan_m = &rte_flow_item_vxlan_mask;
+	size = sizeof(vxlan_m->vni);
+	vni_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, vxlan_vni);
+	vni_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, vxlan_vni);
+	memcpy(vni_m, vxlan_m->vni, size);
+	for (i = 0; i < size; ++i)
+		vni_v[i] = vni_m[i] & vxlan_v->vni[i];
+}
+
+/**
+ * Add VXLAN-GPE item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+
+static void
+flow_dv_translate_item_vxlan_gpe(void *matcher, void *key,
+				 const struct rte_flow_item *item, int inner)
+{
+	const struct rte_flow_item_vxlan_gpe *vxlan_m = item->mask;
+	const struct rte_flow_item_vxlan_gpe *vxlan_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+	void *misc_m =
+		MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_3);
+	void *misc_v =
+		MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3);
+	char *vni_m;
+	char *vni_v;
+	uint16_t dport;
+	int size;
+	int i;
+	uint8_t flags_m = 0xff;
+	uint8_t flags_v = 0xc;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	dport = item->type == RTE_FLOW_ITEM_TYPE_VXLAN ?
+		MLX5_UDP_PORT_VXLAN : MLX5_UDP_PORT_VXLAN_GPE;
+	if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) {
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport);
+	}
+	if (!vxlan_v)
+		return;
+	if (!vxlan_m)
+		vxlan_m = &rte_flow_item_vxlan_gpe_mask;
+	size = sizeof(vxlan_m->vni);
+	vni_m = MLX5_ADDR_OF(fte_match_set_misc3, misc_m, outer_vxlan_gpe_vni);
+	vni_v = MLX5_ADDR_OF(fte_match_set_misc3, misc_v, outer_vxlan_gpe_vni);
+	memcpy(vni_m, vxlan_m->vni, size);
+	for (i = 0; i < size; ++i)
+		vni_v[i] = vni_m[i] & vxlan_v->vni[i];
+	if (vxlan_m->flags) {
+		flags_m = vxlan_m->flags;
+		flags_v = vxlan_v->flags;
+	}
+	MLX5_SET(fte_match_set_misc3, misc_m, outer_vxlan_gpe_flags, flags_m);
+	MLX5_SET(fte_match_set_misc3, misc_v, outer_vxlan_gpe_flags, flags_v);
+	MLX5_SET(fte_match_set_misc3, misc_m, outer_vxlan_gpe_next_protocol,
+		 vxlan_m->protocol);
+	MLX5_SET(fte_match_set_misc3, misc_v, outer_vxlan_gpe_next_protocol,
+		 vxlan_v->protocol);
+}
+
+/**
+ * Add Geneve item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+
+static void
+flow_dv_translate_item_geneve(void *matcher, void *key,
+			      const struct rte_flow_item *item, int inner)
+{
+	const struct rte_flow_item_geneve *geneve_m = item->mask;
+	const struct rte_flow_item_geneve *geneve_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+	void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+	uint16_t dport;
+	uint16_t gbhdr_m;
+	uint16_t gbhdr_v;
+	char *vni_m;
+	char *vni_v;
+	size_t size, i;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	dport = MLX5_UDP_PORT_GENEVE;
+	if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) {
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport);
+	}
+	if (!geneve_v)
+		return;
+	if (!geneve_m)
+		geneve_m = &rte_flow_item_geneve_mask;
+	size = sizeof(geneve_m->vni);
+	vni_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, geneve_vni);
+	vni_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, geneve_vni);
+	memcpy(vni_m, geneve_m->vni, size);
+	for (i = 0; i < size; ++i)
+		vni_v[i] = vni_m[i] & geneve_v->vni[i];
+	MLX5_SET(fte_match_set_misc, misc_m, geneve_protocol_type,
+		 rte_be_to_cpu_16(geneve_m->protocol));
+	MLX5_SET(fte_match_set_misc, misc_v, geneve_protocol_type,
+		 rte_be_to_cpu_16(geneve_v->protocol & geneve_m->protocol));
+	gbhdr_m = rte_be_to_cpu_16(geneve_m->ver_opt_len_o_c_rsvd0);
+	gbhdr_v = rte_be_to_cpu_16(geneve_v->ver_opt_len_o_c_rsvd0);
+	MLX5_SET(fte_match_set_misc, misc_m, geneve_oam,
+		 MLX5_GENEVE_OAMF_VAL(gbhdr_m));
+	MLX5_SET(fte_match_set_misc, misc_v, geneve_oam,
+		 MLX5_GENEVE_OAMF_VAL(gbhdr_v) & MLX5_GENEVE_OAMF_VAL(gbhdr_m));
+	MLX5_SET(fte_match_set_misc, misc_m, geneve_opt_len,
+		 MLX5_GENEVE_OPTLEN_VAL(gbhdr_m));
+	MLX5_SET(fte_match_set_misc, misc_v, geneve_opt_len,
+		 MLX5_GENEVE_OPTLEN_VAL(gbhdr_v) &
+		 MLX5_GENEVE_OPTLEN_VAL(gbhdr_m));
+}
+
+/**
+ * Add MPLS item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] prev_layer
+ *   The protocol layer indicated in previous item.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_mpls(void *matcher, void *key,
+			    const struct rte_flow_item *item,
+			    uint64_t prev_layer,
+			    int inner)
+{
+	const uint32_t *in_mpls_m = item->mask;
+	const uint32_t *in_mpls_v = item->spec;
+	uint32_t *out_mpls_m = 0;
+	uint32_t *out_mpls_v = 0;
+	void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+	void *misc2_m = MLX5_ADDR_OF(fte_match_param, matcher,
+				     misc_parameters_2);
+	void *misc2_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2);
+	void *headers_m = MLX5_ADDR_OF(fte_match_param, matcher, outer_headers);
+	void *headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+
+	switch (prev_layer) {
+	case MLX5_FLOW_LAYER_OUTER_L4_UDP:
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xffff);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
+			 MLX5_UDP_PORT_MPLS);
+		break;
+	case MLX5_FLOW_LAYER_GRE:
+		MLX5_SET(fte_match_set_misc, misc_m, gre_protocol, 0xffff);
+		MLX5_SET(fte_match_set_misc, misc_v, gre_protocol,
+			 RTE_ETHER_TYPE_MPLS);
+		break;
+	default:
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+			 IPPROTO_MPLS);
+		break;
+	}
+	if (!in_mpls_v)
+		return;
+	if (!in_mpls_m)
+		in_mpls_m = (const uint32_t *)&rte_flow_item_mpls_mask;
+	switch (prev_layer) {
+	case MLX5_FLOW_LAYER_OUTER_L4_UDP:
+		out_mpls_m =
+			(uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_m,
+						 outer_first_mpls_over_udp);
+		out_mpls_v =
+			(uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_v,
+						 outer_first_mpls_over_udp);
+		break;
+	case MLX5_FLOW_LAYER_GRE:
+		out_mpls_m =
+			(uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_m,
+						 outer_first_mpls_over_gre);
+		out_mpls_v =
+			(uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2, misc2_v,
+						 outer_first_mpls_over_gre);
+		break;
+	default:
+		/* Inner MPLS not over GRE is not supported. */
+		if (!inner) {
+			out_mpls_m =
+				(uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2,
+							 misc2_m,
+							 outer_first_mpls);
+			out_mpls_v =
+				(uint32_t *)MLX5_ADDR_OF(fte_match_set_misc2,
+							 misc2_v,
+							 outer_first_mpls);
+		}
+		break;
+	}
+	if (out_mpls_m && out_mpls_v) {
+		*out_mpls_m = *in_mpls_m;
+		*out_mpls_v = *in_mpls_v & *in_mpls_m;
+	}
+}
+
+/**
+ * Add metadata register item to matcher
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] reg_type
+ *   Type of device metadata register
+ * @param[in] value
+ *   Register value
+ * @param[in] mask
+ *   Register mask
+ */
+static void
+flow_dv_match_meta_reg(void *matcher, void *key,
+		       enum modify_reg reg_type,
+		       uint32_t data, uint32_t mask)
+{
+	void *misc2_m =
+		MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_2);
+	void *misc2_v =
+		MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2);
+	uint32_t temp;
+
+	data &= mask;
+	switch (reg_type) {
+	case REG_A:
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_a, mask);
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a, data);
+		break;
+	case REG_B:
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_b, mask);
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_b, data);
+		break;
+	case REG_C_0:
+		/*
+		 * The metadata register C0 field might be divided into
+		 * source vport index and META item value, we should set
+		 * this field according to specified mask, not as whole one.
+		 */
+		temp = MLX5_GET(fte_match_set_misc2, misc2_m, metadata_reg_c_0);
+		temp |= mask;
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_0, temp);
+		temp = MLX5_GET(fte_match_set_misc2, misc2_v, metadata_reg_c_0);
+		temp &= ~mask;
+		temp |= data;
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_0, temp);
+		break;
+	case REG_C_1:
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_1, mask);
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_1, data);
+		break;
+	case REG_C_2:
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_2, mask);
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_2, data);
+		break;
+	case REG_C_3:
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_3, mask);
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_3, data);
+		break;
+	case REG_C_4:
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_4, mask);
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_4, data);
+		break;
+	case REG_C_5:
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_5, mask);
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_5, data);
+		break;
+	case REG_C_6:
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_6, mask);
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_6, data);
+		break;
+	case REG_C_7:
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_7, mask);
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_7, data);
+		break;
+	default:
+		MLX5_ASSERT(false);
+		break;
+	}
+}
+
+/**
+ * Add MARK item to matcher
+ *
+ * @param[in] dev
+ *   The device to configure through.
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ */
+static void
+flow_dv_translate_item_mark(struct rte_eth_dev *dev,
+			    void *matcher, void *key,
+			    const struct rte_flow_item *item)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const struct rte_flow_item_mark *mark;
+	uint32_t value;
+	uint32_t mask;
+
+	mark = item->mask ? (const void *)item->mask :
+			    &rte_flow_item_mark_mask;
+	mask = mark->id & priv->sh->dv_mark_mask;
+	mark = (const void *)item->spec;
+	MLX5_ASSERT(mark);
+	value = mark->id & priv->sh->dv_mark_mask & mask;
+	if (mask) {
+		enum modify_reg reg;
+
+		/* Get the metadata register index for the mark. */
+		reg = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, NULL);
+		MLX5_ASSERT(reg > 0);
+		if (reg == REG_C_0) {
+			struct mlx5_priv *priv = dev->data->dev_private;
+			uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+			uint32_t shl_c0 = rte_bsf32(msk_c0);
+
+			mask &= msk_c0;
+			mask <<= shl_c0;
+			value <<= shl_c0;
+		}
+		flow_dv_match_meta_reg(matcher, key, reg, value, mask);
+	}
+}
+
+/**
+ * Add META item to matcher
+ *
+ * @param[in] dev
+ *   The devich to configure through.
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] attr
+ *   Attributes of flow that includes this item.
+ * @param[in] item
+ *   Flow pattern to translate.
+ */
+static void
+flow_dv_translate_item_meta(struct rte_eth_dev *dev,
+			    void *matcher, void *key,
+			    const struct rte_flow_attr *attr,
+			    const struct rte_flow_item *item)
+{
+	const struct rte_flow_item_meta *meta_m;
+	const struct rte_flow_item_meta *meta_v;
+
+	meta_m = (const void *)item->mask;
+	if (!meta_m)
+		meta_m = &rte_flow_item_meta_mask;
+	meta_v = (const void *)item->spec;
+	if (meta_v) {
+		int reg;
+		uint32_t value = meta_v->data;
+		uint32_t mask = meta_m->data;
+
+		reg = flow_dv_get_metadata_reg(dev, attr, NULL);
+		if (reg < 0)
+			return;
+		/*
+		 * In datapath code there is no endianness
+		 * coversions for perfromance reasons, all
+		 * pattern conversions are done in rte_flow.
+		 */
+		value = rte_cpu_to_be_32(value);
+		mask = rte_cpu_to_be_32(mask);
+		if (reg == REG_C_0) {
+			struct mlx5_priv *priv = dev->data->dev_private;
+			uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+			uint32_t shl_c0 = rte_bsf32(msk_c0);
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+			uint32_t shr_c0 = __builtin_clz(priv->sh->dv_meta_mask);
+
+			value >>= shr_c0;
+			mask >>= shr_c0;
+#endif
+			value <<= shl_c0;
+			mask <<= shl_c0;
+			MLX5_ASSERT(msk_c0);
+			MLX5_ASSERT(!(~msk_c0 & mask));
+		}
+		flow_dv_match_meta_reg(matcher, key, reg, value, mask);
+	}
+}
+
+/**
+ * Add vport metadata Reg C0 item to matcher
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] reg
+ *   Flow pattern to translate.
+ */
+static void
+flow_dv_translate_item_meta_vport(void *matcher, void *key,
+				  uint32_t value, uint32_t mask)
+{
+	flow_dv_match_meta_reg(matcher, key, REG_C_0, value, mask);
+}
+
+/**
+ * Add tag item to matcher
+ *
+ * @param[in] dev
+ *   The devich to configure through.
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ */
+static void
+flow_dv_translate_mlx5_item_tag(struct rte_eth_dev *dev,
+				void *matcher, void *key,
+				const struct rte_flow_item *item)
+{
+	const struct mlx5_rte_flow_item_tag *tag_v = item->spec;
+	const struct mlx5_rte_flow_item_tag *tag_m = item->mask;
+	uint32_t mask, value;
+
+	MLX5_ASSERT(tag_v);
+	value = tag_v->data;
+	mask = tag_m ? tag_m->data : UINT32_MAX;
+	if (tag_v->id == REG_C_0) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+		uint32_t msk_c0 = priv->sh->dv_regc0_mask;
+		uint32_t shl_c0 = rte_bsf32(msk_c0);
+
+		mask &= msk_c0;
+		mask <<= shl_c0;
+		value <<= shl_c0;
+	}
+	flow_dv_match_meta_reg(matcher, key, tag_v->id, value, mask);
+}
+
+/**
+ * Add TAG item to matcher
+ *
+ * @param[in] dev
+ *   The devich to configure through.
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ */
+static void
+flow_dv_translate_item_tag(struct rte_eth_dev *dev,
+			   void *matcher, void *key,
+			   const struct rte_flow_item *item)
+{
+	const struct rte_flow_item_tag *tag_v = item->spec;
+	const struct rte_flow_item_tag *tag_m = item->mask;
+	enum modify_reg reg;
+
+	MLX5_ASSERT(tag_v);
+	tag_m = tag_m ? tag_m : &rte_flow_item_tag_mask;
+	/* Get the metadata register index for the tag. */
+	reg = mlx5_flow_get_reg_id(dev, MLX5_APP_TAG, tag_v->index, NULL);
+	MLX5_ASSERT(reg > 0);
+	flow_dv_match_meta_reg(matcher, key, reg, tag_v->data, tag_m->data);
+}
+
+/**
+ * Add source vport match to the specified matcher.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] port
+ *   Source vport value to match
+ * @param[in] mask
+ *   Mask
+ */
+static void
+flow_dv_translate_item_source_vport(void *matcher, void *key,
+				    int16_t port, uint16_t mask)
+{
+	void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+
+	MLX5_SET(fte_match_set_misc, misc_m, source_port, mask);
+	MLX5_SET(fte_match_set_misc, misc_v, source_port, port);
+}
+
+/**
+ * Translate port-id item to eswitch match on  port-id.
+ *
+ * @param[in] dev
+ *   The devich to configure through.
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise.
+ */
+static int
+flow_dv_translate_item_port_id(struct rte_eth_dev *dev, void *matcher,
+			       void *key, const struct rte_flow_item *item)
+{
+	const struct rte_flow_item_port_id *pid_m = item ? item->mask : NULL;
+	const struct rte_flow_item_port_id *pid_v = item ? item->spec : NULL;
+	struct mlx5_priv *priv;
+	uint16_t mask, id;
+
+	mask = pid_m ? pid_m->id : 0xffff;
+	id = pid_v ? pid_v->id : dev->data->port_id;
+	priv = mlx5_port_to_eswitch_info(id, item == NULL);
+	if (!priv)
+		return -rte_errno;
+	/* Translate to vport field or to metadata, depending on mode. */
+	if (priv->vport_meta_mask)
+		flow_dv_translate_item_meta_vport(matcher, key,
+						  priv->vport_meta_tag,
+						  priv->vport_meta_mask);
+	else
+		flow_dv_translate_item_source_vport(matcher, key,
+						    priv->vport_id, mask);
+	return 0;
+}
+
+/**
+ * Add ICMP6 item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_icmp6(void *matcher, void *key,
+			      const struct rte_flow_item *item,
+			      int inner)
+{
+	const struct rte_flow_item_icmp6 *icmp6_m = item->mask;
+	const struct rte_flow_item_icmp6 *icmp6_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+	void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher,
+				     misc_parameters_3);
+	void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3);
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xFF);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_ICMPV6);
+	if (!icmp6_v)
+		return;
+	if (!icmp6_m)
+		icmp6_m = &rte_flow_item_icmp6_mask;
+	/*
+	 * Force flow only to match the non-fragmented IPv6 ICMPv6 packets.
+	 * If only the protocol is specified, no need to match the frag.
+	 */
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, frag, 1);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0);
+	MLX5_SET(fte_match_set_misc3, misc3_m, icmpv6_type, icmp6_m->type);
+	MLX5_SET(fte_match_set_misc3, misc3_v, icmpv6_type,
+		 icmp6_v->type & icmp6_m->type);
+	MLX5_SET(fte_match_set_misc3, misc3_m, icmpv6_code, icmp6_m->code);
+	MLX5_SET(fte_match_set_misc3, misc3_v, icmpv6_code,
+		 icmp6_v->code & icmp6_m->code);
+}
+
+/**
+ * Add ICMP item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_icmp(void *matcher, void *key,
+			    const struct rte_flow_item *item,
+			    int inner)
+{
+	const struct rte_flow_item_icmp *icmp_m = item->mask;
+	const struct rte_flow_item_icmp *icmp_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+	void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher,
+				     misc_parameters_3);
+	void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3);
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xFF);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_ICMP);
+	if (!icmp_v)
+		return;
+	if (!icmp_m)
+		icmp_m = &rte_flow_item_icmp_mask;
+	/*
+	 * Force flow only to match the non-fragmented IPv4 ICMP packets.
+	 * If only the protocol is specified, no need to match the frag.
+	 */
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, frag, 1);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0);
+	MLX5_SET(fte_match_set_misc3, misc3_m, icmp_type,
+		 icmp_m->hdr.icmp_type);
+	MLX5_SET(fte_match_set_misc3, misc3_v, icmp_type,
+		 icmp_v->hdr.icmp_type & icmp_m->hdr.icmp_type);
+	MLX5_SET(fte_match_set_misc3, misc3_m, icmp_code,
+		 icmp_m->hdr.icmp_code);
+	MLX5_SET(fte_match_set_misc3, misc3_v, icmp_code,
+		 icmp_v->hdr.icmp_code & icmp_m->hdr.icmp_code);
+}
+
+/**
+ * Add GTP item to matcher and to the value.
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_gtp(void *matcher, void *key,
+			   const struct rte_flow_item *item, int inner)
+{
+	const struct rte_flow_item_gtp *gtp_m = item->mask;
+	const struct rte_flow_item_gtp *gtp_v = item->spec;
+	void *headers_m;
+	void *headers_v;
+	void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher,
+				     misc_parameters_3);
+	void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3);
+	uint16_t dport = RTE_GTPU_UDP_PORT;
+
+	if (inner) {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers);
+	} else {
+		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+	}
+	if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) {
+		MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport);
+	}
+	if (!gtp_v)
+		return;
+	if (!gtp_m)
+		gtp_m = &rte_flow_item_gtp_mask;
+	MLX5_SET(fte_match_set_misc3, misc3_m, gtpu_msg_flags,
+		 gtp_m->v_pt_rsv_flags);
+	MLX5_SET(fte_match_set_misc3, misc3_v, gtpu_msg_flags,
+		 gtp_v->v_pt_rsv_flags & gtp_m->v_pt_rsv_flags);
+	MLX5_SET(fte_match_set_misc3, misc3_m, gtpu_msg_type, gtp_m->msg_type);
+	MLX5_SET(fte_match_set_misc3, misc3_v, gtpu_msg_type,
+		 gtp_v->msg_type & gtp_m->msg_type);
+	MLX5_SET(fte_match_set_misc3, misc3_m, gtpu_teid,
+		 rte_be_to_cpu_32(gtp_m->teid));
+	MLX5_SET(fte_match_set_misc3, misc3_v, gtpu_teid,
+		 rte_be_to_cpu_32(gtp_v->teid & gtp_m->teid));
+}
+
+static uint32_t matcher_zero[MLX5_ST_SZ_DW(fte_match_param)] = { 0 };
+
+#define HEADER_IS_ZERO(match_criteria, headers)				     \
+	!(memcmp(MLX5_ADDR_OF(fte_match_param, match_criteria, headers),     \
+		 matcher_zero, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \
+
+/**
+ * Calculate flow matcher enable bitmap.
+ *
+ * @param match_criteria
+ *   Pointer to flow matcher criteria.
+ *
+ * @return
+ *   Bitmap of enabled fields.
+ */
+static uint8_t
+flow_dv_matcher_enable(uint32_t *match_criteria)
+{
+	uint8_t match_criteria_enable;
+
+	match_criteria_enable =
+		(!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
+		MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
+		MLX5_MATCH_CRITERIA_ENABLE_MISC_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
+		MLX5_MATCH_CRITERIA_ENABLE_INNER_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
+		MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, misc_parameters_3)) <<
+		MLX5_MATCH_CRITERIA_ENABLE_MISC3_BIT;
+	return match_criteria_enable;
+}
+
+
+/**
+ * Get a flow table.
+ *
+ * @param[in, out] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] table_id
+ *   Table id to use.
+ * @param[in] egress
+ *   Direction of the table.
+ * @param[in] transfer
+ *   E-Switch or NIC flow.
+ * @param[out] error
+ *   pointer to error structure.
+ *
+ * @return
+ *   Returns tables resource based on the index, NULL in case of failed.
+ */
+static struct mlx5_flow_tbl_resource *
+flow_dv_tbl_resource_get(struct rte_eth_dev *dev,
+			 uint32_t table_id, uint8_t egress,
+			 uint8_t transfer,
+			 struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_tbl_resource *tbl;
+	union mlx5_flow_tbl_key table_key = {
+		{
+			.table_id = table_id,
+			.reserved = 0,
+			.domain = !!transfer,
+			.direction = !!egress,
+		}
+	};
+	struct mlx5_hlist_entry *pos = mlx5_hlist_lookup(sh->flow_tbls,
+							 table_key.v64);
+	struct mlx5_flow_tbl_data_entry *tbl_data;
+	uint32_t idx = 0;
+	int ret;
+	void *domain;
+
+	if (pos) {
+		tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
+					entry);
+		tbl = &tbl_data->tbl;
+		rte_atomic32_inc(&tbl->refcnt);
+		return tbl;
+	}
+	tbl_data = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_JUMP], &idx);
+	if (!tbl_data) {
+		rte_flow_error_set(error, ENOMEM,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL,
+				   "cannot allocate flow table data entry");
+		return NULL;
+	}
+	tbl_data->idx = idx;
+	tbl = &tbl_data->tbl;
+	pos = &tbl_data->entry;
+	if (transfer)
+		domain = sh->fdb_domain;
+	else if (egress)
+		domain = sh->tx_domain;
+	else
+		domain = sh->rx_domain;
+	tbl->obj = mlx5_glue->dr_create_flow_tbl(domain, table_id);
+	if (!tbl->obj) {
+		rte_flow_error_set(error, ENOMEM,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL, "cannot create flow table object");
+		mlx5_ipool_free(sh->ipool[MLX5_IPOOL_JUMP], idx);
+		return NULL;
+	}
+	/*
+	 * No multi-threads now, but still better to initialize the reference
+	 * count before insert it into the hash list.
+	 */
+	rte_atomic32_init(&tbl->refcnt);
+	/* Jump action reference count is initialized here. */
+	rte_atomic32_init(&tbl_data->jump.refcnt);
+	pos->key = table_key.v64;
+	ret = mlx5_hlist_insert(sh->flow_tbls, pos);
+	if (ret < 0) {
+		rte_flow_error_set(error, -ret,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "cannot insert flow table data entry");
+		mlx5_glue->dr_destroy_flow_tbl(tbl->obj);
+		mlx5_ipool_free(sh->ipool[MLX5_IPOOL_JUMP], idx);
+	}
+	rte_atomic32_inc(&tbl->refcnt);
+	return tbl;
+}
+
+/**
+ * Release a flow table.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] tbl
+ *   Table resource to be released.
+ *
+ * @return
+ *   Returns 0 if table was released, else return 1;
+ */
+static int
+flow_dv_tbl_resource_release(struct rte_eth_dev *dev,
+			     struct mlx5_flow_tbl_resource *tbl)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_tbl_data_entry *tbl_data =
+		container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl);
+
+	if (!tbl)
+		return 0;
+	if (rte_atomic32_dec_and_test(&tbl->refcnt)) {
+		struct mlx5_hlist_entry *pos = &tbl_data->entry;
+
+		mlx5_glue->dr_destroy_flow_tbl(tbl->obj);
+		tbl->obj = NULL;
+		/* remove the entry from the hash list and free memory. */
+		mlx5_hlist_remove(sh->flow_tbls, pos);
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_JUMP],
+				tbl_data->idx);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Register the flow matcher.
+ *
+ * @param[in, out] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] matcher
+ *   Pointer to flow matcher.
+ * @param[in, out] key
+ *   Pointer to flow table key.
+ * @parm[in, out] dev_flow
+ *   Pointer to the dev_flow.
+ * @param[out] error
+ *   pointer to error structure.
+ *
+ * @return
+ *   0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_matcher_register(struct rte_eth_dev *dev,
+			 struct mlx5_flow_dv_matcher *matcher,
+			 union mlx5_flow_tbl_key *key,
+			 struct mlx5_flow *dev_flow,
+			 struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_dv_matcher *cache_matcher;
+	struct mlx5dv_flow_matcher_attr dv_attr = {
+		.type = IBV_FLOW_ATTR_NORMAL,
+		.match_mask = (void *)&matcher->mask,
+	};
+	struct mlx5_flow_tbl_resource *tbl;
+	struct mlx5_flow_tbl_data_entry *tbl_data;
+
+	tbl = flow_dv_tbl_resource_get(dev, key->table_id, key->direction,
+				       key->domain, error);
+	if (!tbl)
+		return -rte_errno;	/* No need to refill the error info */
+	tbl_data = container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl);
+	/* Lookup from cache. */
+	LIST_FOREACH(cache_matcher, &tbl_data->matchers, next) {
+		if (matcher->crc == cache_matcher->crc &&
+		    matcher->priority == cache_matcher->priority &&
+		    !memcmp((const void *)matcher->mask.buf,
+			    (const void *)cache_matcher->mask.buf,
+			    cache_matcher->mask.size)) {
+			DRV_LOG(DEBUG,
+				"%s group %u priority %hd use %s "
+				"matcher %p: refcnt %d++",
+				key->domain ? "FDB" : "NIC", key->table_id,
+				cache_matcher->priority,
+				key->direction ? "tx" : "rx",
+				(void *)cache_matcher,
+				rte_atomic32_read(&cache_matcher->refcnt));
+			rte_atomic32_inc(&cache_matcher->refcnt);
+			dev_flow->handle->dvh.matcher = cache_matcher;
+			/* old matcher should not make the table ref++. */
+			flow_dv_tbl_resource_release(dev, tbl);
+			return 0;
+		}
+	}
+	/* Register new matcher. */
+	cache_matcher = rte_calloc(__func__, 1, sizeof(*cache_matcher), 0);
+	if (!cache_matcher) {
+		flow_dv_tbl_resource_release(dev, tbl);
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "cannot allocate matcher memory");
+	}
+	*cache_matcher = *matcher;
+	dv_attr.match_criteria_enable =
+		flow_dv_matcher_enable(cache_matcher->mask.buf);
+	dv_attr.priority = matcher->priority;
+	if (key->direction)
+		dv_attr.flags |= IBV_FLOW_ATTR_FLAGS_EGRESS;
+	cache_matcher->matcher_object =
+		mlx5_glue->dv_create_flow_matcher(sh->ctx, &dv_attr, tbl->obj);
+	if (!cache_matcher->matcher_object) {
+		rte_free(cache_matcher);
+#ifdef HAVE_MLX5DV_DR
+		flow_dv_tbl_resource_release(dev, tbl);
+#endif
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL, "cannot create matcher");
+	}
+	/* Save the table information */
+	cache_matcher->tbl = tbl;
+	rte_atomic32_init(&cache_matcher->refcnt);
+	/* only matcher ref++, table ref++ already done above in get API. */
+	rte_atomic32_inc(&cache_matcher->refcnt);
+	LIST_INSERT_HEAD(&tbl_data->matchers, cache_matcher, next);
+	dev_flow->handle->dvh.matcher = cache_matcher;
+	DRV_LOG(DEBUG, "%s group %u priority %hd new %s matcher %p: refcnt %d",
+		key->domain ? "FDB" : "NIC", key->table_id,
+		cache_matcher->priority,
+		key->direction ? "tx" : "rx", (void *)cache_matcher,
+		rte_atomic32_read(&cache_matcher->refcnt));
+	return 0;
+}
+
+/**
+ * Find existing tag resource or create and register a new one.
+ *
+ * @param dev[in, out]
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] tag_be24
+ *   Tag value in big endian then R-shift 8.
+ * @parm[in, out] dev_flow
+ *   Pointer to the dev_flow.
+ * @param[out] error
+ *   pointer to error structure.
+ *
+ * @return
+ *   0 on success otherwise -errno and errno is set.
+ */
+static int
+flow_dv_tag_resource_register
+			(struct rte_eth_dev *dev,
+			 uint32_t tag_be24,
+			 struct mlx5_flow *dev_flow,
+			 struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_dv_tag_resource *cache_resource;
+	struct mlx5_hlist_entry *entry;
+
+	/* Lookup a matching resource from cache. */
+	entry = mlx5_hlist_lookup(sh->tag_table, (uint64_t)tag_be24);
+	if (entry) {
+		cache_resource = container_of
+			(entry, struct mlx5_flow_dv_tag_resource, entry);
+		rte_atomic32_inc(&cache_resource->refcnt);
+		dev_flow->handle->dvh.rix_tag = cache_resource->idx;
+		dev_flow->dv.tag_resource = cache_resource;
+		DRV_LOG(DEBUG, "cached tag resource %p: refcnt now %d++",
+			(void *)cache_resource,
+			rte_atomic32_read(&cache_resource->refcnt));
+		return 0;
+	}
+	/* Register new resource. */
+	cache_resource = mlx5_ipool_zmalloc(sh->ipool[MLX5_IPOOL_TAG],
+				       &dev_flow->handle->dvh.rix_tag);
+	if (!cache_resource)
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "cannot allocate resource memory");
+	cache_resource->entry.key = (uint64_t)tag_be24;
+	cache_resource->action = mlx5_glue->dv_create_flow_action_tag(tag_be24);
+	if (!cache_resource->action) {
+		rte_free(cache_resource);
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL, "cannot create action");
+	}
+	rte_atomic32_init(&cache_resource->refcnt);
+	rte_atomic32_inc(&cache_resource->refcnt);
+	if (mlx5_hlist_insert(sh->tag_table, &cache_resource->entry)) {
+		mlx5_glue->destroy_flow_action(cache_resource->action);
+		rte_free(cache_resource);
+		return rte_flow_error_set(error, EEXIST,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL, "cannot insert tag");
+	}
+	dev_flow->dv.tag_resource = cache_resource;
+	DRV_LOG(DEBUG, "new tag resource %p: refcnt now %d++",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	return 0;
+}
+
+/**
+ * Release the tag.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param tag_idx
+ *   Tag index.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_tag_release(struct rte_eth_dev *dev,
+		    uint32_t tag_idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_dv_tag_resource *tag;
+
+	tag = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_TAG], tag_idx);
+	if (!tag)
+		return 0;
+	DRV_LOG(DEBUG, "port %u tag %p: refcnt %d--",
+		dev->data->port_id, (void *)tag,
+		rte_atomic32_read(&tag->refcnt));
+	if (rte_atomic32_dec_and_test(&tag->refcnt)) {
+		claim_zero(mlx5_glue->destroy_flow_action(tag->action));
+		mlx5_hlist_remove(sh->tag_table, &tag->entry);
+		DRV_LOG(DEBUG, "port %u tag %p: removed",
+			dev->data->port_id, (void *)tag);
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_TAG], tag_idx);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Translate port ID action to vport.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] action
+ *   Pointer to the port ID action.
+ * @param[out] dst_port_id
+ *   The target port ID.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_translate_action_port_id(struct rte_eth_dev *dev,
+				 const struct rte_flow_action *action,
+				 uint32_t *dst_port_id,
+				 struct rte_flow_error *error)
+{
+	uint32_t port;
+	struct mlx5_priv *priv;
+	const struct rte_flow_action_port_id *conf =
+			(const struct rte_flow_action_port_id *)action->conf;
+
+	port = conf->original ? dev->data->port_id : conf->id;
+	priv = mlx5_port_to_eswitch_info(port, false);
+	if (!priv)
+		return rte_flow_error_set(error, -rte_errno,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  NULL,
+					  "No eswitch info was found for port");
+#ifdef HAVE_MLX5DV_DR_DEVX_PORT
+	/*
+	 * This parameter is transferred to
+	 * mlx5dv_dr_action_create_dest_ib_port().
+	 */
+	*dst_port_id = priv->ibv_port;
+#else
+	/*
+	 * Legacy mode, no LAG configurations is supported.
+	 * This parameter is transferred to
+	 * mlx5dv_dr_action_create_dest_vport().
+	 */
+	*dst_port_id = priv->vport_id;
+#endif
+	return 0;
+}
+
+/**
+ * Create a counter with aging configuration.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[out] count
+ *   Pointer to the counter action configuration.
+ * @param[in] age
+ *   Pointer to the aging action configuration.
+ *
+ * @return
+ *   Index to flow counter on success, 0 otherwise.
+ */
+static uint32_t
+flow_dv_translate_create_counter(struct rte_eth_dev *dev,
+				struct mlx5_flow *dev_flow,
+				const struct rte_flow_action_count *count,
+				const struct rte_flow_action_age *age)
+{
+	uint32_t counter;
+	struct mlx5_age_param *age_param;
+
+	counter = flow_dv_counter_alloc(dev,
+				count ? count->shared : 0,
+				count ? count->id : 0,
+				dev_flow->dv.group, !!age);
+	if (!counter || age == NULL)
+		return counter;
+	age_param  = flow_dv_counter_idx_get_age(dev, counter);
+	/*
+	 * The counter age accuracy may have a bit delay. Have 3/4
+	 * second bias on the timeount in order to let it age in time.
+	 */
+	age_param->context = age->context ? age->context :
+		(void *)(uintptr_t)(dev_flow->flow_idx);
+	/*
+	 * The counter age accuracy may have a bit delay. Have 3/4
+	 * second bias on the timeount in order to let it age in time.
+	 */
+	age_param->timeout = age->timeout * 10 - MLX5_AGING_TIME_DELAY;
+	/* Set expire time in unit of 0.1 sec. */
+	age_param->port_id = dev->data->port_id;
+	age_param->expire = age_param->timeout +
+			rte_rdtsc() / (rte_get_tsc_hz() / 10);
+	rte_atomic16_set(&age_param->state, AGE_CANDIDATE);
+	return counter;
+}
+/**
+ * Add Tx queue matcher
+ *
+ * @param[in] dev
+ *   Pointer to the dev struct.
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_tx_queue(struct rte_eth_dev *dev,
+				void *matcher, void *key,
+				const struct rte_flow_item *item)
+{
+	const struct mlx5_rte_flow_item_tx_queue *queue_m;
+	const struct mlx5_rte_flow_item_tx_queue *queue_v;
+	void *misc_m =
+		MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters);
+	void *misc_v =
+		MLX5_ADDR_OF(fte_match_param, key, misc_parameters);
+	struct mlx5_txq_ctrl *txq;
+	uint32_t queue;
+
+
+	queue_m = (const void *)item->mask;
+	if (!queue_m)
+		return;
+	queue_v = (const void *)item->spec;
+	if (!queue_v)
+		return;
+	txq = mlx5_txq_get(dev, queue_v->queue);
+	if (!txq)
+		return;
+	queue = txq->obj->sq->id;
+	MLX5_SET(fte_match_set_misc, misc_m, source_sqn, queue_m->queue);
+	MLX5_SET(fte_match_set_misc, misc_v, source_sqn,
+		 queue & queue_m->queue);
+	mlx5_txq_release(dev, queue_v->queue);
+}
+
+/**
+ * Set the hash fields according to the @p flow information.
+ *
+ * @param[in] dev_flow
+ *   Pointer to the mlx5_flow.
+ * @param[in] rss_desc
+ *   Pointer to the mlx5_flow_rss_desc.
+ */
+static void
+flow_dv_hashfields_set(struct mlx5_flow *dev_flow,
+		       struct mlx5_flow_rss_desc *rss_desc)
+{
+	uint64_t items = dev_flow->handle->layers;
+	int rss_inner = 0;
+	uint64_t rss_types = rte_eth_rss_hf_refine(rss_desc->types);
+
+	dev_flow->hash_fields = 0;
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	if (rss_desc->level >= 2) {
+		dev_flow->hash_fields |= IBV_RX_HASH_INNER;
+		rss_inner = 1;
+	}
+#endif
+	if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L3_IPV4)) ||
+	    (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L3_IPV4))) {
+		if (rss_types & MLX5_IPV4_LAYER_TYPES) {
+			if (rss_types & ETH_RSS_L3_SRC_ONLY)
+				dev_flow->hash_fields |= IBV_RX_HASH_SRC_IPV4;
+			else if (rss_types & ETH_RSS_L3_DST_ONLY)
+				dev_flow->hash_fields |= IBV_RX_HASH_DST_IPV4;
+			else
+				dev_flow->hash_fields |= MLX5_IPV4_IBV_RX_HASH;
+		}
+	} else if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L3_IPV6)) ||
+		   (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L3_IPV6))) {
+		if (rss_types & MLX5_IPV6_LAYER_TYPES) {
+			if (rss_types & ETH_RSS_L3_SRC_ONLY)
+				dev_flow->hash_fields |= IBV_RX_HASH_SRC_IPV6;
+			else if (rss_types & ETH_RSS_L3_DST_ONLY)
+				dev_flow->hash_fields |= IBV_RX_HASH_DST_IPV6;
+			else
+				dev_flow->hash_fields |= MLX5_IPV6_IBV_RX_HASH;
+		}
+	}
+	if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L4_UDP)) ||
+	    (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L4_UDP))) {
+		if (rss_types & ETH_RSS_UDP) {
+			if (rss_types & ETH_RSS_L4_SRC_ONLY)
+				dev_flow->hash_fields |=
+						IBV_RX_HASH_SRC_PORT_UDP;
+			else if (rss_types & ETH_RSS_L4_DST_ONLY)
+				dev_flow->hash_fields |=
+						IBV_RX_HASH_DST_PORT_UDP;
+			else
+				dev_flow->hash_fields |= MLX5_UDP_IBV_RX_HASH;
+		}
+	} else if ((rss_inner && (items & MLX5_FLOW_LAYER_INNER_L4_TCP)) ||
+		   (!rss_inner && (items & MLX5_FLOW_LAYER_OUTER_L4_TCP))) {
+		if (rss_types & ETH_RSS_TCP) {
+			if (rss_types & ETH_RSS_L4_SRC_ONLY)
+				dev_flow->hash_fields |=
+						IBV_RX_HASH_SRC_PORT_TCP;
+			else if (rss_types & ETH_RSS_L4_DST_ONLY)
+				dev_flow->hash_fields |=
+						IBV_RX_HASH_DST_PORT_TCP;
+			else
+				dev_flow->hash_fields |= MLX5_TCP_IBV_RX_HASH;
+		}
+	}
+}
+
+/**
+ * Fill the flow with DV spec, lock free
+ * (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] dev_flow
+ *   Pointer to the sub flow.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+__flow_dv_translate(struct rte_eth_dev *dev,
+		    struct mlx5_flow *dev_flow,
+		    const struct rte_flow_attr *attr,
+		    const struct rte_flow_item items[],
+		    const struct rte_flow_action actions[],
+		    struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *dev_conf = &priv->config;
+	struct rte_flow *flow = dev_flow->flow;
+	struct mlx5_flow_handle *handle = dev_flow->handle;
+	struct mlx5_flow_rss_desc *rss_desc = &((struct mlx5_flow_rss_desc *)
+					      priv->rss_desc)
+					      [!!priv->flow_nested_idx];
+	uint64_t item_flags = 0;
+	uint64_t last_item = 0;
+	uint64_t action_flags = 0;
+	uint64_t priority = attr->priority;
+	struct mlx5_flow_dv_matcher matcher = {
+		.mask = {
+			.size = sizeof(matcher.mask.buf),
+		},
+	};
+	int actions_n = 0;
+	bool actions_end = false;
+	union {
+		struct mlx5_flow_dv_modify_hdr_resource res;
+		uint8_t len[sizeof(struct mlx5_flow_dv_modify_hdr_resource) +
+			    sizeof(struct mlx5_modification_cmd) *
+			    (MLX5_MAX_MODIFY_NUM + 1)];
+	} mhdr_dummy;
+	struct mlx5_flow_dv_modify_hdr_resource *mhdr_res = &mhdr_dummy.res;
+	const struct rte_flow_action_count *count = NULL;
+	const struct rte_flow_action_age *age = NULL;
+	union flow_dv_attr flow_attr = { .attr = 0 };
+	uint32_t tag_be;
+	union mlx5_flow_tbl_key tbl_key;
+	uint32_t modify_action_position = UINT32_MAX;
+	void *match_mask = matcher.mask.buf;
+	void *match_value = dev_flow->dv.value.buf;
+	uint8_t next_protocol = 0xff;
+	struct rte_vlan_hdr vlan = { 0 };
+	uint32_t table;
+	int ret = 0;
+
+	mhdr_res->ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX :
+					   MLX5DV_FLOW_TABLE_TYPE_NIC_RX;
+	ret = mlx5_flow_group_to_table(attr, dev_flow->external, attr->group,
+				       !!priv->fdb_def_rule, &table, error);
+	if (ret)
+		return ret;
+	dev_flow->dv.group = table;
+	if (attr->transfer)
+		mhdr_res->ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB;
+	if (priority == MLX5_FLOW_PRIO_RSVD)
+		priority = dev_conf->flow_prio - 1;
+	/* number of actions must be set to 0 in case of dirty stack. */
+	mhdr_res->actions_num = 0;
+	for (; !actions_end ; actions++) {
+		const struct rte_flow_action_queue *queue;
+		const struct rte_flow_action_rss *rss;
+		const struct rte_flow_action *action = actions;
+		const uint8_t *rss_key;
+		const struct rte_flow_action_jump *jump_data;
+		const struct rte_flow_action_meter *mtr;
+		struct mlx5_flow_tbl_resource *tbl;
+		uint32_t port_id = 0;
+		struct mlx5_flow_dv_port_id_action_resource port_id_resource;
+		int action_type = actions->type;
+		const struct rte_flow_action *found_action = NULL;
+		struct mlx5_flow_meter *fm = NULL;
+
+		switch (action_type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_PORT_ID:
+			if (flow_dv_translate_action_port_id(dev, action,
+							     &port_id, error))
+				return -rte_errno;
+			port_id_resource.port_id = port_id;
+			MLX5_ASSERT(!handle->rix_port_id_action);
+			if (flow_dv_port_id_action_resource_register
+			    (dev, &port_id_resource, dev_flow, error))
+				return -rte_errno;
+			dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.port_id_action->action;
+			action_flags |= MLX5_FLOW_ACTION_PORT_ID;
+			dev_flow->handle->fate_action = MLX5_FLOW_FATE_PORT_ID;
+			break;
+		case RTE_FLOW_ACTION_TYPE_FLAG:
+			action_flags |= MLX5_FLOW_ACTION_FLAG;
+			dev_flow->handle->mark = 1;
+			if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+				struct rte_flow_action_mark mark = {
+					.id = MLX5_FLOW_MARK_DEFAULT,
+				};
+
+				if (flow_dv_convert_action_mark(dev, &mark,
+								mhdr_res,
+								error))
+					return -rte_errno;
+				action_flags |= MLX5_FLOW_ACTION_MARK_EXT;
+				break;
+			}
+			tag_be = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT);
+			/*
+			 * Only one FLAG or MARK is supported per device flow
+			 * right now. So the pointer to the tag resource must be
+			 * zero before the register process.
+			 */
+			MLX5_ASSERT(!handle->dvh.rix_tag);
+			if (flow_dv_tag_resource_register(dev, tag_be,
+							  dev_flow, error))
+				return -rte_errno;
+			MLX5_ASSERT(dev_flow->dv.tag_resource);
+			dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.tag_resource->action;
+			break;
+		case RTE_FLOW_ACTION_TYPE_MARK:
+			action_flags |= MLX5_FLOW_ACTION_MARK;
+			dev_flow->handle->mark = 1;
+			if (dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+				const struct rte_flow_action_mark *mark =
+					(const struct rte_flow_action_mark *)
+						actions->conf;
+
+				if (flow_dv_convert_action_mark(dev, mark,
+								mhdr_res,
+								error))
+					return -rte_errno;
+				action_flags |= MLX5_FLOW_ACTION_MARK_EXT;
+				break;
+			}
+			/* Fall-through */
+		case MLX5_RTE_FLOW_ACTION_TYPE_MARK:
+			/* Legacy (non-extensive) MARK action. */
+			tag_be = mlx5_flow_mark_set
+			      (((const struct rte_flow_action_mark *)
+			       (actions->conf))->id);
+			MLX5_ASSERT(!handle->dvh.rix_tag);
+			if (flow_dv_tag_resource_register(dev, tag_be,
+							  dev_flow, error))
+				return -rte_errno;
+			MLX5_ASSERT(dev_flow->dv.tag_resource);
+			dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.tag_resource->action;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_META:
+			if (flow_dv_convert_action_set_meta
+				(dev, mhdr_res, attr,
+				 (const struct rte_flow_action_set_meta *)
+				  actions->conf, error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_SET_META;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_TAG:
+			if (flow_dv_convert_action_set_tag
+				(dev, mhdr_res,
+				 (const struct rte_flow_action_set_tag *)
+				  actions->conf, error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_SET_TAG;
+			break;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			action_flags |= MLX5_FLOW_ACTION_DROP;
+			dev_flow->handle->fate_action = MLX5_FLOW_FATE_DROP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+			queue = actions->conf;
+			rss_desc->queue_num = 1;
+			rss_desc->queue[0] = queue->index;
+			action_flags |= MLX5_FLOW_ACTION_QUEUE;
+			dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			rss = actions->conf;
+			memcpy(rss_desc->queue, rss->queue,
+			       rss->queue_num * sizeof(uint16_t));
+			rss_desc->queue_num = rss->queue_num;
+			/* NULL RSS key indicates default RSS key. */
+			rss_key = !rss->key ? rss_hash_default_key : rss->key;
+			memcpy(rss_desc->key, rss_key, MLX5_RSS_HASH_KEY_LEN);
+			/*
+			 * rss->level and rss.types should be set in advance
+			 * when expanding items for RSS.
+			 */
+			action_flags |= MLX5_FLOW_ACTION_RSS;
+			dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE;
+			break;
+		case RTE_FLOW_ACTION_TYPE_AGE:
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			if (!dev_conf->devx) {
+				return rte_flow_error_set
+					      (error, ENOTSUP,
+					       RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					       NULL,
+					       "count action not supported");
+			}
+			/* Save information first, will apply later. */
+			if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT)
+				count = action->conf;
+			else
+				age = action->conf;
+			action_flags |= MLX5_FLOW_ACTION_COUNT;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
+			dev_flow->dv.actions[actions_n++] =
+						priv->sh->pop_vlan_action;
+			action_flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
+			if (!(action_flags &
+			      MLX5_FLOW_ACTION_OF_SET_VLAN_VID))
+				flow_dev_get_vlan_info_from_items(items, &vlan);
+			vlan.eth_proto = rte_be_to_cpu_16
+			     ((((const struct rte_flow_action_of_push_vlan *)
+						   actions->conf)->ethertype));
+			found_action = mlx5_flow_find_action
+					(actions + 1,
+					 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID);
+			if (found_action)
+				mlx5_update_vlan_vid_pcp(found_action, &vlan);
+			found_action = mlx5_flow_find_action
+					(actions + 1,
+					 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP);
+			if (found_action)
+				mlx5_update_vlan_vid_pcp(found_action, &vlan);
+			if (flow_dv_create_action_push_vlan
+					    (dev, attr, &vlan, dev_flow, error))
+				return -rte_errno;
+			dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.push_vlan_res->action;
+			action_flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
+			/* of_vlan_push action handled this action */
+			MLX5_ASSERT(action_flags &
+				    MLX5_FLOW_ACTION_OF_PUSH_VLAN);
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
+			if (action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)
+				break;
+			flow_dev_get_vlan_info_from_items(items, &vlan);
+			mlx5_update_vlan_vid_pcp(actions, &vlan);
+			/* If no VLAN push - this is a modify header action */
+			if (flow_dv_convert_action_modify_vlan_vid
+						(mhdr_res, actions, error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+		case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+			if (flow_dv_create_action_l2_encap(dev, actions,
+							   dev_flow,
+							   attr->transfer,
+							   error))
+				return -rte_errno;
+			dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.encap_decap->verbs_action;
+			action_flags |= MLX5_FLOW_ACTION_ENCAP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+		case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP:
+			if (flow_dv_create_action_l2_decap(dev, dev_flow,
+							   attr->transfer,
+							   error))
+				return -rte_errno;
+			dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.encap_decap->verbs_action;
+			action_flags |= MLX5_FLOW_ACTION_DECAP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+			/* Handle encap with preceding decap. */
+			if (action_flags & MLX5_FLOW_ACTION_DECAP) {
+				if (flow_dv_create_action_raw_encap
+					(dev, actions, dev_flow, attr, error))
+					return -rte_errno;
+				dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.encap_decap->verbs_action;
+			} else {
+				/* Handle encap without preceding decap. */
+				if (flow_dv_create_action_l2_encap
+				    (dev, actions, dev_flow, attr->transfer,
+				     error))
+					return -rte_errno;
+				dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.encap_decap->verbs_action;
+			}
+			action_flags |= MLX5_FLOW_ACTION_ENCAP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+			while ((++action)->type == RTE_FLOW_ACTION_TYPE_VOID)
+				;
+			if (action->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) {
+				if (flow_dv_create_action_l2_decap
+				    (dev, dev_flow, attr->transfer, error))
+					return -rte_errno;
+				dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.encap_decap->verbs_action;
+			}
+			/* If decap is followed by encap, handle it at encap. */
+			action_flags |= MLX5_FLOW_ACTION_DECAP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_JUMP:
+			jump_data = action->conf;
+			ret = mlx5_flow_group_to_table(attr, dev_flow->external,
+						       jump_data->group,
+						       !!priv->fdb_def_rule,
+						       &table, error);
+			if (ret)
+				return ret;
+			tbl = flow_dv_tbl_resource_get(dev, table,
+						       attr->egress,
+						       attr->transfer, error);
+			if (!tbl)
+				return rte_flow_error_set
+						(error, errno,
+						 RTE_FLOW_ERROR_TYPE_ACTION,
+						 NULL,
+						 "cannot create jump action.");
+			if (flow_dv_jump_tbl_resource_register
+			    (dev, tbl, dev_flow, error)) {
+				flow_dv_tbl_resource_release(dev, tbl);
+				return rte_flow_error_set
+						(error, errno,
+						 RTE_FLOW_ERROR_TYPE_ACTION,
+						 NULL,
+						 "cannot create jump action.");
+			}
+			dev_flow->dv.actions[actions_n++] =
+					dev_flow->dv.jump->action;
+			action_flags |= MLX5_FLOW_ACTION_JUMP;
+			dev_flow->handle->fate_action = MLX5_FLOW_FATE_JUMP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
+			if (flow_dv_convert_action_modify_mac
+					(mhdr_res, actions, error))
+				return -rte_errno;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
+					MLX5_FLOW_ACTION_SET_MAC_SRC :
+					MLX5_FLOW_ACTION_SET_MAC_DST;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
+			if (flow_dv_convert_action_modify_ipv4
+					(mhdr_res, actions, error))
+				return -rte_errno;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
+					MLX5_FLOW_ACTION_SET_IPV4_SRC :
+					MLX5_FLOW_ACTION_SET_IPV4_DST;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
+			if (flow_dv_convert_action_modify_ipv6
+					(mhdr_res, actions, error))
+				return -rte_errno;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
+					MLX5_FLOW_ACTION_SET_IPV6_SRC :
+					MLX5_FLOW_ACTION_SET_IPV6_DST;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
+			if (flow_dv_convert_action_modify_tp
+					(mhdr_res, actions, items,
+					 &flow_attr, dev_flow, !!(action_flags &
+					 MLX5_FLOW_ACTION_DECAP), error))
+				return -rte_errno;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
+					MLX5_FLOW_ACTION_SET_TP_SRC :
+					MLX5_FLOW_ACTION_SET_TP_DST;
+			break;
+		case RTE_FLOW_ACTION_TYPE_DEC_TTL:
+			if (flow_dv_convert_action_modify_dec_ttl
+					(mhdr_res, items, &flow_attr, dev_flow,
+					 !!(action_flags &
+					 MLX5_FLOW_ACTION_DECAP), error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_DEC_TTL;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_TTL:
+			if (flow_dv_convert_action_modify_ttl
+					(mhdr_res, actions, items, &flow_attr,
+					 dev_flow, !!(action_flags &
+					 MLX5_FLOW_ACTION_DECAP), error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_SET_TTL;
+			break;
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ:
+			if (flow_dv_convert_action_modify_tcp_seq
+					(mhdr_res, actions, error))
+				return -rte_errno;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ ?
+					MLX5_FLOW_ACTION_INC_TCP_SEQ :
+					MLX5_FLOW_ACTION_DEC_TCP_SEQ;
+			break;
+
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+			if (flow_dv_convert_action_modify_tcp_ack
+					(mhdr_res, actions, error))
+				return -rte_errno;
+			action_flags |= actions->type ==
+					RTE_FLOW_ACTION_TYPE_INC_TCP_ACK ?
+					MLX5_FLOW_ACTION_INC_TCP_ACK :
+					MLX5_FLOW_ACTION_DEC_TCP_ACK;
+			break;
+		case MLX5_RTE_FLOW_ACTION_TYPE_TAG:
+			if (flow_dv_convert_action_set_reg
+					(mhdr_res, actions, error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_SET_TAG;
+			break;
+		case MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG:
+			if (flow_dv_convert_action_copy_mreg
+					(dev, mhdr_res, actions, error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_SET_TAG;
+			break;
+		case RTE_FLOW_ACTION_TYPE_METER:
+			mtr = actions->conf;
+			if (!flow->meter) {
+				fm = mlx5_flow_meter_attach(priv, mtr->mtr_id,
+							    attr, error);
+				if (!fm)
+					return rte_flow_error_set(error,
+						rte_errno,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						NULL,
+						"meter not found "
+						"or invalid parameters");
+				flow->meter = fm->idx;
+			}
+			/* Set the meter action. */
+			if (!fm) {
+				fm = mlx5_ipool_get(priv->sh->ipool
+						[MLX5_IPOOL_MTR], flow->meter);
+				if (!fm)
+					return rte_flow_error_set(error,
+						rte_errno,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						NULL,
+						"meter not found "
+						"or invalid parameters");
+			}
+			dev_flow->dv.actions[actions_n++] =
+				fm->mfts->meter_action;
+			action_flags |= MLX5_FLOW_ACTION_METER;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+			if (flow_dv_convert_action_modify_ipv4_dscp(mhdr_res,
+							      actions, error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_SET_IPV4_DSCP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP:
+			if (flow_dv_convert_action_modify_ipv6_dscp(mhdr_res,
+							      actions, error))
+				return -rte_errno;
+			action_flags |= MLX5_FLOW_ACTION_SET_IPV6_DSCP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_END:
+			actions_end = true;
+			if (mhdr_res->actions_num) {
+				/* create modify action if needed. */
+				if (flow_dv_modify_hdr_resource_register
+					(dev, mhdr_res, dev_flow, error))
+					return -rte_errno;
+				dev_flow->dv.actions[modify_action_position] =
+					handle->dvh.modify_hdr->verbs_action;
+			}
+			if (action_flags & MLX5_FLOW_ACTION_COUNT) {
+				flow->counter =
+					flow_dv_translate_create_counter(dev,
+						dev_flow, count, age);
+
+				if (!flow->counter)
+					return rte_flow_error_set
+						(error, rte_errno,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						NULL,
+						"cannot create counter"
+						" object.");
+				dev_flow->dv.actions[actions_n++] =
+					  (flow_dv_counter_get_by_idx(dev,
+					  flow->counter, NULL))->action;
+			}
+			break;
+		default:
+			break;
+		}
+		if (mhdr_res->actions_num &&
+		    modify_action_position == UINT32_MAX)
+			modify_action_position = actions_n++;
+	}
+	dev_flow->dv.actions_n = actions_n;
+	dev_flow->act_flags = action_flags;
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+		int item_type = items->type;
+
+		switch (item_type) {
+		case RTE_FLOW_ITEM_TYPE_PORT_ID:
+			flow_dv_translate_item_port_id(dev, match_mask,
+						       match_value, items);
+			last_item = MLX5_FLOW_ITEM_PORT_ID;
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			flow_dv_translate_item_eth(match_mask, match_value,
+						   items, tunnel,
+						   dev_flow->dv.group);
+			matcher.priority = MLX5_PRIORITY_MAP_L2;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+					     MLX5_FLOW_LAYER_OUTER_L2;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			flow_dv_translate_item_vlan(dev_flow,
+						    match_mask, match_value,
+						    items, tunnel,
+						    dev_flow->dv.group);
+			matcher.priority = MLX5_PRIORITY_MAP_L2;
+			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
+					      MLX5_FLOW_LAYER_INNER_VLAN) :
+					     (MLX5_FLOW_LAYER_OUTER_L2 |
+					      MLX5_FLOW_LAYER_OUTER_VLAN);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			mlx5_flow_tunnel_ip_check(items, next_protocol,
+						  &item_flags, &tunnel);
+			flow_dv_translate_item_ipv4(match_mask, match_value,
+						    items, item_flags, tunnel,
+						    dev_flow->dv.group);
+			matcher.priority = MLX5_PRIORITY_MAP_L3;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+					     MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+			if (items->mask != NULL &&
+			    ((const struct rte_flow_item_ipv4 *)
+			     items->mask)->hdr.next_proto_id) {
+				next_protocol =
+					((const struct rte_flow_item_ipv4 *)
+					 (items->spec))->hdr.next_proto_id;
+				next_protocol &=
+					((const struct rte_flow_item_ipv4 *)
+					 (items->mask))->hdr.next_proto_id;
+			} else {
+				/* Reset for inner layer. */
+				next_protocol = 0xff;
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			mlx5_flow_tunnel_ip_check(items, next_protocol,
+						  &item_flags, &tunnel);
+			flow_dv_translate_item_ipv6(match_mask, match_value,
+						    items, item_flags, tunnel,
+						    dev_flow->dv.group);
+			matcher.priority = MLX5_PRIORITY_MAP_L3;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+					     MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+			if (items->mask != NULL &&
+			    ((const struct rte_flow_item_ipv6 *)
+			     items->mask)->hdr.proto) {
+				next_protocol =
+					((const struct rte_flow_item_ipv6 *)
+					 items->spec)->hdr.proto;
+				next_protocol &=
+					((const struct rte_flow_item_ipv6 *)
+					 items->mask)->hdr.proto;
+			} else {
+				/* Reset for inner layer. */
+				next_protocol = 0xff;
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_TCP:
+			flow_dv_translate_item_tcp(match_mask, match_value,
+						   items, tunnel);
+			matcher.priority = MLX5_PRIORITY_MAP_L4;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP :
+					     MLX5_FLOW_LAYER_OUTER_L4_TCP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			flow_dv_translate_item_udp(match_mask, match_value,
+						   items, tunnel);
+			matcher.priority = MLX5_PRIORITY_MAP_L4;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP :
+					     MLX5_FLOW_LAYER_OUTER_L4_UDP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GRE:
+			flow_dv_translate_item_gre(match_mask, match_value,
+						   items, tunnel);
+			matcher.priority = rss_desc->level >= 2 ?
+				    MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+			last_item = MLX5_FLOW_LAYER_GRE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GRE_KEY:
+			flow_dv_translate_item_gre_key(match_mask,
+						       match_value, items);
+			last_item = MLX5_FLOW_LAYER_GRE_KEY;
+			break;
+		case RTE_FLOW_ITEM_TYPE_NVGRE:
+			flow_dv_translate_item_nvgre(match_mask, match_value,
+						     items, tunnel);
+			matcher.priority = rss_desc->level >= 2 ?
+				    MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+			last_item = MLX5_FLOW_LAYER_GRE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			flow_dv_translate_item_vxlan(match_mask, match_value,
+						     items, tunnel);
+			matcher.priority = rss_desc->level >= 2 ?
+				    MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+			last_item = MLX5_FLOW_LAYER_VXLAN;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+			flow_dv_translate_item_vxlan_gpe(match_mask,
+							 match_value, items,
+							 tunnel);
+			matcher.priority = rss_desc->level >= 2 ?
+				    MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+			last_item = MLX5_FLOW_LAYER_VXLAN_GPE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GENEVE:
+			flow_dv_translate_item_geneve(match_mask, match_value,
+						      items, tunnel);
+			matcher.priority = rss_desc->level >= 2 ?
+				    MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+			last_item = MLX5_FLOW_LAYER_GENEVE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_MPLS:
+			flow_dv_translate_item_mpls(match_mask, match_value,
+						    items, last_item, tunnel);
+			matcher.priority = rss_desc->level >= 2 ?
+				    MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+			last_item = MLX5_FLOW_LAYER_MPLS;
+			break;
+		case RTE_FLOW_ITEM_TYPE_MARK:
+			flow_dv_translate_item_mark(dev, match_mask,
+						    match_value, items);
+			last_item = MLX5_FLOW_ITEM_MARK;
+			break;
+		case RTE_FLOW_ITEM_TYPE_META:
+			flow_dv_translate_item_meta(dev, match_mask,
+						    match_value, attr, items);
+			last_item = MLX5_FLOW_ITEM_METADATA;
+			break;
+		case RTE_FLOW_ITEM_TYPE_ICMP:
+			flow_dv_translate_item_icmp(match_mask, match_value,
+						    items, tunnel);
+			last_item = MLX5_FLOW_LAYER_ICMP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_ICMP6:
+			flow_dv_translate_item_icmp6(match_mask, match_value,
+						      items, tunnel);
+			last_item = MLX5_FLOW_LAYER_ICMP6;
+			break;
+		case RTE_FLOW_ITEM_TYPE_TAG:
+			flow_dv_translate_item_tag(dev, match_mask,
+						   match_value, items);
+			last_item = MLX5_FLOW_ITEM_TAG;
+			break;
+		case MLX5_RTE_FLOW_ITEM_TYPE_TAG:
+			flow_dv_translate_mlx5_item_tag(dev, match_mask,
+							match_value, items);
+			last_item = MLX5_FLOW_ITEM_TAG;
+			break;
+		case MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE:
+			flow_dv_translate_item_tx_queue(dev, match_mask,
+							match_value,
+							items);
+			last_item = MLX5_FLOW_ITEM_TX_QUEUE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GTP:
+			flow_dv_translate_item_gtp(match_mask, match_value,
+						   items, tunnel);
+			matcher.priority = rss_desc->level >= 2 ?
+				    MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4;
+			last_item = MLX5_FLOW_LAYER_GTP;
+			break;
+		default:
+			break;
+		}
+		item_flags |= last_item;
+	}
+	/*
+	 * When E-Switch mode is enabled, we have two cases where we need to
+	 * set the source port manually.
+	 * The first one, is in case of Nic steering rule, and the second is
+	 * E-Switch rule where no port_id item was found. In both cases
+	 * the source port is set according the current port in use.
+	 */
+	if (!(item_flags & MLX5_FLOW_ITEM_PORT_ID) &&
+	    (priv->representor || priv->master)) {
+		if (flow_dv_translate_item_port_id(dev, match_mask,
+						   match_value, NULL))
+			return -rte_errno;
+	}
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+	MLX5_ASSERT(!flow_dv_check_valid_spec(matcher.mask.buf,
+					      dev_flow->dv.value.buf));
+#endif
+	/*
+	 * Layers may be already initialized from prefix flow if this dev_flow
+	 * is the suffix flow.
+	 */
+	handle->layers |= item_flags;
+	if (action_flags & MLX5_FLOW_ACTION_RSS)
+		flow_dv_hashfields_set(dev_flow, rss_desc);
+	/* Register matcher. */
+	matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf,
+				    matcher.mask.size);
+	matcher.priority = mlx5_flow_adjust_priority(dev, priority,
+						     matcher.priority);
+	/* reserved field no needs to be set to 0 here. */
+	tbl_key.domain = attr->transfer;
+	tbl_key.direction = attr->egress;
+	tbl_key.table_id = dev_flow->dv.group;
+	if (flow_dv_matcher_register(dev, &matcher, &tbl_key, dev_flow, error))
+		return -rte_errno;
+	return 0;
+}
+
+/**
+ * Apply the flow to the NIC, lock free,
+ * (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+__flow_dv_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
+		struct rte_flow_error *error)
+{
+	struct mlx5_flow_dv_workspace *dv;
+	struct mlx5_flow_handle *dh;
+	struct mlx5_flow_handle_dv *dv_h;
+	struct mlx5_flow *dev_flow;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t handle_idx;
+	int n;
+	int err;
+	int idx;
+
+	for (idx = priv->flow_idx - 1; idx >= priv->flow_nested_idx; idx--) {
+		dev_flow = &((struct mlx5_flow *)priv->inter_flows)[idx];
+		dv = &dev_flow->dv;
+		dh = dev_flow->handle;
+		dv_h = &dh->dvh;
+		n = dv->actions_n;
+		if (dh->fate_action == MLX5_FLOW_FATE_DROP) {
+			if (dv->transfer) {
+				dv->actions[n++] = priv->sh->esw_drop_action;
+			} else {
+				struct mlx5_hrxq *drop_hrxq;
+				drop_hrxq = mlx5_hrxq_drop_new(dev);
+				if (!drop_hrxq) {
+					rte_flow_error_set
+						(error, errno,
+						 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+						 NULL,
+						 "cannot get drop hash queue");
+					goto error;
+				}
+				/*
+				 * Drop queues will be released by the specify
+				 * mlx5_hrxq_drop_release() function. Assign
+				 * the special index to hrxq to mark the queue
+				 * has been allocated.
+				 */
+				dh->rix_hrxq = UINT32_MAX;
+				dv->actions[n++] = drop_hrxq->action;
+			}
+		} else if (dh->fate_action == MLX5_FLOW_FATE_QUEUE) {
+			struct mlx5_hrxq *hrxq;
+			uint32_t hrxq_idx;
+			struct mlx5_flow_rss_desc *rss_desc =
+				&((struct mlx5_flow_rss_desc *)priv->rss_desc)
+				[!!priv->flow_nested_idx];
+
+			MLX5_ASSERT(rss_desc->queue_num);
+			hrxq_idx = mlx5_hrxq_get(dev, rss_desc->key,
+						 MLX5_RSS_HASH_KEY_LEN,
+						 dev_flow->hash_fields,
+						 rss_desc->queue,
+						 rss_desc->queue_num);
+			if (!hrxq_idx) {
+				hrxq_idx = mlx5_hrxq_new
+						(dev, rss_desc->key,
+						MLX5_RSS_HASH_KEY_LEN,
+						dev_flow->hash_fields,
+						rss_desc->queue,
+						rss_desc->queue_num,
+						!!(dh->layers &
+						MLX5_FLOW_LAYER_TUNNEL));
+			}
+			hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ],
+					      hrxq_idx);
+			if (!hrxq) {
+				rte_flow_error_set
+					(error, rte_errno,
+					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					 "cannot get hash queue");
+				goto error;
+			}
+			dh->rix_hrxq = hrxq_idx;
+			dv->actions[n++] = hrxq->action;
+		}
+		dh->ib_flow =
+			mlx5_glue->dv_create_flow(dv_h->matcher->matcher_object,
+						  (void *)&dv->value, n,
+						  dv->actions);
+		if (!dh->ib_flow) {
+			rte_flow_error_set(error, errno,
+					   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					   NULL,
+					   "hardware refuses to create flow");
+			goto error;
+		}
+		if (priv->vmwa_context &&
+		    dh->vf_vlan.tag && !dh->vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make hypervisor set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_vmwa_acquire(dev, &dh->vf_vlan);
+		}
+	}
+	return 0;
+error:
+	err = rte_errno; /* Save rte_errno before cleanup. */
+	SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+		       handle_idx, dh, next) {
+		/* hrxq is union, don't clear it if the flag is not set. */
+		if (dh->rix_hrxq) {
+			if (dh->fate_action == MLX5_FLOW_FATE_DROP) {
+				mlx5_hrxq_drop_release(dev);
+				dh->rix_hrxq = 0;
+			} else if (dh->fate_action == MLX5_FLOW_FATE_QUEUE) {
+				mlx5_hrxq_release(dev, dh->rix_hrxq);
+				dh->rix_hrxq = 0;
+			}
+		}
+		if (dh->vf_vlan.tag && dh->vf_vlan.created)
+			mlx5_vlan_vmwa_release(dev, &dh->vf_vlan);
+	}
+	rte_errno = err; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Release the flow matcher.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param handle
+ *   Pointer to mlx5_flow_handle.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_matcher_release(struct rte_eth_dev *dev,
+			struct mlx5_flow_handle *handle)
+{
+	struct mlx5_flow_dv_matcher *matcher = handle->dvh.matcher;
+
+	MLX5_ASSERT(matcher->matcher_object);
+	DRV_LOG(DEBUG, "port %u matcher %p: refcnt %d--",
+		dev->data->port_id, (void *)matcher,
+		rte_atomic32_read(&matcher->refcnt));
+	if (rte_atomic32_dec_and_test(&matcher->refcnt)) {
+		claim_zero(mlx5_glue->dv_destroy_flow_matcher
+			   (matcher->matcher_object));
+		LIST_REMOVE(matcher, next);
+		/* table ref-- in release interface. */
+		flow_dv_tbl_resource_release(dev, matcher->tbl);
+		rte_free(matcher);
+		DRV_LOG(DEBUG, "port %u matcher %p: removed",
+			dev->data->port_id, (void *)matcher);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Release an encap/decap resource.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param handle
+ *   Pointer to mlx5_flow_handle.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_encap_decap_resource_release(struct rte_eth_dev *dev,
+				     struct mlx5_flow_handle *handle)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t idx = handle->dvh.rix_encap_decap;
+	struct mlx5_flow_dv_encap_decap_resource *cache_resource;
+
+	cache_resource = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_DECAP_ENCAP],
+			 idx);
+	if (!cache_resource)
+		return 0;
+	MLX5_ASSERT(cache_resource->verbs_action);
+	DRV_LOG(DEBUG, "encap/decap resource %p: refcnt %d--",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+		claim_zero(mlx5_glue->destroy_flow_action
+				(cache_resource->verbs_action));
+		ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_DECAP_ENCAP],
+			     &priv->sh->encaps_decaps, idx,
+			     cache_resource, next);
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_DECAP_ENCAP], idx);
+		DRV_LOG(DEBUG, "encap/decap resource %p: removed",
+			(void *)cache_resource);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Release an jump to table action resource.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param handle
+ *   Pointer to mlx5_flow_handle.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_jump_tbl_resource_release(struct rte_eth_dev *dev,
+				  struct mlx5_flow_handle *handle)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_dv_jump_tbl_resource *cache_resource;
+	struct mlx5_flow_tbl_data_entry *tbl_data;
+
+	tbl_data = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_JUMP],
+			     handle->rix_jump);
+	if (!tbl_data)
+		return 0;
+	cache_resource = &tbl_data->jump;
+	MLX5_ASSERT(cache_resource->action);
+	DRV_LOG(DEBUG, "jump table resource %p: refcnt %d--",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+		claim_zero(mlx5_glue->destroy_flow_action
+				(cache_resource->action));
+		/* jump action memory free is inside the table release. */
+		flow_dv_tbl_resource_release(dev, &tbl_data->tbl);
+		DRV_LOG(DEBUG, "jump table resource %p: removed",
+			(void *)cache_resource);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Release a modify-header resource.
+ *
+ * @param handle
+ *   Pointer to mlx5_flow_handle.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_modify_hdr_resource_release(struct mlx5_flow_handle *handle)
+{
+	struct mlx5_flow_dv_modify_hdr_resource *cache_resource =
+							handle->dvh.modify_hdr;
+
+	MLX5_ASSERT(cache_resource->verbs_action);
+	DRV_LOG(DEBUG, "modify-header resource %p: refcnt %d--",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+		claim_zero(mlx5_glue->destroy_flow_action
+				(cache_resource->verbs_action));
+		LIST_REMOVE(cache_resource, next);
+		rte_free(cache_resource);
+		DRV_LOG(DEBUG, "modify-header resource %p: removed",
+			(void *)cache_resource);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Release port ID action resource.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param handle
+ *   Pointer to mlx5_flow_handle.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_port_id_action_resource_release(struct rte_eth_dev *dev,
+					struct mlx5_flow_handle *handle)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_dv_port_id_action_resource *cache_resource;
+	uint32_t idx = handle->rix_port_id_action;
+
+	cache_resource = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_PORT_ID],
+					idx);
+	if (!cache_resource)
+		return 0;
+	MLX5_ASSERT(cache_resource->action);
+	DRV_LOG(DEBUG, "port ID action resource %p: refcnt %d--",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+		claim_zero(mlx5_glue->destroy_flow_action
+				(cache_resource->action));
+		ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_PORT_ID],
+			     &priv->sh->port_id_action_list, idx,
+			     cache_resource, next);
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_PORT_ID], idx);
+		DRV_LOG(DEBUG, "port id action resource %p: removed",
+			(void *)cache_resource);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Release push vlan action resource.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param handle
+ *   Pointer to mlx5_flow_handle.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+static int
+flow_dv_push_vlan_action_resource_release(struct rte_eth_dev *dev,
+					  struct mlx5_flow_handle *handle)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint32_t idx = handle->dvh.rix_push_vlan;
+	struct mlx5_flow_dv_push_vlan_action_resource *cache_resource;
+
+	cache_resource = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+					idx);
+	if (!cache_resource)
+		return 0;
+	MLX5_ASSERT(cache_resource->action);
+	DRV_LOG(DEBUG, "push VLAN action resource %p: refcnt %d--",
+		(void *)cache_resource,
+		rte_atomic32_read(&cache_resource->refcnt));
+	if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) {
+		claim_zero(mlx5_glue->destroy_flow_action
+				(cache_resource->action));
+		ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_PUSH_VLAN],
+			     &priv->sh->push_vlan_action_list, idx,
+			     cache_resource, next);
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_PUSH_VLAN], idx);
+		DRV_LOG(DEBUG, "push vlan action resource %p: removed",
+			(void *)cache_resource);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Release the fate resource.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param handle
+ *   Pointer to mlx5_flow_handle.
+ */
+static void
+flow_dv_fate_resource_release(struct rte_eth_dev *dev,
+			       struct mlx5_flow_handle *handle)
+{
+	if (!handle->rix_fate)
+		return;
+	if (handle->fate_action == MLX5_FLOW_FATE_DROP)
+		mlx5_hrxq_drop_release(dev);
+	else if (handle->fate_action == MLX5_FLOW_FATE_QUEUE)
+		mlx5_hrxq_release(dev, handle->rix_hrxq);
+	else if (handle->fate_action == MLX5_FLOW_FATE_JUMP)
+		flow_dv_jump_tbl_resource_release(dev, handle);
+	else if (handle->fate_action == MLX5_FLOW_FATE_PORT_ID)
+		flow_dv_port_id_action_resource_release(dev, handle);
+	else
+		DRV_LOG(DEBUG, "Incorrect fate action:%d", handle->fate_action);
+	handle->rix_fate = 0;
+}
+
+/**
+ * Remove the flow from the NIC but keeps it in memory.
+ * Lock free, (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ */
+static void
+__flow_dv_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	struct mlx5_flow_handle *dh;
+	uint32_t handle_idx;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!flow)
+		return;
+	handle_idx = flow->dev_handles;
+	while (handle_idx) {
+		dh = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+				    handle_idx);
+		if (!dh)
+			return;
+		if (dh->ib_flow) {
+			claim_zero(mlx5_glue->dv_destroy_flow(dh->ib_flow));
+			dh->ib_flow = NULL;
+		}
+		if (dh->fate_action == MLX5_FLOW_FATE_DROP ||
+		    dh->fate_action == MLX5_FLOW_FATE_QUEUE)
+			flow_dv_fate_resource_release(dev, dh);
+		if (dh->vf_vlan.tag && dh->vf_vlan.created)
+			mlx5_vlan_vmwa_release(dev, &dh->vf_vlan);
+		handle_idx = dh->next.next;
+	}
+}
+
+/**
+ * Remove the flow from the NIC and the memory.
+ * Lock free, (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ */
+static void
+__flow_dv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	struct mlx5_flow_handle *dev_handle;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!flow)
+		return;
+	__flow_dv_remove(dev, flow);
+	if (flow->counter) {
+		flow_dv_counter_release(dev, flow->counter);
+		flow->counter = 0;
+	}
+	if (flow->meter) {
+		struct mlx5_flow_meter *fm;
+
+		fm = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MTR],
+				    flow->meter);
+		if (fm)
+			mlx5_flow_meter_detach(fm);
+		flow->meter = 0;
+	}
+	while (flow->dev_handles) {
+		uint32_t tmp_idx = flow->dev_handles;
+
+		dev_handle = mlx5_ipool_get(priv->sh->ipool
+					    [MLX5_IPOOL_MLX5_FLOW], tmp_idx);
+		if (!dev_handle)
+			return;
+		flow->dev_handles = dev_handle->next.next;
+		if (dev_handle->dvh.matcher)
+			flow_dv_matcher_release(dev, dev_handle);
+		if (dev_handle->dvh.rix_encap_decap)
+			flow_dv_encap_decap_resource_release(dev, dev_handle);
+		if (dev_handle->dvh.modify_hdr)
+			flow_dv_modify_hdr_resource_release(dev_handle);
+		if (dev_handle->dvh.rix_push_vlan)
+			flow_dv_push_vlan_action_resource_release(dev,
+								  dev_handle);
+		if (dev_handle->dvh.rix_tag)
+			flow_dv_tag_release(dev,
+					    dev_handle->dvh.rix_tag);
+		flow_dv_fate_resource_release(dev, dev_handle);
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+			   tmp_idx);
+	}
+}
+
+/**
+ * Query a dv flow  rule for its statistics via devx.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] flow
+ *   Pointer to the sub flow.
+ * @param[out] data
+ *   data retrieved by the query.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_query_count(struct rte_eth_dev *dev, struct rte_flow *flow,
+		    void *data, struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow_query_count *qc = data;
+
+	if (!priv->config.devx)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "counters are not supported");
+	if (flow->counter) {
+		uint64_t pkts, bytes;
+		struct mlx5_flow_counter *cnt;
+
+		cnt = flow_dv_counter_get_by_idx(dev, flow->counter,
+						 NULL);
+		int err = _flow_dv_query_count(dev, flow->counter, &pkts,
+					       &bytes);
+
+		if (err)
+			return rte_flow_error_set(error, -err,
+					RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					NULL, "cannot read counters");
+		qc->hits_set = 1;
+		qc->bytes_set = 1;
+		qc->hits = pkts - cnt->hits;
+		qc->bytes = bytes - cnt->bytes;
+		if (qc->reset) {
+			cnt->hits = pkts;
+			cnt->bytes = bytes;
+		}
+		return 0;
+	}
+	return rte_flow_error_set(error, EINVAL,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				  NULL,
+				  "counters are not available");
+}
+
+/**
+ * Query a flow.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+static int
+flow_dv_query(struct rte_eth_dev *dev,
+	      struct rte_flow *flow __rte_unused,
+	      const struct rte_flow_action *actions __rte_unused,
+	      void *data __rte_unused,
+	      struct rte_flow_error *error __rte_unused)
+{
+	int ret = -EINVAL;
+
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			ret = flow_dv_query_count(dev, flow, data, error);
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "action not supported");
+		}
+	}
+	return ret;
+}
+
+/**
+ * Destroy the meter table set.
+ * Lock free, (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] tbl
+ *   Pointer to the meter table set.
+ *
+ * @return
+ *   Always 0.
+ */
+static int
+flow_dv_destroy_mtr_tbl(struct rte_eth_dev *dev,
+			struct mlx5_meter_domains_infos *tbl)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_meter_domains_infos *mtd =
+				(struct mlx5_meter_domains_infos *)tbl;
+
+	if (!mtd || !priv->config.dv_flow_en)
+		return 0;
+	if (mtd->ingress.policer_rules[RTE_MTR_DROPPED])
+		claim_zero(mlx5_glue->dv_destroy_flow
+			  (mtd->ingress.policer_rules[RTE_MTR_DROPPED]));
+	if (mtd->egress.policer_rules[RTE_MTR_DROPPED])
+		claim_zero(mlx5_glue->dv_destroy_flow
+			  (mtd->egress.policer_rules[RTE_MTR_DROPPED]));
+	if (mtd->transfer.policer_rules[RTE_MTR_DROPPED])
+		claim_zero(mlx5_glue->dv_destroy_flow
+			  (mtd->transfer.policer_rules[RTE_MTR_DROPPED]));
+	if (mtd->egress.color_matcher)
+		claim_zero(mlx5_glue->dv_destroy_flow_matcher
+			  (mtd->egress.color_matcher));
+	if (mtd->egress.any_matcher)
+		claim_zero(mlx5_glue->dv_destroy_flow_matcher
+			  (mtd->egress.any_matcher));
+	if (mtd->egress.tbl)
+		flow_dv_tbl_resource_release(dev, mtd->egress.tbl);
+	if (mtd->egress.sfx_tbl)
+		flow_dv_tbl_resource_release(dev, mtd->egress.sfx_tbl);
+	if (mtd->ingress.color_matcher)
+		claim_zero(mlx5_glue->dv_destroy_flow_matcher
+			  (mtd->ingress.color_matcher));
+	if (mtd->ingress.any_matcher)
+		claim_zero(mlx5_glue->dv_destroy_flow_matcher
+			  (mtd->ingress.any_matcher));
+	if (mtd->ingress.tbl)
+		flow_dv_tbl_resource_release(dev, mtd->ingress.tbl);
+	if (mtd->ingress.sfx_tbl)
+		flow_dv_tbl_resource_release(dev, mtd->ingress.sfx_tbl);
+	if (mtd->transfer.color_matcher)
+		claim_zero(mlx5_glue->dv_destroy_flow_matcher
+			  (mtd->transfer.color_matcher));
+	if (mtd->transfer.any_matcher)
+		claim_zero(mlx5_glue->dv_destroy_flow_matcher
+			  (mtd->transfer.any_matcher));
+	if (mtd->transfer.tbl)
+		flow_dv_tbl_resource_release(dev, mtd->transfer.tbl);
+	if (mtd->transfer.sfx_tbl)
+		flow_dv_tbl_resource_release(dev, mtd->transfer.sfx_tbl);
+	if (mtd->drop_actn)
+		claim_zero(mlx5_glue->destroy_flow_action(mtd->drop_actn));
+	rte_free(mtd);
+	return 0;
+}
+
+/* Number of meter flow actions, count and jump or count and drop. */
+#define METER_ACTIONS 2
+
+/**
+ * Create specify domain meter table and suffix table.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in,out] mtb
+ *   Pointer to DV meter table set.
+ * @param[in] egress
+ *   Table attribute.
+ * @param[in] transfer
+ *   Table attribute.
+ * @param[in] color_reg_c_idx
+ *   Reg C index for color match.
+ *
+ * @return
+ *   0 on success, -1 otherwise and rte_errno is set.
+ */
+static int
+flow_dv_prepare_mtr_tables(struct rte_eth_dev *dev,
+			   struct mlx5_meter_domains_infos *mtb,
+			   uint8_t egress, uint8_t transfer,
+			   uint32_t color_reg_c_idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_flow_dv_match_params mask = {
+		.size = sizeof(mask.buf),
+	};
+	struct mlx5_flow_dv_match_params value = {
+		.size = sizeof(value.buf),
+	};
+	struct mlx5dv_flow_matcher_attr dv_attr = {
+		.type = IBV_FLOW_ATTR_NORMAL,
+		.priority = 0,
+		.match_criteria_enable = 0,
+		.match_mask = (void *)&mask,
+	};
+	void *actions[METER_ACTIONS];
+	struct mlx5_meter_domain_info *dtb;
+	struct rte_flow_error error;
+	int i = 0;
+
+	if (transfer)
+		dtb = &mtb->transfer;
+	else if (egress)
+		dtb = &mtb->egress;
+	else
+		dtb = &mtb->ingress;
+	/* Create the meter table with METER level. */
+	dtb->tbl = flow_dv_tbl_resource_get(dev, MLX5_FLOW_TABLE_LEVEL_METER,
+					    egress, transfer, &error);
+	if (!dtb->tbl) {
+		DRV_LOG(ERR, "Failed to create meter policer table.");
+		return -1;
+	}
+	/* Create the meter suffix table with SUFFIX level. */
+	dtb->sfx_tbl = flow_dv_tbl_resource_get(dev,
+					    MLX5_FLOW_TABLE_LEVEL_SUFFIX,
+					    egress, transfer, &error);
+	if (!dtb->sfx_tbl) {
+		DRV_LOG(ERR, "Failed to create meter suffix table.");
+		return -1;
+	}
+	/* Create matchers, Any and Color. */
+	dv_attr.priority = 3;
+	dv_attr.match_criteria_enable = 0;
+	dtb->any_matcher = mlx5_glue->dv_create_flow_matcher(sh->ctx,
+							     &dv_attr,
+							     dtb->tbl->obj);
+	if (!dtb->any_matcher) {
+		DRV_LOG(ERR, "Failed to create meter"
+			     " policer default matcher.");
+		goto error_exit;
+	}
+	dv_attr.priority = 0;
+	dv_attr.match_criteria_enable =
+				1 << MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT;
+	flow_dv_match_meta_reg(mask.buf, value.buf, color_reg_c_idx,
+			       rte_col_2_mlx5_col(RTE_COLORS), UINT8_MAX);
+	dtb->color_matcher = mlx5_glue->dv_create_flow_matcher(sh->ctx,
+							       &dv_attr,
+							       dtb->tbl->obj);
+	if (!dtb->color_matcher) {
+		DRV_LOG(ERR, "Failed to create meter policer color matcher.");
+		goto error_exit;
+	}
+	if (mtb->count_actns[RTE_MTR_DROPPED])
+		actions[i++] = mtb->count_actns[RTE_MTR_DROPPED];
+	actions[i++] = mtb->drop_actn;
+	/* Default rule: lowest priority, match any, actions: drop. */
+	dtb->policer_rules[RTE_MTR_DROPPED] =
+			mlx5_glue->dv_create_flow(dtb->any_matcher,
+						 (void *)&value, i, actions);
+	if (!dtb->policer_rules[RTE_MTR_DROPPED]) {
+		DRV_LOG(ERR, "Failed to create meter policer drop rule.");
+		goto error_exit;
+	}
+	return 0;
+error_exit:
+	return -1;
+}
+
+/**
+ * Create the needed meter and suffix tables.
+ * Lock free, (mutex should be acquired by caller).
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] fm
+ *   Pointer to the flow meter.
+ *
+ * @return
+ *   Pointer to table set on success, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_meter_domains_infos *
+flow_dv_create_mtr_tbl(struct rte_eth_dev *dev,
+		       const struct mlx5_flow_meter *fm)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_meter_domains_infos *mtb;
+	int ret;
+	int i;
+
+	if (!priv->mtr_en) {
+		rte_errno = ENOTSUP;
+		return NULL;
+	}
+	mtb = rte_calloc(__func__, 1, sizeof(*mtb), 0);
+	if (!mtb) {
+		DRV_LOG(ERR, "Failed to allocate memory for meter.");
+		return NULL;
+	}
+	/* Create meter count actions */
+	for (i = 0; i <= RTE_MTR_DROPPED; i++) {
+		struct mlx5_flow_counter *cnt;
+		if (!fm->policer_stats.cnt[i])
+			continue;
+		cnt = flow_dv_counter_get_by_idx(dev,
+		      fm->policer_stats.cnt[i], NULL);
+		mtb->count_actns[i] = cnt->action;
+	}
+	/* Create drop action. */
+	mtb->drop_actn = mlx5_glue->dr_create_flow_action_drop();
+	if (!mtb->drop_actn) {
+		DRV_LOG(ERR, "Failed to create drop action.");
+		goto error_exit;
+	}
+	/* Egress meter table. */
+	ret = flow_dv_prepare_mtr_tables(dev, mtb, 1, 0, priv->mtr_color_reg);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to prepare egress meter table.");
+		goto error_exit;
+	}
+	/* Ingress meter table. */
+	ret = flow_dv_prepare_mtr_tables(dev, mtb, 0, 0, priv->mtr_color_reg);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to prepare ingress meter table.");
+		goto error_exit;
+	}
+	/* FDB meter table. */
+	if (priv->config.dv_esw_en) {
+		ret = flow_dv_prepare_mtr_tables(dev, mtb, 0, 1,
+						 priv->mtr_color_reg);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to prepare fdb meter table.");
+			goto error_exit;
+		}
+	}
+	return mtb;
+error_exit:
+	flow_dv_destroy_mtr_tbl(dev, mtb);
+	return NULL;
+}
+
+/**
+ * Destroy domain policer rule.
+ *
+ * @param[in] dt
+ *   Pointer to domain table.
+ */
+static void
+flow_dv_destroy_domain_policer_rule(struct mlx5_meter_domain_info *dt)
+{
+	int i;
+
+	for (i = 0; i < RTE_MTR_DROPPED; i++) {
+		if (dt->policer_rules[i]) {
+			claim_zero(mlx5_glue->dv_destroy_flow
+				  (dt->policer_rules[i]));
+			dt->policer_rules[i] = NULL;
+		}
+	}
+	if (dt->jump_actn) {
+		claim_zero(mlx5_glue->destroy_flow_action(dt->jump_actn));
+		dt->jump_actn = NULL;
+	}
+}
+
+/**
+ * Destroy policer rules.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] fm
+ *   Pointer to flow meter structure.
+ * @param[in] attr
+ *   Pointer to flow attributes.
+ *
+ * @return
+ *   Always 0.
+ */
+static int
+flow_dv_destroy_policer_rules(struct rte_eth_dev *dev __rte_unused,
+			      const struct mlx5_flow_meter *fm,
+			      const struct rte_flow_attr *attr)
+{
+	struct mlx5_meter_domains_infos *mtb = fm ? fm->mfts : NULL;
+
+	if (!mtb)
+		return 0;
+	if (attr->egress)
+		flow_dv_destroy_domain_policer_rule(&mtb->egress);
+	if (attr->ingress)
+		flow_dv_destroy_domain_policer_rule(&mtb->ingress);
+	if (attr->transfer)
+		flow_dv_destroy_domain_policer_rule(&mtb->transfer);
+	return 0;
+}
+
+/**
+ * Create specify domain meter policer rule.
+ *
+ * @param[in] fm
+ *   Pointer to flow meter structure.
+ * @param[in] mtb
+ *   Pointer to DV meter table set.
+ * @param[in] mtr_reg_c
+ *   Color match REG_C.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+flow_dv_create_policer_forward_rule(struct mlx5_flow_meter *fm,
+				    struct mlx5_meter_domain_info *dtb,
+				    uint8_t mtr_reg_c)
+{
+	struct mlx5_flow_dv_match_params matcher = {
+		.size = sizeof(matcher.buf),
+	};
+	struct mlx5_flow_dv_match_params value = {
+		.size = sizeof(value.buf),
+	};
+	struct mlx5_meter_domains_infos *mtb = fm->mfts;
+	void *actions[METER_ACTIONS];
+	int i;
+
+	/* Create jump action. */
+	if (!dtb->jump_actn)
+		dtb->jump_actn =
+			mlx5_glue->dr_create_flow_action_dest_flow_tbl
+							(dtb->sfx_tbl->obj);
+	if (!dtb->jump_actn) {
+		DRV_LOG(ERR, "Failed to create policer jump action.");
+		goto error;
+	}
+	for (i = 0; i < RTE_MTR_DROPPED; i++) {
+		int j = 0;
+
+		flow_dv_match_meta_reg(matcher.buf, value.buf, mtr_reg_c,
+				       rte_col_2_mlx5_col(i), UINT8_MAX);
+		if (mtb->count_actns[i])
+			actions[j++] = mtb->count_actns[i];
+		if (fm->action[i] == MTR_POLICER_ACTION_DROP)
+			actions[j++] = mtb->drop_actn;
+		else
+			actions[j++] = dtb->jump_actn;
+		dtb->policer_rules[i] =
+			mlx5_glue->dv_create_flow(dtb->color_matcher,
+						 (void *)&value,
+						  j, actions);
+		if (!dtb->policer_rules[i]) {
+			DRV_LOG(ERR, "Failed to create policer rule.");
+			goto error;
+		}
+	}
+	return 0;
+error:
+	rte_errno = errno;
+	return -1;
+}
+
+/**
+ * Create policer rules.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] fm
+ *   Pointer to flow meter structure.
+ * @param[in] attr
+ *   Pointer to flow attributes.
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+flow_dv_create_policer_rules(struct rte_eth_dev *dev,
+			     struct mlx5_flow_meter *fm,
+			     const struct rte_flow_attr *attr)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_meter_domains_infos *mtb = fm->mfts;
+	int ret;
+
+	if (attr->egress) {
+		ret = flow_dv_create_policer_forward_rule(fm, &mtb->egress,
+						priv->mtr_color_reg);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to create egress policer.");
+			goto error;
+		}
+	}
+	if (attr->ingress) {
+		ret = flow_dv_create_policer_forward_rule(fm, &mtb->ingress,
+						priv->mtr_color_reg);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to create ingress policer.");
+			goto error;
+		}
+	}
+	if (attr->transfer) {
+		ret = flow_dv_create_policer_forward_rule(fm, &mtb->transfer,
+						priv->mtr_color_reg);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to create transfer policer.");
+			goto error;
+		}
+	}
+	return 0;
+error:
+	flow_dv_destroy_policer_rules(dev, fm, attr);
+	return -1;
+}
+
+/**
+ * Query a devx counter.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] cnt
+ *   Index to the flow counter.
+ * @param[in] clear
+ *   Set to clear the counter statistics.
+ * @param[out] pkts
+ *   The statistics value of packets.
+ * @param[out] bytes
+ *   The statistics value of bytes.
+ *
+ * @return
+ *   0 on success, otherwise return -1.
+ */
+static int
+flow_dv_counter_query(struct rte_eth_dev *dev, uint32_t counter, bool clear,
+		      uint64_t *pkts, uint64_t *bytes)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_counter *cnt;
+	uint64_t inn_pkts, inn_bytes;
+	int ret;
+
+	if (!priv->config.devx)
+		return -1;
+
+	ret = _flow_dv_query_count(dev, counter, &inn_pkts, &inn_bytes);
+	if (ret)
+		return -1;
+	cnt = flow_dv_counter_get_by_idx(dev, counter, NULL);
+	*pkts = inn_pkts - cnt->hits;
+	*bytes = inn_bytes - cnt->bytes;
+	if (clear) {
+		cnt->hits = inn_pkts;
+		cnt->bytes = inn_bytes;
+	}
+	return 0;
+}
+
+/**
+ * Get aged-out flows.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] context
+ *   The address of an array of pointers to the aged-out flows contexts.
+ * @param[in] nb_contexts
+ *   The length of context array pointers.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL. Initialized in case of
+ *   error only.
+ *
+ * @return
+ *   how many contexts get in success, otherwise negative errno value.
+ *   if nb_contexts is 0, return the amount of all aged contexts.
+ *   if nb_contexts is not 0 , return the amount of aged flows reported
+ *   in the context array.
+ * @note: only stub for now
+ */
+static int
+flow_get_aged_flows(struct rte_eth_dev *dev,
+		    void **context,
+		    uint32_t nb_contexts,
+		    struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_age_info *age_info;
+	struct mlx5_age_param *age_param;
+	struct mlx5_flow_counter *counter;
+	int nb_flows = 0;
+
+	if (nb_contexts && !context)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "Should assign at least one flow or"
+					  " context to get if nb_contexts != 0");
+	age_info = GET_PORT_AGE_INFO(priv);
+	rte_spinlock_lock(&age_info->aged_sl);
+	TAILQ_FOREACH(counter, &age_info->aged_counters, next) {
+		nb_flows++;
+		if (nb_contexts) {
+			age_param = MLX5_CNT_TO_AGE(counter);
+			context[nb_flows - 1] = age_param->context;
+			if (!(--nb_contexts))
+				break;
+		}
+	}
+	rte_spinlock_unlock(&age_info->aged_sl);
+	MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER);
+	return nb_flows;
+}
+
+/*
+ * Mutex-protected thunk to lock-free  __flow_dv_translate().
+ */
+static int
+flow_dv_translate(struct rte_eth_dev *dev,
+		  struct mlx5_flow *dev_flow,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error)
+{
+	int ret;
+
+	flow_dv_shared_lock(dev);
+	ret = __flow_dv_translate(dev, dev_flow, attr, items, actions, error);
+	flow_dv_shared_unlock(dev);
+	return ret;
+}
+
+/*
+ * Mutex-protected thunk to lock-free  __flow_dv_apply().
+ */
+static int
+flow_dv_apply(struct rte_eth_dev *dev,
+	      struct rte_flow *flow,
+	      struct rte_flow_error *error)
+{
+	int ret;
+
+	flow_dv_shared_lock(dev);
+	ret = __flow_dv_apply(dev, flow, error);
+	flow_dv_shared_unlock(dev);
+	return ret;
+}
+
+/*
+ * Mutex-protected thunk to lock-free __flow_dv_remove().
+ */
+static void
+flow_dv_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	flow_dv_shared_lock(dev);
+	__flow_dv_remove(dev, flow);
+	flow_dv_shared_unlock(dev);
+}
+
+/*
+ * Mutex-protected thunk to lock-free __flow_dv_destroy().
+ */
+static void
+flow_dv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	flow_dv_shared_lock(dev);
+	__flow_dv_destroy(dev, flow);
+	flow_dv_shared_unlock(dev);
+}
+
+/*
+ * Mutex-protected thunk to lock-free flow_dv_counter_alloc().
+ */
+static uint32_t
+flow_dv_counter_allocate(struct rte_eth_dev *dev)
+{
+	uint32_t cnt;
+
+	flow_dv_shared_lock(dev);
+	cnt = flow_dv_counter_alloc(dev, 0, 0, 1, 0);
+	flow_dv_shared_unlock(dev);
+	return cnt;
+}
+
+/*
+ * Mutex-protected thunk to lock-free flow_dv_counter_release().
+ */
+static void
+flow_dv_counter_free(struct rte_eth_dev *dev, uint32_t cnt)
+{
+	flow_dv_shared_lock(dev);
+	flow_dv_counter_release(dev, cnt);
+	flow_dv_shared_unlock(dev);
+}
+
+const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops = {
+	.validate = flow_dv_validate,
+	.prepare = flow_dv_prepare,
+	.translate = flow_dv_translate,
+	.apply = flow_dv_apply,
+	.remove = flow_dv_remove,
+	.destroy = flow_dv_destroy,
+	.query = flow_dv_query,
+	.create_mtr_tbls = flow_dv_create_mtr_tbl,
+	.destroy_mtr_tbls = flow_dv_destroy_mtr_tbl,
+	.create_policer_rules = flow_dv_create_policer_rules,
+	.destroy_policer_rules = flow_dv_destroy_policer_rules,
+	.counter_alloc = flow_dv_counter_allocate,
+	.counter_free = flow_dv_counter_free,
+	.counter_query = flow_dv_counter_query,
+	.get_aged_flows = flow_get_aged_flows,
+};
+
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c
new file mode 100644
index 000000000..08f7dc8d1
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_meter.c
@@ -0,0 +1,1292 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+#include <math.h>
+
+#include <rte_tailq.h>
+#include <rte_malloc.h>
+#include <rte_mtr.h>
+#include <rte_mtr_driver.h>
+
+#include <mlx5_devx_cmds.h>
+
+#include "mlx5.h"
+#include "mlx5_flow.h"
+
+/**
+ * Create the meter action.
+ *
+ * @param priv
+ *   Pointer to mlx5_priv.
+ * @param[in] fm
+ *   Pointer to flow meter to be converted.
+ *
+ * @return
+ *   Pointer to the meter action on success, NULL otherwise.
+ */
+static void *
+mlx5_flow_meter_action_create(struct mlx5_priv *priv,
+			      struct mlx5_flow_meter *fm)
+{
+#ifdef HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER
+	struct mlx5dv_dr_flow_meter_attr mtr_init;
+	void *attr = fm->mfts->fmp;
+	struct mlx5_flow_meter_srtcm_rfc2697_prm *srtcm =
+						     &fm->profile->srtcm_prm;
+
+	fm->mfts->fmp_size = MLX5_ST_SZ_BYTES(flow_meter_parameters);
+	memset(attr, 0, fm->mfts->fmp_size);
+	MLX5_SET(flow_meter_parameters, attr, valid, 1);
+	MLX5_SET(flow_meter_parameters, attr, bucket_overflow, 1);
+	MLX5_SET(flow_meter_parameters, attr,
+		 start_color, MLX5_FLOW_COLOR_GREEN);
+	MLX5_SET(flow_meter_parameters, attr, both_buckets_on_green, 0);
+	MLX5_SET(flow_meter_parameters,
+		 attr, cbs_exponent, srtcm->cbs_exponent);
+	MLX5_SET(flow_meter_parameters,
+		 attr, cbs_mantissa, srtcm->cbs_mantissa);
+	MLX5_SET(flow_meter_parameters,
+		 attr, cir_exponent, srtcm->cir_exponent);
+	MLX5_SET(flow_meter_parameters,
+		 attr, cir_mantissa, srtcm->cir_mantissa);
+	MLX5_SET(flow_meter_parameters,
+		 attr, ebs_exponent, srtcm->ebs_exponent);
+	MLX5_SET(flow_meter_parameters,
+		 attr, ebs_mantissa, srtcm->ebs_mantissa);
+	mtr_init.next_table =
+		fm->transfer ? fm->mfts->transfer.tbl->obj :
+		    fm->egress ? fm->mfts->egress.tbl->obj :
+				       fm->mfts->ingress.tbl->obj;
+	mtr_init.reg_c_index = priv->mtr_color_reg - REG_C_0;
+	mtr_init.flow_meter_parameter = fm->mfts->fmp;
+	mtr_init.flow_meter_parameter_sz = fm->mfts->fmp_size;
+	mtr_init.active = fm->active_state;
+	return mlx5_glue->dv_create_flow_action_meter(&mtr_init);
+#else
+	(void)priv;
+	(void)fm;
+	return NULL;
+#endif
+}
+
+/**
+ * Find meter profile by id.
+ *
+ * @param priv
+ *   Pointer to mlx5_priv.
+ * @param meter_profile_id
+ *   Meter profile id.
+ *
+ * @return
+ *   Pointer to the profile found on success, NULL otherwise.
+ */
+static struct mlx5_flow_meter_profile *
+mlx5_flow_meter_profile_find(struct mlx5_priv *priv, uint32_t meter_profile_id)
+{
+	struct mlx5_mtr_profiles *fmps = &priv->flow_meter_profiles;
+	struct mlx5_flow_meter_profile *fmp;
+
+	TAILQ_FOREACH(fmp, fmps, next)
+		if (meter_profile_id == fmp->meter_profile_id)
+			return fmp;
+	return NULL;
+}
+
+/**
+ * Validate the MTR profile.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_profile_id
+ *   Meter profile id.
+ * @param[in] profile
+ *   Pointer to meter profile detail.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_profile_validate(struct rte_eth_dev *dev,
+				 uint32_t meter_profile_id,
+				 struct rte_mtr_meter_profile *profile,
+				 struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meter_profile *fmp;
+
+	/* Profile must not be NULL. */
+	if (profile == NULL)
+		return -rte_mtr_error_set(error, EINVAL,
+					  RTE_MTR_ERROR_TYPE_METER_PROFILE,
+					  NULL, "Meter profile is null.");
+	/* Meter profile ID must be valid. */
+	if (meter_profile_id == UINT32_MAX)
+		return -rte_mtr_error_set(error, EINVAL,
+					  RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+					  NULL, "Meter profile id not valid.");
+	/* Meter profile must not exist. */
+	fmp = mlx5_flow_meter_profile_find(priv, meter_profile_id);
+	if (fmp)
+		return -rte_mtr_error_set(error, EEXIST,
+					  RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+					  NULL,
+					  "Meter profile already exists.");
+	if (profile->alg == RTE_MTR_SRTCM_RFC2697) {
+		if (priv->config.hca_attr.qos.srtcm_sup) {
+			/* Verify support for flow meter parameters. */
+			if (profile->srtcm_rfc2697.cir > 0 &&
+			    profile->srtcm_rfc2697.cir <= MLX5_SRTCM_CIR_MAX &&
+			    profile->srtcm_rfc2697.cbs > 0 &&
+			    profile->srtcm_rfc2697.cbs <= MLX5_SRTCM_CBS_MAX &&
+			    profile->srtcm_rfc2697.ebs <= MLX5_SRTCM_EBS_MAX)
+				return 0;
+			else
+				return -rte_mtr_error_set
+					     (error, ENOTSUP,
+					      RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+					      NULL,
+					      profile->srtcm_rfc2697.ebs ?
+					      "Metering value ebs must be 0." :
+					      "Invalid metering parameters.");
+		}
+	}
+	return -rte_mtr_error_set(error, ENOTSUP,
+				  RTE_MTR_ERROR_TYPE_METER_PROFILE,
+				  NULL, "Metering algorithm not supported.");
+}
+
+/**
+ * Calculate mantissa and exponent for cir.
+ *
+ * @param[in] cir
+ *   Value to be calculated.
+ * @param[out] man
+ *   Pointer to the mantissa.
+ * @param[out] exp
+ *   Pointer to the exp.
+ */
+static void
+mlx5_flow_meter_cir_man_exp_calc(int64_t cir, uint8_t *man, uint8_t *exp)
+{
+	int64_t _cir;
+	int64_t delta = INT64_MAX;
+	uint8_t _man = 0;
+	uint8_t _exp = 0;
+	uint64_t m, e;
+
+	for (m = 0; m <= 0xFF; m++) { /* man width 8 bit */
+		for (e = 0; e <= 0x1F; e++) { /* exp width 5bit */
+			_cir = (1000000000ULL * m) >> e;
+			if (llabs(cir - _cir) <= delta) {
+				delta = llabs(cir - _cir);
+				_man = m;
+				_exp = e;
+			}
+		}
+	}
+	*man = _man;
+	*exp = _exp;
+}
+
+/**
+ * Calculate mantissa and exponent for xbs.
+ *
+ * @param[in] xbs
+ *   Value to be calculated.
+ * @param[out] man
+ *   Pointer to the mantissa.
+ * @param[out] exp
+ *   Pointer to the exp.
+ */
+static void
+mlx5_flow_meter_xbs_man_exp_calc(uint64_t xbs, uint8_t *man, uint8_t *exp)
+{
+	int _exp;
+	double _man;
+
+	/* Special case xbs == 0 ? both exp and matissa are 0. */
+	if (xbs == 0) {
+		*man = 0;
+		*exp = 0;
+		return;
+	}
+	/* xbs = xbs_mantissa * 2^xbs_exponent */
+	_man = frexp(xbs, &_exp);
+	_man = _man * pow(2, MLX5_MAN_WIDTH);
+	_exp = _exp - MLX5_MAN_WIDTH;
+	*man = (uint8_t)ceil(_man);
+	*exp = _exp;
+}
+
+/**
+ * Fill the prm meter parameter.
+ *
+ * @param[in,out] fmp
+ *   Pointer to meter profie to be converted.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_param_fill(struct mlx5_flow_meter_profile *fmp,
+			  struct rte_mtr_error *error)
+{
+	struct mlx5_flow_meter_srtcm_rfc2697_prm *srtcm = &fmp->srtcm_prm;
+	uint8_t man, exp;
+
+	if (fmp->profile.alg != RTE_MTR_SRTCM_RFC2697)
+		return -rte_mtr_error_set(error, ENOTSUP,
+				RTE_MTR_ERROR_TYPE_METER_PROFILE,
+				NULL, "Metering algorithm not supported.");
+	 /* cbs = cbs_mantissa * 2^cbs_exponent */
+	mlx5_flow_meter_xbs_man_exp_calc(fmp->profile.srtcm_rfc2697.cbs,
+				    &man, &exp);
+	srtcm->cbs_mantissa = man;
+	srtcm->cbs_exponent = exp;
+	/* Check if cbs mantissa is too large. */
+	if (srtcm->cbs_exponent != exp)
+		return -rte_mtr_error_set(error, EINVAL,
+					  RTE_MTR_ERROR_TYPE_MTR_PARAMS, NULL,
+					  "Metering profile parameter cbs is"
+					  " invalid.");
+	/* ebs = ebs_mantissa * 2^ebs_exponent */
+	mlx5_flow_meter_xbs_man_exp_calc(fmp->profile.srtcm_rfc2697.ebs,
+				    &man, &exp);
+	srtcm->ebs_mantissa = man;
+	srtcm->ebs_exponent = exp;
+	/* Check if ebs mantissa is too large. */
+	if (srtcm->ebs_exponent != exp)
+		return -rte_mtr_error_set(error, EINVAL,
+					  RTE_MTR_ERROR_TYPE_MTR_PARAMS, NULL,
+					  "Metering profile parameter ebs is"
+					  " invalid.");
+	/* cir = 8G * cir_mantissa * 1/(2^cir_exponent)) Bytes/Sec */
+	mlx5_flow_meter_cir_man_exp_calc(fmp->profile.srtcm_rfc2697.cir,
+				    &man, &exp);
+	srtcm->cir_mantissa = man;
+	srtcm->cir_exponent = exp;
+	/* Check if cir mantissa is too large. */
+	if (srtcm->cir_exponent != exp)
+		return -rte_mtr_error_set(error, EINVAL,
+					  RTE_MTR_ERROR_TYPE_MTR_PARAMS, NULL,
+					  "Metering profile parameter cir is"
+					  " invalid.");
+	return 0;
+}
+
+/**
+ * Callback to get MTR capabilities.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] cap
+ *   Pointer to save MTR capabilities.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_mtr_cap_get(struct rte_eth_dev *dev,
+		 struct rte_mtr_capabilities *cap,
+		 struct rte_mtr_error *error __rte_unused)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hca_qos_attr *qattr = &priv->config.hca_attr.qos;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	memset(cap, 0, sizeof(*cap));
+	cap->n_max = 1 << qattr->log_max_flow_meter;
+	cap->n_shared_max = cap->n_max;
+	cap->identical = 1;
+	cap->shared_identical = 1;
+	cap->shared_n_flows_per_mtr_max = 4 << 20;
+	/* 2M flows can share the same meter. */
+	cap->chaining_n_mtrs_per_flow_max = 1; /* Chaining is not supported. */
+	cap->meter_srtcm_rfc2697_n_max = qattr->srtcm_sup ? cap->n_max : 0;
+	cap->meter_rate_max = 1ULL << 40; /* 1 Tera tokens per sec. */
+	cap->policer_action_drop_supported = 1;
+	cap->stats_mask = RTE_MTR_STATS_N_BYTES_DROPPED |
+			  RTE_MTR_STATS_N_PKTS_DROPPED;
+	return 0;
+}
+
+/**
+ * Callback to add MTR profile.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_profile_id
+ *   Meter profile id.
+ * @param[in] profile
+ *   Pointer to meter profile detail.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_profile_add(struct rte_eth_dev *dev,
+		       uint32_t meter_profile_id,
+		       struct rte_mtr_meter_profile *profile,
+		       struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_mtr_profiles *fmps = &priv->flow_meter_profiles;
+	struct mlx5_flow_meter_profile *fmp;
+	int ret;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	/* Check input params. */
+	ret = mlx5_flow_meter_profile_validate(dev, meter_profile_id,
+					       profile, error);
+	if (ret)
+		return ret;
+	/* Meter profile memory allocation. */
+	fmp = rte_calloc(__func__, 1, sizeof(struct mlx5_flow_meter_profile),
+			 RTE_CACHE_LINE_SIZE);
+	if (fmp == NULL)
+		return -rte_mtr_error_set(error, ENOMEM,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED,
+					  NULL, "Meter profile memory "
+					  "alloc failed.");
+	/* Fill profile info. */
+	fmp->meter_profile_id = meter_profile_id;
+	fmp->profile = *profile;
+	/* Fill the flow meter parameters for the PRM. */
+	ret = mlx5_flow_meter_param_fill(fmp, error);
+	if (ret)
+		goto error;
+	/* Add to list. */
+	TAILQ_INSERT_TAIL(fmps, fmp, next);
+	return 0;
+error:
+	rte_free(fmp);
+	return ret;
+}
+
+/**
+ * Callback to delete MTR profile.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_profile_id
+ *   Meter profile id.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_profile_delete(struct rte_eth_dev *dev,
+			  uint32_t meter_profile_id,
+			  struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meter_profile *fmp;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	/* Meter profile must exist. */
+	fmp = mlx5_flow_meter_profile_find(priv, meter_profile_id);
+	if (fmp == NULL)
+		return -rte_mtr_error_set(error, ENOENT,
+					  RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+					  &meter_profile_id,
+					  "Meter profile id invalid.");
+	/* Check profile is unused. */
+	if (fmp->ref_cnt)
+		return -rte_mtr_error_set(error, EBUSY,
+					  RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+					  NULL, "Meter profile in use.");
+	/* Remove from list. */
+	TAILQ_REMOVE(&priv->flow_meter_profiles, fmp, next);
+	rte_free(fmp);
+	return 0;
+}
+
+/**
+ * Convert wrong color setting action to verbose error.
+ *
+ * @param[in] action
+ *   Policy color action.
+ *
+ * @return
+ *   Verbose meter color error type.
+ */
+static inline enum rte_mtr_error_type
+action2error(enum rte_mtr_policer_action action)
+{
+	switch (action) {
+	case MTR_POLICER_ACTION_COLOR_GREEN:
+		return RTE_MTR_ERROR_TYPE_POLICER_ACTION_GREEN;
+	case MTR_POLICER_ACTION_COLOR_YELLOW:
+		return RTE_MTR_ERROR_TYPE_POLICER_ACTION_YELLOW;
+	case MTR_POLICER_ACTION_COLOR_RED:
+		return RTE_MTR_ERROR_TYPE_POLICER_ACTION_RED;
+	default:
+		break;
+	}
+	return RTE_MTR_ERROR_TYPE_UNSPECIFIED;
+}
+
+/**
+ * Check meter validation.
+ *
+ * @param[in] priv
+ *   Pointer to mlx5 private data structure.
+ * @param[in] meter_id
+ *   Meter id.
+ * @param[in] params
+ *   Pointer to rte meter parameters.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_validate(struct mlx5_priv *priv, uint32_t meter_id,
+			 struct rte_mtr_params *params,
+			 struct rte_mtr_error *error)
+{
+	static enum rte_mtr_policer_action
+				valid_recol_action[RTE_COLORS] = {
+					       MTR_POLICER_ACTION_COLOR_GREEN,
+					       MTR_POLICER_ACTION_COLOR_YELLOW,
+					       MTR_POLICER_ACTION_COLOR_RED };
+	int i;
+
+	/* Meter params must not be NULL. */
+	if (params == NULL)
+		return -rte_mtr_error_set(error, EINVAL,
+					  RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+					  NULL, "Meter object params null.");
+	/* Previous meter color is not supported. */
+	if (params->use_prev_mtr_color)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+					  NULL,
+					  "Previous meter color "
+					  "not supported.");
+	/* Validate policer settings. */
+	for (i = 0; i < RTE_COLORS; i++)
+		if (params->action[i] != valid_recol_action[i] &&
+		    params->action[i] != MTR_POLICER_ACTION_DROP)
+			return -rte_mtr_error_set
+					(error, ENOTSUP,
+					 action2error(params->action[i]), NULL,
+					 "Recolor action not supported.");
+	/* Validate meter id. */
+	if (mlx5_flow_meter_find(priv, meter_id))
+		return -rte_mtr_error_set(error, EEXIST,
+					  RTE_MTR_ERROR_TYPE_MTR_ID, NULL,
+					  "Meter object already exists.");
+	return 0;
+}
+
+/**
+ * Modify the flow meter action.
+ *
+ * @param[in] priv
+ *   Pointer to mlx5 private data structure.
+ * @param[in] fm
+ *   Pointer to flow meter to be modified.
+ * @param[in] srtcm
+ *   Pointer to meter srtcm description parameter.
+ * @param[in] modify_bits
+ *   The bit in srtcm to be updated.
+ * @param[in] active_state
+ *   The state to be updated.
+ * @return
+ *   0 on success, o negative value otherwise.
+ */
+static int
+mlx5_flow_meter_action_modify(struct mlx5_priv *priv,
+		struct mlx5_flow_meter *fm,
+		const struct mlx5_flow_meter_srtcm_rfc2697_prm *srtcm,
+		uint64_t modify_bits, uint32_t active_state)
+{
+#ifdef HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER
+	uint32_t in[MLX5_ST_SZ_DW(flow_meter_parameters)] = { 0 };
+	uint32_t *attr;
+	struct mlx5dv_dr_flow_meter_attr mod_attr = { 0 };
+	int ret;
+
+	/* Fill command parameters. */
+	mod_attr.reg_c_index = priv->mtr_color_reg - REG_C_0;
+	mod_attr.flow_meter_parameter = in;
+	mod_attr.flow_meter_parameter_sz = fm->mfts->fmp_size;
+	if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_ACTIVE)
+		mod_attr.active = !!active_state;
+	else
+		mod_attr.active = 0;
+	attr = in;
+	if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS) {
+		MLX5_SET(flow_meter_parameters,
+			 attr, cbs_exponent, srtcm->cbs_exponent);
+		MLX5_SET(flow_meter_parameters,
+			 attr, cbs_mantissa, srtcm->cbs_mantissa);
+	}
+	if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR) {
+		MLX5_SET(flow_meter_parameters,
+			 attr, cir_exponent, srtcm->cir_exponent);
+		MLX5_SET(flow_meter_parameters,
+			 attr, cir_mantissa, srtcm->cir_mantissa);
+	}
+	if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_EBS) {
+		MLX5_SET(flow_meter_parameters,
+			 attr, ebs_exponent, srtcm->ebs_exponent);
+		MLX5_SET(flow_meter_parameters,
+			 attr, ebs_mantissa, srtcm->ebs_mantissa);
+	}
+	/* Apply modifications to meter only if it was created. */
+	if (fm->mfts->meter_action) {
+		ret = mlx5_glue->dv_modify_flow_action_meter
+					(fm->mfts->meter_action, &mod_attr,
+					rte_cpu_to_be_64(modify_bits));
+		if (ret)
+			return ret;
+	}
+	/* Update succeedded modify meter parameters. */
+	if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_ACTIVE)
+		fm->active_state = !!active_state;
+	attr = fm->mfts->fmp;
+	if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS) {
+		MLX5_SET(flow_meter_parameters,
+			 attr, cbs_exponent, srtcm->cbs_exponent);
+		MLX5_SET(flow_meter_parameters,
+			 attr, cbs_mantissa, srtcm->cbs_mantissa);
+	}
+	if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR) {
+		MLX5_SET(flow_meter_parameters,
+			 attr, cir_exponent, srtcm->cir_exponent);
+		MLX5_SET(flow_meter_parameters,
+			 attr, cir_mantissa, srtcm->cir_mantissa);
+	}
+	if (modify_bits & MLX5_FLOW_METER_OBJ_MODIFY_FIELD_EBS) {
+		MLX5_SET(flow_meter_parameters,
+			 attr, ebs_exponent, srtcm->ebs_exponent);
+		MLX5_SET(flow_meter_parameters,
+			 attr, ebs_mantissa, srtcm->ebs_mantissa);
+	}
+
+	return 0;
+#else
+	(void)priv;
+	(void)fm;
+	(void)srtcm;
+	(void)modify_bits;
+	(void)active_state;
+	return -ENOTSUP;
+#endif
+}
+
+/**
+ * Create meter rules.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_id
+ *   Meter id.
+ * @param[in] params
+ *   Pointer to rte meter parameters.
+ * @param[in] shared
+ *   Meter shared with other flow or not.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_create(struct rte_eth_dev *dev, uint32_t meter_id,
+		       struct rte_mtr_params *params, int shared,
+		       struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meters *fms = &priv->flow_meters;
+	struct mlx5_flow_meter_profile *fmp;
+	struct mlx5_flow_meter *fm;
+	const struct rte_flow_attr attr = {
+				.ingress = 1,
+				.egress = 1,
+				.transfer = priv->config.dv_esw_en ? 1 : 0,
+			};
+	int ret;
+	unsigned int i;
+	uint32_t idx = 0;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	/* Validate the parameters. */
+	ret = mlx5_flow_meter_validate(priv, meter_id, params, error);
+	if (ret)
+		return ret;
+	/* Meter profile must exist. */
+	fmp = mlx5_flow_meter_profile_find(priv, params->meter_profile_id);
+	if (fmp == NULL)
+		return -rte_mtr_error_set(error, ENOENT,
+					  RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+					  NULL, "Meter profile id not valid.");
+	/* Allocate the flow meter memory. */
+	fm = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MTR], &idx);
+	if (fm == NULL)
+		return -rte_mtr_error_set(error, ENOMEM,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Memory alloc failed for meter.");
+	fm->idx = idx;
+	/* Fill the flow meter parameters. */
+	fm->meter_id = meter_id;
+	fm->profile = fmp;
+	memcpy(fm->action, params->action, sizeof(params->action));
+	fm->stats_mask = params->stats_mask;
+
+	/* Alloc policer counters. */
+	for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++) {
+		fm->policer_stats.cnt[i] = mlx5_counter_alloc(dev);
+		if (!fm->policer_stats.cnt[i])
+			goto error;
+	}
+	fm->mfts = mlx5_flow_create_mtr_tbls(dev, fm);
+	if (!fm->mfts)
+		goto error;
+	ret = mlx5_flow_create_policer_rules(dev, fm, &attr);
+	if (ret)
+		goto error;
+	/* Add to the flow meter list. */
+	TAILQ_INSERT_TAIL(fms, fm, next);
+	fm->active_state = 1; /* Config meter starts as active. */
+	fm->shared = !!shared;
+	fm->policer_stats.stats_mask = params->stats_mask;
+	fm->profile->ref_cnt++;
+	return 0;
+error:
+	mlx5_flow_destroy_policer_rules(dev, fm, &attr);
+	mlx5_flow_destroy_mtr_tbls(dev, fm->mfts);
+	/* Free policer counters. */
+	for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++)
+		if (fm->policer_stats.cnt[i])
+			mlx5_counter_free(dev, fm->policer_stats.cnt[i]);
+	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MTR], idx);
+	return -rte_mtr_error_set(error, -ret,
+				  RTE_MTR_ERROR_TYPE_UNSPECIFIED,
+				  NULL, "Failed to create devx meter.");
+}
+
+/**
+ * Destroy meter rules.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_id
+ *   Meter id.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_destroy(struct rte_eth_dev *dev, uint32_t meter_id,
+			struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meters *fms = &priv->flow_meters;
+	struct mlx5_flow_meter_profile *fmp;
+	struct mlx5_flow_meter *fm;
+	const struct rte_flow_attr attr = {
+				.ingress = 1,
+				.egress = 1,
+				.transfer = priv->config.dv_esw_en ? 1 : 0,
+			};
+	unsigned int i;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	/* Meter object must exist. */
+	fm = mlx5_flow_meter_find(priv, meter_id);
+	if (fm == NULL)
+		return -rte_mtr_error_set(error, ENOENT,
+					  RTE_MTR_ERROR_TYPE_MTR_ID,
+					  NULL, "Meter object id not valid.");
+	/* Meter object must not have any owner. */
+	if (fm->ref_cnt > 0)
+		return -rte_mtr_error_set(error, EBUSY,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED,
+					  NULL, "Meter object is being used.");
+	/* Get the meter profile. */
+	fmp = fm->profile;
+	MLX5_ASSERT(fmp);
+	/* Update dependencies. */
+	fmp->ref_cnt--;
+	/* Remove from the flow meter list. */
+	TAILQ_REMOVE(fms, fm, next);
+	/* Free policer counters. */
+	for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++)
+		if (fm->policer_stats.cnt[i])
+			mlx5_counter_free(dev, fm->policer_stats.cnt[i]);
+	/* Free meter flow table */
+	mlx5_flow_destroy_policer_rules(dev, fm, &attr);
+	mlx5_flow_destroy_mtr_tbls(dev, fm->mfts);
+	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MTR], fm->idx);
+	return 0;
+}
+
+/**
+ * Modify meter state.
+ *
+ * @param[in] priv
+ *   Pointer to mlx5 private data structure.
+ * @param[in] fm
+ *   Pointer to flow meter.
+ * @param[in] new_state
+ *   New state to update.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_modify_state(struct mlx5_priv *priv,
+			     struct mlx5_flow_meter *fm,
+			     uint32_t new_state,
+			     struct rte_mtr_error *error)
+{
+	static const struct mlx5_flow_meter_srtcm_rfc2697_prm srtcm = {
+		.cbs_exponent = 20,
+		.cbs_mantissa = 191,
+		.cir_exponent = 0,
+		.cir_mantissa = 200,
+		.ebs_exponent = 0,
+		.ebs_mantissa = 0,
+	};
+	uint64_t modify_bits = MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS |
+			       MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR;
+	int ret;
+
+	if (new_state == MLX5_FLOW_METER_DISABLE)
+		ret = mlx5_flow_meter_action_modify(priv, fm, &srtcm,
+						    modify_bits, 0);
+	else
+		ret = mlx5_flow_meter_action_modify(priv, fm,
+						   &fm->profile->srtcm_prm,
+						    modify_bits, 0);
+	if (ret)
+		return -rte_mtr_error_set(error, -ret,
+					  RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+					  NULL,
+					  new_state ?
+					  "Failed to enable meter." :
+					  "Failed to disable meter.");
+	return 0;
+}
+
+/**
+ * Callback to enable flow meter.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_id
+ *   Meter id.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_enable(struct rte_eth_dev *dev,
+		       uint32_t meter_id,
+		       struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meter *fm;
+	int ret;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	/* Meter object must exist. */
+	fm = mlx5_flow_meter_find(priv, meter_id);
+	if (fm == NULL)
+		return -rte_mtr_error_set(error, ENOENT,
+					  RTE_MTR_ERROR_TYPE_MTR_ID,
+					  NULL, "Meter not found.");
+	if (fm->active_state == MLX5_FLOW_METER_ENABLE)
+		return 0;
+	ret = mlx5_flow_meter_modify_state(priv, fm, MLX5_FLOW_METER_ENABLE,
+					   error);
+	if (!ret)
+		fm->active_state = MLX5_FLOW_METER_ENABLE;
+	return ret;
+}
+
+/**
+ * Callback to disable flow meter.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_id
+ *   Meter id.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_disable(struct rte_eth_dev *dev,
+			uint32_t meter_id,
+			struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meter *fm;
+	int ret;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	/* Meter object must exist. */
+	fm = mlx5_flow_meter_find(priv, meter_id);
+	if (fm == NULL)
+		return -rte_mtr_error_set(error, ENOENT,
+					  RTE_MTR_ERROR_TYPE_MTR_ID,
+					  NULL, "Meter not found.");
+	if (fm->active_state == MLX5_FLOW_METER_DISABLE)
+		return 0;
+	ret = mlx5_flow_meter_modify_state(priv, fm, MLX5_FLOW_METER_DISABLE,
+					   error);
+	if (!ret)
+		fm->active_state = MLX5_FLOW_METER_DISABLE;
+	return ret;
+}
+
+/**
+ * Callback to update meter profile.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_id
+ *   Meter id.
+ * @param[in] meter_profile_id
+ *   To be updated meter profile id.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_profile_update(struct rte_eth_dev *dev,
+			       uint32_t meter_id,
+			       uint32_t meter_profile_id,
+			       struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meter_profile *fmp;
+	struct mlx5_flow_meter_profile *old_fmp;
+	struct mlx5_flow_meter *fm;
+	uint64_t modify_bits = MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CBS |
+			       MLX5_FLOW_METER_OBJ_MODIFY_FIELD_CIR;
+	int ret;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	/* Meter profile must exist. */
+	fmp = mlx5_flow_meter_profile_find(priv, meter_profile_id);
+	if (fmp == NULL)
+		return -rte_mtr_error_set(error, ENOENT,
+					  RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+					  NULL, "Meter profile not found.");
+	/* Meter object must exist. */
+	fm = mlx5_flow_meter_find(priv, meter_id);
+	if (fm == NULL)
+		return -rte_mtr_error_set(error, ENOENT,
+					  RTE_MTR_ERROR_TYPE_MTR_ID,
+					  NULL, "Meter not found.");
+	/* MTR object already set to meter profile id. */
+	old_fmp = fm->profile;
+	if (fmp == old_fmp)
+		return 0;
+	/* Update the profile. */
+	fm->profile = fmp;
+	/* Update meter params in HW (if not disabled). */
+	if (fm->active_state == MLX5_FLOW_METER_DISABLE)
+		return 0;
+	ret = mlx5_flow_meter_action_modify(priv, fm, &fm->profile->srtcm_prm,
+					      modify_bits, fm->active_state);
+	if (ret) {
+		fm->profile = old_fmp;
+		return -rte_mtr_error_set(error, -ret,
+					  RTE_MTR_ERROR_TYPE_MTR_PARAMS,
+					  NULL, "Failed to update meter"
+					  " parmeters in hardware.");
+	}
+	old_fmp->ref_cnt--;
+	fmp->ref_cnt++;
+	return 0;
+}
+
+/**
+ * Callback to update meter stats mask.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_id
+ *   Meter id.
+ * @param[in] stats_mask
+ *   To be updated stats_mask.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_stats_update(struct rte_eth_dev *dev,
+			     uint32_t meter_id,
+			     uint64_t stats_mask,
+			     struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meter *fm;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	/* Meter object must exist. */
+	fm = mlx5_flow_meter_find(priv, meter_id);
+	if (fm == NULL)
+		return -rte_mtr_error_set(error, ENOENT,
+					  RTE_MTR_ERROR_TYPE_MTR_ID,
+					  NULL, "Meter object id not valid.");
+	fm->policer_stats.stats_mask = stats_mask;
+	return 0;
+}
+
+/**
+ * Callback to read meter statistics.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] meter_id
+ *   Meter id.
+ * @param[out] stats
+ *   Pointer to store the statistics.
+ * @param[out] stats_mask
+ *   Pointer to store the stats_mask.
+ * @param[in] clear
+ *   Statistic to be cleared after read or not.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_meter_stats_read(struct rte_eth_dev *dev,
+			   uint32_t meter_id,
+			   struct rte_mtr_stats *stats,
+			   uint64_t *stats_mask,
+			   int clear,
+			   struct rte_mtr_error *error)
+{
+	static uint64_t meter2mask[RTE_MTR_DROPPED + 1] = {
+		RTE_MTR_STATS_N_PKTS_GREEN | RTE_MTR_STATS_N_BYTES_GREEN,
+		RTE_MTR_STATS_N_PKTS_YELLOW | RTE_MTR_STATS_N_BYTES_YELLOW,
+		RTE_MTR_STATS_N_PKTS_RED | RTE_MTR_STATS_N_BYTES_RED,
+		RTE_MTR_STATS_N_PKTS_DROPPED | RTE_MTR_STATS_N_BYTES_DROPPED
+	};
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meter *fm;
+	struct mlx5_flow_policer_stats *ps;
+	uint64_t pkts_dropped = 0;
+	uint64_t bytes_dropped = 0;
+	uint64_t pkts;
+	uint64_t bytes;
+	int i;
+	int ret = 0;
+
+	if (!priv->mtr_en)
+		return -rte_mtr_error_set(error, ENOTSUP,
+					  RTE_MTR_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "Meter is not support");
+	/* Meter object must exist. */
+	fm = mlx5_flow_meter_find(priv, meter_id);
+	if (fm == NULL)
+		return -rte_mtr_error_set(error, ENOENT,
+					  RTE_MTR_ERROR_TYPE_MTR_ID,
+					  NULL, "Meter object id not valid.");
+	ps = &fm->policer_stats;
+	*stats_mask = ps->stats_mask;
+	for (i = 0; i < RTE_MTR_DROPPED; i++) {
+		if (*stats_mask & meter2mask[i]) {
+			ret = mlx5_counter_query(dev, ps->cnt[i], clear, &pkts,
+						 &bytes);
+			if (ret)
+				goto error;
+			if (fm->action[i] == MTR_POLICER_ACTION_DROP) {
+				pkts_dropped += pkts;
+				bytes_dropped += bytes;
+			}
+			/* If need to read the packets, set it. */
+			if ((1 << i) & (*stats_mask & meter2mask[i]))
+				stats->n_pkts[i] = pkts;
+			/* If need to read the bytes, set it. */
+			if ((1 << (RTE_MTR_DROPPED + 1 + i)) &
+			   (*stats_mask & meter2mask[i]))
+				stats->n_bytes[i] = bytes;
+		}
+	}
+	/* Dropped packets/bytes are treated differently. */
+	if (*stats_mask & meter2mask[i]) {
+		ret = mlx5_counter_query(dev, ps->cnt[i], clear, &pkts,
+					 &bytes);
+		if (ret)
+			goto error;
+		pkts += pkts_dropped;
+		bytes += bytes_dropped;
+		/* If need to read the packets, set it. */
+		if ((*stats_mask & meter2mask[i]) &
+		   RTE_MTR_STATS_N_PKTS_DROPPED)
+			stats->n_pkts_dropped = pkts;
+		/* If need to read the bytes, set it. */
+		if ((*stats_mask & meter2mask[i]) &
+		   RTE_MTR_STATS_N_BYTES_DROPPED)
+			stats->n_bytes_dropped = bytes;
+	}
+	return 0;
+error:
+	return -rte_mtr_error_set(error, ret, RTE_MTR_ERROR_TYPE_STATS, NULL,
+				 "Failed to read policer counters.");
+}
+
+static const struct rte_mtr_ops mlx5_flow_mtr_ops = {
+	.capabilities_get = mlx5_flow_mtr_cap_get,
+	.meter_profile_add = mlx5_flow_meter_profile_add,
+	.meter_profile_delete = mlx5_flow_meter_profile_delete,
+	.create = mlx5_flow_meter_create,
+	.destroy = mlx5_flow_meter_destroy,
+	.meter_enable = mlx5_flow_meter_enable,
+	.meter_disable = mlx5_flow_meter_disable,
+	.meter_profile_update = mlx5_flow_meter_profile_update,
+	.meter_dscp_table_update = NULL,
+	.policer_actions_update = NULL,
+	.stats_update = mlx5_flow_meter_stats_update,
+	.stats_read = mlx5_flow_meter_stats_read,
+};
+
+/**
+ * Get meter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param arg
+ *   Pointer to set the mtr operations.
+ *
+ * @return
+ *   Always 0.
+ */
+int
+mlx5_flow_meter_ops_get(struct rte_eth_dev *dev __rte_unused, void *arg)
+{
+	*(const struct rte_mtr_ops **)arg = &mlx5_flow_mtr_ops;
+	return 0;
+}
+
+/**
+ * Find meter by id.
+ *
+ * @param priv
+ *   Pointer to mlx5_priv.
+ * @param meter_id
+ *   Meter id.
+ *
+ * @return
+ *   Pointer to the profile found on success, NULL otherwise.
+ */
+struct mlx5_flow_meter *
+mlx5_flow_meter_find(struct mlx5_priv *priv, uint32_t meter_id)
+{
+	struct mlx5_flow_meters *fms = &priv->flow_meters;
+	struct mlx5_flow_meter *fm;
+
+	TAILQ_FOREACH(fm, fms, next)
+		if (meter_id == fm->meter_id)
+			return fm;
+	return NULL;
+}
+
+/**
+ * Attach meter to flow.
+ * Unidirectional Meter creation can only be done
+ * when flow direction is known, i.e. when calling meter_attach.
+ *
+ * @param [in] priv
+ *  Pointer to mlx5 private data.
+ * @param [in] meter_id
+ *  Flow meter id.
+ * @param [in] attr
+ *  Pointer to flow attributes.
+ * @param [out] error
+ *  Pointer to error structure.
+ *
+ * @return the flow meter pointer, NULL otherwise.
+ */
+struct mlx5_flow_meter *
+mlx5_flow_meter_attach(struct mlx5_priv *priv, uint32_t meter_id,
+		       const struct rte_flow_attr *attr,
+		       struct rte_flow_error *error)
+{
+	struct mlx5_flow_meter *fm;
+
+	fm = mlx5_flow_meter_find(priv, meter_id);
+	if (fm == NULL) {
+		rte_flow_error_set(error, ENOENT,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Meter object id not valid");
+		goto error;
+	}
+	if (!fm->shared && fm->ref_cnt) {
+		DRV_LOG(ERR, "Cannot share a non-shared meter.");
+		rte_flow_error_set(error, EINVAL,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				  "Meter can't be shared");
+		goto error;
+	}
+	if (!fm->ref_cnt++) {
+		MLX5_ASSERT(!fm->mfts->meter_action);
+		fm->ingress = attr->ingress;
+		fm->egress = attr->egress;
+		fm->transfer = attr->transfer;
+		/* This also creates the meter object. */
+		fm->mfts->meter_action = mlx5_flow_meter_action_create(priv,
+								       fm);
+		if (!fm->mfts->meter_action)
+			goto error_detach;
+	} else {
+		MLX5_ASSERT(fm->mfts->meter_action);
+		if (attr->transfer != fm->transfer ||
+		    attr->ingress != fm->ingress ||
+		    attr->egress != fm->egress) {
+			DRV_LOG(ERR, "meter I/O attributes do not "
+				"match flow I/O attributes.");
+			goto error_detach;
+		}
+	}
+	return fm;
+error_detach:
+	mlx5_flow_meter_detach(fm);
+	rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			  fm->mfts->meter_action ? "Meter attr not match" :
+			  "Meter action create failed");
+error:
+	return NULL;
+}
+
+/**
+ * Detach meter from flow.
+ *
+ * @param [in] fm
+ *  Pointer to flow meter.
+ */
+void
+mlx5_flow_meter_detach(struct mlx5_flow_meter *fm)
+{
+	MLX5_ASSERT(fm->ref_cnt);
+	if (--fm->ref_cnt)
+		return;
+	if (fm->mfts->meter_action)
+		mlx5_glue->destroy_flow_action(fm->mfts->meter_action);
+	fm->mfts->meter_action = NULL;
+	fm->ingress = 0;
+	fm->egress = 0;
+	fm->transfer = 0;
+}
+
+/**
+ * Flush meter configuration.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] error
+ *   Pointer to rte meter error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_meter_flush(struct rte_eth_dev *dev, struct rte_mtr_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_meters *fms = &priv->flow_meters;
+	struct mlx5_mtr_profiles *fmps = &priv->flow_meter_profiles;
+	struct mlx5_flow_meter_profile *fmp;
+	struct mlx5_flow_meter *fm;
+	const struct rte_flow_attr attr = {
+				.ingress = 1,
+				.egress = 1,
+				.transfer = priv->config.dv_esw_en ? 1 : 0,
+			};
+	void *tmp;
+	uint32_t i;
+
+	TAILQ_FOREACH_SAFE(fm, fms, next, tmp) {
+		/* Meter object must not have any owner. */
+		MLX5_ASSERT(!fm->ref_cnt);
+		/* Get meter profile. */
+		fmp = fm->profile;
+		if (fmp == NULL)
+			return -rte_mtr_error_set(error, EINVAL,
+				RTE_MTR_ERROR_TYPE_METER_PROFILE_ID,
+				NULL, "MTR object meter profile invalid.");
+		/* Update dependencies. */
+		fmp->ref_cnt--;
+		/* Remove from list. */
+		TAILQ_REMOVE(fms, fm, next);
+		/* Free policer counters. */
+		for (i = 0; i < RTE_DIM(fm->policer_stats.cnt); i++)
+			if (fm->policer_stats.cnt[i])
+				mlx5_counter_free(dev,
+						  fm->policer_stats.cnt[i]);
+		/* Free meter flow table. */
+		mlx5_flow_destroy_policer_rules(dev, fm, &attr);
+		mlx5_flow_destroy_mtr_tbls(dev, fm->mfts);
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MTR], fm->idx);
+	}
+	TAILQ_FOREACH_SAFE(fmp, fmps, next, tmp) {
+		/* Check unused. */
+		MLX5_ASSERT(!fmp->ref_cnt);
+		/* Remove from list. */
+		TAILQ_REMOVE(&priv->flow_meter_profiles, fmp, next);
+		rte_free(fmp);
+	}
+	return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c
new file mode 100644
index 000000000..c266e5683
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -0,0 +1,1987 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <netinet/in.h>
+#include <sys/queue.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+#include <rte_malloc.h>
+#include <rte_ip.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_flow.h"
+#include "mlx5_rxtx.h"
+
+#define VERBS_SPEC_INNER(item_flags) \
+	(!!((item_flags) & MLX5_FLOW_LAYER_TUNNEL) ? IBV_FLOW_SPEC_INNER : 0)
+
+/**
+ * Get Verbs flow counter by index.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] idx
+ *   mlx5 flow counter index in the container.
+ * @param[out] ppool
+ *   mlx5 flow counter pool in the container,
+ *
+ * @return
+ *   A pointer to the counter, NULL otherwise.
+ */
+static struct mlx5_flow_counter *
+flow_verbs_counter_get_by_idx(struct rte_eth_dev *dev,
+			      uint32_t idx,
+			      struct mlx5_flow_counter_pool **ppool)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0);
+	struct mlx5_flow_counter_pool *pool;
+
+	idx--;
+	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
+	MLX5_ASSERT(pool);
+	if (ppool)
+		*ppool = pool;
+	return MLX5_POOL_GET_CNT(pool, idx % MLX5_COUNTERS_PER_POOL);
+}
+
+/**
+ * Create Verbs flow counter with Verbs library.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in, out] counter
+ *   mlx5 flow counter object, contains the counter id,
+ *   handle of created Verbs flow counter is returned
+ *   in cs field (if counters are supported).
+ *
+ * @return
+ *   0 On success else a negative errno value is returned
+ *   and rte_errno is set.
+ */
+static int
+flow_verbs_counter_create(struct rte_eth_dev *dev,
+			  struct mlx5_flow_counter_ext *counter)
+{
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ibv_context *ctx = priv->sh->ctx;
+	struct ibv_counter_set_init_attr init = {
+			 .counter_set_id = counter->id};
+
+	counter->cs = mlx5_glue->create_counter_set(ctx, &init);
+	if (!counter->cs) {
+		rte_errno = ENOTSUP;
+		return -ENOTSUP;
+	}
+	return 0;
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ibv_context *ctx = priv->sh->ctx;
+	struct ibv_counters_init_attr init = {0};
+	struct ibv_counter_attach_attr attach;
+	int ret;
+
+	memset(&attach, 0, sizeof(attach));
+	counter->cs = mlx5_glue->create_counters(ctx, &init);
+	if (!counter->cs) {
+		rte_errno = ENOTSUP;
+		return -ENOTSUP;
+	}
+	attach.counter_desc = IBV_COUNTER_PACKETS;
+	attach.index = 0;
+	ret = mlx5_glue->attach_counters(counter->cs, &attach, NULL);
+	if (!ret) {
+		attach.counter_desc = IBV_COUNTER_BYTES;
+		attach.index = 1;
+		ret = mlx5_glue->attach_counters
+					(counter->cs, &attach, NULL);
+	}
+	if (ret) {
+		claim_zero(mlx5_glue->destroy_counters(counter->cs));
+		counter->cs = NULL;
+		rte_errno = ret;
+		return -ret;
+	}
+	return 0;
+#else
+	(void)dev;
+	(void)counter;
+	rte_errno = ENOTSUP;
+	return -ENOTSUP;
+#endif
+}
+
+/**
+ * Get a flow counter.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] shared
+ *   Indicate if this counter is shared with other flows.
+ * @param[in] id
+ *   Counter identifier.
+ *
+ * @return
+ *   Index to the counter, 0 otherwise and rte_errno is set.
+ */
+static uint32_t
+flow_verbs_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0);
+	struct mlx5_flow_counter_pool *pool = NULL;
+	struct mlx5_flow_counter_ext *cnt_ext = NULL;
+	struct mlx5_flow_counter *cnt = NULL;
+	uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
+	uint32_t pool_idx;
+	uint32_t i;
+	int ret;
+
+	if (shared) {
+		for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
+			pool = cont->pools[pool_idx];
+			for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+				cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
+				if (cnt_ext->shared && cnt_ext->id == id) {
+					cnt_ext->ref_cnt++;
+					return MLX5_MAKE_CNT_IDX(pool_idx, i);
+				}
+			}
+		}
+	}
+	for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
+		pool = cont->pools[pool_idx];
+		if (!pool)
+			continue;
+		cnt = TAILQ_FIRST(&pool->counters);
+		if (cnt)
+			break;
+	}
+	if (!cnt) {
+		struct mlx5_flow_counter_pool **pools;
+		uint32_t size;
+
+		if (n_valid == cont->n) {
+			/* Resize the container pool array. */
+			size = sizeof(struct mlx5_flow_counter_pool *) *
+				     (n_valid + MLX5_CNT_CONTAINER_RESIZE);
+			pools = rte_zmalloc(__func__, size, 0);
+			if (!pools)
+				return 0;
+			if (n_valid) {
+				memcpy(pools, cont->pools,
+				       sizeof(struct mlx5_flow_counter_pool *) *
+				       n_valid);
+				rte_free(cont->pools);
+			}
+			cont->pools = pools;
+			cont->n += MLX5_CNT_CONTAINER_RESIZE;
+		}
+		/* Allocate memory for new pool*/
+		size = sizeof(*pool) + (sizeof(*cnt_ext) + sizeof(*cnt)) *
+		       MLX5_COUNTERS_PER_POOL;
+		pool = rte_calloc(__func__, 1, size, 0);
+		if (!pool)
+			return 0;
+		pool->type |= CNT_POOL_TYPE_EXT;
+		for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
+			cnt = MLX5_POOL_GET_CNT(pool, i);
+			TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+		}
+		cnt = MLX5_POOL_GET_CNT(pool, 0);
+		cont->pools[n_valid] = pool;
+		pool_idx = n_valid;
+		rte_atomic16_add(&cont->n_valid, 1);
+		TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+	}
+	i = MLX5_CNT_ARRAY_IDX(pool, cnt);
+	cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
+	cnt_ext->id = id;
+	cnt_ext->shared = shared;
+	cnt_ext->ref_cnt = 1;
+	cnt->hits = 0;
+	cnt->bytes = 0;
+	/* Create counter with Verbs. */
+	ret = flow_verbs_counter_create(dev, cnt_ext);
+	if (!ret) {
+		TAILQ_REMOVE(&pool->counters, cnt, next);
+		return MLX5_MAKE_CNT_IDX(pool_idx, i);
+	}
+	/* Some error occurred in Verbs library. */
+	rte_errno = -ret;
+	return 0;
+}
+
+/**
+ * Release a flow counter.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] counter
+ *   Index to the counter handler.
+ */
+static void
+flow_verbs_counter_release(struct rte_eth_dev *dev, uint32_t counter)
+{
+	struct mlx5_flow_counter_pool *pool;
+	struct mlx5_flow_counter *cnt;
+	struct mlx5_flow_counter_ext *cnt_ext;
+
+	cnt = flow_verbs_counter_get_by_idx(dev, counter,
+					    &pool);
+	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+	if (--cnt_ext->ref_cnt == 0) {
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+		claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs));
+		cnt_ext->cs = NULL;
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+		claim_zero(mlx5_glue->destroy_counters(cnt_ext->cs));
+		cnt_ext->cs = NULL;
+#endif
+		TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
+	}
+}
+
+/**
+ * Query a flow counter via Verbs library call.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+static int
+flow_verbs_counter_query(struct rte_eth_dev *dev __rte_unused,
+			 struct rte_flow *flow, void *data,
+			 struct rte_flow_error *error)
+{
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \
+	defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+	if (flow->counter) {
+		struct mlx5_flow_counter_pool *pool;
+		struct mlx5_flow_counter *cnt = flow_verbs_counter_get_by_idx
+						(dev, flow->counter, &pool);
+		struct mlx5_flow_counter_ext *cnt_ext = MLX5_CNT_TO_CNT_EXT
+						(pool, cnt);
+		struct rte_flow_query_count *qc = data;
+		uint64_t counters[2] = {0, 0};
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+		struct ibv_query_counter_set_attr query_cs_attr = {
+			.cs = cnt_ext->cs,
+			.query_flags = IBV_COUNTER_SET_FORCE_UPDATE,
+		};
+		struct ibv_counter_set_data query_out = {
+			.out = counters,
+			.outlen = 2 * sizeof(uint64_t),
+		};
+		int err = mlx5_glue->query_counter_set(&query_cs_attr,
+						       &query_out);
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+		int err = mlx5_glue->query_counters
+			       (cnt_ext->cs, counters,
+				RTE_DIM(counters),
+				IBV_READ_COUNTERS_ATTR_PREFER_CACHED);
+#endif
+		if (err)
+			return rte_flow_error_set
+				(error, err,
+				 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				 NULL,
+				 "cannot read counter");
+		qc->hits_set = 1;
+		qc->bytes_set = 1;
+		qc->hits = counters[0] - cnt->hits;
+		qc->bytes = counters[1] - cnt->bytes;
+		if (qc->reset) {
+			cnt->hits = counters[0];
+			cnt->bytes = counters[1];
+		}
+		return 0;
+	}
+	return rte_flow_error_set(error, EINVAL,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				  NULL,
+				  "flow does not have counter");
+#else
+	(void)flow;
+	(void)data;
+	return rte_flow_error_set(error, ENOTSUP,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				  NULL,
+				  "counters are not available");
+#endif
+}
+
+/**
+ * Add a verbs item specification into @p verbs.
+ *
+ * @param[out] verbs
+ *   Pointer to verbs structure.
+ * @param[in] src
+ *   Create specification.
+ * @param[in] size
+ *   Size in bytes of the specification to copy.
+ */
+static void
+flow_verbs_spec_add(struct mlx5_flow_verbs_workspace *verbs,
+		    void *src, unsigned int size)
+{
+	void *dst;
+
+	if (!verbs)
+		return;
+	MLX5_ASSERT(verbs->specs);
+	dst = (void *)(verbs->specs + verbs->size);
+	memcpy(dst, src, size);
+	++verbs->attr.num_of_specs;
+	verbs->size += size;
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_eth(struct mlx5_flow *dev_flow,
+			      const struct rte_flow_item *item,
+			      uint64_t item_flags)
+{
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask = item->mask;
+	const unsigned int size = sizeof(struct ibv_flow_spec_eth);
+	struct ibv_flow_spec_eth eth = {
+		.type = IBV_FLOW_SPEC_ETH | VERBS_SPEC_INNER(item_flags),
+		.size = size,
+	};
+
+	if (!mask)
+		mask = &rte_flow_item_eth_mask;
+	if (spec) {
+		unsigned int i;
+
+		memcpy(&eth.val.dst_mac, spec->dst.addr_bytes,
+			RTE_ETHER_ADDR_LEN);
+		memcpy(&eth.val.src_mac, spec->src.addr_bytes,
+			RTE_ETHER_ADDR_LEN);
+		eth.val.ether_type = spec->type;
+		memcpy(&eth.mask.dst_mac, mask->dst.addr_bytes,
+			RTE_ETHER_ADDR_LEN);
+		memcpy(&eth.mask.src_mac, mask->src.addr_bytes,
+			RTE_ETHER_ADDR_LEN);
+		eth.mask.ether_type = mask->type;
+		/* Remove unwanted bits from values. */
+		for (i = 0; i < RTE_ETHER_ADDR_LEN; ++i) {
+			eth.val.dst_mac[i] &= eth.mask.dst_mac[i];
+			eth.val.src_mac[i] &= eth.mask.src_mac[i];
+		}
+		eth.val.ether_type &= eth.mask.ether_type;
+	}
+	flow_verbs_spec_add(&dev_flow->verbs, &eth, size);
+}
+
+/**
+ * Update the VLAN tag in the Verbs Ethernet specification.
+ * This function assumes that the input is valid and there is space to add
+ * the requested item.
+ *
+ * @param[in, out] attr
+ *   Pointer to Verbs attributes structure.
+ * @param[in] eth
+ *   Verbs structure containing the VLAN information to copy.
+ */
+static void
+flow_verbs_item_vlan_update(struct ibv_flow_attr *attr,
+			    struct ibv_flow_spec_eth *eth)
+{
+	unsigned int i;
+	const enum ibv_flow_spec_type search = eth->type;
+	struct ibv_spec_header *hdr = (struct ibv_spec_header *)
+		((uint8_t *)attr + sizeof(struct ibv_flow_attr));
+
+	for (i = 0; i != attr->num_of_specs; ++i) {
+		if (hdr->type == search) {
+			struct ibv_flow_spec_eth *e =
+				(struct ibv_flow_spec_eth *)hdr;
+
+			e->val.vlan_tag = eth->val.vlan_tag;
+			e->mask.vlan_tag = eth->mask.vlan_tag;
+			e->val.ether_type = eth->val.ether_type;
+			e->mask.ether_type = eth->mask.ether_type;
+			break;
+		}
+		hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size);
+	}
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_vlan(struct mlx5_flow *dev_flow,
+			       const struct rte_flow_item *item,
+			       uint64_t item_flags)
+{
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask = item->mask;
+	unsigned int size = sizeof(struct ibv_flow_spec_eth);
+	const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+	struct ibv_flow_spec_eth eth = {
+		.type = IBV_FLOW_SPEC_ETH | VERBS_SPEC_INNER(item_flags),
+		.size = size,
+	};
+	const uint32_t l2m = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+				      MLX5_FLOW_LAYER_OUTER_L2;
+
+	if (!mask)
+		mask = &rte_flow_item_vlan_mask;
+	if (spec) {
+		eth.val.vlan_tag = spec->tci;
+		eth.mask.vlan_tag = mask->tci;
+		eth.val.vlan_tag &= eth.mask.vlan_tag;
+		eth.val.ether_type = spec->inner_type;
+		eth.mask.ether_type = mask->inner_type;
+		eth.val.ether_type &= eth.mask.ether_type;
+	}
+	if (!(item_flags & l2m))
+		flow_verbs_spec_add(&dev_flow->verbs, &eth, size);
+	else
+		flow_verbs_item_vlan_update(&dev_flow->verbs.attr, &eth);
+	if (!tunnel)
+		dev_flow->handle->vf_vlan.tag =
+			rte_be_to_cpu_16(spec->tci) & 0x0fff;
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_ipv4(struct mlx5_flow *dev_flow,
+			       const struct rte_flow_item *item,
+			       uint64_t item_flags)
+{
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 *mask = item->mask;
+	unsigned int size = sizeof(struct ibv_flow_spec_ipv4_ext);
+	struct ibv_flow_spec_ipv4_ext ipv4 = {
+		.type = IBV_FLOW_SPEC_IPV4_EXT | VERBS_SPEC_INNER(item_flags),
+		.size = size,
+	};
+
+	if (!mask)
+		mask = &rte_flow_item_ipv4_mask;
+	if (spec) {
+		ipv4.val = (struct ibv_flow_ipv4_ext_filter){
+			.src_ip = spec->hdr.src_addr,
+			.dst_ip = spec->hdr.dst_addr,
+			.proto = spec->hdr.next_proto_id,
+			.tos = spec->hdr.type_of_service,
+		};
+		ipv4.mask = (struct ibv_flow_ipv4_ext_filter){
+			.src_ip = mask->hdr.src_addr,
+			.dst_ip = mask->hdr.dst_addr,
+			.proto = mask->hdr.next_proto_id,
+			.tos = mask->hdr.type_of_service,
+		};
+		/* Remove unwanted bits from values. */
+		ipv4.val.src_ip &= ipv4.mask.src_ip;
+		ipv4.val.dst_ip &= ipv4.mask.dst_ip;
+		ipv4.val.proto &= ipv4.mask.proto;
+		ipv4.val.tos &= ipv4.mask.tos;
+	}
+	flow_verbs_spec_add(&dev_flow->verbs, &ipv4, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_ipv6(struct mlx5_flow *dev_flow,
+			       const struct rte_flow_item *item,
+			       uint64_t item_flags)
+{
+	const struct rte_flow_item_ipv6 *spec = item->spec;
+	const struct rte_flow_item_ipv6 *mask = item->mask;
+	unsigned int size = sizeof(struct ibv_flow_spec_ipv6);
+	struct ibv_flow_spec_ipv6 ipv6 = {
+		.type = IBV_FLOW_SPEC_IPV6 | VERBS_SPEC_INNER(item_flags),
+		.size = size,
+	};
+
+	if (!mask)
+		mask = &rte_flow_item_ipv6_mask;
+	if (spec) {
+		unsigned int i;
+		uint32_t vtc_flow_val;
+		uint32_t vtc_flow_mask;
+
+		memcpy(&ipv6.val.src_ip, spec->hdr.src_addr,
+		       RTE_DIM(ipv6.val.src_ip));
+		memcpy(&ipv6.val.dst_ip, spec->hdr.dst_addr,
+		       RTE_DIM(ipv6.val.dst_ip));
+		memcpy(&ipv6.mask.src_ip, mask->hdr.src_addr,
+		       RTE_DIM(ipv6.mask.src_ip));
+		memcpy(&ipv6.mask.dst_ip, mask->hdr.dst_addr,
+		       RTE_DIM(ipv6.mask.dst_ip));
+		vtc_flow_val = rte_be_to_cpu_32(spec->hdr.vtc_flow);
+		vtc_flow_mask = rte_be_to_cpu_32(mask->hdr.vtc_flow);
+		ipv6.val.flow_label =
+			rte_cpu_to_be_32((vtc_flow_val & RTE_IPV6_HDR_FL_MASK) >>
+					 RTE_IPV6_HDR_FL_SHIFT);
+		ipv6.val.traffic_class = (vtc_flow_val & RTE_IPV6_HDR_TC_MASK) >>
+					 RTE_IPV6_HDR_TC_SHIFT;
+		ipv6.val.next_hdr = spec->hdr.proto;
+		ipv6.mask.flow_label =
+			rte_cpu_to_be_32((vtc_flow_mask & RTE_IPV6_HDR_FL_MASK) >>
+					 RTE_IPV6_HDR_FL_SHIFT);
+		ipv6.mask.traffic_class = (vtc_flow_mask & RTE_IPV6_HDR_TC_MASK) >>
+					  RTE_IPV6_HDR_TC_SHIFT;
+		ipv6.mask.next_hdr = mask->hdr.proto;
+		/* Remove unwanted bits from values. */
+		for (i = 0; i < RTE_DIM(ipv6.val.src_ip); ++i) {
+			ipv6.val.src_ip[i] &= ipv6.mask.src_ip[i];
+			ipv6.val.dst_ip[i] &= ipv6.mask.dst_ip[i];
+		}
+		ipv6.val.flow_label &= ipv6.mask.flow_label;
+		ipv6.val.traffic_class &= ipv6.mask.traffic_class;
+		ipv6.val.next_hdr &= ipv6.mask.next_hdr;
+	}
+	flow_verbs_spec_add(&dev_flow->verbs, &ipv6, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_tcp(struct mlx5_flow *dev_flow,
+			      const struct rte_flow_item *item,
+			      uint64_t item_flags __rte_unused)
+{
+	const struct rte_flow_item_tcp *spec = item->spec;
+	const struct rte_flow_item_tcp *mask = item->mask;
+	unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp);
+	struct ibv_flow_spec_tcp_udp tcp = {
+		.type = IBV_FLOW_SPEC_TCP | VERBS_SPEC_INNER(item_flags),
+		.size = size,
+	};
+
+	if (!mask)
+		mask = &rte_flow_item_tcp_mask;
+	if (spec) {
+		tcp.val.dst_port = spec->hdr.dst_port;
+		tcp.val.src_port = spec->hdr.src_port;
+		tcp.mask.dst_port = mask->hdr.dst_port;
+		tcp.mask.src_port = mask->hdr.src_port;
+		/* Remove unwanted bits from values. */
+		tcp.val.src_port &= tcp.mask.src_port;
+		tcp.val.dst_port &= tcp.mask.dst_port;
+	}
+	flow_verbs_spec_add(&dev_flow->verbs, &tcp, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_udp(struct mlx5_flow *dev_flow,
+			      const struct rte_flow_item *item,
+			      uint64_t item_flags __rte_unused)
+{
+	const struct rte_flow_item_udp *spec = item->spec;
+	const struct rte_flow_item_udp *mask = item->mask;
+	unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp);
+	struct ibv_flow_spec_tcp_udp udp = {
+		.type = IBV_FLOW_SPEC_UDP | VERBS_SPEC_INNER(item_flags),
+		.size = size,
+	};
+
+	if (!mask)
+		mask = &rte_flow_item_udp_mask;
+	if (spec) {
+		udp.val.dst_port = spec->hdr.dst_port;
+		udp.val.src_port = spec->hdr.src_port;
+		udp.mask.dst_port = mask->hdr.dst_port;
+		udp.mask.src_port = mask->hdr.src_port;
+		/* Remove unwanted bits from values. */
+		udp.val.src_port &= udp.mask.src_port;
+		udp.val.dst_port &= udp.mask.dst_port;
+	}
+	item++;
+	while (item->type == RTE_FLOW_ITEM_TYPE_VOID)
+		item++;
+	if (!(udp.val.dst_port & udp.mask.dst_port)) {
+		switch ((item)->type) {
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			udp.val.dst_port = htons(MLX5_UDP_PORT_VXLAN);
+			udp.mask.dst_port = 0xffff;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+			udp.val.dst_port = htons(MLX5_UDP_PORT_VXLAN_GPE);
+			udp.mask.dst_port = 0xffff;
+			break;
+		case RTE_FLOW_ITEM_TYPE_MPLS:
+			udp.val.dst_port = htons(MLX5_UDP_PORT_MPLS);
+			udp.mask.dst_port = 0xffff;
+			break;
+		default:
+			break;
+		}
+	}
+
+	flow_verbs_spec_add(&dev_flow->verbs, &udp, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_vxlan(struct mlx5_flow *dev_flow,
+				const struct rte_flow_item *item,
+				uint64_t item_flags __rte_unused)
+{
+	const struct rte_flow_item_vxlan *spec = item->spec;
+	const struct rte_flow_item_vxlan *mask = item->mask;
+	unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
+	struct ibv_flow_spec_tunnel vxlan = {
+		.type = IBV_FLOW_SPEC_VXLAN_TUNNEL,
+		.size = size,
+	};
+	union vni {
+		uint32_t vlan_id;
+		uint8_t vni[4];
+	} id = { .vlan_id = 0, };
+
+	if (!mask)
+		mask = &rte_flow_item_vxlan_mask;
+	if (spec) {
+		memcpy(&id.vni[1], spec->vni, 3);
+		vxlan.val.tunnel_id = id.vlan_id;
+		memcpy(&id.vni[1], mask->vni, 3);
+		vxlan.mask.tunnel_id = id.vlan_id;
+		/* Remove unwanted bits from values. */
+		vxlan.val.tunnel_id &= vxlan.mask.tunnel_id;
+	}
+	flow_verbs_spec_add(&dev_flow->verbs, &vxlan, size);
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_vxlan_gpe(struct mlx5_flow *dev_flow,
+				    const struct rte_flow_item *item,
+				    uint64_t item_flags __rte_unused)
+{
+	const struct rte_flow_item_vxlan_gpe *spec = item->spec;
+	const struct rte_flow_item_vxlan_gpe *mask = item->mask;
+	unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
+	struct ibv_flow_spec_tunnel vxlan_gpe = {
+		.type = IBV_FLOW_SPEC_VXLAN_TUNNEL,
+		.size = size,
+	};
+	union vni {
+		uint32_t vlan_id;
+		uint8_t vni[4];
+	} id = { .vlan_id = 0, };
+
+	if (!mask)
+		mask = &rte_flow_item_vxlan_gpe_mask;
+	if (spec) {
+		memcpy(&id.vni[1], spec->vni, 3);
+		vxlan_gpe.val.tunnel_id = id.vlan_id;
+		memcpy(&id.vni[1], mask->vni, 3);
+		vxlan_gpe.mask.tunnel_id = id.vlan_id;
+		/* Remove unwanted bits from values. */
+		vxlan_gpe.val.tunnel_id &= vxlan_gpe.mask.tunnel_id;
+	}
+	flow_verbs_spec_add(&dev_flow->verbs, &vxlan_gpe, size);
+}
+
+/**
+ * Update the protocol in Verbs IPv4/IPv6 spec.
+ *
+ * @param[in, out] attr
+ *   Pointer to Verbs attributes structure.
+ * @param[in] search
+ *   Specification type to search in order to update the IP protocol.
+ * @param[in] protocol
+ *   Protocol value to set if none is present in the specification.
+ */
+static void
+flow_verbs_item_gre_ip_protocol_update(struct ibv_flow_attr *attr,
+				       enum ibv_flow_spec_type search,
+				       uint8_t protocol)
+{
+	unsigned int i;
+	struct ibv_spec_header *hdr = (struct ibv_spec_header *)
+		((uint8_t *)attr + sizeof(struct ibv_flow_attr));
+
+	if (!attr)
+		return;
+	for (i = 0; i != attr->num_of_specs; ++i) {
+		if (hdr->type == search) {
+			union {
+				struct ibv_flow_spec_ipv4_ext *ipv4;
+				struct ibv_flow_spec_ipv6 *ipv6;
+			} ip;
+
+			switch (search) {
+			case IBV_FLOW_SPEC_IPV4_EXT:
+				ip.ipv4 = (struct ibv_flow_spec_ipv4_ext *)hdr;
+				if (!ip.ipv4->val.proto) {
+					ip.ipv4->val.proto = protocol;
+					ip.ipv4->mask.proto = 0xff;
+				}
+				break;
+			case IBV_FLOW_SPEC_IPV6:
+				ip.ipv6 = (struct ibv_flow_spec_ipv6 *)hdr;
+				if (!ip.ipv6->val.next_hdr) {
+					ip.ipv6->val.next_hdr = protocol;
+					ip.ipv6->mask.next_hdr = 0xff;
+				}
+				break;
+			default:
+				break;
+			}
+			break;
+		}
+		hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size);
+	}
+}
+
+/**
+ * Convert the @p item into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested item
+ * into the flow.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_gre(struct mlx5_flow *dev_flow,
+			      const struct rte_flow_item *item __rte_unused,
+			      uint64_t item_flags)
+{
+	struct mlx5_flow_verbs_workspace *verbs = &dev_flow->verbs;
+#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT
+	unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
+	struct ibv_flow_spec_tunnel tunnel = {
+		.type = IBV_FLOW_SPEC_VXLAN_TUNNEL,
+		.size = size,
+	};
+#else
+	const struct rte_flow_item_gre *spec = item->spec;
+	const struct rte_flow_item_gre *mask = item->mask;
+	unsigned int size = sizeof(struct ibv_flow_spec_gre);
+	struct ibv_flow_spec_gre tunnel = {
+		.type = IBV_FLOW_SPEC_GRE,
+		.size = size,
+	};
+
+	if (!mask)
+		mask = &rte_flow_item_gre_mask;
+	if (spec) {
+		tunnel.val.c_ks_res0_ver = spec->c_rsvd0_ver;
+		tunnel.val.protocol = spec->protocol;
+		tunnel.mask.c_ks_res0_ver = mask->c_rsvd0_ver;
+		tunnel.mask.protocol = mask->protocol;
+		/* Remove unwanted bits from values. */
+		tunnel.val.c_ks_res0_ver &= tunnel.mask.c_ks_res0_ver;
+		tunnel.val.protocol &= tunnel.mask.protocol;
+		tunnel.val.key &= tunnel.mask.key;
+	}
+#endif
+	if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4)
+		flow_verbs_item_gre_ip_protocol_update(&verbs->attr,
+						       IBV_FLOW_SPEC_IPV4_EXT,
+						       IPPROTO_GRE);
+	else
+		flow_verbs_item_gre_ip_protocol_update(&verbs->attr,
+						       IBV_FLOW_SPEC_IPV6,
+						       IPPROTO_GRE);
+	flow_verbs_spec_add(verbs, &tunnel, size);
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow. This function also return the action that was added.
+ *
+ * @param[in, out] dev_flow
+ *   Pointer to dev_flow structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] item_flags
+ *   Parsed item flags.
+ */
+static void
+flow_verbs_translate_item_mpls(struct mlx5_flow *dev_flow __rte_unused,
+			       const struct rte_flow_item *item __rte_unused,
+			       uint64_t item_flags __rte_unused)
+{
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+	const struct rte_flow_item_mpls *spec = item->spec;
+	const struct rte_flow_item_mpls *mask = item->mask;
+	unsigned int size = sizeof(struct ibv_flow_spec_mpls);
+	struct ibv_flow_spec_mpls mpls = {
+		.type = IBV_FLOW_SPEC_MPLS,
+		.size = size,
+	};
+
+	if (!mask)
+		mask = &rte_flow_item_mpls_mask;
+	if (spec) {
+		memcpy(&mpls.val.label, spec, sizeof(mpls.val.label));
+		memcpy(&mpls.mask.label, mask, sizeof(mpls.mask.label));
+		/* Remove unwanted bits from values.  */
+		mpls.val.label &= mpls.mask.label;
+	}
+	flow_verbs_spec_add(&dev_flow->verbs, &mpls, size);
+#endif
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] dev_flow
+ *   Pointer to mlx5_flow.
+ * @param[in] action
+ *   Action configuration.
+ */
+static void
+flow_verbs_translate_action_drop
+	(struct mlx5_flow *dev_flow,
+	 const struct rte_flow_action *action __rte_unused)
+{
+	unsigned int size = sizeof(struct ibv_flow_spec_action_drop);
+	struct ibv_flow_spec_action_drop drop = {
+			.type = IBV_FLOW_SPEC_ACTION_DROP,
+			.size = size,
+	};
+
+	flow_verbs_spec_add(&dev_flow->verbs, &drop, size);
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] rss_desc
+ *   Pointer to mlx5_flow_rss_desc.
+ * @param[in] action
+ *   Action configuration.
+ */
+static void
+flow_verbs_translate_action_queue(struct mlx5_flow_rss_desc *rss_desc,
+				  const struct rte_flow_action *action)
+{
+	const struct rte_flow_action_queue *queue = action->conf;
+
+	rss_desc->queue[0] = queue->index;
+	rss_desc->queue_num = 1;
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] rss_desc
+ *   Pointer to mlx5_flow_rss_desc.
+ * @param[in] action
+ *   Action configuration.
+ */
+static void
+flow_verbs_translate_action_rss(struct mlx5_flow_rss_desc *rss_desc,
+				const struct rte_flow_action *action)
+{
+	const struct rte_flow_action_rss *rss = action->conf;
+	const uint8_t *rss_key;
+
+	memcpy(rss_desc->queue, rss->queue, rss->queue_num * sizeof(uint16_t));
+	rss_desc->queue_num = rss->queue_num;
+	/* NULL RSS key indicates default RSS key. */
+	rss_key = !rss->key ? rss_hash_default_key : rss->key;
+	memcpy(rss_desc->key, rss_key, MLX5_RSS_HASH_KEY_LEN);
+	/*
+	 * rss->level and rss.types should be set in advance when expanding
+	 * items for RSS.
+	 */
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] dev_flow
+ *   Pointer to mlx5_flow.
+ * @param[in] action
+ *   Action configuration.
+ */
+static void
+flow_verbs_translate_action_flag
+	(struct mlx5_flow *dev_flow,
+	 const struct rte_flow_action *action __rte_unused)
+{
+	unsigned int size = sizeof(struct ibv_flow_spec_action_tag);
+	struct ibv_flow_spec_action_tag tag = {
+		.type = IBV_FLOW_SPEC_ACTION_TAG,
+		.size = size,
+		.tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT),
+	};
+
+	flow_verbs_spec_add(&dev_flow->verbs, &tag, size);
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] dev_flow
+ *   Pointer to mlx5_flow.
+ * @param[in] action
+ *   Action configuration.
+ */
+static void
+flow_verbs_translate_action_mark(struct mlx5_flow *dev_flow,
+				 const struct rte_flow_action *action)
+{
+	const struct rte_flow_action_mark *mark = action->conf;
+	unsigned int size = sizeof(struct ibv_flow_spec_action_tag);
+	struct ibv_flow_spec_action_tag tag = {
+		.type = IBV_FLOW_SPEC_ACTION_TAG,
+		.size = size,
+		.tag_id = mlx5_flow_mark_set(mark->id),
+	};
+
+	flow_verbs_spec_add(&dev_flow->verbs, &tag, size);
+}
+
+/**
+ * Convert the @p action into a Verbs specification. This function assumes that
+ * the input is valid and that there is space to insert the requested action
+ * into the flow.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] action
+ *   Action configuration.
+ * @param[in] dev_flow
+ *   Pointer to mlx5_flow.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 On success else a negative errno value is returned and rte_errno is set.
+ */
+static int
+flow_verbs_translate_action_count(struct mlx5_flow *dev_flow,
+				  const struct rte_flow_action *action,
+				  struct rte_eth_dev *dev,
+				  struct rte_flow_error *error)
+{
+	const struct rte_flow_action_count *count = action->conf;
+	struct rte_flow *flow = dev_flow->flow;
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \
+	defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+	struct mlx5_flow_counter_pool *pool;
+	struct mlx5_flow_counter *cnt = NULL;
+	struct mlx5_flow_counter_ext *cnt_ext;
+	unsigned int size = sizeof(struct ibv_flow_spec_counter_action);
+	struct ibv_flow_spec_counter_action counter = {
+		.type = IBV_FLOW_SPEC_ACTION_COUNT,
+		.size = size,
+	};
+#endif
+
+	if (!flow->counter) {
+		flow->counter = flow_verbs_counter_new(dev, count->shared,
+						       count->id);
+		if (!flow->counter)
+			return rte_flow_error_set(error, rte_errno,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  action,
+						  "cannot get counter"
+						  " context.");
+	}
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
+	cnt = flow_verbs_counter_get_by_idx(dev, flow->counter, &pool);
+	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+	counter.counter_set_handle = cnt_ext->cs->handle;
+	flow_verbs_spec_add(&dev_flow->verbs, &counter, size);
+#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+	cnt = flow_verbs_counter_get_by_idx(dev, flow->counter, &pool);
+	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
+	counter.counters = cnt_ext->cs;
+	flow_verbs_spec_add(&dev_flow->verbs, &counter, size);
+#endif
+	return 0;
+}
+
+/**
+ * Internal validation function. For validating both actions and items.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[in] hairpin
+ *   Number of hairpin TX actions, 0 means classic flow.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_verbs_validate(struct rte_eth_dev *dev,
+		    const struct rte_flow_attr *attr,
+		    const struct rte_flow_item items[],
+		    const struct rte_flow_action actions[],
+		    bool external __rte_unused,
+		    int hairpin __rte_unused,
+		    struct rte_flow_error *error)
+{
+	int ret;
+	uint64_t action_flags = 0;
+	uint64_t item_flags = 0;
+	uint64_t last_item = 0;
+	uint8_t next_protocol = 0xff;
+	uint16_t ether_type = 0;
+
+	if (items == NULL)
+		return -1;
+	ret = mlx5_flow_validate_attributes(dev, attr, error);
+	if (ret < 0)
+		return ret;
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+		int ret = 0;
+
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			ret = mlx5_flow_validate_item_eth(items, item_flags,
+							  error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+					     MLX5_FLOW_LAYER_OUTER_L2;
+			if (items->mask != NULL && items->spec != NULL) {
+				ether_type =
+					((const struct rte_flow_item_eth *)
+					 items->spec)->type;
+				ether_type &=
+					((const struct rte_flow_item_eth *)
+					 items->mask)->type;
+				ether_type = rte_be_to_cpu_16(ether_type);
+			} else {
+				ether_type = 0;
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			ret = mlx5_flow_validate_item_vlan(items, item_flags,
+							   dev, error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
+					      MLX5_FLOW_LAYER_INNER_VLAN) :
+					     (MLX5_FLOW_LAYER_OUTER_L2 |
+					      MLX5_FLOW_LAYER_OUTER_VLAN);
+			if (items->mask != NULL && items->spec != NULL) {
+				ether_type =
+					((const struct rte_flow_item_vlan *)
+					 items->spec)->inner_type;
+				ether_type &=
+					((const struct rte_flow_item_vlan *)
+					 items->mask)->inner_type;
+				ether_type = rte_be_to_cpu_16(ether_type);
+			} else {
+				ether_type = 0;
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			ret = mlx5_flow_validate_item_ipv4(items, item_flags,
+							   last_item,
+							   ether_type, NULL,
+							   error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+					     MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+			if (items->mask != NULL &&
+			    ((const struct rte_flow_item_ipv4 *)
+			     items->mask)->hdr.next_proto_id) {
+				next_protocol =
+					((const struct rte_flow_item_ipv4 *)
+					 (items->spec))->hdr.next_proto_id;
+				next_protocol &=
+					((const struct rte_flow_item_ipv4 *)
+					 (items->mask))->hdr.next_proto_id;
+			} else {
+				/* Reset for inner layer. */
+				next_protocol = 0xff;
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			ret = mlx5_flow_validate_item_ipv6(items, item_flags,
+							   last_item,
+							   ether_type, NULL,
+							   error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+					     MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+			if (items->mask != NULL &&
+			    ((const struct rte_flow_item_ipv6 *)
+			     items->mask)->hdr.proto) {
+				next_protocol =
+					((const struct rte_flow_item_ipv6 *)
+					 items->spec)->hdr.proto;
+				next_protocol &=
+					((const struct rte_flow_item_ipv6 *)
+					 items->mask)->hdr.proto;
+			} else {
+				/* Reset for inner layer. */
+				next_protocol = 0xff;
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			ret = mlx5_flow_validate_item_udp(items, item_flags,
+							  next_protocol,
+							  error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP :
+					     MLX5_FLOW_LAYER_OUTER_L4_UDP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_TCP:
+			ret = mlx5_flow_validate_item_tcp
+						(items, item_flags,
+						 next_protocol,
+						 &rte_flow_item_tcp_mask,
+						 error);
+			if (ret < 0)
+				return ret;
+			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP :
+					     MLX5_FLOW_LAYER_OUTER_L4_TCP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			ret = mlx5_flow_validate_item_vxlan(items, item_flags,
+							    error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_VXLAN;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+			ret = mlx5_flow_validate_item_vxlan_gpe(items,
+								item_flags,
+								dev, error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_VXLAN_GPE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GRE:
+			ret = mlx5_flow_validate_item_gre(items, item_flags,
+							  next_protocol, error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_GRE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_MPLS:
+			ret = mlx5_flow_validate_item_mpls(dev, items,
+							   item_flags,
+							   last_item, error);
+			if (ret < 0)
+				return ret;
+			last_item = MLX5_FLOW_LAYER_MPLS;
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  NULL, "item not supported");
+		}
+		item_flags |= last_item;
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_FLAG:
+			ret = mlx5_flow_validate_action_flag(action_flags,
+							     attr,
+							     error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_FLAG;
+			break;
+		case RTE_FLOW_ACTION_TYPE_MARK:
+			ret = mlx5_flow_validate_action_mark(actions,
+							     action_flags,
+							     attr,
+							     error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_MARK;
+			break;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			ret = mlx5_flow_validate_action_drop(action_flags,
+							     attr,
+							     error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_DROP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+			ret = mlx5_flow_validate_action_queue(actions,
+							      action_flags, dev,
+							      attr,
+							      error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_QUEUE;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			ret = mlx5_flow_validate_action_rss(actions,
+							    action_flags, dev,
+							    attr, item_flags,
+							    error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_RSS;
+			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			ret = mlx5_flow_validate_action_count(dev, attr, error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_COUNT;
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "action not supported");
+		}
+	}
+	/*
+	 * Validate the drop action mutual exclusion with other actions.
+	 * Drop action is mutually-exclusive with any other action, except for
+	 * Count action.
+	 */
+	if ((action_flags & MLX5_FLOW_ACTION_DROP) &&
+	    (action_flags & ~(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_COUNT)))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					  "Drop action is mutually-exclusive "
+					  "with any other action, except for "
+					  "Count action");
+	if (!(action_flags & MLX5_FLOW_FATE_ACTIONS))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION, actions,
+					  "no fate action is found");
+	return 0;
+}
+
+/**
+ * Calculate the required bytes that are needed for the action part of the verbs
+ * flow.
+ *
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ *
+ * @return
+ *   The size of the memory needed for all actions.
+ */
+static int
+flow_verbs_get_actions_size(const struct rte_flow_action actions[])
+{
+	int size = 0;
+
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_FLAG:
+			size += sizeof(struct ibv_flow_spec_action_tag);
+			break;
+		case RTE_FLOW_ACTION_TYPE_MARK:
+			size += sizeof(struct ibv_flow_spec_action_tag);
+			break;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			size += sizeof(struct ibv_flow_spec_action_drop);
+			break;
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+			break;
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \
+	defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+			size += sizeof(struct ibv_flow_spec_counter_action);
+#endif
+			break;
+		default:
+			break;
+		}
+	}
+	return size;
+}
+
+/**
+ * Calculate the required bytes that are needed for the item part of the verbs
+ * flow.
+ *
+ * @param[in] items
+ *   Pointer to the list of items.
+ *
+ * @return
+ *   The size of the memory needed for all items.
+ */
+static int
+flow_verbs_get_items_size(const struct rte_flow_item items[])
+{
+	int size = 0;
+
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			size += sizeof(struct ibv_flow_spec_eth);
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			size += sizeof(struct ibv_flow_spec_eth);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			size += sizeof(struct ibv_flow_spec_ipv4_ext);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			size += sizeof(struct ibv_flow_spec_ipv6);
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			size += sizeof(struct ibv_flow_spec_tcp_udp);
+			break;
+		case RTE_FLOW_ITEM_TYPE_TCP:
+			size += sizeof(struct ibv_flow_spec_tcp_udp);
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			size += sizeof(struct ibv_flow_spec_tunnel);
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+			size += sizeof(struct ibv_flow_spec_tunnel);
+			break;
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+		case RTE_FLOW_ITEM_TYPE_GRE:
+			size += sizeof(struct ibv_flow_spec_gre);
+			break;
+		case RTE_FLOW_ITEM_TYPE_MPLS:
+			size += sizeof(struct ibv_flow_spec_mpls);
+			break;
+#else
+		case RTE_FLOW_ITEM_TYPE_GRE:
+			size += sizeof(struct ibv_flow_spec_tunnel);
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	return size;
+}
+
+/**
+ * Internal preparation function. Allocate mlx5_flow with the required size.
+ * The required size is calculate based on the actions and items. This function
+ * also returns the detected actions and items for later use.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   Pointer to mlx5_flow object on success, otherwise NULL and rte_errno
+ *   is set.
+ */
+static struct mlx5_flow *
+flow_verbs_prepare(struct rte_eth_dev *dev,
+		   const struct rte_flow_attr *attr __rte_unused,
+		   const struct rte_flow_item items[],
+		   const struct rte_flow_action actions[],
+		   struct rte_flow_error *error)
+{
+	size_t size = 0;
+	uint32_t handle_idx = 0;
+	struct mlx5_flow *dev_flow;
+	struct mlx5_flow_handle *dev_handle;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	size += flow_verbs_get_actions_size(actions);
+	size += flow_verbs_get_items_size(items);
+	if (size > MLX5_VERBS_MAX_SPEC_ACT_SIZE) {
+		rte_flow_error_set(error, E2BIG,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Verbs spec/action size too large");
+		return NULL;
+	}
+	/* In case of corrupting the memory. */
+	if (priv->flow_idx >= MLX5_NUM_MAX_DEV_FLOWS) {
+		rte_flow_error_set(error, ENOSPC,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "not free temporary device flow");
+		return NULL;
+	}
+	dev_handle = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+				   &handle_idx);
+	if (!dev_handle) {
+		rte_flow_error_set(error, ENOMEM,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "not enough memory to create flow handle");
+		return NULL;
+	}
+	/* No multi-thread supporting. */
+	dev_flow = &((struct mlx5_flow *)priv->inter_flows)[priv->flow_idx++];
+	dev_flow->handle = dev_handle;
+	dev_flow->handle_idx = handle_idx;
+	/* Memcpy is used, only size needs to be cleared to 0. */
+	dev_flow->verbs.size = 0;
+	dev_flow->verbs.attr.num_of_specs = 0;
+	dev_flow->ingress = attr->ingress;
+	dev_flow->hash_fields = 0;
+	/* Need to set transfer attribute: not supported in Verbs mode. */
+	return dev_flow;
+}
+
+/**
+ * Fill the flow with verb spec.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in, out] dev_flow
+ *   Pointer to the mlx5 flow.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, else a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_verbs_translate(struct rte_eth_dev *dev,
+		     struct mlx5_flow *dev_flow,
+		     const struct rte_flow_attr *attr,
+		     const struct rte_flow_item items[],
+		     const struct rte_flow_action actions[],
+		     struct rte_flow_error *error)
+{
+	uint64_t item_flags = 0;
+	uint64_t action_flags = 0;
+	uint64_t priority = attr->priority;
+	uint32_t subpriority = 0;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_rss_desc *rss_desc = &((struct mlx5_flow_rss_desc *)
+					      priv->rss_desc)
+					      [!!priv->flow_nested_idx];
+
+	if (priority == MLX5_FLOW_PRIO_RSVD)
+		priority = priv->config.flow_prio - 1;
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		int ret;
+
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_FLAG:
+			flow_verbs_translate_action_flag(dev_flow, actions);
+			action_flags |= MLX5_FLOW_ACTION_FLAG;
+			dev_flow->handle->mark = 1;
+			break;
+		case RTE_FLOW_ACTION_TYPE_MARK:
+			flow_verbs_translate_action_mark(dev_flow, actions);
+			action_flags |= MLX5_FLOW_ACTION_MARK;
+			dev_flow->handle->mark = 1;
+			break;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			flow_verbs_translate_action_drop(dev_flow, actions);
+			action_flags |= MLX5_FLOW_ACTION_DROP;
+			dev_flow->handle->fate_action = MLX5_FLOW_FATE_DROP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+			flow_verbs_translate_action_queue(rss_desc, actions);
+			action_flags |= MLX5_FLOW_ACTION_QUEUE;
+			dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			flow_verbs_translate_action_rss(rss_desc, actions);
+			action_flags |= MLX5_FLOW_ACTION_RSS;
+			dev_flow->handle->fate_action = MLX5_FLOW_FATE_QUEUE;
+			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			ret = flow_verbs_translate_action_count(dev_flow,
+								actions,
+								dev, error);
+			if (ret < 0)
+				return ret;
+			action_flags |= MLX5_FLOW_ACTION_COUNT;
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "action not supported");
+		}
+	}
+	dev_flow->act_flags = action_flags;
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL);
+
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			flow_verbs_translate_item_eth(dev_flow, items,
+						      item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L2;
+			item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 :
+					       MLX5_FLOW_LAYER_OUTER_L2;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			flow_verbs_translate_item_vlan(dev_flow, items,
+						       item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L2;
+			item_flags |= tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
+						MLX5_FLOW_LAYER_INNER_VLAN) :
+					       (MLX5_FLOW_LAYER_OUTER_L2 |
+						MLX5_FLOW_LAYER_OUTER_VLAN);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			flow_verbs_translate_item_ipv4(dev_flow, items,
+						       item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L3;
+			dev_flow->hash_fields |=
+				mlx5_flow_hashfields_adjust
+					(rss_desc, tunnel,
+					 MLX5_IPV4_LAYER_TYPES,
+					 MLX5_IPV4_IBV_RX_HASH);
+			item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 :
+					       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			flow_verbs_translate_item_ipv6(dev_flow, items,
+						       item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L3;
+			dev_flow->hash_fields |=
+				mlx5_flow_hashfields_adjust
+					(rss_desc, tunnel,
+					 MLX5_IPV6_LAYER_TYPES,
+					 MLX5_IPV6_IBV_RX_HASH);
+			item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 :
+					       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+			break;
+		case RTE_FLOW_ITEM_TYPE_TCP:
+			flow_verbs_translate_item_tcp(dev_flow, items,
+						      item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L4;
+			dev_flow->hash_fields |=
+				mlx5_flow_hashfields_adjust
+					(rss_desc, tunnel, ETH_RSS_TCP,
+					 (IBV_RX_HASH_SRC_PORT_TCP |
+					  IBV_RX_HASH_DST_PORT_TCP));
+			item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP :
+					       MLX5_FLOW_LAYER_OUTER_L4_TCP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			flow_verbs_translate_item_udp(dev_flow, items,
+						      item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L4;
+			dev_flow->hash_fields |=
+				mlx5_flow_hashfields_adjust
+					(rss_desc, tunnel, ETH_RSS_UDP,
+					 (IBV_RX_HASH_SRC_PORT_UDP |
+					  IBV_RX_HASH_DST_PORT_UDP));
+			item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP :
+					       MLX5_FLOW_LAYER_OUTER_L4_UDP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			flow_verbs_translate_item_vxlan(dev_flow, items,
+							item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L2;
+			item_flags |= MLX5_FLOW_LAYER_VXLAN;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+			flow_verbs_translate_item_vxlan_gpe(dev_flow, items,
+							    item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L2;
+			item_flags |= MLX5_FLOW_LAYER_VXLAN_GPE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_GRE:
+			flow_verbs_translate_item_gre(dev_flow, items,
+						      item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L2;
+			item_flags |= MLX5_FLOW_LAYER_GRE;
+			break;
+		case RTE_FLOW_ITEM_TYPE_MPLS:
+			flow_verbs_translate_item_mpls(dev_flow, items,
+						       item_flags);
+			subpriority = MLX5_PRIORITY_MAP_L2;
+			item_flags |= MLX5_FLOW_LAYER_MPLS;
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  NULL,
+						  "item not supported");
+		}
+	}
+	dev_flow->handle->layers = item_flags;
+	/* Other members of attr will be ignored. */
+	dev_flow->verbs.attr.priority =
+		mlx5_flow_adjust_priority(dev, priority, subpriority);
+	dev_flow->verbs.attr.port = (uint8_t)priv->ibv_port;
+	return 0;
+}
+
+/**
+ * Remove the flow from the NIC but keeps it in memory.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ */
+static void
+flow_verbs_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_handle *handle;
+	uint32_t handle_idx;
+
+	if (!flow)
+		return;
+	SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+		       handle_idx, handle, next) {
+		if (handle->ib_flow) {
+			claim_zero(mlx5_glue->destroy_flow(handle->ib_flow));
+			handle->ib_flow = NULL;
+		}
+		/* hrxq is union, don't touch it only the flag is set. */
+		if (handle->rix_hrxq) {
+			if (handle->fate_action == MLX5_FLOW_FATE_DROP) {
+				mlx5_hrxq_drop_release(dev);
+				handle->rix_hrxq = 0;
+			} else if (handle->fate_action ==
+				   MLX5_FLOW_FATE_QUEUE) {
+				mlx5_hrxq_release(dev, handle->rix_hrxq);
+				handle->rix_hrxq = 0;
+			}
+		}
+		if (handle->vf_vlan.tag && handle->vf_vlan.created)
+			mlx5_vlan_vmwa_release(dev, &handle->vf_vlan);
+	}
+}
+
+/**
+ * Remove the flow from the NIC and the memory.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ */
+static void
+flow_verbs_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_handle *handle;
+
+	if (!flow)
+		return;
+	flow_verbs_remove(dev, flow);
+	while (flow->dev_handles) {
+		uint32_t tmp_idx = flow->dev_handles;
+
+		handle = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+				   tmp_idx);
+		if (!handle)
+			return;
+		flow->dev_handles = handle->next.next;
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW],
+			   tmp_idx);
+	}
+	if (flow->counter) {
+		flow_verbs_counter_release(dev, flow->counter);
+		flow->counter = 0;
+	}
+}
+
+/**
+ * Apply the flow to the NIC.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
+		 struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_handle *handle;
+	struct mlx5_flow *dev_flow;
+	struct mlx5_hrxq *hrxq;
+	uint32_t dev_handles;
+	int err;
+	int idx;
+
+	for (idx = priv->flow_idx - 1; idx >= priv->flow_nested_idx; idx--) {
+		dev_flow = &((struct mlx5_flow *)priv->inter_flows)[idx];
+		handle = dev_flow->handle;
+		if (handle->fate_action == MLX5_FLOW_FATE_DROP) {
+			hrxq = mlx5_hrxq_drop_new(dev);
+			if (!hrxq) {
+				rte_flow_error_set
+					(error, errno,
+					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					 "cannot get drop hash queue");
+				goto error;
+			}
+		} else {
+			uint32_t hrxq_idx;
+			struct mlx5_flow_rss_desc *rss_desc =
+				&((struct mlx5_flow_rss_desc *)priv->rss_desc)
+				[!!priv->flow_nested_idx];
+
+			MLX5_ASSERT(rss_desc->queue_num);
+			hrxq_idx = mlx5_hrxq_get(dev, rss_desc->key,
+					     MLX5_RSS_HASH_KEY_LEN,
+					     dev_flow->hash_fields,
+					     rss_desc->queue,
+					     rss_desc->queue_num);
+			if (!hrxq_idx)
+				hrxq_idx = mlx5_hrxq_new(dev, rss_desc->key,
+						MLX5_RSS_HASH_KEY_LEN,
+						dev_flow->hash_fields,
+						rss_desc->queue,
+						rss_desc->queue_num,
+						!!(handle->layers &
+						MLX5_FLOW_LAYER_TUNNEL));
+			hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ],
+					 hrxq_idx);
+			if (!hrxq) {
+				rte_flow_error_set
+					(error, rte_errno,
+					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					 "cannot get hash queue");
+				goto error;
+			}
+			handle->rix_hrxq = hrxq_idx;
+		}
+		MLX5_ASSERT(hrxq);
+		handle->ib_flow = mlx5_glue->create_flow(hrxq->qp,
+						     &dev_flow->verbs.attr);
+		if (!handle->ib_flow) {
+			rte_flow_error_set(error, errno,
+					   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					   NULL,
+					   "hardware refuses to create flow");
+			goto error;
+		}
+		if (priv->vmwa_context &&
+		    handle->vf_vlan.tag && !handle->vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make hypervisor set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_vmwa_acquire(dev, &handle->vf_vlan);
+		}
+	}
+	return 0;
+error:
+	err = rte_errno; /* Save rte_errno before cleanup. */
+	SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
+		       dev_handles, handle, next) {
+		/* hrxq is union, don't touch it only the flag is set. */
+		if (handle->rix_hrxq) {
+			if (handle->fate_action == MLX5_FLOW_FATE_DROP) {
+				mlx5_hrxq_drop_release(dev);
+				handle->rix_hrxq = 0;
+			} else if (handle->fate_action ==
+				   MLX5_FLOW_FATE_QUEUE) {
+				mlx5_hrxq_release(dev, handle->rix_hrxq);
+				handle->rix_hrxq = 0;
+			}
+		}
+		if (handle->vf_vlan.tag && handle->vf_vlan.created)
+			mlx5_vlan_vmwa_release(dev, &handle->vf_vlan);
+	}
+	rte_errno = err; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Query a flow.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+static int
+flow_verbs_query(struct rte_eth_dev *dev,
+		 struct rte_flow *flow,
+		 const struct rte_flow_action *actions,
+		 void *data,
+		 struct rte_flow_error *error)
+{
+	int ret = -EINVAL;
+
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_COUNT:
+			ret = flow_verbs_counter_query(dev, flow, data, error);
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "action not supported");
+		}
+	}
+	return ret;
+}
+
+const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops = {
+	.validate = flow_verbs_validate,
+	.prepare = flow_verbs_prepare,
+	.translate = flow_verbs_translate,
+	.apply = flow_verbs_apply,
+	.remove = flow_verbs_remove,
+	.destroy = flow_verbs_destroy,
+	.query = flow_verbs_query,
+};
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c
new file mode 100644
index 000000000..291f7724c
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mac.c
@@ -0,0 +1,255 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <arpa/inet.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+
+/**
+ * Get MAC address by querying netdevice.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] mac
+ *   MAC address output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
+{
+	struct ifreq request;
+	int ret;
+
+	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+	if (ret)
+		return ret;
+	memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
+	return 0;
+}
+
+/**
+ * Remove a MAC address from the internal array.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param index
+ *   MAC address index.
+ */
+static void
+mlx5_internal_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const int vf = priv->config.vf;
+
+	MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
+	if (rte_is_zero_ether_addr(&dev->data->mac_addrs[index]))
+		return;
+	if (vf)
+		mlx5_nl_mac_addr_remove(priv->nl_socket_route,
+					mlx5_ifindex(dev), priv->mac_own,
+					&dev->data->mac_addrs[index], index);
+	memset(&dev->data->mac_addrs[index], 0, sizeof(struct rte_ether_addr));
+}
+
+/**
+ * Adds a MAC address to the internal array.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mac_addr
+ *   MAC address to register.
+ * @param index
+ *   MAC address index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_internal_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
+			   uint32_t index)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const int vf = priv->config.vf;
+	unsigned int i;
+
+	MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
+	if (rte_is_zero_ether_addr(mac)) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	/* First, make sure this address isn't already configured. */
+	for (i = 0; (i != MLX5_MAX_MAC_ADDRESSES); ++i) {
+		/* Skip this index, it's going to be reconfigured. */
+		if (i == index)
+			continue;
+		if (memcmp(&dev->data->mac_addrs[i], mac, sizeof(*mac)))
+			continue;
+		/* Address already configured elsewhere, return with error. */
+		rte_errno = EADDRINUSE;
+		return -rte_errno;
+	}
+	if (vf) {
+		int ret = mlx5_nl_mac_addr_add(priv->nl_socket_route,
+					       mlx5_ifindex(dev), priv->mac_own,
+					       mac, index);
+
+		if (ret)
+			return ret;
+	}
+	dev->data->mac_addrs[index] = *mac;
+	return 0;
+}
+
+/**
+ * DPDK callback to remove a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param index
+ *   MAC address index.
+ */
+void
+mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+	int ret;
+
+	if (index >= MLX5_MAX_UC_MAC_ADDRESSES)
+		return;
+	mlx5_internal_mac_addr_remove(dev, index);
+	if (!dev->data->promiscuous) {
+		ret = mlx5_traffic_restart(dev);
+		if (ret)
+			DRV_LOG(ERR, "port %u cannot restart traffic: %s",
+				dev->data->port_id, strerror(rte_errno));
+	}
+}
+
+/**
+ * DPDK callback to add a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mac_addr
+ *   MAC address to register.
+ * @param index
+ *   MAC address index.
+ * @param vmdq
+ *   VMDq pool index to associate address with (ignored).
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
+		  uint32_t index, uint32_t vmdq __rte_unused)
+{
+	int ret;
+
+	if (index >= MLX5_MAX_UC_MAC_ADDRESSES) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	ret = mlx5_internal_mac_addr_add(dev, mac, index);
+	if (ret < 0)
+		return ret;
+	if (!dev->data->promiscuous)
+		return mlx5_traffic_restart(dev);
+	return 0;
+}
+
+/**
+ * DPDK callback to set primary MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mac_addr
+ *   MAC address to register.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_mac_addr_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr)
+{
+	uint16_t port_id;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	/* Configuring the VF instead of its representor. */
+	if (priv->representor) {
+		DRV_LOG(DEBUG, "VF represented by port %u setting primary MAC address",
+			dev->data->port_id);
+		RTE_ETH_FOREACH_DEV_SIBLING(port_id, dev->data->port_id) {
+			priv = rte_eth_devices[port_id].data->dev_private;
+			if (priv->master == 1) {
+				priv = dev->data->dev_private;
+				return mlx5_nl_vf_mac_addr_modify
+				       (priv->nl_socket_route,
+					mlx5_ifindex(&rte_eth_devices[port_id]),
+					mac_addr, priv->representor_id);
+			}
+		}
+		rte_errno = -ENOTSUP;
+		return rte_errno;
+	}
+
+	DRV_LOG(DEBUG, "port %u setting primary MAC address",
+		dev->data->port_id);
+	return mlx5_mac_addr_add(dev, mac_addr, 0, 0);
+}
+
+/**
+ * DPDK callback to set multicast addresses list.
+ *
+ * @see rte_eth_dev_set_mc_addr_list()
+ */
+int
+mlx5_set_mc_addr_list(struct rte_eth_dev *dev,
+		      struct rte_ether_addr *mc_addr_set, uint32_t nb_mc_addr)
+{
+	uint32_t i;
+	int ret;
+
+	if (nb_mc_addr >= MLX5_MAX_MC_MAC_ADDRESSES) {
+		rte_errno = ENOSPC;
+		return -rte_errno;
+	}
+	for (i = MLX5_MAX_UC_MAC_ADDRESSES; i != MLX5_MAX_MAC_ADDRESSES; ++i)
+		mlx5_internal_mac_addr_remove(dev, i);
+	i = MLX5_MAX_UC_MAC_ADDRESSES;
+	while (nb_mc_addr--) {
+		ret = mlx5_internal_mac_addr_add(dev, mc_addr_set++, i++);
+		if (ret)
+			return ret;
+	}
+	if (!dev->data->promiscuous)
+		return mlx5_traffic_restart(dev);
+	return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c
new file mode 100644
index 000000000..7ad322d47
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mp.c
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 6WIND S.A.
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#include <stdio.h>
+#include <time.h>
+
+#include <rte_eal.h>
+#include <rte_ethdev_driver.h>
+#include <rte_string_fns.h>
+
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+int
+mlx5_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+	struct rte_mp_msg mp_res;
+	struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param;
+	const struct mlx5_mp_param *param =
+		(const struct mlx5_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev;
+	struct mlx5_priv *priv;
+	struct mr_cache_entry entry;
+	uint32_t lkey;
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		rte_errno = ENODEV;
+		DRV_LOG(ERR, "port %u invalid port ID", param->port_id);
+		return -rte_errno;
+	}
+	dev = &rte_eth_devices[param->port_id];
+	priv = dev->data->dev_private;
+	switch (param->type) {
+	case MLX5_MP_REQ_CREATE_MR:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		lkey = mlx5_mr_create_primary(priv->sh->pd,
+					      &priv->sh->share_cache,
+					      &entry, param->args.addr,
+					      priv->config.mr_ext_memseg_en);
+		if (lkey == UINT32_MAX)
+			res->result = -rte_errno;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_VERBS_CMD_FD:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		mp_res.num_fds = 1;
+		mp_res.fds[0] = priv->sh->ctx->cmd_fd;
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_QUEUE_STATE_MODIFY:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_queue_state_modify_primary
+					(dev, &param->args.state_modify);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	default:
+		rte_errno = EINVAL;
+		DRV_LOG(ERR, "port %u invalid mp request type",
+			dev->data->port_id);
+		return -rte_errno;
+	}
+	return ret;
+}
+
+/**
+ * IPC message handler of a secondary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] peer
+ *   Pointer to the peer socket path.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+	struct rte_mp_msg mp_res;
+	struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param;
+	const struct mlx5_mp_param *param =
+		(const struct mlx5_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev;
+	struct mlx5_priv *priv;
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		rte_errno = ENODEV;
+		DRV_LOG(ERR, "port %u invalid port ID", param->port_id);
+		return -rte_errno;
+	}
+	dev = &rte_eth_devices[param->port_id];
+	priv = dev->data->dev_private;
+	switch (param->type) {
+	case MLX5_MP_REQ_START_RXTX:
+		DRV_LOG(INFO, "port %u starting datapath", dev->data->port_id);
+		rte_mb();
+		dev->rx_pkt_burst = mlx5_select_rx_function(dev);
+		dev->tx_pkt_burst = mlx5_select_tx_function(dev);
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_STOP_RXTX:
+		DRV_LOG(INFO, "port %u stopping datapath", dev->data->port_id);
+		dev->rx_pkt_burst = removed_rx_burst;
+		dev->tx_pkt_burst = removed_tx_burst;
+		rte_mb();
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	default:
+		rte_errno = EINVAL;
+		DRV_LOG(ERR, "port %u invalid mp request type",
+			dev->data->port_id);
+		return -rte_errno;
+	}
+	return ret;
+}
+
+/**
+ * Broadcast request of stopping/starting data-path to secondary processes.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] type
+ *   Request type.
+ */
+static void
+mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx5_mp_req_type type)
+{
+	struct rte_mp_msg mp_req;
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mlx5_mp_param *res;
+	struct timespec ts = {.tv_sec = MLX5_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+	int i;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (!mlx5_shared_data->secondary_cnt)
+		return;
+	if (type != MLX5_MP_REQ_START_RXTX && type != MLX5_MP_REQ_STOP_RXTX) {
+		DRV_LOG(ERR, "port %u unknown request (req_type %d)",
+			dev->data->port_id, type);
+		return;
+	}
+	mp_init_msg(&priv->mp_id, &mp_req, type);
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		if (rte_errno != ENOTSUP)
+			DRV_LOG(ERR, "port %u failed to request stop/start Rx/Tx (%d)",
+				dev->data->port_id, type);
+		goto exit;
+	}
+	if (mp_rep.nb_sent != mp_rep.nb_received) {
+		DRV_LOG(ERR,
+			"port %u not all secondaries responded (req_type %d)",
+			dev->data->port_id, type);
+		goto exit;
+	}
+	for (i = 0; i < mp_rep.nb_received; i++) {
+		mp_res = &mp_rep.msgs[i];
+		res = (struct mlx5_mp_param *)mp_res->param;
+		if (res->result) {
+			DRV_LOG(ERR, "port %u request failed on secondary #%d",
+				dev->data->port_id, i);
+			goto exit;
+		}
+	}
+exit:
+	free(mp_rep.msgs);
+}
+
+/**
+ * Broadcast request of starting data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ */
+void
+mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev)
+{
+	mp_req_on_rxtx(dev, MLX5_MP_REQ_START_RXTX);
+}
+
+/**
+ * Broadcast request of stopping data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ */
+void
+mlx5_mp_req_stop_rxtx(struct rte_eth_dev *dev)
+{
+	mp_req_on_rxtx(dev, MLX5_MP_REQ_STOP_RXTX);
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c
new file mode 100644
index 000000000..2b4b3e289
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.c
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox Technologies, Ltd
+ */
+
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_eal_memconfig.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_rwlock.h>
+#include <rte_bus_pci.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5.h"
+#include "mlx5_mr.h"
+#include "mlx5_rxtx.h"
+
+struct mr_find_contig_memsegs_data {
+	uintptr_t addr;
+	uintptr_t start;
+	uintptr_t end;
+	const struct rte_memseg_list *msl;
+};
+
+struct mr_update_mp_data {
+	struct rte_eth_dev *dev;
+	struct mlx5_mr_ctrl *mr_ctrl;
+	int ret;
+};
+
+/**
+ * Callback for memory free event. Iterate freed memsegs and check whether it
+ * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
+ * result, the MR would be fragmented. If it becomes empty, the MR will be freed
+ * later by mlx5_mr_garbage_collect(). Even if this callback is called from a
+ * secondary process, the garbage collector will be called in primary process
+ * as the secondary process can't call mlx5_mr_create().
+ *
+ * The global cache must be rebuilt if there's any change and this event has to
+ * be propagated to dataplane threads to flush the local caches.
+ *
+ * @param sh
+ *   Pointer to the Ethernet device shared context.
+ * @param addr
+ *   Address of freed memory.
+ * @param len
+ *   Size of freed memory.
+ */
+static void
+mlx5_mr_mem_event_free_cb(struct mlx5_ibv_shared *sh,
+			  const void *addr, size_t len)
+{
+	const struct rte_memseg_list *msl;
+	struct mlx5_mr *mr;
+	int ms_n;
+	int i;
+	int rebuild = 0;
+
+	DEBUG("device %s free callback: addr=%p, len=%zu",
+	      sh->ibdev_name, addr, len);
+	msl = rte_mem_virt2memseg_list(addr);
+	/* addr and len must be page-aligned. */
+	MLX5_ASSERT((uintptr_t)addr ==
+		    RTE_ALIGN((uintptr_t)addr, msl->page_sz));
+	MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz));
+	ms_n = len / msl->page_sz;
+	rte_rwlock_write_lock(&sh->share_cache.rwlock);
+	/* Clear bits of freed memsegs from MR. */
+	for (i = 0; i < ms_n; ++i) {
+		const struct rte_memseg *ms;
+		struct mr_cache_entry entry;
+		uintptr_t start;
+		int ms_idx;
+		uint32_t pos;
+
+		/* Find MR having this memseg. */
+		start = (uintptr_t)addr + i * msl->page_sz;
+		mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, start);
+		if (mr == NULL)
+			continue;
+		MLX5_ASSERT(mr->msl); /* Can't be external memory. */
+		ms = rte_mem_virt2memseg((void *)start, msl);
+		MLX5_ASSERT(ms != NULL);
+		MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
+		ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+		pos = ms_idx - mr->ms_base_idx;
+		MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos));
+		MLX5_ASSERT(pos < mr->ms_bmp_n);
+		DEBUG("device %s MR(%p): clear bitmap[%u] for addr %p",
+		      sh->ibdev_name, (void *)mr, pos, (void *)start);
+		rte_bitmap_clear(mr->ms_bmp, pos);
+		if (--mr->ms_n == 0) {
+			LIST_REMOVE(mr, mr);
+			LIST_INSERT_HEAD(&sh->share_cache.mr_free_list, mr, mr);
+			DEBUG("device %s remove MR(%p) from list",
+			      sh->ibdev_name, (void *)mr);
+		}
+		/*
+		 * MR is fragmented or will be freed. the global cache must be
+		 * rebuilt.
+		 */
+		rebuild = 1;
+	}
+	if (rebuild) {
+		mlx5_mr_rebuild_cache(&sh->share_cache);
+		/*
+		 * Flush local caches by propagating invalidation across cores.
+		 * rte_smp_wmb() is enough to synchronize this event. If one of
+		 * freed memsegs is seen by other core, that means the memseg
+		 * has been allocated by allocator, which will come after this
+		 * free call. Therefore, this store instruction (incrementing
+		 * generation below) will be guaranteed to be seen by other core
+		 * before the core sees the newly allocated memory.
+		 */
+		++sh->share_cache.dev_gen;
+		DEBUG("broadcasting local cache flush, gen=%d",
+		      sh->share_cache.dev_gen);
+		rte_smp_wmb();
+	}
+	rte_rwlock_write_unlock(&sh->share_cache.rwlock);
+}
+
+/**
+ * Callback for memory event. This can be called from both primary and secondary
+ * process.
+ *
+ * @param event_type
+ *   Memory event type.
+ * @param addr
+ *   Address of memory.
+ * @param len
+ *   Size of memory.
+ */
+void
+mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+		     size_t len, void *arg __rte_unused)
+{
+	struct mlx5_ibv_shared *sh;
+	struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list;
+
+	/* Must be called from the primary process. */
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	switch (event_type) {
+	case RTE_MEM_EVENT_FREE:
+		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+		/* Iterate all the existing mlx5 devices. */
+		LIST_FOREACH(sh, dev_list, mem_event_cb)
+			mlx5_mr_mem_event_free_cb(sh, addr, len);
+		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+		break;
+	case RTE_MEM_EVENT_ALLOC:
+	default:
+		break;
+	}
+}
+
+/**
+ * Bottom-half of LKey search on Rx.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr)
+{
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+	struct mlx5_priv *priv = rxq_ctrl->priv;
+
+	return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
+				  &priv->sh->share_cache, mr_ctrl, addr,
+				  priv->config.mr_ext_memseg_en);
+}
+
+/**
+ * Bottom-half of LKey search on Tx.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr)
+{
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq, struct mlx5_txq_ctrl, txq);
+	struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+	struct mlx5_priv *priv = txq_ctrl->priv;
+
+	return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
+				  &priv->sh->share_cache, mr_ctrl, addr,
+				  priv->config.mr_ext_memseg_en);
+}
+
+/**
+ * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
+ * list, register the mempool of the mbuf as externally allocated memory.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param mb
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
+{
+	uintptr_t addr = (uintptr_t)mb->buf_addr;
+	uint32_t lkey;
+
+	lkey = mlx5_tx_addr2mr_bh(txq, addr);
+	if (lkey == UINT32_MAX && rte_errno == ENXIO) {
+		/* Mempool may have externally allocated memory. */
+		return mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb));
+	}
+	return lkey;
+}
+
+/**
+ * Called during rte_mempool_mem_iter() by mlx5_mr_update_ext_mp().
+ *
+ * Externally allocated chunk is registered and a MR is created for the chunk.
+ * The MR object is added to the global list. If memseg list of a MR object
+ * (mr->msl) is null, the MR object can be regarded as externally allocated
+ * memory.
+ *
+ * Once external memory is registered, it should be static. If the memory is
+ * freed and the virtual address range has different physical memory mapped
+ * again, it may cause crash on device due to the wrong translation entry. PMD
+ * can't track the free event of the external memory for now.
+ */
+static void
+mlx5_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
+			 struct rte_mempool_memhdr *memhdr,
+			 unsigned mem_idx __rte_unused)
+{
+	struct mr_update_mp_data *data = opaque;
+	struct rte_eth_dev *dev = data->dev;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
+	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
+	struct mlx5_mr *mr = NULL;
+	uintptr_t addr = (uintptr_t)memhdr->addr;
+	size_t len = memhdr->len;
+	struct mr_cache_entry entry;
+	uint32_t lkey;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	/* If already registered, it should return. */
+	rte_rwlock_read_lock(&sh->share_cache.rwlock);
+	lkey = mlx5_mr_lookup_cache(&sh->share_cache, &entry, addr);
+	rte_rwlock_read_unlock(&sh->share_cache.rwlock);
+	if (lkey != UINT32_MAX)
+		return;
+	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
+		dev->data->port_id, mem_idx, mp->name);
+	mr = mlx5_create_mr_ext(sh->pd, addr, len, mp->socket_id);
+	if (!mr) {
+		DRV_LOG(WARNING,
+			"port %u unable to allocate a new MR of"
+			" mempool (%s).",
+			dev->data->port_id, mp->name);
+		data->ret = -1;
+		return;
+	}
+	rte_rwlock_write_lock(&sh->share_cache.rwlock);
+	LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr);
+	/* Insert to the global cache table. */
+	mlx5_mr_insert_cache(&sh->share_cache, mr);
+	rte_rwlock_write_unlock(&sh->share_cache.rwlock);
+	/* Insert to the local cache table */
+	mlx5_mr_addr2mr_bh(sh->pd, &priv->mp_id, &sh->share_cache,
+			   mr_ctrl, addr, priv->config.mr_ext_memseg_en);
+}
+
+/**
+ * Finds the first ethdev that match the pci device.
+ * The existence of multiple ethdev per pci device is only with representors.
+ * On such case, it is enough to get only one of the ports as they all share
+ * the same ibv context.
+ *
+ * @param pdev
+ *   Pointer to the PCI device.
+ *
+ * @return
+ *   Pointer to the ethdev if found, NULL otherwise.
+ */
+static struct rte_eth_dev *
+pci_dev_to_eth_dev(struct rte_pci_device *pdev)
+{
+	uint16_t port_id;
+
+	RTE_ETH_FOREACH_DEV_OF(port_id, &pdev->device)
+		return &rte_eth_devices[port_id];
+	return NULL;
+}
+
+/**
+ * DPDK callback to DMA map external memory to a PCI device.
+ *
+ * @param pdev
+ *   Pointer to the PCI device.
+ * @param addr
+ *   Starting virtual address of memory to be mapped.
+ * @param iova
+ *   Starting IOVA address of memory to be mapped.
+ * @param len
+ *   Length of memory segment being mapped.
+ *
+ * @return
+ *   0 on success, negative value on error.
+ */
+int
+mlx5_dma_map(struct rte_pci_device *pdev, void *addr,
+	     uint64_t iova __rte_unused, size_t len)
+{
+	struct rte_eth_dev *dev;
+	struct mlx5_mr *mr;
+	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
+
+	dev = pci_dev_to_eth_dev(pdev);
+	if (!dev) {
+		DRV_LOG(WARNING, "unable to find matching ethdev "
+				 "to PCI device %p", (void *)pdev);
+		rte_errno = ENODEV;
+		return -1;
+	}
+	priv = dev->data->dev_private;
+	sh = priv->sh;
+	mr = mlx5_create_mr_ext(sh->pd, (uintptr_t)addr, len, SOCKET_ID_ANY);
+	if (!mr) {
+		DRV_LOG(WARNING,
+			"port %u unable to dma map", dev->data->port_id);
+		rte_errno = EINVAL;
+		return -1;
+	}
+	rte_rwlock_write_lock(&sh->share_cache.rwlock);
+	LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr);
+	/* Insert to the global cache table. */
+	mlx5_mr_insert_cache(&sh->share_cache, mr);
+	rte_rwlock_write_unlock(&sh->share_cache.rwlock);
+	return 0;
+}
+
+/**
+ * DPDK callback to DMA unmap external memory to a PCI device.
+ *
+ * @param pdev
+ *   Pointer to the PCI device.
+ * @param addr
+ *   Starting virtual address of memory to be unmapped.
+ * @param iova
+ *   Starting IOVA address of memory to be unmapped.
+ * @param len
+ *   Length of memory segment being unmapped.
+ *
+ * @return
+ *   0 on success, negative value on error.
+ */
+int
+mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr,
+	       uint64_t iova __rte_unused, size_t len __rte_unused)
+{
+	struct rte_eth_dev *dev;
+	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
+	struct mlx5_mr *mr;
+	struct mr_cache_entry entry;
+
+	dev = pci_dev_to_eth_dev(pdev);
+	if (!dev) {
+		DRV_LOG(WARNING, "unable to find matching ethdev "
+				 "to PCI device %p", (void *)pdev);
+		rte_errno = ENODEV;
+		return -1;
+	}
+	priv = dev->data->dev_private;
+	sh = priv->sh;
+	rte_rwlock_read_lock(&sh->share_cache.rwlock);
+	mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, (uintptr_t)addr);
+	if (!mr) {
+		rte_rwlock_read_unlock(&sh->share_cache.rwlock);
+		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
+				 "to PCI device %p", (uintptr_t)addr,
+				 (void *)pdev);
+		rte_errno = EINVAL;
+		return -1;
+	}
+	LIST_REMOVE(mr, mr);
+	LIST_INSERT_HEAD(&sh->share_cache.mr_free_list, mr, mr);
+	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
+	      (void *)mr);
+	mlx5_mr_rebuild_cache(&sh->share_cache);
+	/*
+	 * Flush local caches by propagating invalidation across cores.
+	 * rte_smp_wmb() is enough to synchronize this event. If one of
+	 * freed memsegs is seen by other core, that means the memseg
+	 * has been allocated by allocator, which will come after this
+	 * free call. Therefore, this store instruction (incrementing
+	 * generation below) will be guaranteed to be seen by other core
+	 * before the core sees the newly allocated memory.
+	 */
+	++sh->share_cache.dev_gen;
+	DEBUG("broadcasting local cache flush, gen=%d",
+	      sh->share_cache.dev_gen);
+	rte_smp_wmb();
+	rte_rwlock_read_unlock(&sh->share_cache.rwlock);
+	return 0;
+}
+
+/**
+ * Register MR for entire memory chunks in a Mempool having externally allocated
+ * memory and fill in local cache.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mr_ctrl
+ *   Pointer to per-queue MR control structure.
+ * @param mp
+ *   Pointer to registering Mempool.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+static uint32_t
+mlx5_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
+		      struct rte_mempool *mp)
+{
+	struct mr_update_mp_data data = {
+		.dev = dev,
+		.mr_ctrl = mr_ctrl,
+		.ret = 0,
+	};
+
+	rte_mempool_mem_iter(mp, mlx5_mr_update_ext_mp_cb, &data);
+	return data.ret;
+}
+
+/**
+ * Register MR entire memory chunks in a Mempool having externally allocated
+ * memory and search LKey of the address to return.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param addr
+ *   Search key.
+ * @param mp
+ *   Pointer to registering Mempool where addr belongs.
+ *
+ * @return
+ *   LKey for address on success, UINT32_MAX on failure.
+ */
+uint32_t
+mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
+		      struct rte_mempool *mp)
+{
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq, struct mlx5_txq_ctrl, txq);
+	struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+	struct mlx5_priv *priv = txq_ctrl->priv;
+
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+		DRV_LOG(WARNING,
+			"port %u using address (%p) from unregistered mempool"
+			" having externally allocated memory"
+			" in secondary process, please create mempool"
+			" prior to rte_eth_dev_start()",
+			PORT_ID(priv), (void *)addr);
+		return UINT32_MAX;
+	}
+	mlx5_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
+	return mlx5_tx_addr2mr_bh(txq, addr);
+}
+
+/* Called during rte_mempool_mem_iter() by mlx5_mr_update_mp(). */
+static void
+mlx5_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque,
+		     struct rte_mempool_memhdr *memhdr,
+		     unsigned mem_idx __rte_unused)
+{
+	struct mr_update_mp_data *data = opaque;
+	struct rte_eth_dev *dev = data->dev;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	uint32_t lkey;
+
+	/* Stop iteration if failed in the previous walk. */
+	if (data->ret < 0)
+		return;
+	/* Register address of the chunk and update local caches. */
+	lkey = mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
+				  &priv->sh->share_cache, data->mr_ctrl,
+				  (uintptr_t)memhdr->addr,
+				  priv->config.mr_ext_memseg_en);
+	if (lkey == UINT32_MAX)
+		data->ret = -1;
+}
+
+/**
+ * Register entire memory chunks in a Mempool.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mr_ctrl
+ *   Pointer to per-queue MR control structure.
+ * @param mp
+ *   Pointer to registering Mempool.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+int
+mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
+		  struct rte_mempool *mp)
+{
+	struct mr_update_mp_data data = {
+		.dev = dev,
+		.mr_ctrl = mr_ctrl,
+		.ret = 0,
+	};
+
+	rte_mempool_mem_iter(mp, mlx5_mr_update_mp_cb, &data);
+	if (data.ret < 0 && rte_errno == ENXIO) {
+		/* Mempool may have externally allocated memory. */
+		return mlx5_mr_update_ext_mp(dev, mr_ctrl, mp);
+	}
+	return data.ret;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h
new file mode 100644
index 000000000..0c5877b3d
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_mr.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_MR_H_
+#define RTE_PMD_MLX5_MR_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_rwlock.h>
+#include <rte_bitmap.h>
+#include <rte_memory.h>
+
+#include <mlx5_common_mr.h>
+
+/* First entry must be NULL for comparison. */
+#define mlx5_mr_btree_len(bt) ((bt)->len - 1)
+
+void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+			  size_t len, void *arg);
+int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
+		      struct rte_mempool *mp);
+
+#endif /* RTE_PMD_MLX5_MR_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c
new file mode 100644
index 000000000..653b06914
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rss.c
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_malloc.h>
+#include <rte_ethdev_driver.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+
+/**
+ * DPDK callback to update the RSS hash configuration.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[in] rss_conf
+ *   RSS configuration data.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rss_hash_update(struct rte_eth_dev *dev,
+		     struct rte_eth_rss_conf *rss_conf)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+	unsigned int idx;
+
+	if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	if (rss_conf->rss_key && rss_conf->rss_key_len) {
+		if (rss_conf->rss_key_len != MLX5_RSS_HASH_KEY_LEN) {
+			DRV_LOG(ERR,
+				"port %u RSS key len must be %s Bytes long",
+				dev->data->port_id,
+				RTE_STR(MLX5_RSS_HASH_KEY_LEN));
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
+		priv->rss_conf.rss_key = rte_realloc(priv->rss_conf.rss_key,
+						     rss_conf->rss_key_len, 0);
+		if (!priv->rss_conf.rss_key) {
+			rte_errno = ENOMEM;
+			return -rte_errno;
+		}
+		memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
+		       rss_conf->rss_key_len);
+		priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
+	}
+	priv->rss_conf.rss_hf = rss_conf->rss_hf;
+	/* Enable the RSS hash in all Rx queues. */
+	for (i = 0, idx = 0; idx != priv->rxqs_n; ++i) {
+		if (!(*priv->rxqs)[i])
+			continue;
+		(*priv->rxqs)[i]->rss_hash = !!rss_conf->rss_hf &&
+			!!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS);
+		++idx;
+	}
+	return 0;
+}
+
+/**
+ * DPDK callback to get the RSS hash configuration.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[in, out] rss_conf
+ *   RSS configuration data.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rss_hash_conf_get(struct rte_eth_dev *dev,
+		       struct rte_eth_rss_conf *rss_conf)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!rss_conf) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	if (rss_conf->rss_key &&
+	    (rss_conf->rss_key_len >= priv->rss_conf.rss_key_len)) {
+		memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
+		       priv->rss_conf.rss_key_len);
+	}
+	rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
+	rss_conf->rss_hf = priv->rss_conf.rss_hf;
+	return 0;
+}
+
+/**
+ * Allocate/reallocate RETA index table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @praram reta_size
+ *   The size of the array to allocate.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	void *mem;
+	unsigned int old_size = priv->reta_idx_n;
+
+	if (priv->reta_idx_n == reta_size)
+		return 0;
+
+	mem = rte_realloc(priv->reta_idx,
+			  reta_size * sizeof((*priv->reta_idx)[0]), 0);
+	if (!mem) {
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	priv->reta_idx = mem;
+	priv->reta_idx_n = reta_size;
+	if (old_size < reta_size)
+		memset(&(*priv->reta_idx)[old_size], 0,
+		       (reta_size - old_size) *
+		       sizeof((*priv->reta_idx)[0]));
+	return 0;
+}
+
+/**
+ * DPDK callback to get the RETA indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param reta_conf
+ *   Pointer to RETA configuration structure array.
+ * @param reta_size
+ *   Size of the RETA table.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_rss_reta_query(struct rte_eth_dev *dev,
+			struct rte_eth_rss_reta_entry64 *reta_conf,
+			uint16_t reta_size)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int idx;
+	unsigned int i;
+
+	if (!reta_size || reta_size > priv->reta_idx_n) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	/* Fill each entry of the table even if its bit is not set. */
+	for (idx = 0, i = 0; (i != reta_size); ++i) {
+		idx = i / RTE_RETA_GROUP_SIZE;
+		reta_conf[idx].reta[i % RTE_RETA_GROUP_SIZE] =
+			(*priv->reta_idx)[i];
+	}
+	return 0;
+}
+
+/**
+ * DPDK callback to update the RETA indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param reta_conf
+ *   Pointer to RETA configuration structure array.
+ * @param reta_size
+ *   Size of the RETA table.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_rss_reta_update(struct rte_eth_dev *dev,
+			 struct rte_eth_rss_reta_entry64 *reta_conf,
+			 uint16_t reta_size)
+{
+	int ret;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int idx;
+	unsigned int i;
+	unsigned int pos;
+
+	if (!reta_size) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	ret = mlx5_rss_reta_index_resize(dev, reta_size);
+	if (ret)
+		return ret;
+	for (idx = 0, i = 0; (i != reta_size); ++i) {
+		idx = i / RTE_RETA_GROUP_SIZE;
+		pos = i % RTE_RETA_GROUP_SIZE;
+		if (((reta_conf[idx].mask >> i) & 0x1) == 0)
+			continue;
+		MLX5_ASSERT(reta_conf[idx].reta[pos] < priv->rxqs_n);
+		(*priv->reta_idx)[i] = reta_conf[idx].reta[pos];
+	}
+	if (dev->data->dev_started) {
+		mlx5_dev_stop(dev);
+		priv->skip_default_rss_reta = 1;
+		return mlx5_dev_start(dev);
+	}
+	return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c
new file mode 100644
index 000000000..84c8b0526
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxmode.c
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev_driver.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+/**
+ * DPDK callback to enable promiscuous mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_promiscuous_enable(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+
+	dev->data->promiscuous = 1;
+	if (priv->isolated) {
+		DRV_LOG(WARNING,
+			"port %u cannot enable promiscuous mode"
+			" in flow isolation mode",
+			dev->data->port_id);
+		return 0;
+	}
+	if (priv->config.vf) {
+		ret = mlx5_nl_promisc(priv->nl_socket_route, mlx5_ifindex(dev),
+				      1);
+		if (ret)
+			return ret;
+	}
+	ret = mlx5_traffic_restart(dev);
+	if (ret)
+		DRV_LOG(ERR, "port %u cannot enable promiscuous mode: %s",
+			dev->data->port_id, strerror(rte_errno));
+
+	/*
+	 * rte_eth_dev_promiscuous_enable() rollback
+	 * dev->data->promiscuous in the case of failure.
+	 */
+	return ret;
+}
+
+/**
+ * DPDK callback to disable promiscuous mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_promiscuous_disable(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+
+	dev->data->promiscuous = 0;
+	if (priv->config.vf) {
+		ret = mlx5_nl_promisc(priv->nl_socket_route, mlx5_ifindex(dev),
+				      0);
+		if (ret)
+			return ret;
+	}
+	ret = mlx5_traffic_restart(dev);
+	if (ret)
+		DRV_LOG(ERR, "port %u cannot disable promiscuous mode: %s",
+			dev->data->port_id, strerror(rte_errno));
+
+	/*
+	 * rte_eth_dev_promiscuous_disable() rollback
+	 * dev->data->promiscuous in the case of failure.
+	 */
+	return ret;
+}
+
+/**
+ * DPDK callback to enable allmulti mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_allmulticast_enable(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+
+	dev->data->all_multicast = 1;
+	if (priv->isolated) {
+		DRV_LOG(WARNING,
+			"port %u cannot enable allmulticast mode"
+			" in flow isolation mode",
+			dev->data->port_id);
+		return 0;
+	}
+	if (priv->config.vf) {
+		ret = mlx5_nl_allmulti(priv->nl_socket_route, mlx5_ifindex(dev),
+				       1);
+		if (ret)
+			goto error;
+	}
+	ret = mlx5_traffic_restart(dev);
+	if (ret)
+		DRV_LOG(ERR, "port %u cannot enable allmulicast mode: %s",
+			dev->data->port_id, strerror(rte_errno));
+error:
+	/*
+	 * rte_eth_allmulticast_enable() rollback
+	 * dev->data->all_multicast in the case of failure.
+	 */
+	return ret;
+}
+
+/**
+ * DPDK callback to disable allmulti mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_allmulticast_disable(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+
+	dev->data->all_multicast = 0;
+	if (priv->config.vf) {
+		ret = mlx5_nl_allmulti(priv->nl_socket_route, mlx5_ifindex(dev),
+				       0);
+		if (ret)
+			goto error;
+	}
+	ret = mlx5_traffic_restart(dev);
+	if (ret)
+		DRV_LOG(ERR, "port %u cannot disable allmulicast mode: %s",
+			dev->data->port_id, strerror(rte_errno));
+error:
+	/*
+	 * rte_eth_allmulticast_disable() rollback
+	 * dev->data->all_multicast in the case of failure.
+	 */
+	return ret;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c
new file mode 100644
index 000000000..7a50ec6f1
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxq.c
@@ -0,0 +1,2976 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <sys/queue.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_malloc.h>
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+#include <rte_interrupts.h>
+#include <rte_debug.h>
+#include <rte_io.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_flow.h"
+
+
+/* Default RSS hash key also used for ConnectX-3. */
+uint8_t rss_hash_default_key[] = {
+	0x2c, 0xc6, 0x81, 0xd1,
+	0x5b, 0xdb, 0xf4, 0xf7,
+	0xfc, 0xa2, 0x83, 0x19,
+	0xdb, 0x1a, 0x3e, 0x94,
+	0x6b, 0x9e, 0x38, 0xd9,
+	0x2c, 0x9c, 0x03, 0xd1,
+	0xad, 0x99, 0x44, 0xa7,
+	0xd9, 0x56, 0x3d, 0x59,
+	0x06, 0x3c, 0x25, 0xf3,
+	0xfc, 0x1f, 0xdc, 0x2a,
+};
+
+/* Length of the default RSS hash key. */
+static_assert(MLX5_RSS_HASH_KEY_LEN ==
+	      (unsigned int)sizeof(rss_hash_default_key),
+	      "wrong RSS default key size.");
+
+/**
+ * Check whether Multi-Packet RQ can be enabled for the device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   1 if supported, negative errno value if not.
+ */
+inline int
+mlx5_check_mprq_support(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (priv->config.mprq.enabled &&
+	    priv->rxqs_n >= priv->config.mprq.min_rxqs_num)
+		return 1;
+	return -ENOTSUP;
+}
+
+/**
+ * Check whether Multi-Packet RQ is enabled for the Rx queue.
+ *
+ *  @param rxq
+ *     Pointer to receive queue structure.
+ *
+ * @return
+ *   0 if disabled, otherwise enabled.
+ */
+inline int
+mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq)
+{
+	return rxq->strd_num_n > 0;
+}
+
+/**
+ * Check whether Multi-Packet RQ is enabled for the device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   0 if disabled, otherwise enabled.
+ */
+inline int
+mlx5_mprq_enabled(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint16_t i;
+	uint16_t n = 0;
+	uint16_t n_ibv = 0;
+
+	if (mlx5_check_mprq_support(dev) < 0)
+		return 0;
+	/* All the configured queues should be enabled. */
+	for (i = 0; i < priv->rxqs_n; ++i) {
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+		struct mlx5_rxq_ctrl *rxq_ctrl = container_of
+			(rxq, struct mlx5_rxq_ctrl, rxq);
+
+		if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD)
+			continue;
+		n_ibv++;
+		if (mlx5_rxq_mprq_enabled(rxq))
+			++n;
+	}
+	/* Multi-Packet RQ can't be partially configured. */
+	MLX5_ASSERT(n == 0 || n == n_ibv);
+	return n == n_ibv;
+}
+
+/**
+ * Allocate RX queue elements for Multi-Packet RQ.
+ *
+ * @param rxq_ctrl
+ *   Pointer to RX queue structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+	unsigned int wqe_n = 1 << rxq->elts_n;
+	unsigned int i;
+	int err;
+
+	/* Iterate on segments. */
+	for (i = 0; i <= wqe_n; ++i) {
+		struct mlx5_mprq_buf *buf;
+
+		if (rte_mempool_get(rxq->mprq_mp, (void **)&buf) < 0) {
+			DRV_LOG(ERR, "port %u empty mbuf pool", rxq->port_id);
+			rte_errno = ENOMEM;
+			goto error;
+		}
+		if (i < wqe_n)
+			(*rxq->mprq_bufs)[i] = buf;
+		else
+			rxq->mprq_repl = buf;
+	}
+	DRV_LOG(DEBUG,
+		"port %u Rx queue %u allocated and configured %u segments",
+		rxq->port_id, rxq->idx, wqe_n);
+	return 0;
+error:
+	err = rte_errno; /* Save rte_errno before cleanup. */
+	wqe_n = i;
+	for (i = 0; (i != wqe_n); ++i) {
+		if ((*rxq->mprq_bufs)[i] != NULL)
+			rte_mempool_put(rxq->mprq_mp,
+					(*rxq->mprq_bufs)[i]);
+		(*rxq->mprq_bufs)[i] = NULL;
+	}
+	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
+		rxq->port_id, rxq->idx);
+	rte_errno = err; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Allocate RX queue elements for Single-Packet RQ.
+ *
+ * @param rxq_ctrl
+ *   Pointer to RX queue structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
+	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
+	unsigned int i;
+	int err;
+
+	/* Iterate on segments. */
+	for (i = 0; (i != elts_n); ++i) {
+		struct rte_mbuf *buf;
+
+		buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
+		if (buf == NULL) {
+			DRV_LOG(ERR, "port %u empty mbuf pool",
+				PORT_ID(rxq_ctrl->priv));
+			rte_errno = ENOMEM;
+			goto error;
+		}
+		/* Headroom is reserved by rte_pktmbuf_alloc(). */
+		MLX5_ASSERT(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
+		/* Buffer is supposed to be empty. */
+		MLX5_ASSERT(rte_pktmbuf_data_len(buf) == 0);
+		MLX5_ASSERT(rte_pktmbuf_pkt_len(buf) == 0);
+		MLX5_ASSERT(!buf->next);
+		/* Only the first segment keeps headroom. */
+		if (i % sges_n)
+			SET_DATA_OFF(buf, 0);
+		PORT(buf) = rxq_ctrl->rxq.port_id;
+		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
+		PKT_LEN(buf) = DATA_LEN(buf);
+		NB_SEGS(buf) = 1;
+		(*rxq_ctrl->rxq.elts)[i] = buf;
+	}
+	/* If Rx vector is activated. */
+	if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
+		struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+		struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
+		struct rte_pktmbuf_pool_private *priv =
+			(struct rte_pktmbuf_pool_private *)
+				rte_mempool_get_priv(rxq_ctrl->rxq.mp);
+		int j;
+
+		/* Initialize default rearm_data for vPMD. */
+		mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
+		rte_mbuf_refcnt_set(mbuf_init, 1);
+		mbuf_init->nb_segs = 1;
+		mbuf_init->port = rxq->port_id;
+		if (priv->flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF)
+			mbuf_init->ol_flags = EXT_ATTACHED_MBUF;
+		/*
+		 * prevent compiler reordering:
+		 * rearm_data covers previous fields.
+		 */
+		rte_compiler_barrier();
+		rxq->mbuf_initializer =
+			*(rte_xmm_t *)&mbuf_init->rearm_data;
+		/* Padding with a fake mbuf for vectorized Rx. */
+		for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
+			(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
+	}
+	DRV_LOG(DEBUG,
+		"port %u Rx queue %u allocated and configured %u segments"
+		" (max %u packets)",
+		PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx, elts_n,
+		elts_n / (1 << rxq_ctrl->rxq.sges_n));
+	return 0;
+error:
+	err = rte_errno; /* Save rte_errno before cleanup. */
+	elts_n = i;
+	for (i = 0; (i != elts_n); ++i) {
+		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+		(*rxq_ctrl->rxq.elts)[i] = NULL;
+	}
+	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
+		PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx);
+	rte_errno = err; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Allocate RX queue elements.
+ *
+ * @param rxq_ctrl
+ *   Pointer to RX queue structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+	return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
+	       rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
+}
+
+/**
+ * Free RX queue elements for Multi-Packet RQ.
+ *
+ * @param rxq_ctrl
+ *   Pointer to RX queue structure.
+ */
+static void
+rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+	uint16_t i;
+
+	DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs",
+		rxq->port_id, rxq->idx);
+	if (rxq->mprq_bufs == NULL)
+		return;
+	MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0);
+	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
+		if ((*rxq->mprq_bufs)[i] != NULL)
+			mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
+		(*rxq->mprq_bufs)[i] = NULL;
+	}
+	if (rxq->mprq_repl != NULL) {
+		mlx5_mprq_buf_free(rxq->mprq_repl);
+		rxq->mprq_repl = NULL;
+	}
+}
+
+/**
+ * Free RX queue elements for Single-Packet RQ.
+ *
+ * @param rxq_ctrl
+ *   Pointer to RX queue structure.
+ */
+static void
+rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+	const uint16_t q_n = (1 << rxq->elts_n);
+	const uint16_t q_mask = q_n - 1;
+	uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
+	uint16_t i;
+
+	DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
+		PORT_ID(rxq_ctrl->priv), rxq->idx);
+	if (rxq->elts == NULL)
+		return;
+	/**
+	 * Some mbuf in the Ring belongs to the application.  They cannot be
+	 * freed.
+	 */
+	if (mlx5_rxq_check_vec_support(rxq) > 0) {
+		for (i = 0; i < used; ++i)
+			(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
+		rxq->rq_pi = rxq->rq_ci;
+	}
+	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
+		if ((*rxq->elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq->elts)[i]);
+		(*rxq->elts)[i] = NULL;
+	}
+}
+
+/**
+ * Free RX queue elements.
+ *
+ * @param rxq_ctrl
+ *   Pointer to RX queue structure.
+ */
+static void
+rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+		rxq_free_elts_mprq(rxq_ctrl);
+	else
+		rxq_free_elts_sprq(rxq_ctrl);
+}
+
+/**
+ * Returns the per-queue supported offloads.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   Supported Rx offloads.
+ */
+uint64_t
+mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER |
+			     DEV_RX_OFFLOAD_TIMESTAMP |
+			     DEV_RX_OFFLOAD_JUMBO_FRAME |
+			     DEV_RX_OFFLOAD_RSS_HASH);
+
+	if (config->hw_fcs_strip)
+		offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
+
+	if (config->hw_csum)
+		offloads |= (DEV_RX_OFFLOAD_IPV4_CKSUM |
+			     DEV_RX_OFFLOAD_UDP_CKSUM |
+			     DEV_RX_OFFLOAD_TCP_CKSUM);
+	if (config->hw_vlan_strip)
+		offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
+	if (MLX5_LRO_SUPPORTED(dev))
+		offloads |= DEV_RX_OFFLOAD_TCP_LRO;
+	return offloads;
+}
+
+
+/**
+ * Returns the per-port supported offloads.
+ *
+ * @return
+ *   Supported Rx offloads.
+ */
+uint64_t
+mlx5_get_rx_port_offloads(void)
+{
+	uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER;
+
+	return offloads;
+}
+
+/**
+ * Verify if the queue can be released.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   RX queue index.
+ *
+ * @return
+ *   1 if the queue can be released
+ *   0 if the queue can not be released, there are references to it.
+ *   Negative errno and rte_errno is set if queue doesn't exist.
+ */
+static int
+mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+
+	if (!(*priv->rxqs)[idx]) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
+	return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
+}
+
+/**
+ * Rx queue presetup checks.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!rte_is_power_of_2(desc)) {
+		desc = 1 << log2above(desc);
+		DRV_LOG(WARNING,
+			"port %u increased number of descriptors in Rx queue %u"
+			" to the next power of two (%d)",
+			dev->data->port_id, idx, desc);
+	}
+	DRV_LOG(DEBUG, "port %u configuring Rx queue %u for %u descriptors",
+		dev->data->port_id, idx, desc);
+	if (idx >= priv->rxqs_n) {
+		DRV_LOG(ERR, "port %u Rx queue index out of range (%u >= %u)",
+			dev->data->port_id, idx, priv->rxqs_n);
+		rte_errno = EOVERFLOW;
+		return -rte_errno;
+	}
+	if (!mlx5_rxq_releasable(dev, idx)) {
+		DRV_LOG(ERR, "port %u unable to release queue index %u",
+			dev->data->port_id, idx);
+		rte_errno = EBUSY;
+		return -rte_errno;
+	}
+	mlx5_rxq_release(dev, idx);
+	return 0;
+}
+
+/**
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ *   Thresholds parameters.
+ * @param mp
+ *   Memory pool for buffer allocations.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+		    unsigned int socket, const struct rte_eth_rxconf *conf,
+		    struct rte_mempool *mp)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	int res;
+
+	res = mlx5_rx_queue_pre_setup(dev, idx, desc);
+	if (res)
+		return res;
+	rxq_ctrl = mlx5_rxq_new(dev, idx, desc, socket, conf, mp);
+	if (!rxq_ctrl) {
+		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
+			dev->data->port_id, idx);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	DRV_LOG(DEBUG, "port %u adding Rx queue %u to list",
+		dev->data->port_id, idx);
+	(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+	return 0;
+}
+
+/**
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param hairpin_conf
+ *   Hairpin configuration parameters.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+			    uint16_t desc,
+			    const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	int res;
+
+	res = mlx5_rx_queue_pre_setup(dev, idx, desc);
+	if (res)
+		return res;
+	if (hairpin_conf->peer_count != 1 ||
+	    hairpin_conf->peers[0].port != dev->data->port_id ||
+	    hairpin_conf->peers[0].queue >= priv->txqs_n) {
+		DRV_LOG(ERR, "port %u unable to setup hairpin queue index %u "
+			" invalid hairpind configuration", dev->data->port_id,
+			idx);
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	rxq_ctrl = mlx5_rxq_hairpin_new(dev, idx, desc, hairpin_conf);
+	if (!rxq_ctrl) {
+		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
+			dev->data->port_id, idx);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	DRV_LOG(DEBUG, "port %u adding Rx queue %u to list",
+		dev->data->port_id, idx);
+	(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+	return 0;
+}
+
+/**
+ * DPDK callback to release a RX queue.
+ *
+ * @param dpdk_rxq
+ *   Generic RX queue pointer.
+ */
+void
+mlx5_rx_queue_release(void *dpdk_rxq)
+{
+	struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+	struct mlx5_priv *priv;
+
+	if (rxq == NULL)
+		return;
+	rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	priv = rxq_ctrl->priv;
+	if (!mlx5_rxq_releasable(ETH_DEV(priv), rxq_ctrl->rxq.idx))
+		rte_panic("port %u Rx queue %u is still used by a flow and"
+			  " cannot be removed\n",
+			  PORT_ID(priv), rxq->idx);
+	mlx5_rxq_release(ETH_DEV(priv), rxq_ctrl->rxq.idx);
+}
+
+/**
+ * Get an Rx queue Verbs/DevX object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The Verbs/DevX object if it exists.
+ */
+static struct mlx5_rxq_obj *
+mlx5_rxq_obj_get(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+
+	if (idx >= priv->rxqs_n)
+		return NULL;
+	if (!rxq_data)
+		return NULL;
+	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	if (rxq_ctrl->obj)
+		rte_atomic32_inc(&rxq_ctrl->obj->refcnt);
+	return rxq_ctrl->obj;
+}
+
+/**
+ * Release the resources allocated for an RQ DevX object.
+ *
+ * @param rxq_ctrl
+ *   DevX Rx queue object.
+ */
+static void
+rxq_release_rq_resources(struct mlx5_rxq_ctrl *rxq_ctrl)
+{
+	if (rxq_ctrl->rxq.wqes) {
+		rte_free((void *)(uintptr_t)rxq_ctrl->rxq.wqes);
+		rxq_ctrl->rxq.wqes = NULL;
+	}
+	if (rxq_ctrl->wq_umem) {
+		mlx5_glue->devx_umem_dereg(rxq_ctrl->wq_umem);
+		rxq_ctrl->wq_umem = NULL;
+	}
+}
+
+/**
+ * Release an Rx hairpin related resources.
+ *
+ * @param rxq_obj
+ *   Hairpin Rx queue object.
+ */
+static void
+rxq_obj_hairpin_release(struct mlx5_rxq_obj *rxq_obj)
+{
+	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
+
+	MLX5_ASSERT(rxq_obj);
+	rq_attr.state = MLX5_RQC_STATE_RST;
+	rq_attr.rq_state = MLX5_RQC_STATE_RDY;
+	mlx5_devx_cmd_modify_rq(rxq_obj->rq, &rq_attr);
+	claim_zero(mlx5_devx_cmd_destroy(rxq_obj->rq));
+}
+
+/**
+ * Release an Rx verbs/DevX queue object.
+ *
+ * @param rxq_obj
+ *   Verbs/DevX Rx queue object.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+static int
+mlx5_rxq_obj_release(struct mlx5_rxq_obj *rxq_obj)
+{
+	MLX5_ASSERT(rxq_obj);
+	if (rte_atomic32_dec_and_test(&rxq_obj->refcnt)) {
+		switch (rxq_obj->type) {
+		case MLX5_RXQ_OBJ_TYPE_IBV:
+			MLX5_ASSERT(rxq_obj->wq);
+			MLX5_ASSERT(rxq_obj->cq);
+			rxq_free_elts(rxq_obj->rxq_ctrl);
+			claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
+			claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq));
+			break;
+		case MLX5_RXQ_OBJ_TYPE_DEVX_RQ:
+			MLX5_ASSERT(rxq_obj->cq);
+			MLX5_ASSERT(rxq_obj->rq);
+			rxq_free_elts(rxq_obj->rxq_ctrl);
+			claim_zero(mlx5_devx_cmd_destroy(rxq_obj->rq));
+			rxq_release_rq_resources(rxq_obj->rxq_ctrl);
+			claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq));
+			break;
+		case MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN:
+			MLX5_ASSERT(rxq_obj->rq);
+			rxq_obj_hairpin_release(rxq_obj);
+			break;
+		}
+		if (rxq_obj->channel)
+			claim_zero(mlx5_glue->destroy_comp_channel
+				   (rxq_obj->channel));
+		LIST_REMOVE(rxq_obj, next);
+		rte_free(rxq_obj);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Allocate queue vector and fill epoll fd list for Rx interrupts.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+	unsigned int rxqs_n = priv->rxqs_n;
+	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+	unsigned int count = 0;
+	struct rte_intr_handle *intr_handle = dev->intr_handle;
+
+	if (!dev->data->dev_conf.intr_conf.rxq)
+		return 0;
+	mlx5_rx_intr_vec_disable(dev);
+	intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0]));
+	if (intr_handle->intr_vec == NULL) {
+		DRV_LOG(ERR,
+			"port %u failed to allocate memory for interrupt"
+			" vector, Rx interrupts will not be supported",
+			dev->data->port_id);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	intr_handle->type = RTE_INTR_HANDLE_EXT;
+	for (i = 0; i != n; ++i) {
+		/* This rxq obj must not be released in this function. */
+		struct mlx5_rxq_obj *rxq_obj = mlx5_rxq_obj_get(dev, i);
+		int fd;
+		int flags;
+		int rc;
+
+		/* Skip queues that cannot request interrupts. */
+		if (!rxq_obj || !rxq_obj->channel) {
+			/* Use invalid intr_vec[] index to disable entry. */
+			intr_handle->intr_vec[i] =
+				RTE_INTR_VEC_RXTX_OFFSET +
+				RTE_MAX_RXTX_INTR_VEC_ID;
+			continue;
+		}
+		if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
+			DRV_LOG(ERR,
+				"port %u too many Rx queues for interrupt"
+				" vector size (%d), Rx interrupts cannot be"
+				" enabled",
+				dev->data->port_id, RTE_MAX_RXTX_INTR_VEC_ID);
+			mlx5_rx_intr_vec_disable(dev);
+			rte_errno = ENOMEM;
+			return -rte_errno;
+		}
+		fd = rxq_obj->channel->fd;
+		flags = fcntl(fd, F_GETFL);
+		rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+		if (rc < 0) {
+			rte_errno = errno;
+			DRV_LOG(ERR,
+				"port %u failed to make Rx interrupt file"
+				" descriptor %d non-blocking for queue index"
+				" %d",
+				dev->data->port_id, fd, i);
+			mlx5_rx_intr_vec_disable(dev);
+			return -rte_errno;
+		}
+		intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
+		intr_handle->efds[count] = fd;
+		count++;
+	}
+	if (!count)
+		mlx5_rx_intr_vec_disable(dev);
+	else
+		intr_handle->nb_efd = count;
+	return 0;
+}
+
+/**
+ * Clean up Rx interrupts handler.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_intr_handle *intr_handle = dev->intr_handle;
+	unsigned int i;
+	unsigned int rxqs_n = priv->rxqs_n;
+	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+
+	if (!dev->data->dev_conf.intr_conf.rxq)
+		return;
+	if (!intr_handle->intr_vec)
+		goto free;
+	for (i = 0; i != n; ++i) {
+		struct mlx5_rxq_ctrl *rxq_ctrl;
+		struct mlx5_rxq_data *rxq_data;
+
+		if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
+		    RTE_MAX_RXTX_INTR_VEC_ID)
+			continue;
+		/**
+		 * Need to access directly the queue to release the reference
+		 * kept in mlx5_rx_intr_vec_enable().
+		 */
+		rxq_data = (*priv->rxqs)[i];
+		rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+		if (rxq_ctrl->obj)
+			mlx5_rxq_obj_release(rxq_ctrl->obj);
+	}
+free:
+	rte_intr_free_epoll_fd(intr_handle);
+	if (intr_handle->intr_vec)
+		free(intr_handle->intr_vec);
+	intr_handle->nb_efd = 0;
+	intr_handle->intr_vec = NULL;
+}
+
+/**
+ *  MLX5 CQ notification .
+ *
+ *  @param rxq
+ *     Pointer to receive queue structure.
+ *  @param sq_n_rxq
+ *     Sequence number per receive queue .
+ */
+static inline void
+mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
+{
+	int sq_n = 0;
+	uint32_t doorbell_hi;
+	uint64_t doorbell;
+	void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL;
+
+	sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK;
+	doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
+	doorbell = (uint64_t)doorbell_hi << 32;
+	doorbell |=  rxq->cqn;
+	rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
+	mlx5_uar_write64(rte_cpu_to_be_64(doorbell),
+			 cq_db_reg, rxq->uar_lock_cq);
+}
+
+/**
+ * DPDK callback for Rx queue interrupt enable.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param rx_queue_id
+ *   Rx queue number.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq_data;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+
+	rxq_data = (*priv->rxqs)[rx_queue_id];
+	if (!rxq_data) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	if (rxq_ctrl->irq) {
+		struct mlx5_rxq_obj *rxq_obj;
+
+		rxq_obj = mlx5_rxq_obj_get(dev, rx_queue_id);
+		if (!rxq_obj) {
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
+		mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn);
+		mlx5_rxq_obj_release(rxq_obj);
+	}
+	return 0;
+}
+
+/**
+ * DPDK callback for Rx queue interrupt disable.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param rx_queue_id
+ *   Rx queue number.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq_data;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+	struct mlx5_rxq_obj *rxq_obj = NULL;
+	struct ibv_cq *ev_cq;
+	void *ev_ctx;
+	int ret;
+
+	rxq_data = (*priv->rxqs)[rx_queue_id];
+	if (!rxq_data) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	if (!rxq_ctrl->irq)
+		return 0;
+	rxq_obj = mlx5_rxq_obj_get(dev, rx_queue_id);
+	if (!rxq_obj) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	ret = mlx5_glue->get_cq_event(rxq_obj->channel, &ev_cq, &ev_ctx);
+	if (ret || ev_cq != rxq_obj->cq) {
+		rte_errno = EINVAL;
+		goto exit;
+	}
+	rxq_data->cq_arm_sn++;
+	mlx5_glue->ack_cq_events(rxq_obj->cq, 1);
+	mlx5_rxq_obj_release(rxq_obj);
+	return 0;
+exit:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	if (rxq_obj)
+		mlx5_rxq_obj_release(rxq_obj);
+	DRV_LOG(WARNING, "port %u unable to disable interrupt on Rx queue %d",
+		dev->data->port_id, rx_queue_id);
+	rte_errno = ret; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Create a CQ Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param priv
+ *   Pointer to device private data.
+ * @param rxq_data
+ *   Pointer to Rx queue data.
+ * @param cqe_n
+ *   Number of CQEs in CQ.
+ * @param rxq_obj
+ *   Pointer to Rx queue object data.
+ *
+ * @return
+ *   The Verbs object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_cq *
+mlx5_ibv_cq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv,
+		struct mlx5_rxq_data *rxq_data,
+		unsigned int cqe_n, struct mlx5_rxq_obj *rxq_obj)
+{
+	struct {
+		struct ibv_cq_init_attr_ex ibv;
+		struct mlx5dv_cq_init_attr mlx5;
+	} cq_attr;
+
+	cq_attr.ibv = (struct ibv_cq_init_attr_ex){
+		.cqe = cqe_n,
+		.channel = rxq_obj->channel,
+		.comp_mask = 0,
+	};
+	cq_attr.mlx5 = (struct mlx5dv_cq_init_attr){
+		.comp_mask = 0,
+	};
+	if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
+	    !rxq_data->lro) {
+		cq_attr.mlx5.comp_mask |=
+				MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+		cq_attr.mlx5.cqe_comp_res_format =
+				mlx5_rxq_mprq_enabled(rxq_data) ?
+				MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX :
+				MLX5DV_CQE_RES_FORMAT_HASH;
+#else
+		cq_attr.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
+#endif
+		/*
+		 * For vectorized Rx, it must not be doubled in order to
+		 * make cq_ci and rq_ci aligned.
+		 */
+		if (mlx5_rxq_check_vec_support(rxq_data) < 0)
+			cq_attr.ibv.cqe *= 2;
+	} else if (priv->config.cqe_comp && rxq_data->hw_timestamp) {
+		DRV_LOG(DEBUG,
+			"port %u Rx CQE compression is disabled for HW"
+			" timestamp",
+			dev->data->port_id);
+	} else if (priv->config.cqe_comp && rxq_data->lro) {
+		DRV_LOG(DEBUG,
+			"port %u Rx CQE compression is disabled for LRO",
+			dev->data->port_id);
+	}
+#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
+	if (priv->config.cqe_pad) {
+		cq_attr.mlx5.comp_mask |= MLX5DV_CQ_INIT_ATTR_MASK_FLAGS;
+		cq_attr.mlx5.flags |= MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD;
+	}
+#endif
+	return mlx5_glue->cq_ex_to_cq(mlx5_glue->dv_create_cq(priv->sh->ctx,
+							      &cq_attr.ibv,
+							      &cq_attr.mlx5));
+}
+
+/**
+ * Create a WQ Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param priv
+ *   Pointer to device private data.
+ * @param rxq_data
+ *   Pointer to Rx queue data.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ * @param wqe_n
+ *   Number of WQEs in WQ.
+ * @param rxq_obj
+ *   Pointer to Rx queue object data.
+ *
+ * @return
+ *   The Verbs object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_wq *
+mlx5_ibv_wq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv,
+		struct mlx5_rxq_data *rxq_data, uint16_t idx,
+		unsigned int wqe_n, struct mlx5_rxq_obj *rxq_obj)
+{
+	struct {
+		struct ibv_wq_init_attr ibv;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+		struct mlx5dv_wq_init_attr mlx5;
+#endif
+	} wq_attr;
+
+	wq_attr.ibv = (struct ibv_wq_init_attr){
+		.wq_context = NULL, /* Could be useful in the future. */
+		.wq_type = IBV_WQT_RQ,
+		/* Max number of outstanding WRs. */
+		.max_wr = wqe_n >> rxq_data->sges_n,
+		/* Max number of scatter/gather elements in a WR. */
+		.max_sge = 1 << rxq_data->sges_n,
+		.pd = priv->sh->pd,
+		.cq = rxq_obj->cq,
+		.comp_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING | 0,
+		.create_flags = (rxq_data->vlan_strip ?
+				 IBV_WQ_FLAGS_CVLAN_STRIPPING : 0),
+	};
+	/* By default, FCS (CRC) is stripped by hardware. */
+	if (rxq_data->crc_present) {
+		wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
+		wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+	}
+	if (priv->config.hw_padding) {
+#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
+		wq_attr.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
+		wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
+		wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_PCI_WRITE_END_PADDING;
+		wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+#endif
+	}
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+	wq_attr.mlx5 = (struct mlx5dv_wq_init_attr){
+		.comp_mask = 0,
+	};
+	if (mlx5_rxq_mprq_enabled(rxq_data)) {
+		struct mlx5dv_striding_rq_init_attr *mprq_attr =
+						&wq_attr.mlx5.striding_rq_attrs;
+
+		wq_attr.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
+		*mprq_attr = (struct mlx5dv_striding_rq_init_attr){
+			.single_stride_log_num_of_bytes = rxq_data->strd_sz_n,
+			.single_wqe_log_num_of_strides = rxq_data->strd_num_n,
+			.two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT,
+		};
+	}
+	rxq_obj->wq = mlx5_glue->dv_create_wq(priv->sh->ctx, &wq_attr.ibv,
+					      &wq_attr.mlx5);
+#else
+	rxq_obj->wq = mlx5_glue->create_wq(priv->sh->ctx, &wq_attr.ibv);
+#endif
+	if (rxq_obj->wq) {
+		/*
+		 * Make sure number of WRs*SGEs match expectations since a queue
+		 * cannot allocate more than "desc" buffers.
+		 */
+		if (wq_attr.ibv.max_wr != (wqe_n >> rxq_data->sges_n) ||
+		    wq_attr.ibv.max_sge != (1u << rxq_data->sges_n)) {
+			DRV_LOG(ERR,
+				"port %u Rx queue %u requested %u*%u but got"
+				" %u*%u WRs*SGEs",
+				dev->data->port_id, idx,
+				wqe_n >> rxq_data->sges_n,
+				(1 << rxq_data->sges_n),
+				wq_attr.ibv.max_wr, wq_attr.ibv.max_sge);
+			claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
+			rxq_obj->wq = NULL;
+			rte_errno = EINVAL;
+		}
+	}
+	return rxq_obj->wq;
+}
+
+/**
+ * Fill common fields of create RQ attributes structure.
+ *
+ * @param rxq_data
+ *   Pointer to Rx queue data.
+ * @param cqn
+ *   CQ number to use with this RQ.
+ * @param rq_attr
+ *   RQ attributes structure to fill..
+ */
+static void
+mlx5_devx_create_rq_attr_fill(struct mlx5_rxq_data *rxq_data, uint32_t cqn,
+			      struct mlx5_devx_create_rq_attr *rq_attr)
+{
+	rq_attr->state = MLX5_RQC_STATE_RST;
+	rq_attr->vsd = (rxq_data->vlan_strip) ? 0 : 1;
+	rq_attr->cqn = cqn;
+	rq_attr->scatter_fcs = (rxq_data->crc_present) ? 1 : 0;
+}
+
+/**
+ * Fill common fields of DevX WQ attributes structure.
+ *
+ * @param priv
+ *   Pointer to device private data.
+ * @param rxq_ctrl
+ *   Pointer to Rx queue control structure.
+ * @param wq_attr
+ *   WQ attributes structure to fill..
+ */
+static void
+mlx5_devx_wq_attr_fill(struct mlx5_priv *priv, struct mlx5_rxq_ctrl *rxq_ctrl,
+		       struct mlx5_devx_wq_attr *wq_attr)
+{
+	wq_attr->end_padding_mode = priv->config.cqe_pad ?
+					MLX5_WQ_END_PAD_MODE_ALIGN :
+					MLX5_WQ_END_PAD_MODE_NONE;
+	wq_attr->pd = priv->sh->pdn;
+	wq_attr->dbr_addr = rxq_ctrl->dbr_offset;
+	wq_attr->dbr_umem_id = rxq_ctrl->dbr_umem_id;
+	wq_attr->dbr_umem_valid = 1;
+	wq_attr->wq_umem_id = rxq_ctrl->wq_umem->umem_id;
+	wq_attr->wq_umem_valid = 1;
+}
+
+/**
+ * Create a RQ object using DevX.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ * @param cqn
+ *   CQ number to use with this RQ.
+ *
+ * @return
+ *   The DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_devx_obj *
+mlx5_devx_rq_new(struct rte_eth_dev *dev, uint16_t idx, uint32_t cqn)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	struct mlx5_devx_create_rq_attr rq_attr;
+	uint32_t wqe_n = 1 << (rxq_data->elts_n - rxq_data->sges_n);
+	uint32_t wq_size = 0;
+	uint32_t wqe_size = 0;
+	uint32_t log_wqe_size = 0;
+	void *buf = NULL;
+	struct mlx5_devx_obj *rq;
+
+	memset(&rq_attr, 0, sizeof(rq_attr));
+	/* Fill RQ attributes. */
+	rq_attr.mem_rq_type = MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE;
+	rq_attr.flush_in_error_en = 1;
+	mlx5_devx_create_rq_attr_fill(rxq_data, cqn, &rq_attr);
+	/* Fill WQ attributes for this RQ. */
+	if (mlx5_rxq_mprq_enabled(rxq_data)) {
+		rq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ;
+		/*
+		 * Number of strides in each WQE:
+		 * 512*2^single_wqe_log_num_of_strides.
+		 */
+		rq_attr.wq_attr.single_wqe_log_num_of_strides =
+				rxq_data->strd_num_n -
+				MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
+		/* Stride size = (2^single_stride_log_num_of_bytes)*64B. */
+		rq_attr.wq_attr.single_stride_log_num_of_bytes =
+				rxq_data->strd_sz_n -
+				MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
+		wqe_size = sizeof(struct mlx5_wqe_mprq);
+	} else {
+		rq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
+		wqe_size = sizeof(struct mlx5_wqe_data_seg);
+	}
+	log_wqe_size = log2above(wqe_size) + rxq_data->sges_n;
+	rq_attr.wq_attr.log_wq_stride = log_wqe_size;
+	rq_attr.wq_attr.log_wq_sz = rxq_data->elts_n - rxq_data->sges_n;
+	/* Calculate and allocate WQ memory space. */
+	wqe_size = 1 << log_wqe_size; /* round up power of two.*/
+	wq_size = wqe_n * wqe_size;
+	buf = rte_calloc_socket(__func__, 1, wq_size, MLX5_WQE_BUF_ALIGNMENT,
+				rxq_ctrl->socket);
+	if (!buf)
+		return NULL;
+	rxq_data->wqes = buf;
+	rxq_ctrl->wq_umem = mlx5_glue->devx_umem_reg(priv->sh->ctx,
+						     buf, wq_size, 0);
+	if (!rxq_ctrl->wq_umem) {
+		rte_free(buf);
+		return NULL;
+	}
+	mlx5_devx_wq_attr_fill(priv, rxq_ctrl, &rq_attr.wq_attr);
+	rq = mlx5_devx_cmd_create_rq(priv->sh->ctx, &rq_attr, rxq_ctrl->socket);
+	if (!rq)
+		rxq_release_rq_resources(rxq_ctrl);
+	return rq;
+}
+
+/**
+ * Create the Rx hairpin queue object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The hairpin DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_rxq_obj *
+mlx5_rxq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	struct mlx5_devx_create_rq_attr attr = { 0 };
+	struct mlx5_rxq_obj *tmpl = NULL;
+	int ret = 0;
+	uint32_t max_wq_data;
+
+	MLX5_ASSERT(rxq_data);
+	MLX5_ASSERT(!rxq_ctrl->obj);
+	tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+				 rxq_ctrl->socket);
+	if (!tmpl) {
+		DRV_LOG(ERR,
+			"port %u Rx queue %u cannot allocate verbs resources",
+			dev->data->port_id, rxq_data->idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	tmpl->type = MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN;
+	tmpl->rxq_ctrl = rxq_ctrl;
+	attr.hairpin = 1;
+	max_wq_data = priv->config.hca_attr.log_max_hairpin_wq_data_sz;
+	/* Jumbo frames > 9KB should be supported, and more packets. */
+	if (priv->config.log_hp_size != (uint32_t)MLX5_ARG_UNSET) {
+		if (priv->config.log_hp_size > max_wq_data) {
+			DRV_LOG(ERR, "total data size %u power of 2 is "
+				"too large for hairpin",
+				priv->config.log_hp_size);
+			rte_errno = ERANGE;
+			return NULL;
+		}
+		attr.wq_attr.log_hairpin_data_sz = priv->config.log_hp_size;
+	} else {
+		attr.wq_attr.log_hairpin_data_sz =
+				(max_wq_data < MLX5_HAIRPIN_JUMBO_LOG_SIZE) ?
+				 max_wq_data : MLX5_HAIRPIN_JUMBO_LOG_SIZE;
+	}
+	/* Set the packets number to the maximum value for performance. */
+	attr.wq_attr.log_hairpin_num_packets =
+			attr.wq_attr.log_hairpin_data_sz -
+			MLX5_HAIRPIN_QUEUE_STRIDE;
+	tmpl->rq = mlx5_devx_cmd_create_rq(priv->sh->ctx, &attr,
+					   rxq_ctrl->socket);
+	if (!tmpl->rq) {
+		DRV_LOG(ERR,
+			"port %u Rx hairpin queue %u can't create rq object",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+		goto error;
+	}
+	DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id,
+		idx, (void *)&tmpl);
+	rte_atomic32_inc(&tmpl->refcnt);
+	LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next);
+	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+	return tmpl;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	if (tmpl->rq)
+		mlx5_devx_cmd_destroy(tmpl->rq);
+	rte_errno = ret; /* Restore rte_errno. */
+	return NULL;
+}
+
+/**
+ * Create the Rx queue Verbs/DevX object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ * @param type
+ *   Type of Rx queue object to create.
+ *
+ * @return
+ *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_rxq_obj *
+mlx5_rxq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
+		 enum mlx5_rxq_obj_type type)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	struct ibv_wq_attr mod;
+	unsigned int cqe_n;
+	unsigned int wqe_n = 1 << rxq_data->elts_n;
+	struct mlx5_rxq_obj *tmpl = NULL;
+	struct mlx5dv_cq cq_info;
+	struct mlx5dv_rwq rwq;
+	int ret = 0;
+	struct mlx5dv_obj obj;
+
+	MLX5_ASSERT(rxq_data);
+	MLX5_ASSERT(!rxq_ctrl->obj);
+	if (type == MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN)
+		return mlx5_rxq_obj_hairpin_new(dev, idx);
+	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_RX_QUEUE;
+	priv->verbs_alloc_ctx.obj = rxq_ctrl;
+	tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+				 rxq_ctrl->socket);
+	if (!tmpl) {
+		DRV_LOG(ERR,
+			"port %u Rx queue %u cannot allocate verbs resources",
+			dev->data->port_id, rxq_data->idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	tmpl->type = type;
+	tmpl->rxq_ctrl = rxq_ctrl;
+	if (rxq_ctrl->irq) {
+		tmpl->channel = mlx5_glue->create_comp_channel(priv->sh->ctx);
+		if (!tmpl->channel) {
+			DRV_LOG(ERR, "port %u: comp channel creation failure",
+				dev->data->port_id);
+			rte_errno = ENOMEM;
+			goto error;
+		}
+	}
+	if (mlx5_rxq_mprq_enabled(rxq_data))
+		cqe_n = wqe_n * (1 << rxq_data->strd_num_n) - 1;
+	else
+		cqe_n = wqe_n  - 1;
+	tmpl->cq = mlx5_ibv_cq_new(dev, priv, rxq_data, cqe_n, tmpl);
+	if (!tmpl->cq) {
+		DRV_LOG(ERR, "port %u Rx queue %u CQ creation failure",
+			dev->data->port_id, idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	obj.cq.in = tmpl->cq;
+	obj.cq.out = &cq_info;
+	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ);
+	if (ret) {
+		rte_errno = ret;
+		goto error;
+	}
+	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+		DRV_LOG(ERR,
+			"port %u wrong MLX5_CQE_SIZE environment variable"
+			" value: it should be set to %u",
+			dev->data->port_id, RTE_CACHE_LINE_SIZE);
+		rte_errno = EINVAL;
+		goto error;
+	}
+	DRV_LOG(DEBUG, "port %u device_attr.max_qp_wr is %d",
+		dev->data->port_id, priv->sh->device_attr.orig_attr.max_qp_wr);
+	DRV_LOG(DEBUG, "port %u device_attr.max_sge is %d",
+		dev->data->port_id, priv->sh->device_attr.orig_attr.max_sge);
+	/* Allocate door-bell for types created with DevX. */
+	if (tmpl->type != MLX5_RXQ_OBJ_TYPE_IBV) {
+		struct mlx5_devx_dbr_page *dbr_page;
+		int64_t dbr_offset;
+
+		dbr_offset = mlx5_get_dbr(dev, &dbr_page);
+		if (dbr_offset < 0)
+			goto error;
+		rxq_ctrl->dbr_offset = dbr_offset;
+		rxq_ctrl->dbr_umem_id = dbr_page->umem->umem_id;
+		rxq_ctrl->dbr_umem_id_valid = 1;
+		rxq_data->rq_db = (uint32_t *)((uintptr_t)dbr_page->dbrs +
+					       (uintptr_t)rxq_ctrl->dbr_offset);
+	}
+	if (tmpl->type == MLX5_RXQ_OBJ_TYPE_IBV) {
+		tmpl->wq = mlx5_ibv_wq_new(dev, priv, rxq_data, idx, wqe_n,
+					   tmpl);
+		if (!tmpl->wq) {
+			DRV_LOG(ERR, "port %u Rx queue %u WQ creation failure",
+				dev->data->port_id, idx);
+			rte_errno = ENOMEM;
+			goto error;
+		}
+		/* Change queue state to ready. */
+		mod = (struct ibv_wq_attr){
+			.attr_mask = IBV_WQ_ATTR_STATE,
+			.wq_state = IBV_WQS_RDY,
+		};
+		ret = mlx5_glue->modify_wq(tmpl->wq, &mod);
+		if (ret) {
+			DRV_LOG(ERR,
+				"port %u Rx queue %u WQ state to IBV_WQS_RDY"
+				" failed", dev->data->port_id, idx);
+			rte_errno = ret;
+			goto error;
+		}
+		obj.rwq.in = tmpl->wq;
+		obj.rwq.out = &rwq;
+		ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_RWQ);
+		if (ret) {
+			rte_errno = ret;
+			goto error;
+		}
+		rxq_data->wqes = rwq.buf;
+		rxq_data->rq_db = rwq.dbrec;
+	} else if (tmpl->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) {
+		struct mlx5_devx_modify_rq_attr rq_attr;
+
+		memset(&rq_attr, 0, sizeof(rq_attr));
+		tmpl->rq = mlx5_devx_rq_new(dev, idx, cq_info.cqn);
+		if (!tmpl->rq) {
+			DRV_LOG(ERR, "port %u Rx queue %u RQ creation failure",
+				dev->data->port_id, idx);
+			rte_errno = ENOMEM;
+			goto error;
+		}
+		/* Change queue state to ready. */
+		rq_attr.rq_state = MLX5_RQC_STATE_RST;
+		rq_attr.state = MLX5_RQC_STATE_RDY;
+		ret = mlx5_devx_cmd_modify_rq(tmpl->rq, &rq_attr);
+		if (ret)
+			goto error;
+	}
+	/* Fill the rings. */
+	rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
+	rxq_data->cq_db = cq_info.dbrec;
+	rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
+	rxq_data->cq_uar = cq_info.cq_uar;
+	rxq_data->cqn = cq_info.cqn;
+	rxq_data->cq_arm_sn = 0;
+	mlx5_rxq_initialize(rxq_data);
+	rxq_data->cq_ci = 0;
+	DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id,
+		idx, (void *)&tmpl);
+	rte_atomic32_inc(&tmpl->refcnt);
+	LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next);
+	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+	return tmpl;
+error:
+	if (tmpl) {
+		ret = rte_errno; /* Save rte_errno before cleanup. */
+		if (tmpl->type == MLX5_RXQ_OBJ_TYPE_IBV && tmpl->wq)
+			claim_zero(mlx5_glue->destroy_wq(tmpl->wq));
+		else if (tmpl->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ && tmpl->rq)
+			claim_zero(mlx5_devx_cmd_destroy(tmpl->rq));
+		if (tmpl->cq)
+			claim_zero(mlx5_glue->destroy_cq(tmpl->cq));
+		if (tmpl->channel)
+			claim_zero(mlx5_glue->destroy_comp_channel
+							(tmpl->channel));
+		rte_free(tmpl);
+		rte_errno = ret; /* Restore rte_errno. */
+	}
+	if (type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ)
+		rxq_release_rq_resources(rxq_ctrl);
+	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+	return NULL;
+}
+
+/**
+ * Verify the Rx queue objects list is empty
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The number of objects not released.
+ */
+int
+mlx5_rxq_obj_verify(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret = 0;
+	struct mlx5_rxq_obj *rxq_obj;
+
+	LIST_FOREACH(rxq_obj, &priv->rxqsobj, next) {
+		DRV_LOG(DEBUG, "port %u Rx queue %u still referenced",
+			dev->data->port_id, rxq_obj->rxq_ctrl->rxq.idx);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Callback function to initialize mbufs for Multi-Packet RQ.
+ */
+static inline void
+mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg,
+		    void *_m, unsigned int i __rte_unused)
+{
+	struct mlx5_mprq_buf *buf = _m;
+	struct rte_mbuf_ext_shared_info *shinfo;
+	unsigned int strd_n = (unsigned int)(uintptr_t)opaque_arg;
+	unsigned int j;
+
+	memset(_m, 0, sizeof(*buf));
+	buf->mp = mp;
+	rte_atomic16_set(&buf->refcnt, 1);
+	for (j = 0; j != strd_n; ++j) {
+		shinfo = &buf->shinfos[j];
+		shinfo->free_cb = mlx5_mprq_buf_free_cb;
+		shinfo->fcb_opaque = buf;
+	}
+}
+
+/**
+ * Free mempool of Multi-Packet RQ.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+mlx5_mprq_free_mp(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_mempool *mp = priv->mprq_mp;
+	unsigned int i;
+
+	if (mp == NULL)
+		return 0;
+	DRV_LOG(DEBUG, "port %u freeing mempool (%s) for Multi-Packet RQ",
+		dev->data->port_id, mp->name);
+	/*
+	 * If a buffer in the pool has been externally attached to a mbuf and it
+	 * is still in use by application, destroying the Rx queue can spoil
+	 * the packet. It is unlikely to happen but if application dynamically
+	 * creates and destroys with holding Rx packets, this can happen.
+	 *
+	 * TODO: It is unavoidable for now because the mempool for Multi-Packet
+	 * RQ isn't provided by application but managed by PMD.
+	 */
+	if (!rte_mempool_full(mp)) {
+		DRV_LOG(ERR,
+			"port %u mempool for Multi-Packet RQ is still in use",
+			dev->data->port_id);
+		rte_errno = EBUSY;
+		return -rte_errno;
+	}
+	rte_mempool_free(mp);
+	/* Unset mempool for each Rx queue. */
+	for (i = 0; i != priv->rxqs_n; ++i) {
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+
+		if (rxq == NULL)
+			continue;
+		rxq->mprq_mp = NULL;
+	}
+	priv->mprq_mp = NULL;
+	return 0;
+}
+
+/**
+ * Allocate a mempool for Multi-Packet RQ. All configured Rx queues share the
+ * mempool. If already allocated, reuse it if there're enough elements.
+ * Otherwise, resize it.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+mlx5_mprq_alloc_mp(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_mempool *mp = priv->mprq_mp;
+	char name[RTE_MEMPOOL_NAMESIZE];
+	unsigned int desc = 0;
+	unsigned int buf_len;
+	unsigned int obj_num;
+	unsigned int obj_size;
+	unsigned int strd_num_n = 0;
+	unsigned int strd_sz_n = 0;
+	unsigned int i;
+	unsigned int n_ibv = 0;
+
+	if (!mlx5_mprq_enabled(dev))
+		return 0;
+	/* Count the total number of descriptors configured. */
+	for (i = 0; i != priv->rxqs_n; ++i) {
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+		struct mlx5_rxq_ctrl *rxq_ctrl = container_of
+			(rxq, struct mlx5_rxq_ctrl, rxq);
+
+		if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD)
+			continue;
+		n_ibv++;
+		desc += 1 << rxq->elts_n;
+		/* Get the max number of strides. */
+		if (strd_num_n < rxq->strd_num_n)
+			strd_num_n = rxq->strd_num_n;
+		/* Get the max size of a stride. */
+		if (strd_sz_n < rxq->strd_sz_n)
+			strd_sz_n = rxq->strd_sz_n;
+	}
+	MLX5_ASSERT(strd_num_n && strd_sz_n);
+	buf_len = (1 << strd_num_n) * (1 << strd_sz_n);
+	obj_size = sizeof(struct mlx5_mprq_buf) + buf_len + (1 << strd_num_n) *
+		sizeof(struct rte_mbuf_ext_shared_info) + RTE_PKTMBUF_HEADROOM;
+	/*
+	 * Received packets can be either memcpy'd or externally referenced. In
+	 * case that the packet is attached to an mbuf as an external buffer, as
+	 * it isn't possible to predict how the buffers will be queued by
+	 * application, there's no option to exactly pre-allocate needed buffers
+	 * in advance but to speculatively prepares enough buffers.
+	 *
+	 * In the data path, if this Mempool is depleted, PMD will try to memcpy
+	 * received packets to buffers provided by application (rxq->mp) until
+	 * this Mempool gets available again.
+	 */
+	desc *= 4;
+	obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * n_ibv;
+	/*
+	 * rte_mempool_create_empty() has sanity check to refuse large cache
+	 * size compared to the number of elements.
+	 * CACHE_FLUSHTHRESH_MULTIPLIER is defined in a C file, so using a
+	 * constant number 2 instead.
+	 */
+	obj_num = RTE_MAX(obj_num, MLX5_MPRQ_MP_CACHE_SZ * 2);
+	/* Check a mempool is already allocated and if it can be resued. */
+	if (mp != NULL && mp->elt_size >= obj_size && mp->size >= obj_num) {
+		DRV_LOG(DEBUG, "port %u mempool %s is being reused",
+			dev->data->port_id, mp->name);
+		/* Reuse. */
+		goto exit;
+	} else if (mp != NULL) {
+		DRV_LOG(DEBUG, "port %u mempool %s should be resized, freeing it",
+			dev->data->port_id, mp->name);
+		/*
+		 * If failed to free, which means it may be still in use, no way
+		 * but to keep using the existing one. On buffer underrun,
+		 * packets will be memcpy'd instead of external buffer
+		 * attachment.
+		 */
+		if (mlx5_mprq_free_mp(dev)) {
+			if (mp->elt_size >= obj_size)
+				goto exit;
+			else
+				return -rte_errno;
+		}
+	}
+	snprintf(name, sizeof(name), "port-%u-mprq", dev->data->port_id);
+	mp = rte_mempool_create(name, obj_num, obj_size, MLX5_MPRQ_MP_CACHE_SZ,
+				0, NULL, NULL, mlx5_mprq_buf_init,
+				(void *)(uintptr_t)(1 << strd_num_n),
+				dev->device->numa_node, 0);
+	if (mp == NULL) {
+		DRV_LOG(ERR,
+			"port %u failed to allocate a mempool for"
+			" Multi-Packet RQ, count=%u, size=%u",
+			dev->data->port_id, obj_num, obj_size);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	priv->mprq_mp = mp;
+exit:
+	/* Set mempool for each Rx queue. */
+	for (i = 0; i != priv->rxqs_n; ++i) {
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+		struct mlx5_rxq_ctrl *rxq_ctrl = container_of
+			(rxq, struct mlx5_rxq_ctrl, rxq);
+
+		if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD)
+			continue;
+		rxq->mprq_mp = mp;
+	}
+	DRV_LOG(INFO, "port %u Multi-Packet RQ is configured",
+		dev->data->port_id);
+	return 0;
+}
+
+#define MLX5_MAX_TCP_HDR_OFFSET ((unsigned int)(sizeof(struct rte_ether_hdr) + \
+					sizeof(struct rte_vlan_hdr) * 2 + \
+					sizeof(struct rte_ipv6_hdr)))
+#define MAX_TCP_OPTION_SIZE 40u
+#define MLX5_MAX_LRO_HEADER_FIX ((unsigned int)(MLX5_MAX_TCP_HDR_OFFSET + \
+				 sizeof(struct rte_tcp_hdr) + \
+				 MAX_TCP_OPTION_SIZE))
+
+/**
+ * Adjust the maximum LRO massage size.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   RX queue index.
+ * @param max_lro_size
+ *   The maximum size for LRO packet.
+ */
+static void
+mlx5_max_lro_msg_size_adjust(struct rte_eth_dev *dev, uint16_t idx,
+			     uint32_t max_lro_size)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (priv->config.hca_attr.lro_max_msg_sz_mode ==
+	    MLX5_LRO_MAX_MSG_SIZE_START_FROM_L4 && max_lro_size >
+	    MLX5_MAX_TCP_HDR_OFFSET)
+		max_lro_size -= MLX5_MAX_TCP_HDR_OFFSET;
+	max_lro_size = RTE_MIN(max_lro_size, MLX5_MAX_LRO_SIZE);
+	MLX5_ASSERT(max_lro_size >= MLX5_LRO_SEG_CHUNK_SIZE);
+	max_lro_size /= MLX5_LRO_SEG_CHUNK_SIZE;
+	if (priv->max_lro_msg_size)
+		priv->max_lro_msg_size =
+			RTE_MIN((uint32_t)priv->max_lro_msg_size, max_lro_size);
+	else
+		priv->max_lro_msg_size = max_lro_size;
+	DRV_LOG(DEBUG,
+		"port %u Rx Queue %u max LRO message size adjusted to %u bytes",
+		dev->data->port_id, idx,
+		priv->max_lro_msg_size * MLX5_LRO_SEG_CHUNK_SIZE);
+}
+
+/**
+ * Create a DPDK Rx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   RX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ *
+ * @return
+ *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_rxq_ctrl *
+mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+	     unsigned int socket, const struct rte_eth_rxconf *conf,
+	     struct rte_mempool *mp)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_ctrl *tmpl;
+	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+	unsigned int mprq_stride_nums;
+	unsigned int mprq_stride_size;
+	unsigned int mprq_stride_cap;
+	struct mlx5_dev_config *config = &priv->config;
+	/*
+	 * Always allocate extra slots, even if eventually
+	 * the vector Rx will not be used.
+	 */
+	uint16_t desc_n =
+		desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
+	uint64_t offloads = conf->offloads |
+			   dev->data->dev_conf.rxmode.offloads;
+	unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);
+	const int mprq_en = mlx5_check_mprq_support(dev) > 0;
+	unsigned int max_rx_pkt_len = lro_on_queue ?
+			dev->data->dev_conf.rxmode.max_lro_pkt_size :
+			dev->data->dev_conf.rxmode.max_rx_pkt_len;
+	unsigned int non_scatter_min_mbuf_size = max_rx_pkt_len +
+							RTE_PKTMBUF_HEADROOM;
+	unsigned int max_lro_size = 0;
+	unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
+
+	if (non_scatter_min_mbuf_size > mb_len && !(offloads &
+						    DEV_RX_OFFLOAD_SCATTER)) {
+		DRV_LOG(ERR, "port %u Rx queue %u: Scatter offload is not"
+			" configured and no enough mbuf space(%u) to contain "
+			"the maximum RX packet length(%u) with head-room(%u)",
+			dev->data->port_id, idx, mb_len, max_rx_pkt_len,
+			RTE_PKTMBUF_HEADROOM);
+		rte_errno = ENOSPC;
+		return NULL;
+	}
+	tmpl = rte_calloc_socket("RXQ", 1,
+				 sizeof(*tmpl) +
+				 desc_n * sizeof(struct rte_mbuf *),
+				 0, socket);
+	if (!tmpl) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	tmpl->type = MLX5_RXQ_TYPE_STANDARD;
+	if (mlx5_mr_btree_init(&tmpl->rxq.mr_ctrl.cache_bh,
+			       MLX5_MR_BTREE_CACHE_N, socket)) {
+		/* rte_errno is already set. */
+		goto error;
+	}
+	tmpl->socket = socket;
+	if (dev->data->dev_conf.intr_conf.rxq)
+		tmpl->irq = 1;
+	mprq_stride_nums = config->mprq.stride_num_n ?
+		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
+	mprq_stride_size = non_scatter_min_mbuf_size <=
+		(1U << config->mprq.max_stride_size_n) ?
+		log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
+	mprq_stride_cap = (config->mprq.stride_num_n ?
+		(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
+			(config->mprq.stride_size_n ?
+		(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
+	/*
+	 * This Rx queue can be configured as a Multi-Packet RQ if all of the
+	 * following conditions are met:
+	 *  - MPRQ is enabled.
+	 *  - The number of descs is more than the number of strides.
+	 *  - max_rx_pkt_len plus overhead is less than the max size
+	 *    of a stride or mprq_stride_size is specified by a user.
+	 *    Need to nake sure that there are enough stides to encap
+	 *    the maximum packet size in case mprq_stride_size is set.
+	 *  Otherwise, enable Rx scatter if necessary.
+	 */
+	if (mprq_en && desc > (1U << mprq_stride_nums) &&
+	    (non_scatter_min_mbuf_size <=
+	     (1U << config->mprq.max_stride_size_n) ||
+	     (config->mprq.stride_size_n &&
+	      non_scatter_min_mbuf_size <= mprq_stride_cap))) {
+		/* TODO: Rx scatter isn't supported yet. */
+		tmpl->rxq.sges_n = 0;
+		/* Trim the number of descs needed. */
+		desc >>= mprq_stride_nums;
+		tmpl->rxq.strd_num_n = config->mprq.stride_num_n ?
+			config->mprq.stride_num_n : mprq_stride_nums;
+		tmpl->rxq.strd_sz_n = config->mprq.stride_size_n ?
+			config->mprq.stride_size_n : mprq_stride_size;
+		tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT;
+		tmpl->rxq.strd_scatter_en =
+				!!(offloads & DEV_RX_OFFLOAD_SCATTER);
+		tmpl->rxq.mprq_max_memcpy_len = RTE_MIN(first_mb_free_size,
+				config->mprq.max_memcpy_len);
+		max_lro_size = RTE_MIN(max_rx_pkt_len,
+				       (1u << tmpl->rxq.strd_num_n) *
+				       (1u << tmpl->rxq.strd_sz_n));
+		DRV_LOG(DEBUG,
+			"port %u Rx queue %u: Multi-Packet RQ is enabled"
+			" strd_num_n = %u, strd_sz_n = %u",
+			dev->data->port_id, idx,
+			tmpl->rxq.strd_num_n, tmpl->rxq.strd_sz_n);
+	} else if (max_rx_pkt_len <= first_mb_free_size) {
+		tmpl->rxq.sges_n = 0;
+		max_lro_size = max_rx_pkt_len;
+	} else if (offloads & DEV_RX_OFFLOAD_SCATTER) {
+		unsigned int size = non_scatter_min_mbuf_size;
+		unsigned int sges_n;
+
+		if (lro_on_queue && first_mb_free_size <
+		    MLX5_MAX_LRO_HEADER_FIX) {
+			DRV_LOG(ERR, "Not enough space in the first segment(%u)"
+				" to include the max header size(%u) for LRO",
+				first_mb_free_size, MLX5_MAX_LRO_HEADER_FIX);
+			rte_errno = ENOTSUP;
+			goto error;
+		}
+		/*
+		 * Determine the number of SGEs needed for a full packet
+		 * and round it to the next power of two.
+		 */
+		sges_n = log2above((size / mb_len) + !!(size % mb_len));
+		if (sges_n > MLX5_MAX_LOG_RQ_SEGS) {
+			DRV_LOG(ERR,
+				"port %u too many SGEs (%u) needed to handle"
+				" requested maximum packet size %u, the maximum"
+				" supported are %u", dev->data->port_id,
+				1 << sges_n, max_rx_pkt_len,
+				1u << MLX5_MAX_LOG_RQ_SEGS);
+			rte_errno = ENOTSUP;
+			goto error;
+		}
+		tmpl->rxq.sges_n = sges_n;
+		max_lro_size = max_rx_pkt_len;
+	}
+	if (config->mprq.enabled && !mlx5_rxq_mprq_enabled(&tmpl->rxq))
+		DRV_LOG(WARNING,
+			"port %u MPRQ is requested but cannot be enabled\n"
+			" (requested: pkt_sz = %u, desc_num = %u,"
+			" rxq_num = %u, stride_sz = %u, stride_num = %u\n"
+			"  supported: min_rxqs_num = %u,"
+			" min_stride_sz = %u, max_stride_sz = %u).",
+			dev->data->port_id, non_scatter_min_mbuf_size,
+			desc, priv->rxqs_n,
+			config->mprq.stride_size_n ?
+				(1U << config->mprq.stride_size_n) :
+				(1U << mprq_stride_size),
+			config->mprq.stride_num_n ?
+				(1U << config->mprq.stride_num_n) :
+				(1U << mprq_stride_nums),
+			config->mprq.min_rxqs_num,
+			(1U << config->mprq.min_stride_size_n),
+			(1U << config->mprq.max_stride_size_n));
+	DRV_LOG(DEBUG, "port %u maximum number of segments per packet: %u",
+		dev->data->port_id, 1 << tmpl->rxq.sges_n);
+	if (desc % (1 << tmpl->rxq.sges_n)) {
+		DRV_LOG(ERR,
+			"port %u number of Rx queue descriptors (%u) is not a"
+			" multiple of SGEs per packet (%u)",
+			dev->data->port_id,
+			desc,
+			1 << tmpl->rxq.sges_n);
+		rte_errno = EINVAL;
+		goto error;
+	}
+	mlx5_max_lro_msg_size_adjust(dev, idx, max_lro_size);
+	/* Toggle RX checksum offload if hardware supports it. */
+	tmpl->rxq.csum = !!(offloads & DEV_RX_OFFLOAD_CHECKSUM);
+	tmpl->rxq.hw_timestamp = !!(offloads & DEV_RX_OFFLOAD_TIMESTAMP);
+	/* Configure VLAN stripping. */
+	tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
+	/* By default, FCS (CRC) is stripped by hardware. */
+	tmpl->rxq.crc_present = 0;
+	tmpl->rxq.lro = lro_on_queue;
+	if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) {
+		if (config->hw_fcs_strip) {
+			/*
+			 * RQs used for LRO-enabled TIRs should not be
+			 * configured to scatter the FCS.
+			 */
+			if (lro_on_queue)
+				DRV_LOG(WARNING,
+					"port %u CRC stripping has been "
+					"disabled but will still be performed "
+					"by hardware, because LRO is enabled",
+					dev->data->port_id);
+			else
+				tmpl->rxq.crc_present = 1;
+		} else {
+			DRV_LOG(WARNING,
+				"port %u CRC stripping has been disabled but will"
+				" still be performed by hardware, make sure MLNX_OFED"
+				" and firmware are up to date",
+				dev->data->port_id);
+		}
+	}
+	DRV_LOG(DEBUG,
+		"port %u CRC stripping is %s, %u bytes will be subtracted from"
+		" incoming frames to hide it",
+		dev->data->port_id,
+		tmpl->rxq.crc_present ? "disabled" : "enabled",
+		tmpl->rxq.crc_present << 2);
+	/* Save port ID. */
+	tmpl->rxq.rss_hash = !!priv->rss_conf.rss_hf &&
+		(!!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS));
+	tmpl->rxq.port_id = dev->data->port_id;
+	tmpl->priv = priv;
+	tmpl->rxq.mp = mp;
+	tmpl->rxq.elts_n = log2above(desc);
+	tmpl->rxq.rq_repl_thresh =
+		MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
+	tmpl->rxq.elts =
+		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
+#ifndef RTE_ARCH_64
+	tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq;
+#endif
+	tmpl->rxq.idx = idx;
+	rte_atomic32_inc(&tmpl->refcnt);
+	LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
+	return tmpl;
+error:
+	rte_free(tmpl);
+	return NULL;
+}
+
+/**
+ * Create a DPDK Rx hairpin queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   RX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param hairpin_conf
+ *   The hairpin binding configuration.
+ *
+ * @return
+ *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_rxq_ctrl *
+mlx5_rxq_hairpin_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+		     const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_ctrl *tmpl;
+
+	tmpl = rte_calloc_socket("RXQ", 1, sizeof(*tmpl), 0, SOCKET_ID_ANY);
+	if (!tmpl) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	tmpl->type = MLX5_RXQ_TYPE_HAIRPIN;
+	tmpl->socket = SOCKET_ID_ANY;
+	tmpl->rxq.rss_hash = 0;
+	tmpl->rxq.port_id = dev->data->port_id;
+	tmpl->priv = priv;
+	tmpl->rxq.mp = NULL;
+	tmpl->rxq.elts_n = log2above(desc);
+	tmpl->rxq.elts = NULL;
+	tmpl->rxq.mr_ctrl.cache_bh = (struct mlx5_mr_btree) { 0 };
+	tmpl->hairpin_conf = *hairpin_conf;
+	tmpl->rxq.idx = idx;
+	rte_atomic32_inc(&tmpl->refcnt);
+	LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
+	return tmpl;
+}
+
+/**
+ * Get a Rx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   RX queue index.
+ *
+ * @return
+ *   A pointer to the queue if it exists, NULL otherwise.
+ */
+struct mlx5_rxq_ctrl *
+mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
+
+	if ((*priv->rxqs)[idx]) {
+		rxq_ctrl = container_of((*priv->rxqs)[idx],
+					struct mlx5_rxq_ctrl,
+					rxq);
+		mlx5_rxq_obj_get(dev, idx);
+		rte_atomic32_inc(&rxq_ctrl->refcnt);
+	}
+	return rxq_ctrl;
+}
+
+/**
+ * Release a Rx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   RX queue index.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+int
+mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+
+	if (!(*priv->rxqs)[idx])
+		return 0;
+	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
+	MLX5_ASSERT(rxq_ctrl->priv);
+	if (rxq_ctrl->obj && !mlx5_rxq_obj_release(rxq_ctrl->obj))
+		rxq_ctrl->obj = NULL;
+	if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) {
+		if (rxq_ctrl->dbr_umem_id_valid)
+			claim_zero(mlx5_release_dbr(dev, rxq_ctrl->dbr_umem_id,
+						    rxq_ctrl->dbr_offset));
+		if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD)
+			mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh);
+		LIST_REMOVE(rxq_ctrl, next);
+		rte_free(rxq_ctrl);
+		(*priv->rxqs)[idx] = NULL;
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Verify the Rx Queue list is empty
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The number of object not released.
+ */
+int
+mlx5_rxq_verify(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+	int ret = 0;
+
+	LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) {
+		DRV_LOG(DEBUG, "port %u Rx Queue %u still referenced",
+			dev->data->port_id, rxq_ctrl->rxq.idx);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Get a Rx queue type.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Rx queue index.
+ *
+ * @return
+ *   The Rx queue type.
+ */
+enum mlx5_rxq_type
+mlx5_rxq_get_type(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
+
+	if (idx < priv->rxqs_n && (*priv->rxqs)[idx]) {
+		rxq_ctrl = container_of((*priv->rxqs)[idx],
+					struct mlx5_rxq_ctrl,
+					rxq);
+		return rxq_ctrl->type;
+	}
+	return MLX5_RXQ_TYPE_UNDEFINED;
+}
+
+/**
+ * Create an indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param queues
+ *   Queues entering in the indirection table.
+ * @param queues_n
+ *   Number of queues in the array.
+ *
+ * @return
+ *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_ind_table_obj *
+mlx5_ind_table_obj_new(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n, enum mlx5_ind_tbl_type type)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ind_table_obj *ind_tbl;
+	unsigned int i = 0, j = 0, k = 0;
+
+	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) +
+			     queues_n * sizeof(uint16_t), 0);
+	if (!ind_tbl) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	ind_tbl->type = type;
+	if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) {
+		const unsigned int wq_n = rte_is_power_of_2(queues_n) ?
+			log2above(queues_n) :
+			log2above(priv->config.ind_table_max_size);
+		struct ibv_wq *wq[1 << wq_n];
+
+		for (i = 0; i != queues_n; ++i) {
+			struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev,
+								 queues[i]);
+			if (!rxq)
+				goto error;
+			wq[i] = rxq->obj->wq;
+			ind_tbl->queues[i] = queues[i];
+		}
+		ind_tbl->queues_n = queues_n;
+		/* Finalise indirection table. */
+		k = i; /* Retain value of i for use in error case. */
+		for (j = 0; k != (unsigned int)(1 << wq_n); ++k, ++j)
+			wq[k] = wq[j];
+		ind_tbl->ind_table = mlx5_glue->create_rwq_ind_table
+			(priv->sh->ctx,
+			 &(struct ibv_rwq_ind_table_init_attr){
+				.log_ind_tbl_size = wq_n,
+				.ind_tbl = wq,
+				.comp_mask = 0,
+			});
+		if (!ind_tbl->ind_table) {
+			rte_errno = errno;
+			goto error;
+		}
+	} else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
+		struct mlx5_devx_rqt_attr *rqt_attr = NULL;
+		const unsigned int rqt_n =
+			1 << (rte_is_power_of_2(queues_n) ?
+			      log2above(queues_n) :
+			      log2above(priv->config.ind_table_max_size));
+
+		rqt_attr = rte_calloc(__func__, 1, sizeof(*rqt_attr) +
+				      rqt_n * sizeof(uint32_t), 0);
+		if (!rqt_attr) {
+			DRV_LOG(ERR, "port %u cannot allocate RQT resources",
+				dev->data->port_id);
+			rte_errno = ENOMEM;
+			goto error;
+		}
+		rqt_attr->rqt_max_size = priv->config.ind_table_max_size;
+		rqt_attr->rqt_actual_size = rqt_n;
+		for (i = 0; i != queues_n; ++i) {
+			struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev,
+								 queues[i]);
+			if (!rxq)
+				goto error;
+			rqt_attr->rq_list[i] = rxq->obj->rq->id;
+			ind_tbl->queues[i] = queues[i];
+		}
+		k = i; /* Retain value of i for use in error case. */
+		for (j = 0; k != rqt_n; ++k, ++j)
+			rqt_attr->rq_list[k] = rqt_attr->rq_list[j];
+		ind_tbl->rqt = mlx5_devx_cmd_create_rqt(priv->sh->ctx,
+							rqt_attr);
+		rte_free(rqt_attr);
+		if (!ind_tbl->rqt) {
+			DRV_LOG(ERR, "port %u cannot create DevX RQT",
+				dev->data->port_id);
+			rte_errno = errno;
+			goto error;
+		}
+		ind_tbl->queues_n = queues_n;
+	}
+	rte_atomic32_inc(&ind_tbl->refcnt);
+	LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next);
+	return ind_tbl;
+error:
+	for (j = 0; j < i; j++)
+		mlx5_rxq_release(dev, ind_tbl->queues[j]);
+	rte_free(ind_tbl);
+	DEBUG("port %u cannot create indirection table", dev->data->port_id);
+	return NULL;
+}
+
+/**
+ * Get an indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param queues
+ *   Queues entering in the indirection table.
+ * @param queues_n
+ *   Number of queues in the array.
+ *
+ * @return
+ *   An indirection table if found.
+ */
+static struct mlx5_ind_table_obj *
+mlx5_ind_table_obj_get(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ind_table_obj *ind_tbl;
+
+	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
+		if ((ind_tbl->queues_n == queues_n) &&
+		    (memcmp(ind_tbl->queues, queues,
+			    ind_tbl->queues_n * sizeof(ind_tbl->queues[0]))
+		     == 0))
+			break;
+	}
+	if (ind_tbl) {
+		unsigned int i;
+
+		rte_atomic32_inc(&ind_tbl->refcnt);
+		for (i = 0; i != ind_tbl->queues_n; ++i)
+			mlx5_rxq_get(dev, ind_tbl->queues[i]);
+	}
+	return ind_tbl;
+}
+
+/**
+ * Release an indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param ind_table
+ *   Indirection table to release.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+static int
+mlx5_ind_table_obj_release(struct rte_eth_dev *dev,
+			   struct mlx5_ind_table_obj *ind_tbl)
+{
+	unsigned int i;
+
+	if (rte_atomic32_dec_and_test(&ind_tbl->refcnt)) {
+		if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV)
+			claim_zero(mlx5_glue->destroy_rwq_ind_table
+							(ind_tbl->ind_table));
+		else if (ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX)
+			claim_zero(mlx5_devx_cmd_destroy(ind_tbl->rqt));
+	}
+	for (i = 0; i != ind_tbl->queues_n; ++i)
+		claim_nonzero(mlx5_rxq_release(dev, ind_tbl->queues[i]));
+	if (!rte_atomic32_read(&ind_tbl->refcnt)) {
+		LIST_REMOVE(ind_tbl, next);
+		rte_free(ind_tbl);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Verify the Rx Queue list is empty
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The number of object not released.
+ */
+int
+mlx5_ind_table_obj_verify(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ind_table_obj *ind_tbl;
+	int ret = 0;
+
+	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
+		DRV_LOG(DEBUG,
+			"port %u indirection table obj %p still referenced",
+			dev->data->port_id, (void *)ind_tbl);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Create an Rx Hash queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param rss_key
+ *   RSS key for the Rx hash queue.
+ * @param rss_key_len
+ *   RSS key length.
+ * @param hash_fields
+ *   Verbs protocol hash field to make the RSS on.
+ * @param queues
+ *   Queues entering in hash queue. In case of empty hash_fields only the
+ *   first queue index will be taken for the indirection table.
+ * @param queues_n
+ *   Number of queues.
+ * @param tunnel
+ *   Tunnel type.
+ *
+ * @return
+ *   The Verbs/DevX object initialised index, 0 otherwise and rte_errno is set.
+ */
+uint32_t
+mlx5_hrxq_new(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n,
+	      int tunnel __rte_unused)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hrxq *hrxq;
+	uint32_t hrxq_idx = 0;
+	struct ibv_qp *qp = NULL;
+	struct mlx5_ind_table_obj *ind_tbl;
+	int err;
+	struct mlx5_devx_obj *tir = NULL;
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[queues[0]];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+
+	queues_n = hash_fields ? queues_n : 1;
+	ind_tbl = mlx5_ind_table_obj_get(dev, queues, queues_n);
+	if (!ind_tbl) {
+		enum mlx5_ind_tbl_type type;
+
+		type = rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV ?
+				MLX5_IND_TBL_TYPE_IBV : MLX5_IND_TBL_TYPE_DEVX;
+		ind_tbl = mlx5_ind_table_obj_new(dev, queues, queues_n, type);
+	}
+	if (!ind_tbl) {
+		rte_errno = ENOMEM;
+		return 0;
+	}
+	if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) {
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+		struct mlx5dv_qp_init_attr qp_init_attr;
+
+		memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+		if (tunnel) {
+			qp_init_attr.comp_mask =
+				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+			qp_init_attr.create_flags =
+				MLX5DV_QP_CREATE_TUNNEL_OFFLOADS;
+		}
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+		if (dev->data->dev_conf.lpbk_mode) {
+			/*
+			 * Allow packet sent from NIC loop back
+			 * w/o source MAC check.
+			 */
+			qp_init_attr.comp_mask |=
+				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+			qp_init_attr.create_flags |=
+				MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC;
+		}
+#endif
+		qp = mlx5_glue->dv_create_qp
+			(priv->sh->ctx,
+			 &(struct ibv_qp_init_attr_ex){
+				.qp_type = IBV_QPT_RAW_PACKET,
+				.comp_mask =
+					IBV_QP_INIT_ATTR_PD |
+					IBV_QP_INIT_ATTR_IND_TABLE |
+					IBV_QP_INIT_ATTR_RX_HASH,
+				.rx_hash_conf = (struct ibv_rx_hash_conf){
+					.rx_hash_function =
+						IBV_RX_HASH_FUNC_TOEPLITZ,
+					.rx_hash_key_len = rss_key_len,
+					.rx_hash_key =
+						(void *)(uintptr_t)rss_key,
+					.rx_hash_fields_mask = hash_fields,
+				},
+				.rwq_ind_tbl = ind_tbl->ind_table,
+				.pd = priv->sh->pd,
+			  },
+			  &qp_init_attr);
+#else
+		qp = mlx5_glue->create_qp_ex
+			(priv->sh->ctx,
+			 &(struct ibv_qp_init_attr_ex){
+				.qp_type = IBV_QPT_RAW_PACKET,
+				.comp_mask =
+					IBV_QP_INIT_ATTR_PD |
+					IBV_QP_INIT_ATTR_IND_TABLE |
+					IBV_QP_INIT_ATTR_RX_HASH,
+				.rx_hash_conf = (struct ibv_rx_hash_conf){
+					.rx_hash_function =
+						IBV_RX_HASH_FUNC_TOEPLITZ,
+					.rx_hash_key_len = rss_key_len,
+					.rx_hash_key =
+						(void *)(uintptr_t)rss_key,
+					.rx_hash_fields_mask = hash_fields,
+				},
+				.rwq_ind_tbl = ind_tbl->ind_table,
+				.pd = priv->sh->pd,
+			 });
+#endif
+		if (!qp) {
+			rte_errno = errno;
+			goto error;
+		}
+	} else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
+		struct mlx5_devx_tir_attr tir_attr;
+		uint32_t i;
+		uint32_t lro = 1;
+
+		/* Enable TIR LRO only if all the queues were configured for. */
+		for (i = 0; i < queues_n; ++i) {
+			if (!(*priv->rxqs)[queues[i]]->lro) {
+				lro = 0;
+				break;
+			}
+		}
+		memset(&tir_attr, 0, sizeof(tir_attr));
+		tir_attr.disp_type = MLX5_TIRC_DISP_TYPE_INDIRECT;
+		tir_attr.rx_hash_fn = MLX5_RX_HASH_FN_TOEPLITZ;
+		tir_attr.tunneled_offload_en = !!tunnel;
+		/* If needed, translate hash_fields bitmap to PRM format. */
+		if (hash_fields) {
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+			struct mlx5_rx_hash_field_select *rx_hash_field_select =
+					hash_fields & IBV_RX_HASH_INNER ?
+					&tir_attr.rx_hash_field_selector_inner :
+					&tir_attr.rx_hash_field_selector_outer;
+#else
+			struct mlx5_rx_hash_field_select *rx_hash_field_select =
+					&tir_attr.rx_hash_field_selector_outer;
+#endif
+
+			/* 1 bit: 0: IPv4, 1: IPv6. */
+			rx_hash_field_select->l3_prot_type =
+				!!(hash_fields & MLX5_IPV6_IBV_RX_HASH);
+			/* 1 bit: 0: TCP, 1: UDP. */
+			rx_hash_field_select->l4_prot_type =
+				!!(hash_fields & MLX5_UDP_IBV_RX_HASH);
+			/* Bitmask which sets which fields to use in RX Hash. */
+			rx_hash_field_select->selected_fields =
+			((!!(hash_fields & MLX5_L3_SRC_IBV_RX_HASH)) <<
+			 MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP) |
+			(!!(hash_fields & MLX5_L3_DST_IBV_RX_HASH)) <<
+			 MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP |
+			(!!(hash_fields & MLX5_L4_SRC_IBV_RX_HASH)) <<
+			 MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT |
+			(!!(hash_fields & MLX5_L4_DST_IBV_RX_HASH)) <<
+			 MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT;
+		}
+		if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN)
+			tir_attr.transport_domain = priv->sh->td->id;
+		else
+			tir_attr.transport_domain = priv->sh->tdn;
+		memcpy(tir_attr.rx_hash_toeplitz_key, rss_key,
+		       MLX5_RSS_HASH_KEY_LEN);
+		tir_attr.indirect_table = ind_tbl->rqt->id;
+		if (dev->data->dev_conf.lpbk_mode)
+			tir_attr.self_lb_block =
+					MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
+		if (lro) {
+			tir_attr.lro_timeout_period_usecs =
+					priv->config.lro.timeout;
+			tir_attr.lro_max_msg_sz = priv->max_lro_msg_size;
+			tir_attr.lro_enable_mask =
+					MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
+					MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO;
+		}
+		tir = mlx5_devx_cmd_create_tir(priv->sh->ctx, &tir_attr);
+		if (!tir) {
+			DRV_LOG(ERR, "port %u cannot create DevX TIR",
+				dev->data->port_id);
+			rte_errno = errno;
+			goto error;
+		}
+	}
+	hrxq = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_HRXQ], &hrxq_idx);
+	if (!hrxq)
+		goto error;
+	hrxq->ind_table = ind_tbl;
+	if (ind_tbl->type == MLX5_IND_TBL_TYPE_IBV) {
+		hrxq->qp = qp;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+		hrxq->action =
+			mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
+		if (!hrxq->action) {
+			rte_errno = errno;
+			goto error;
+		}
+#endif
+	} else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
+		hrxq->tir = tir;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+		hrxq->action = mlx5_glue->dv_create_flow_action_dest_devx_tir
+							(hrxq->tir->obj);
+		if (!hrxq->action) {
+			rte_errno = errno;
+			goto error;
+		}
+#endif
+	}
+	hrxq->rss_key_len = rss_key_len;
+	hrxq->hash_fields = hash_fields;
+	memcpy(hrxq->rss_key, rss_key, rss_key_len);
+	rte_atomic32_inc(&hrxq->refcnt);
+	ILIST_INSERT(priv->sh->ipool[MLX5_IPOOL_HRXQ], &priv->hrxqs, hrxq_idx,
+		     hrxq, next);
+	return hrxq_idx;
+error:
+	err = rte_errno; /* Save rte_errno before cleanup. */
+	mlx5_ind_table_obj_release(dev, ind_tbl);
+	if (qp)
+		claim_zero(mlx5_glue->destroy_qp(qp));
+	else if (tir)
+		claim_zero(mlx5_devx_cmd_destroy(tir));
+	rte_errno = err; /* Restore rte_errno. */
+	return 0;
+}
+
+/**
+ * Get an Rx Hash queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param rss_conf
+ *   RSS configuration for the Rx hash queue.
+ * @param queues
+ *   Queues entering in hash queue. In case of empty hash_fields only the
+ *   first queue index will be taken for the indirection table.
+ * @param queues_n
+ *   Number of queues.
+ *
+ * @return
+ *   An hash Rx queue index on success.
+ */
+uint32_t
+mlx5_hrxq_get(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hrxq *hrxq;
+	uint32_t idx;
+
+	queues_n = hash_fields ? queues_n : 1;
+	ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_HRXQ], priv->hrxqs, idx,
+		      hrxq, next) {
+		struct mlx5_ind_table_obj *ind_tbl;
+
+		if (hrxq->rss_key_len != rss_key_len)
+			continue;
+		if (memcmp(hrxq->rss_key, rss_key, rss_key_len))
+			continue;
+		if (hrxq->hash_fields != hash_fields)
+			continue;
+		ind_tbl = mlx5_ind_table_obj_get(dev, queues, queues_n);
+		if (!ind_tbl)
+			continue;
+		if (ind_tbl != hrxq->ind_table) {
+			mlx5_ind_table_obj_release(dev, ind_tbl);
+			continue;
+		}
+		rte_atomic32_inc(&hrxq->refcnt);
+		return idx;
+	}
+	return 0;
+}
+
+/**
+ * Release the hash Rx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param hrxq
+ *   Index to Hash Rx queue to release.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+int
+mlx5_hrxq_release(struct rte_eth_dev *dev, uint32_t hrxq_idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hrxq *hrxq;
+
+	hrxq = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_HRXQ], hrxq_idx);
+	if (!hrxq)
+		return 0;
+	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+		mlx5_glue->destroy_flow_action(hrxq->action);
+#endif
+		if (hrxq->ind_table->type == MLX5_IND_TBL_TYPE_IBV)
+			claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+		else /* hrxq->ind_table->type == MLX5_IND_TBL_TYPE_DEVX */
+			claim_zero(mlx5_devx_cmd_destroy(hrxq->tir));
+		mlx5_ind_table_obj_release(dev, hrxq->ind_table);
+		ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_HRXQ], &priv->hrxqs,
+			     hrxq_idx, hrxq, next);
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_HRXQ], hrxq_idx);
+		return 0;
+	}
+	claim_nonzero(mlx5_ind_table_obj_release(dev, hrxq->ind_table));
+	return 1;
+}
+
+/**
+ * Verify the Rx Queue list is empty
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The number of object not released.
+ */
+int
+mlx5_hrxq_verify(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hrxq *hrxq;
+	uint32_t idx;
+	int ret = 0;
+
+	ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_HRXQ], priv->hrxqs, idx,
+		      hrxq, next) {
+		DRV_LOG(DEBUG,
+			"port %u hash Rx queue %p still referenced",
+			dev->data->port_id, (void *)hrxq);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Create a drop Rx queue Verbs/DevX object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_rxq_obj *
+mlx5_rxq_obj_drop_new(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ibv_context *ctx = priv->sh->ctx;
+	struct ibv_cq *cq;
+	struct ibv_wq *wq = NULL;
+	struct mlx5_rxq_obj *rxq;
+
+	if (priv->drop_queue.rxq)
+		return priv->drop_queue.rxq;
+	cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
+	if (!cq) {
+		DEBUG("port %u cannot allocate CQ for drop queue",
+		      dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+	wq = mlx5_glue->create_wq(ctx,
+		 &(struct ibv_wq_init_attr){
+			.wq_type = IBV_WQT_RQ,
+			.max_wr = 1,
+			.max_sge = 1,
+			.pd = priv->sh->pd,
+			.cq = cq,
+		 });
+	if (!wq) {
+		DEBUG("port %u cannot allocate WQ for drop queue",
+		      dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+	rxq = rte_calloc(__func__, 1, sizeof(*rxq), 0);
+	if (!rxq) {
+		DEBUG("port %u cannot allocate drop Rx queue memory",
+		      dev->data->port_id);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	rxq->cq = cq;
+	rxq->wq = wq;
+	priv->drop_queue.rxq = rxq;
+	return rxq;
+error:
+	if (wq)
+		claim_zero(mlx5_glue->destroy_wq(wq));
+	if (cq)
+		claim_zero(mlx5_glue->destroy_cq(cq));
+	return NULL;
+}
+
+/**
+ * Release a drop Rx queue Verbs/DevX object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static void
+mlx5_rxq_obj_drop_release(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_obj *rxq = priv->drop_queue.rxq;
+
+	if (rxq->wq)
+		claim_zero(mlx5_glue->destroy_wq(rxq->wq));
+	if (rxq->cq)
+		claim_zero(mlx5_glue->destroy_cq(rxq->cq));
+	rte_free(rxq);
+	priv->drop_queue.rxq = NULL;
+}
+
+/**
+ * Create a drop indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_ind_table_obj *
+mlx5_ind_table_obj_drop_new(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ind_table_obj *ind_tbl;
+	struct mlx5_rxq_obj *rxq;
+	struct mlx5_ind_table_obj tmpl;
+
+	rxq = mlx5_rxq_obj_drop_new(dev);
+	if (!rxq)
+		return NULL;
+	tmpl.ind_table = mlx5_glue->create_rwq_ind_table
+		(priv->sh->ctx,
+		 &(struct ibv_rwq_ind_table_init_attr){
+			.log_ind_tbl_size = 0,
+			.ind_tbl = &rxq->wq,
+			.comp_mask = 0,
+		 });
+	if (!tmpl.ind_table) {
+		DEBUG("port %u cannot allocate indirection table for drop"
+		      " queue",
+		      dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl), 0);
+	if (!ind_tbl) {
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	ind_tbl->ind_table = tmpl.ind_table;
+	return ind_tbl;
+error:
+	mlx5_rxq_obj_drop_release(dev);
+	return NULL;
+}
+
+/**
+ * Release a drop indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mlx5_ind_table_obj_drop_release(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ind_table_obj *ind_tbl = priv->drop_queue.hrxq->ind_table;
+
+	claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl->ind_table));
+	mlx5_rxq_obj_drop_release(dev);
+	rte_free(ind_tbl);
+	priv->drop_queue.hrxq->ind_table = NULL;
+}
+
+/**
+ * Create a drop Rx Hash queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_hrxq *
+mlx5_hrxq_drop_new(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ind_table_obj *ind_tbl = NULL;
+	struct ibv_qp *qp = NULL;
+	struct mlx5_hrxq *hrxq = NULL;
+
+	if (priv->drop_queue.hrxq) {
+		rte_atomic32_inc(&priv->drop_queue.hrxq->refcnt);
+		return priv->drop_queue.hrxq;
+	}
+	hrxq = rte_calloc(__func__, 1, sizeof(*hrxq), 0);
+	if (!hrxq) {
+		DRV_LOG(WARNING,
+			"port %u cannot allocate memory for drop queue",
+			dev->data->port_id);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	priv->drop_queue.hrxq = hrxq;
+	ind_tbl = mlx5_ind_table_obj_drop_new(dev);
+	if (!ind_tbl)
+		goto error;
+	hrxq->ind_table = ind_tbl;
+	qp = mlx5_glue->create_qp_ex(priv->sh->ctx,
+		 &(struct ibv_qp_init_attr_ex){
+			.qp_type = IBV_QPT_RAW_PACKET,
+			.comp_mask =
+				IBV_QP_INIT_ATTR_PD |
+				IBV_QP_INIT_ATTR_IND_TABLE |
+				IBV_QP_INIT_ATTR_RX_HASH,
+			.rx_hash_conf = (struct ibv_rx_hash_conf){
+				.rx_hash_function =
+					IBV_RX_HASH_FUNC_TOEPLITZ,
+				.rx_hash_key_len = MLX5_RSS_HASH_KEY_LEN,
+				.rx_hash_key = rss_hash_default_key,
+				.rx_hash_fields_mask = 0,
+				},
+			.rwq_ind_tbl = ind_tbl->ind_table,
+			.pd = priv->sh->pd
+		 });
+	if (!qp) {
+		DEBUG("port %u cannot allocate QP for drop queue",
+		      dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+	hrxq->qp = qp;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
+	if (!hrxq->action) {
+		rte_errno = errno;
+		goto error;
+	}
+#endif
+	rte_atomic32_set(&hrxq->refcnt, 1);
+	return hrxq;
+error:
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	if (hrxq && hrxq->action)
+		mlx5_glue->destroy_flow_action(hrxq->action);
+#endif
+	if (qp)
+		claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+	if (ind_tbl)
+		mlx5_ind_table_obj_drop_release(dev);
+	if (hrxq) {
+		priv->drop_queue.hrxq = NULL;
+		rte_free(hrxq);
+	}
+	return NULL;
+}
+
+/**
+ * Release a drop hash Rx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_hrxq_drop_release(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq;
+
+	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+		mlx5_glue->destroy_flow_action(hrxq->action);
+#endif
+		claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+		mlx5_ind_table_obj_drop_release(dev);
+		rte_free(hrxq);
+		priv->drop_queue.hrxq = NULL;
+	}
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c
new file mode 100644
index 000000000..6a17a9a5d
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.c
@@ -0,0 +1,5691 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015-2019 Mellanox Technologies, Ltd
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+#include <rte_ether.h>
+#include <rte_cycles.h>
+#include <rte_flow.h>
+
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+#include <mlx5_common.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_mr.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_autoconf.h"
+
+/* TX burst subroutines return codes. */
+enum mlx5_txcmp_code {
+	MLX5_TXCMP_CODE_EXIT = 0,
+	MLX5_TXCMP_CODE_ERROR,
+	MLX5_TXCMP_CODE_SINGLE,
+	MLX5_TXCMP_CODE_MULTI,
+	MLX5_TXCMP_CODE_TSO,
+	MLX5_TXCMP_CODE_EMPW,
+};
+
+/*
+ * These defines are used to configure Tx burst routine option set
+ * supported at compile time. The not specified options are optimized out
+ * out due to if conditions can be explicitly calculated at compile time.
+ * The offloads with bigger runtime check (require more CPU cycles to
+ * skip) overhead should have the bigger index - this is needed to
+ * select the better matching routine function if no exact match and
+ * some offloads are not actually requested.
+ */
+#define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
+#define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
+#define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
+#define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
+#define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
+#define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
+#define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
+#define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
+#define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/
+
+/* The most common offloads groups. */
+#define MLX5_TXOFF_CONFIG_NONE 0
+#define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
+				MLX5_TXOFF_CONFIG_TSO | \
+				MLX5_TXOFF_CONFIG_SWP | \
+				MLX5_TXOFF_CONFIG_CSUM | \
+				MLX5_TXOFF_CONFIG_INLINE | \
+				MLX5_TXOFF_CONFIG_VLAN | \
+				MLX5_TXOFF_CONFIG_METADATA)
+
+#define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
+
+#define MLX5_TXOFF_DECL(func, olx) \
+static uint16_t mlx5_tx_burst_##func(void *txq, \
+				     struct rte_mbuf **pkts, \
+				    uint16_t pkts_n) \
+{ \
+	return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \
+		    pkts, pkts_n, (olx)); \
+}
+
+#define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
+
+static __rte_always_inline uint32_t
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
+
+static __rte_always_inline int
+mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe);
+
+static __rte_always_inline uint32_t
+rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
+
+static __rte_always_inline void
+rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
+	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
+
+static __rte_always_inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+		 const unsigned int strd_n);
+
+static int
+mlx5_queue_state_modify(struct rte_eth_dev *dev,
+			struct mlx5_mp_arg_queue_state_modify *sm);
+
+static inline void
+mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
+			volatile struct mlx5_cqe *restrict cqe,
+			uint32_t phcsum);
+
+static inline void
+mlx5_lro_update_hdr(uint8_t *restrict padd,
+		    volatile struct mlx5_cqe *restrict cqe,
+		    uint32_t len);
+
+uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
+	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
+};
+
+uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
+uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
+
+uint64_t rte_net_mlx5_dynf_inline_mask;
+#define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
+
+/**
+ * Build a table to translate Rx completion flags to packet type.
+ *
+ * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
+ */
+void
+mlx5_set_ptype_table(void)
+{
+	unsigned int i;
+	uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
+
+	/* Last entry must not be overwritten, reserved for errored packet. */
+	for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
+		(*p)[i] = RTE_PTYPE_UNKNOWN;
+	/*
+	 * The index to the array should have:
+	 * bit[1:0] = l3_hdr_type
+	 * bit[4:2] = l4_hdr_type
+	 * bit[5] = ip_frag
+	 * bit[6] = tunneled
+	 * bit[7] = outer_l3_type
+	 */
+	/* L2 */
+	(*p)[0x00] = RTE_PTYPE_L2_ETHER;
+	/* L3 */
+	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_NONFRAG;
+	(*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_NONFRAG;
+	/* Fragmented */
+	(*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG;
+	(*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG;
+	/* TCP */
+	(*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	/* UDP */
+	(*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_UDP;
+	(*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_UDP;
+	/* Repeat with outer_l3_type being set. Just in case. */
+	(*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_NONFRAG;
+	(*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_NONFRAG;
+	(*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG;
+	(*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG;
+	(*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP;
+	(*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_UDP;
+	(*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_UDP;
+	/* Tunneled - L3 */
+	(*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
+	(*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_NONFRAG;
+	(*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_NONFRAG;
+	(*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
+	(*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_NONFRAG;
+	(*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_NONFRAG;
+	/* Tunneled - Fragmented */
+	(*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG;
+	(*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG;
+	(*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG;
+	(*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG;
+	/* Tunneled - TCP */
+	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	(*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP;
+	/* Tunneled - UDP */
+	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP;
+	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP;
+	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP;
+	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP;
+}
+
+/**
+ * Build a table to translate packet to checksum type of Verbs.
+ */
+void
+mlx5_set_cksum_table(void)
+{
+	unsigned int i;
+	uint8_t v;
+
+	/*
+	 * The index should have:
+	 * bit[0] = PKT_TX_TCP_SEG
+	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
+	 * bit[4] = PKT_TX_IP_CKSUM
+	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
+	 * bit[9] = tunnel
+	 */
+	for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) {
+		v = 0;
+		if (i & (1 << 9)) {
+			/* Tunneled packet. */
+			if (i & (1 << 8)) /* Outer IP. */
+				v |= MLX5_ETH_WQE_L3_CSUM;
+			if (i & (1 << 4)) /* Inner IP. */
+				v |= MLX5_ETH_WQE_L3_INNER_CSUM;
+			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
+				v |= MLX5_ETH_WQE_L4_INNER_CSUM;
+		} else {
+			/* No tunnel. */
+			if (i & (1 << 4)) /* IP. */
+				v |= MLX5_ETH_WQE_L3_CSUM;
+			if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
+				v |= MLX5_ETH_WQE_L4_CSUM;
+		}
+		mlx5_cksum_table[i] = v;
+	}
+}
+
+/**
+ * Build a table to translate packet type of mbuf to SWP type of Verbs.
+ */
+void
+mlx5_set_swp_types_table(void)
+{
+	unsigned int i;
+	uint8_t v;
+
+	/*
+	 * The index should have:
+	 * bit[0:1] = PKT_TX_L4_MASK
+	 * bit[4] = PKT_TX_IPV6
+	 * bit[8] = PKT_TX_OUTER_IPV6
+	 * bit[9] = PKT_TX_OUTER_UDP
+	 */
+	for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) {
+		v = 0;
+		if (i & (1 << 8))
+			v |= MLX5_ETH_WQE_L3_OUTER_IPV6;
+		if (i & (1 << 9))
+			v |= MLX5_ETH_WQE_L4_OUTER_UDP;
+		if (i & (1 << 4))
+			v |= MLX5_ETH_WQE_L3_INNER_IPV6;
+		if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52))
+			v |= MLX5_ETH_WQE_L4_INNER_UDP;
+		mlx5_swp_types_table[i] = v;
+	}
+}
+
+/**
+ * Set Software Parser flags and offsets in Ethernet Segment of WQE.
+ * Flags must be preliminary initialized to zero.
+ *
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param swp_flags
+ *   Pointer to store Software Parser flags
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Software Parser offsets packed in dword.
+ *   Software Parser flags are set by pointer.
+ */
+static __rte_always_inline uint32_t
+txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc,
+		uint8_t *swp_flags,
+		unsigned int olx)
+{
+	uint64_t ol, tunnel;
+	unsigned int idx, off;
+	uint32_t set;
+
+	if (!MLX5_TXOFF_CONFIG(SWP))
+		return 0;
+	ol = loc->mbuf->ol_flags;
+	tunnel = ol & PKT_TX_TUNNEL_MASK;
+	/*
+	 * Check whether Software Parser is required.
+	 * Only customized tunnels may ask for.
+	 */
+	if (likely(tunnel != PKT_TX_TUNNEL_UDP && tunnel != PKT_TX_TUNNEL_IP))
+		return 0;
+	/*
+	 * The index should have:
+	 * bit[0:1] = PKT_TX_L4_MASK
+	 * bit[4] = PKT_TX_IPV6
+	 * bit[8] = PKT_TX_OUTER_IPV6
+	 * bit[9] = PKT_TX_OUTER_UDP
+	 */
+	idx = (ol & (PKT_TX_L4_MASK | PKT_TX_IPV6 | PKT_TX_OUTER_IPV6)) >> 52;
+	idx |= (tunnel == PKT_TX_TUNNEL_UDP) ? (1 << 9) : 0;
+	*swp_flags = mlx5_swp_types_table[idx];
+	/*
+	 * Set offsets for SW parser. Since ConnectX-5, SW parser just
+	 * complements HW parser. SW parser starts to engage only if HW parser
+	 * can't reach a header. For the older devices, HW parser will not kick
+	 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
+	 * should be set regardless of HW offload.
+	 */
+	off = loc->mbuf->outer_l2_len;
+	if (MLX5_TXOFF_CONFIG(VLAN) && ol & PKT_TX_VLAN_PKT)
+		off += sizeof(struct rte_vlan_hdr);
+	set = (off >> 1) << 8; /* Outer L3 offset. */
+	off += loc->mbuf->outer_l3_len;
+	if (tunnel == PKT_TX_TUNNEL_UDP)
+		set |= off >> 1; /* Outer L4 offset. */
+	if (ol & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Inner IP. */
+		const uint64_t csum = ol & PKT_TX_L4_MASK;
+			off += loc->mbuf->l2_len;
+		set |= (off >> 1) << 24; /* Inner L3 offset. */
+		if (csum == PKT_TX_TCP_CKSUM ||
+		    csum == PKT_TX_UDP_CKSUM ||
+		    (MLX5_TXOFF_CONFIG(TSO) && ol & PKT_TX_TCP_SEG)) {
+			off += loc->mbuf->l3_len;
+			set |= (off >> 1) << 16; /* Inner L4 offset. */
+		}
+	}
+	set = rte_cpu_to_le_32(set);
+	return set;
+}
+
+/**
+ * Convert the Checksum offloads to Verbs.
+ *
+ * @param buf
+ *   Pointer to the mbuf.
+ *
+ * @return
+ *   Converted checksum flags.
+ */
+static __rte_always_inline uint8_t
+txq_ol_cksum_to_cs(struct rte_mbuf *buf)
+{
+	uint32_t idx;
+	uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK);
+	const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK |
+				       PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM;
+
+	/*
+	 * The index should have:
+	 * bit[0] = PKT_TX_TCP_SEG
+	 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
+	 * bit[4] = PKT_TX_IP_CKSUM
+	 * bit[8] = PKT_TX_OUTER_IP_CKSUM
+	 * bit[9] = tunnel
+	 */
+	idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
+	return mlx5_cksum_table[idx];
+}
+
+/**
+ * Internal function to compute the number of used descriptors in an RX queue
+ *
+ * @param rxq
+ *   The Rx queue.
+ *
+ * @return
+ *   The number of used rx descriptor.
+ */
+static uint32_t
+rx_queue_count(struct mlx5_rxq_data *rxq)
+{
+	struct rxq_zip *zip = &rxq->zip;
+	volatile struct mlx5_cqe *cqe;
+	const unsigned int cqe_n = (1 << rxq->cqe_n);
+	const unsigned int cqe_cnt = cqe_n - 1;
+	unsigned int cq_ci;
+	unsigned int used;
+
+	/* if we are processing a compressed cqe */
+	if (zip->ai) {
+		used = zip->cqe_cnt - zip->ca;
+		cq_ci = zip->cq_ci;
+	} else {
+		used = 0;
+		cq_ci = rxq->cq_ci;
+	}
+	cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
+	while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
+		int8_t op_own;
+		unsigned int n;
+
+		op_own = cqe->op_own;
+		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
+			n = rte_be_to_cpu_32(cqe->byte_cnt);
+		else
+			n = 1;
+		cq_ci += n;
+		used += n;
+		cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
+	}
+	used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
+	return used;
+}
+
+/**
+ * DPDK callback to check the status of a rx descriptor.
+ *
+ * @param rx_queue
+ *   The Rx queue.
+ * @param[in] offset
+ *   The index of the descriptor in the ring.
+ *
+ * @return
+ *   The status of the tx descriptor.
+ */
+int
+mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
+{
+	struct mlx5_rxq_data *rxq = rx_queue;
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv);
+
+	if (dev->rx_pkt_burst != mlx5_rx_burst) {
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	if (offset >= (1 << rxq->elts_n)) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	if (offset < rx_queue_count(rxq))
+		return RTE_ETH_RX_DESC_DONE;
+	return RTE_ETH_RX_DESC_AVAIL;
+}
+
+/**
+ * DPDK callback to get the RX queue information
+ *
+ * @param dev
+ *   Pointer to the device structure.
+ *
+ * @param rx_queue_id
+ *   Rx queue identificator.
+ *
+ * @param qinfo
+ *   Pointer to the RX queue information structure.
+ *
+ * @return
+ *   None.
+ */
+
+void
+mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+		  struct rte_eth_rxq_info *qinfo)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+	if (!rxq)
+		return;
+	qinfo->mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
+					rxq->mprq_mp : rxq->mp;
+	qinfo->conf.rx_thresh.pthresh = 0;
+	qinfo->conf.rx_thresh.hthresh = 0;
+	qinfo->conf.rx_thresh.wthresh = 0;
+	qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh;
+	qinfo->conf.rx_drop_en = 1;
+	qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1;
+	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
+	qinfo->scattered_rx = dev->data->scattered_rx;
+	qinfo->nb_desc = 1 << rxq->elts_n;
+}
+
+/**
+ * DPDK callback to get the RX packet burst mode information
+ *
+ * @param dev
+ *   Pointer to the device structure.
+ *
+ * @param rx_queue_id
+ *   Rx queue identificatior.
+ *
+ * @param mode
+ *   Pointer to the burts mode information.
+ *
+ * @return
+ *   0 as success, -EINVAL as failure.
+ */
+
+int
+mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
+		       uint16_t rx_queue_id __rte_unused,
+		       struct rte_eth_burst_mode *mode)
+{
+	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
+
+	if (pkt_burst == mlx5_rx_burst) {
+		snprintf(mode->info, sizeof(mode->info), "%s", "Scalar");
+	} else if (pkt_burst == mlx5_rx_burst_mprq) {
+		snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ");
+	} else if (pkt_burst == mlx5_rx_burst_vec) {
+#if defined RTE_ARCH_X86_64
+		snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE");
+#elif defined RTE_ARCH_ARM64
+		snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon");
+#elif defined RTE_ARCH_PPC_64
+		snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
+#else
+		return -EINVAL;
+#endif
+	} else {
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * DPDK callback to get the number of used descriptors in a RX queue
+ *
+ * @param dev
+ *   Pointer to the device structure.
+ *
+ * @param rx_queue_id
+ *   The Rx queue.
+ *
+ * @return
+ *   The number of used rx descriptor.
+ *   -EINVAL if the queue is invalid
+ */
+uint32_t
+mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq;
+
+	if (dev->rx_pkt_burst != mlx5_rx_burst) {
+		rte_errno = ENOTSUP;
+		return -rte_errno;
+	}
+	rxq = (*priv->rxqs)[rx_queue_id];
+	if (!rxq) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	return rx_queue_count(rxq);
+}
+
+#define MLX5_SYSTEM_LOG_DIR "/var/log"
+/**
+ * Dump debug information to log file.
+ *
+ * @param fname
+ *   The file name.
+ * @param hex_title
+ *   If not NULL this string is printed as a header to the output
+ *   and the output will be in hexadecimal view.
+ * @param buf
+ *   This is the buffer address to print out.
+ * @param len
+ *   The number of bytes to dump out.
+ */
+void
+mlx5_dump_debug_information(const char *fname, const char *hex_title,
+			    const void *buf, unsigned int hex_len)
+{
+	FILE *fd;
+
+	MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname);
+	fd = fopen(path, "a+");
+	if (!fd) {
+		DRV_LOG(WARNING, "cannot open %s for debug dump", path);
+		MKSTR(path2, "./%s", fname);
+		fd = fopen(path2, "a+");
+		if (!fd) {
+			DRV_LOG(ERR, "cannot open %s for debug dump", path2);
+			return;
+		}
+		DRV_LOG(INFO, "New debug dump in file %s", path2);
+	} else {
+		DRV_LOG(INFO, "New debug dump in file %s", path);
+	}
+	if (hex_title)
+		rte_hexdump(fd, hex_title, buf, hex_len);
+	else
+		fprintf(fd, "%s", (const char *)buf);
+	fprintf(fd, "\n\n\n");
+	fclose(fd);
+}
+
+/**
+ * Move QP from error state to running state and initialize indexes.
+ *
+ * @param txq_ctrl
+ *   Pointer to TX queue control structure.
+ *
+ * @return
+ *   0 on success, else -1.
+ */
+static int
+tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	struct mlx5_mp_arg_queue_state_modify sm = {
+			.is_wq = 0,
+			.queue_id = txq_ctrl->txq.idx,
+	};
+
+	if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm))
+		return -1;
+	txq_ctrl->txq.wqe_ci = 0;
+	txq_ctrl->txq.wqe_pi = 0;
+	txq_ctrl->txq.elts_comp = 0;
+	return 0;
+}
+
+/* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
+static int
+check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
+{
+	static const uint8_t magic[] = "seen";
+	int ret = 1;
+	unsigned int i;
+
+	for (i = 0; i < sizeof(magic); ++i)
+		if (!ret || err_cqe->rsvd1[i] != magic[i]) {
+			ret = 0;
+			err_cqe->rsvd1[i] = magic[i];
+		}
+	return ret;
+}
+
+/**
+ * Handle error CQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param error_cqe
+ *   Pointer to the error CQE.
+ *
+ * @return
+ *   Negative value if queue recovery failed, otherwise
+ *   the error completion entry is handled successfully.
+ */
+static int
+mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq,
+			 volatile struct mlx5_err_cqe *err_cqe)
+{
+	if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
+		const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
+		struct mlx5_txq_ctrl *txq_ctrl =
+				container_of(txq, struct mlx5_txq_ctrl, txq);
+		uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter);
+		int seen = check_err_cqe_seen(err_cqe);
+
+		if (!seen && txq_ctrl->dump_file_n <
+		    txq_ctrl->priv->config.max_dump_files_num) {
+			MKSTR(err_str, "Unexpected CQE error syndrome "
+			      "0x%02x CQN = %u SQN = %u wqe_counter = %u "
+			      "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
+			      txq->cqe_s, txq->qp_num_8s >> 8,
+			      rte_be_to_cpu_16(err_cqe->wqe_counter),
+			      txq->wqe_ci, txq->cq_ci);
+			MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
+			      PORT_ID(txq_ctrl->priv), txq->idx,
+			      txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc());
+			mlx5_dump_debug_information(name, NULL, err_str, 0);
+			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
+						    (const void *)((uintptr_t)
+						    txq->cqes),
+						    sizeof(*err_cqe) *
+						    (1 << txq->cqe_n));
+			mlx5_dump_debug_information(name, "MLX5 Error SQ:",
+						    (const void *)((uintptr_t)
+						    txq->wqes),
+						    MLX5_WQE_SIZE *
+						    (1 << txq->wqe_n));
+			txq_ctrl->dump_file_n++;
+		}
+		if (!seen)
+			/*
+			 * Count errors in WQEs units.
+			 * Later it can be improved to count error packets,
+			 * for example, by SQ parsing to find how much packets
+			 * should be counted for each WQE.
+			 */
+			txq->stats.oerrors += ((txq->wqe_ci & wqe_m) -
+						new_wqe_pi) & wqe_m;
+		if (tx_recover_qp(txq_ctrl)) {
+			/* Recovering failed - retry later on the same WQE. */
+			return -1;
+		}
+		/* Release all the remaining buffers. */
+		txq_free_elts(txq_ctrl);
+	}
+	return 0;
+}
+
+/**
+ * Translate RX completion flags to packet type.
+ *
+ * @param[in] rxq
+ *   Pointer to RX queue structure.
+ * @param[in] cqe
+ *   Pointer to CQE.
+ *
+ * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
+ *
+ * @return
+ *   Packet type for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
+{
+	uint8_t idx;
+	uint8_t pinfo = cqe->pkt_info;
+	uint16_t ptype = cqe->hdr_type_etc;
+
+	/*
+	 * The index to the array should have:
+	 * bit[1:0] = l3_hdr_type
+	 * bit[4:2] = l4_hdr_type
+	 * bit[5] = ip_frag
+	 * bit[6] = tunneled
+	 * bit[7] = outer_l3_type
+	 */
+	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
+	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
+}
+
+/**
+ * Initialize Rx WQ and indexes.
+ *
+ * @param[in] rxq
+ *   Pointer to RX queue structure.
+ */
+void
+mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
+{
+	const unsigned int wqe_n = 1 << rxq->elts_n;
+	unsigned int i;
+
+	for (i = 0; (i != wqe_n); ++i) {
+		volatile struct mlx5_wqe_data_seg *scat;
+		uintptr_t addr;
+		uint32_t byte_count;
+
+		if (mlx5_rxq_mprq_enabled(rxq)) {
+			struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
+
+			scat = &((volatile struct mlx5_wqe_mprq *)
+				rxq->wqes)[i].dseg;
+			addr = (uintptr_t)mlx5_mprq_buf_addr(buf,
+							 1 << rxq->strd_num_n);
+			byte_count = (1 << rxq->strd_sz_n) *
+					(1 << rxq->strd_num_n);
+		} else {
+			struct rte_mbuf *buf = (*rxq->elts)[i];
+
+			scat = &((volatile struct mlx5_wqe_data_seg *)
+					rxq->wqes)[i];
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
+			byte_count = DATA_LEN(buf);
+		}
+		/* scat->addr must be able to store a pointer. */
+		MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t));
+		*scat = (struct mlx5_wqe_data_seg){
+			.addr = rte_cpu_to_be_64(addr),
+			.byte_count = rte_cpu_to_be_32(byte_count),
+			.lkey = mlx5_rx_addr2mr(rxq, addr),
+		};
+	}
+	rxq->consumed_strd = 0;
+	rxq->decompressed = 0;
+	rxq->rq_pi = 0;
+	rxq->zip = (struct rxq_zip){
+		.ai = 0,
+	};
+	/* Update doorbell counter. */
+	rxq->rq_ci = wqe_n >> rxq->sges_n;
+	rte_cio_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+}
+
+/**
+ * Modify a Verbs/DevX queue state.
+ * This must be called from the primary process.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param sm
+ *   State modify request parameters.
+ *
+ * @return
+ *   0 in case of success else non-zero value and rte_errno is set.
+ */
+int
+mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
+			const struct mlx5_mp_arg_queue_state_modify *sm)
+{
+	int ret;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (sm->is_wq) {
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id];
+		struct mlx5_rxq_ctrl *rxq_ctrl =
+			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+		if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
+			struct ibv_wq_attr mod = {
+				.attr_mask = IBV_WQ_ATTR_STATE,
+				.wq_state = sm->state,
+			};
+
+			ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod);
+		} else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */
+			struct mlx5_devx_modify_rq_attr rq_attr;
+
+			memset(&rq_attr, 0, sizeof(rq_attr));
+			if (sm->state == IBV_WQS_RESET) {
+				rq_attr.rq_state = MLX5_RQC_STATE_ERR;
+				rq_attr.state = MLX5_RQC_STATE_RST;
+			} else if (sm->state == IBV_WQS_RDY) {
+				rq_attr.rq_state = MLX5_RQC_STATE_RST;
+				rq_attr.state = MLX5_RQC_STATE_RDY;
+			} else if (sm->state == IBV_WQS_ERR) {
+				rq_attr.rq_state = MLX5_RQC_STATE_RDY;
+				rq_attr.state = MLX5_RQC_STATE_ERR;
+			}
+			ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq,
+						      &rq_attr);
+		}
+		if (ret) {
+			DRV_LOG(ERR, "Cannot change Rx WQ state to %u  - %s",
+					sm->state, strerror(errno));
+			rte_errno = errno;
+			return ret;
+		}
+	} else {
+		struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id];
+		struct mlx5_txq_ctrl *txq_ctrl =
+			container_of(txq, struct mlx5_txq_ctrl, txq);
+		struct ibv_qp_attr mod = {
+			.qp_state = IBV_QPS_RESET,
+			.port_num = (uint8_t)priv->ibv_port,
+		};
+		struct ibv_qp *qp = txq_ctrl->obj->qp;
+
+		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+		if (ret) {
+			DRV_LOG(ERR, "Cannot change the Tx QP state to RESET "
+				"%s", strerror(errno));
+			rte_errno = errno;
+			return ret;
+		}
+		mod.qp_state = IBV_QPS_INIT;
+		ret = mlx5_glue->modify_qp(qp, &mod,
+					   (IBV_QP_STATE | IBV_QP_PORT));
+		if (ret) {
+			DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s",
+				strerror(errno));
+			rte_errno = errno;
+			return ret;
+		}
+		mod.qp_state = IBV_QPS_RTR;
+		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+		if (ret) {
+			DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s",
+				strerror(errno));
+			rte_errno = errno;
+			return ret;
+		}
+		mod.qp_state = IBV_QPS_RTS;
+		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+		if (ret) {
+			DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s",
+				strerror(errno));
+			rte_errno = errno;
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Modify a Verbs queue state.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param sm
+ *   State modify request parameters.
+ *
+ * @return
+ *   0 in case of success else non-zero value.
+ */
+static int
+mlx5_queue_state_modify(struct rte_eth_dev *dev,
+			struct mlx5_mp_arg_queue_state_modify *sm)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret = 0;
+
+	switch (rte_eal_process_type()) {
+	case RTE_PROC_PRIMARY:
+		ret = mlx5_queue_state_modify_primary(dev, sm);
+		break;
+	case RTE_PROC_SECONDARY:
+		ret = mlx5_mp_req_queue_state_modify(&priv->mp_id, sm);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+/**
+ * Handle a Rx error.
+ * The function inserts the RQ state to reset when the first error CQE is
+ * shown, then drains the CQ by the caller function loop. When the CQ is empty,
+ * it moves the RQ state to ready and initializes the RQ.
+ * Next CQE identification and error counting are in the caller responsibility.
+ *
+ * @param[in] rxq
+ *   Pointer to RX queue structure.
+ * @param[in] vec
+ *   1 when called from vectorized Rx burst, need to prepare mbufs for the RQ.
+ *   0 when called from non-vectorized Rx burst.
+ *
+ * @return
+ *   -1 in case of recovery error, otherwise the CQE status.
+ */
+int
+mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
+{
+	const uint16_t cqe_n = 1 << rxq->cqe_n;
+	const uint16_t cqe_mask = cqe_n - 1;
+	const unsigned int wqe_n = 1 << rxq->elts_n;
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	union {
+		volatile struct mlx5_cqe *cqe;
+		volatile struct mlx5_err_cqe *err_cqe;
+	} u = {
+		.cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
+	};
+	struct mlx5_mp_arg_queue_state_modify sm;
+	int ret;
+
+	switch (rxq->err_state) {
+	case MLX5_RXQ_ERR_STATE_NO_ERROR:
+		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
+		/* Fall-through */
+	case MLX5_RXQ_ERR_STATE_NEED_RESET:
+		sm.is_wq = 1;
+		sm.queue_id = rxq->idx;
+		sm.state = IBV_WQS_RESET;
+		if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm))
+			return -1;
+		if (rxq_ctrl->dump_file_n <
+		    rxq_ctrl->priv->config.max_dump_files_num) {
+			MKSTR(err_str, "Unexpected CQE error syndrome "
+			      "0x%02x CQN = %u RQN = %u wqe_counter = %u"
+			      " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
+			      rxq->cqn, rxq_ctrl->wqn,
+			      rte_be_to_cpu_16(u.err_cqe->wqe_counter),
+			      rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
+			MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
+			      rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
+			mlx5_dump_debug_information(name, NULL, err_str, 0);
+			mlx5_dump_debug_information(name, "MLX5 Error CQ:",
+						    (const void *)((uintptr_t)
+								    rxq->cqes),
+						    sizeof(*u.cqe) * cqe_n);
+			mlx5_dump_debug_information(name, "MLX5 Error RQ:",
+						    (const void *)((uintptr_t)
+								    rxq->wqes),
+						    16 * wqe_n);
+			rxq_ctrl->dump_file_n++;
+		}
+		rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
+		/* Fall-through */
+	case MLX5_RXQ_ERR_STATE_NEED_READY:
+		ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
+		if (ret == MLX5_CQE_STATUS_HW_OWN) {
+			rte_cio_wmb();
+			*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+			rte_cio_wmb();
+			/*
+			 * The RQ consumer index must be zeroed while moving
+			 * from RESET state to RDY state.
+			 */
+			*rxq->rq_db = rte_cpu_to_be_32(0);
+			rte_cio_wmb();
+			sm.is_wq = 1;
+			sm.queue_id = rxq->idx;
+			sm.state = IBV_WQS_RDY;
+			if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv),
+						    &sm))
+				return -1;
+			if (vec) {
+				const uint16_t q_mask = wqe_n - 1;
+				uint16_t elt_idx;
+				struct rte_mbuf **elt;
+				int i;
+				unsigned int n = wqe_n - (rxq->rq_ci -
+							  rxq->rq_pi);
+
+				for (i = 0; i < (int)n; ++i) {
+					elt_idx = (rxq->rq_ci + i) & q_mask;
+					elt = &(*rxq->elts)[elt_idx];
+					*elt = rte_mbuf_raw_alloc(rxq->mp);
+					if (!*elt) {
+						for (i--; i >= 0; --i) {
+							elt_idx = (rxq->rq_ci +
+								   i) & q_mask;
+							elt = &(*rxq->elts)
+								[elt_idx];
+							rte_pktmbuf_free_seg
+								(*elt);
+						}
+						return -1;
+					}
+				}
+				for (i = 0; i < (int)wqe_n; ++i) {
+					elt = &(*rxq->elts)[i];
+					DATA_LEN(*elt) =
+						(uint16_t)((*elt)->buf_len -
+						rte_pktmbuf_headroom(*elt));
+				}
+				/* Padding with a fake mbuf for vec Rx. */
+				for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+					(*rxq->elts)[wqe_n + i] =
+								&rxq->fake_mbuf;
+			}
+			mlx5_rxq_initialize(rxq);
+			rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
+		}
+		return ret;
+	default:
+		return -1;
+	}
+}
+
+/**
+ * Get size of the next packet for a given CQE. For compressed CQEs, the
+ * consumer index is updated only once all packets of the current one have
+ * been processed.
+ *
+ * @param rxq
+ *   Pointer to RX queue.
+ * @param cqe
+ *   CQE to process.
+ * @param[out] mcqe
+ *   Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
+ *   written.
+ *
+ * @return
+ *   0 in case of empty CQE, otherwise the packet size in bytes.
+ */
+static inline int
+mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+		 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
+{
+	struct rxq_zip *zip = &rxq->zip;
+	uint16_t cqe_n = cqe_cnt + 1;
+	int len;
+	uint16_t idx, end;
+
+	do {
+		len = 0;
+		/* Process compressed data in the CQE and mini arrays. */
+		if (zip->ai) {
+			volatile struct mlx5_mini_cqe8 (*mc)[8] =
+				(volatile struct mlx5_mini_cqe8 (*)[8])
+				(uintptr_t)(&(*rxq->cqes)[zip->ca &
+							  cqe_cnt].pkt_info);
+
+			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+			*mcqe = &(*mc)[zip->ai & 7];
+			if ((++zip->ai & 7) == 0) {
+				/* Invalidate consumed CQEs */
+				idx = zip->ca;
+				end = zip->na;
+				while (idx != end) {
+					(*rxq->cqes)[idx & cqe_cnt].op_own =
+						MLX5_CQE_INVALIDATE;
+					++idx;
+				}
+				/*
+				 * Increment consumer index to skip the number
+				 * of CQEs consumed. Hardware leaves holes in
+				 * the CQ ring for software use.
+				 */
+				zip->ca = zip->na;
+				zip->na += 8;
+			}
+			if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+				/* Invalidate the rest */
+				idx = zip->ca;
+				end = zip->cq_ci;
+
+				while (idx != end) {
+					(*rxq->cqes)[idx & cqe_cnt].op_own =
+						MLX5_CQE_INVALIDATE;
+					++idx;
+				}
+				rxq->cq_ci = zip->cq_ci;
+				zip->ai = 0;
+			}
+		/*
+		 * No compressed data, get next CQE and verify if it is
+		 * compressed.
+		 */
+		} else {
+			int ret;
+			int8_t op_own;
+
+			ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
+			if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+				if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
+					     rxq->err_state)) {
+					ret = mlx5_rx_err_handle(rxq, 0);
+					if (ret == MLX5_CQE_STATUS_HW_OWN ||
+					    ret == -1)
+						return 0;
+				} else {
+					return 0;
+				}
+			}
+			++rxq->cq_ci;
+			op_own = cqe->op_own;
+			if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+				volatile struct mlx5_mini_cqe8 (*mc)[8] =
+					(volatile struct mlx5_mini_cqe8 (*)[8])
+					(uintptr_t)(&(*rxq->cqes)
+						[rxq->cq_ci &
+						 cqe_cnt].pkt_info);
+
+				/* Fix endianness. */
+				zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
+				/*
+				 * Current mini array position is the one
+				 * returned by check_cqe64().
+				 *
+				 * If completion comprises several mini arrays,
+				 * as a special case the second one is located
+				 * 7 CQEs after the initial CQE instead of 8
+				 * for subsequent ones.
+				 */
+				zip->ca = rxq->cq_ci;
+				zip->na = zip->ca + 7;
+				/* Compute the next non compressed CQE. */
+				--rxq->cq_ci;
+				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
+				/* Get packet size to return. */
+				len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+				*mcqe = &(*mc)[0];
+				zip->ai = 1;
+				/* Prefetch all to be invalidated */
+				idx = zip->ca;
+				end = zip->cq_ci;
+				while (idx != end) {
+					rte_prefetch0(&(*rxq->cqes)[(idx) &
+								    cqe_cnt]);
+					++idx;
+				}
+			} else {
+				len = rte_be_to_cpu_32(cqe->byte_cnt);
+			}
+		}
+		if (unlikely(rxq->err_state)) {
+			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+			++rxq->stats.idropped;
+		} else {
+			return len;
+		}
+	} while (1);
+}
+
+/**
+ * Translate RX completion flags to offload flags.
+ *
+ * @param[in] cqe
+ *   Pointer to CQE.
+ *
+ * @return
+ *   Offload flags (ol_flags) for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
+{
+	uint32_t ol_flags = 0;
+	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
+
+	ol_flags =
+		TRANSPOSE(flags,
+			  MLX5_CQE_RX_L3_HDR_VALID,
+			  PKT_RX_IP_CKSUM_GOOD) |
+		TRANSPOSE(flags,
+			  MLX5_CQE_RX_L4_HDR_VALID,
+			  PKT_RX_L4_CKSUM_GOOD);
+	return ol_flags;
+}
+
+/**
+ * Fill in mbuf fields from RX completion flags.
+ * Note that pkt->ol_flags should be initialized outside of this function.
+ *
+ * @param rxq
+ *   Pointer to RX queue.
+ * @param pkt
+ *   mbuf to fill.
+ * @param cqe
+ *   CQE to process.
+ * @param rss_hash_res
+ *   Packet RSS Hash result.
+ */
+static inline void
+rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
+	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
+{
+	/* Update packet information. */
+	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
+	if (rss_hash_res && rxq->rss_hash) {
+		pkt->hash.rss = rss_hash_res;
+		pkt->ol_flags |= PKT_RX_RSS_HASH;
+	}
+	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
+		pkt->ol_flags |= PKT_RX_FDIR;
+		if (cqe->sop_drop_qpn !=
+		    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
+			uint32_t mark = cqe->sop_drop_qpn;
+
+			pkt->ol_flags |= PKT_RX_FDIR_ID;
+			pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
+		}
+	}
+	if (rxq->dynf_meta && cqe->flow_table_metadata) {
+		pkt->ol_flags |= rxq->flow_meta_mask;
+		*RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, uint32_t *) =
+			cqe->flow_table_metadata;
+	}
+	if (rxq->csum)
+		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
+	if (rxq->vlan_strip &&
+	    (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
+		pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+		pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+	}
+	if (rxq->hw_timestamp) {
+		pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp);
+		pkt->ol_flags |= PKT_RX_TIMESTAMP;
+	}
+}
+
+/**
+ * DPDK callback for RX.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct mlx5_rxq_data *rxq = dpdk_rxq;
+	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
+	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
+	const unsigned int sges_n = rxq->sges_n;
+	struct rte_mbuf *pkt = NULL;
+	struct rte_mbuf *seg = NULL;
+	volatile struct mlx5_cqe *cqe =
+		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+	unsigned int i = 0;
+	unsigned int rq_ci = rxq->rq_ci << sges_n;
+	int len = 0; /* keep its value across iterations. */
+
+	while (pkts_n) {
+		unsigned int idx = rq_ci & wqe_cnt;
+		volatile struct mlx5_wqe_data_seg *wqe =
+			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
+		struct rte_mbuf *rep = (*rxq->elts)[idx];
+		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
+		uint32_t rss_hash_res;
+
+		if (pkt)
+			NEXT(seg) = rep;
+		seg = rep;
+		rte_prefetch0(seg);
+		rte_prefetch0(cqe);
+		rte_prefetch0(wqe);
+		rep = rte_mbuf_raw_alloc(rxq->mp);
+		if (unlikely(rep == NULL)) {
+			++rxq->stats.rx_nombuf;
+			if (!pkt) {
+				/*
+				 * no buffers before we even started,
+				 * bail out silently.
+				 */
+				break;
+			}
+			while (pkt != seg) {
+				MLX5_ASSERT(pkt != (*rxq->elts)[idx]);
+				rep = NEXT(pkt);
+				NEXT(pkt) = NULL;
+				NB_SEGS(pkt) = 1;
+				rte_mbuf_raw_free(pkt);
+				pkt = rep;
+			}
+			break;
+		}
+		if (!pkt) {
+			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
+			if (!len) {
+				rte_mbuf_raw_free(rep);
+				break;
+			}
+			pkt = seg;
+			MLX5_ASSERT(len >= (rxq->crc_present << 2));
+			pkt->ol_flags &= EXT_ATTACHED_MBUF;
+			/* If compressed, take hash result from mini-CQE. */
+			rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
+							cqe->rx_hash_res :
+							mcqe->rx_hash_result);
+			rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+			if (rxq->crc_present)
+				len -= RTE_ETHER_CRC_LEN;
+			PKT_LEN(pkt) = len;
+			if (cqe->lro_num_seg > 1) {
+				mlx5_lro_update_hdr
+					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
+					 len);
+				pkt->ol_flags |= PKT_RX_LRO;
+				pkt->tso_segsz = len / cqe->lro_num_seg;
+			}
+		}
+		DATA_LEN(rep) = DATA_LEN(seg);
+		PKT_LEN(rep) = PKT_LEN(seg);
+		SET_DATA_OFF(rep, DATA_OFF(seg));
+		PORT(rep) = PORT(seg);
+		(*rxq->elts)[idx] = rep;
+		/*
+		 * Fill NIC descriptor with the new buffer.  The lkey and size
+		 * of the buffers are already known, only the buffer address
+		 * changes.
+		 */
+		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+		/* If there's only one MR, no need to replace LKey in WQE. */
+		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
+		if (len > DATA_LEN(seg)) {
+			len -= DATA_LEN(seg);
+			++NB_SEGS(pkt);
+			++rq_ci;
+			continue;
+		}
+		DATA_LEN(seg) = len;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Increment bytes counter. */
+		rxq->stats.ibytes += PKT_LEN(pkt);
+#endif
+		/* Return packet. */
+		*(pkts++) = pkt;
+		pkt = NULL;
+		--pkts_n;
+		++i;
+		/* Align consumer index to the next stride. */
+		rq_ci >>= sges_n;
+		++rq_ci;
+		rq_ci <<= sges_n;
+	}
+	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
+		return 0;
+	/* Update the consumer index. */
+	rxq->rq_ci = rq_ci >> sges_n;
+	rte_cio_wmb();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	rte_cio_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment packets counter. */
+	rxq->stats.ipackets += i;
+#endif
+	return i;
+}
+
+/**
+ * Update LRO packet TCP header.
+ * The HW LRO feature doesn't update the TCP header after coalescing the
+ * TCP segments but supplies information in CQE to fill it by SW.
+ *
+ * @param tcp
+ *   Pointer to the TCP header.
+ * @param cqe
+ *   Pointer to the completion entry..
+ * @param phcsum
+ *   The L3 pseudo-header checksum.
+ */
+static inline void
+mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
+			volatile struct mlx5_cqe *restrict cqe,
+			uint32_t phcsum)
+{
+	uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
+			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+	/*
+	 * The HW calculates only the TCP payload checksum, need to complete
+	 * the TCP header checksum and the L3 pseudo-header checksum.
+	 */
+	uint32_t csum = phcsum + cqe->csum;
+
+	if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK ||
+	    l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) {
+		tcp->tcp_flags |= RTE_TCP_ACK_FLAG;
+		tcp->recv_ack = cqe->lro_ack_seq_num;
+		tcp->rx_win = cqe->lro_tcp_win;
+	}
+	if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK)
+		tcp->tcp_flags |= RTE_TCP_PSH_FLAG;
+	tcp->cksum = 0;
+	csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4);
+	csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff);
+	csum = (~csum) & 0xffff;
+	if (csum == 0)
+		csum = 0xffff;
+	tcp->cksum = csum;
+}
+
+/**
+ * Update LRO packet headers.
+ * The HW LRO feature doesn't update the L3/TCP headers after coalescing the
+ * TCP segments but supply information in CQE to fill it by SW.
+ *
+ * @param padd
+ *   The packet address.
+ * @param cqe
+ *   Pointer to the completion entry..
+ * @param len
+ *   The packet length.
+ */
+static inline void
+mlx5_lro_update_hdr(uint8_t *restrict padd,
+		    volatile struct mlx5_cqe *restrict cqe,
+		    uint32_t len)
+{
+	union {
+		struct rte_ether_hdr *eth;
+		struct rte_vlan_hdr *vlan;
+		struct rte_ipv4_hdr *ipv4;
+		struct rte_ipv6_hdr *ipv6;
+		struct rte_tcp_hdr *tcp;
+		uint8_t *hdr;
+	} h = {
+			.hdr = padd,
+	};
+	uint16_t proto = h.eth->ether_type;
+	uint32_t phcsum;
+
+	h.eth++;
+	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
+	       proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
+		proto = h.vlan->eth_proto;
+		h.vlan++;
+	}
+	if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) {
+		h.ipv4->time_to_live = cqe->lro_min_ttl;
+		h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd));
+		h.ipv4->hdr_checksum = 0;
+		h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4);
+		phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0);
+		h.ipv4++;
+	} else {
+		h.ipv6->hop_limits = cqe->lro_min_ttl;
+		h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) -
+						       sizeof(*h.ipv6));
+		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
+		h.ipv6++;
+	}
+	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
+}
+
+void
+mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
+{
+	struct mlx5_mprq_buf *buf = opaque;
+
+	if (rte_atomic16_read(&buf->refcnt) == 1) {
+		rte_mempool_put(buf->mp, buf);
+	} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
+		rte_atomic16_set(&buf->refcnt, 1);
+		rte_mempool_put(buf->mp, buf);
+	}
+}
+
+void
+mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
+{
+	mlx5_mprq_buf_free_cb(NULL, buf);
+}
+
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+		 const unsigned int strd_n)
+{
+	struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+	volatile struct mlx5_wqe_data_seg *wqe =
+		&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+	void *addr;
+
+	MLX5_ASSERT(rep != NULL);
+	/* Replace MPRQ buf. */
+	(*rxq->mprq_bufs)[rq_idx] = rep;
+	/* Replace WQE. */
+	addr = mlx5_mprq_buf_addr(rep, strd_n);
+	wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+	/* If there's only one MR, no need to replace LKey in WQE. */
+	if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+		wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+	/* Stash a mbuf for next replacement. */
+	if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+		rxq->mprq_repl = rep;
+	else
+		rxq->mprq_repl = NULL;
+}
+
+/**
+ * DPDK callback for RX with Multi-Packet RQ support.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct mlx5_rxq_data *rxq = dpdk_rxq;
+	const unsigned int strd_n = 1 << rxq->strd_num_n;
+	const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+	const unsigned int strd_shift =
+		MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+	const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
+	const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
+	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
+	unsigned int i = 0;
+	uint32_t rq_ci = rxq->rq_ci;
+	uint16_t consumed_strd = rxq->consumed_strd;
+	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
+
+	while (i < pkts_n) {
+		struct rte_mbuf *pkt;
+		void *addr;
+		int ret;
+		uint32_t len;
+		uint16_t strd_cnt;
+		uint16_t strd_idx;
+		uint32_t offset;
+		uint32_t byte_cnt;
+		int32_t hdrm_overlap;
+		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
+		uint32_t rss_hash_res = 0;
+
+		if (consumed_strd == strd_n) {
+			/* Replace WQE only if the buffer is still in use. */
+			if (rte_atomic16_read(&buf->refcnt) > 1) {
+				mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);
+				/* Release the old buffer. */
+				mlx5_mprq_buf_free(buf);
+			} else if (unlikely(rxq->mprq_repl == NULL)) {
+				struct mlx5_mprq_buf *rep;
+
+				/*
+				 * Currently, the MPRQ mempool is out of buffer
+				 * and doing memcpy regardless of the size of Rx
+				 * packet. Retry allocation to get back to
+				 * normal.
+				 */
+				if (!rte_mempool_get(rxq->mprq_mp,
+						     (void **)&rep))
+					rxq->mprq_repl = rep;
+			}
+			/* Advance to the next WQE. */
+			consumed_strd = 0;
+			++rq_ci;
+			buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
+		}
+		cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
+		ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
+		if (!ret)
+			break;
+		byte_cnt = ret;
+		strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
+			   MLX5_MPRQ_STRIDE_NUM_SHIFT;
+		MLX5_ASSERT(strd_cnt);
+		consumed_strd += strd_cnt;
+		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
+			continue;
+		if (mcqe == NULL) {
+			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
+			strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
+		} else {
+			/* mini-CQE for MPRQ doesn't have hash result. */
+			strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
+		}
+		MLX5_ASSERT(strd_idx < strd_n);
+		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
+			    wq_mask));
+		pkt = rte_pktmbuf_alloc(rxq->mp);
+		if (unlikely(pkt == NULL)) {
+			++rxq->stats.rx_nombuf;
+			break;
+		}
+		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
+		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
+		if (rxq->crc_present)
+			len -= RTE_ETHER_CRC_LEN;
+		offset = strd_idx * strd_sz + strd_shift;
+		addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+		hdrm_overlap = len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz;
+		/*
+		 * Memcpy packets to the target mbuf if:
+		 * - The size of packet is smaller than mprq_max_memcpy_len.
+		 * - Out of buffer in the Mempool for Multi-Packet RQ.
+		 * - The packet's stride overlaps a headroom and scatter is off.
+		 */
+		if (len <= rxq->mprq_max_memcpy_len ||
+		    rxq->mprq_repl == NULL ||
+		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
+			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
+				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+					   addr, len);
+				DATA_LEN(pkt) = len;
+			} else if (rxq->strd_scatter_en) {
+				struct rte_mbuf *prev = pkt;
+				uint32_t seg_len =
+					RTE_MIN(rte_pktmbuf_tailroom(pkt), len);
+				uint32_t rem_len = len - seg_len;
+
+				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+					   addr, seg_len);
+				DATA_LEN(pkt) = seg_len;
+				while (rem_len) {
+					struct rte_mbuf *next =
+						rte_pktmbuf_alloc(rxq->mp);
+
+					if (unlikely(next == NULL)) {
+						rte_pktmbuf_free(pkt);
+						++rxq->stats.rx_nombuf;
+						goto out;
+					}
+					NEXT(prev) = next;
+					SET_DATA_OFF(next, 0);
+					addr = RTE_PTR_ADD(addr, seg_len);
+					seg_len = RTE_MIN
+						(rte_pktmbuf_tailroom(next),
+						 rem_len);
+					rte_memcpy
+						(rte_pktmbuf_mtod(next, void *),
+						 addr, seg_len);
+					DATA_LEN(next) = seg_len;
+					rem_len -= seg_len;
+					prev = next;
+					++NB_SEGS(pkt);
+				}
+			} else {
+				rte_pktmbuf_free_seg(pkt);
+				++rxq->stats.idropped;
+				continue;
+			}
+		} else {
+			rte_iova_t buf_iova;
+			struct rte_mbuf_ext_shared_info *shinfo;
+			uint16_t buf_len = strd_cnt * strd_sz;
+			void *buf_addr;
+
+			/* Increment the refcnt of the whole chunk. */
+			rte_atomic16_add_return(&buf->refcnt, 1);
+			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+				    strd_n + 1);
+			buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+			/*
+			 * MLX5 device doesn't use iova but it is necessary in a
+			 * case where the Rx packet is transmitted via a
+			 * different PMD.
+			 */
+			buf_iova = rte_mempool_virt2iova(buf) +
+				   RTE_PTR_DIFF(buf_addr, buf);
+			shinfo = &buf->shinfos[strd_idx];
+			rte_mbuf_ext_refcnt_set(shinfo, 1);
+			/*
+			 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+			 * attaching the stride to mbuf and more offload flags
+			 * will be added below by calling rxq_cq_to_mbuf().
+			 * Other fields will be overwritten.
+			 */
+			rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
+						  buf_len, shinfo);
+			/* Set mbuf head-room. */
+			SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM);
+			MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF);
+			MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >=
+				len - (hdrm_overlap > 0 ? hdrm_overlap : 0));
+			DATA_LEN(pkt) = len;
+			/*
+			 * Copy the last fragment of a packet (up to headroom
+			 * size bytes) in case there is a stride overlap with
+			 * a next packet's headroom. Allocate a separate mbuf
+			 * to store this fragment and link it. Scatter is on.
+			 */
+			if (hdrm_overlap > 0) {
+				MLX5_ASSERT(rxq->strd_scatter_en);
+				struct rte_mbuf *seg =
+					rte_pktmbuf_alloc(rxq->mp);
+
+				if (unlikely(seg == NULL)) {
+					rte_pktmbuf_free_seg(pkt);
+					++rxq->stats.rx_nombuf;
+					break;
+				}
+				SET_DATA_OFF(seg, 0);
+				rte_memcpy(rte_pktmbuf_mtod(seg, void *),
+					RTE_PTR_ADD(addr, len - hdrm_overlap),
+					hdrm_overlap);
+				DATA_LEN(seg) = hdrm_overlap;
+				DATA_LEN(pkt) = len - hdrm_overlap;
+				NEXT(pkt) = seg;
+				NB_SEGS(pkt) = 2;
+			}
+		}
+		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+		if (cqe->lro_num_seg > 1) {
+			mlx5_lro_update_hdr(addr, cqe, len);
+			pkt->ol_flags |= PKT_RX_LRO;
+			pkt->tso_segsz = len / cqe->lro_num_seg;
+		}
+		PKT_LEN(pkt) = len;
+		PORT(pkt) = rxq->port_id;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Increment bytes counter. */
+		rxq->stats.ibytes += PKT_LEN(pkt);
+#endif
+		/* Return packet. */
+		*(pkts++) = pkt;
+		++i;
+	}
+out:
+	/* Update the consumer indexes. */
+	rxq->consumed_strd = consumed_strd;
+	rte_cio_wmb();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	if (rq_ci != rxq->rq_ci) {
+		rxq->rq_ci = rq_ci;
+		rte_cio_wmb();
+		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment packets counter. */
+	rxq->stats.ipackets += i;
+#endif
+	return i;
+}
+
+/**
+ * Dummy DPDK callback for TX.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+removed_tx_burst(void *dpdk_txq __rte_unused,
+		 struct rte_mbuf **pkts __rte_unused,
+		 uint16_t pkts_n __rte_unused)
+{
+	rte_mb();
+	return 0;
+}
+
+/**
+ * Dummy DPDK callback for RX.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+removed_rx_burst(void *dpdk_txq __rte_unused,
+		 struct rte_mbuf **pkts __rte_unused,
+		 uint16_t pkts_n __rte_unused)
+{
+	rte_mb();
+	return 0;
+}
+
+/*
+ * Vectorized Rx/Tx routines are not compiled in when required vector
+ * instructions are not supported on a target architecture. The following null
+ * stubs are needed for linkage when those are not included outside of this file
+ * (e.g.  mlx5_rxtx_vec_sse.c for x86).
+ */
+
+__rte_weak uint16_t
+mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
+		  struct rte_mbuf **pkts __rte_unused,
+		  uint16_t pkts_n __rte_unused)
+{
+	return 0;
+}
+
+__rte_weak int
+mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
+{
+	return -ENOTSUP;
+}
+
+__rte_weak int
+mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
+{
+	return -ENOTSUP;
+}
+
+/**
+ * Free the mbufs from the linear array of pointers.
+ *
+ * @param pkts
+ *   Pointer to array of packets to be free.
+ * @param pkts_n
+ *   Number of packets to be freed.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  unsigned int olx __rte_unused)
+{
+	struct rte_mempool *pool = NULL;
+	struct rte_mbuf **p_free = NULL;
+	struct rte_mbuf *mbuf;
+	unsigned int n_free = 0;
+
+	/*
+	 * The implemented algorithm eliminates
+	 * copying pointers to temporary array
+	 * for rte_mempool_put_bulk() calls.
+	 */
+	MLX5_ASSERT(pkts);
+	MLX5_ASSERT(pkts_n);
+	for (;;) {
+		for (;;) {
+			/*
+			 * Decrement mbuf reference counter, detach
+			 * indirect and external buffers if needed.
+			 */
+			mbuf = rte_pktmbuf_prefree_seg(*pkts);
+			if (likely(mbuf != NULL)) {
+				MLX5_ASSERT(mbuf == *pkts);
+				if (likely(n_free != 0)) {
+					if (unlikely(pool != mbuf->pool))
+						/* From different pool. */
+						break;
+				} else {
+					/* Start new scan array. */
+					pool = mbuf->pool;
+					p_free = pkts;
+				}
+				++n_free;
+				++pkts;
+				--pkts_n;
+				if (unlikely(pkts_n == 0)) {
+					mbuf = NULL;
+					break;
+				}
+			} else {
+				/*
+				 * This happens if mbuf is still referenced.
+				 * We can't put it back to the pool, skip.
+				 */
+				++pkts;
+				--pkts_n;
+				if (unlikely(n_free != 0))
+					/* There is some array to free.*/
+					break;
+				if (unlikely(pkts_n == 0))
+					/* Last mbuf, nothing to free. */
+					return;
+			}
+		}
+		for (;;) {
+			/*
+			 * This loop is implemented to avoid multiple
+			 * inlining of rte_mempool_put_bulk().
+			 */
+			MLX5_ASSERT(pool);
+			MLX5_ASSERT(p_free);
+			MLX5_ASSERT(n_free);
+			/*
+			 * Free the array of pre-freed mbufs
+			 * belonging to the same memory pool.
+			 */
+			rte_mempool_put_bulk(pool, (void *)p_free, n_free);
+			if (unlikely(mbuf != NULL)) {
+				/* There is the request to start new scan. */
+				pool = mbuf->pool;
+				p_free = pkts++;
+				n_free = 1;
+				--pkts_n;
+				if (likely(pkts_n != 0))
+					break;
+				/*
+				 * This is the last mbuf to be freed.
+				 * Do one more loop iteration to complete.
+				 * This is rare case of the last unique mbuf.
+				 */
+				mbuf = NULL;
+				continue;
+			}
+			if (likely(pkts_n == 0))
+				return;
+			n_free = 0;
+			break;
+		}
+	}
+}
+
+/**
+ * Free the mbuf from the elts ring buffer till new tail.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tail
+ *   Index in elts to free up to, becomes new elts tail.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
+		  uint16_t tail,
+		  unsigned int olx __rte_unused)
+{
+	uint16_t n_elts = tail - txq->elts_tail;
+
+	MLX5_ASSERT(n_elts);
+	MLX5_ASSERT(n_elts <= txq->elts_s);
+	/*
+	 * Implement a loop to support ring buffer wraparound
+	 * with single inlining of mlx5_tx_free_mbuf().
+	 */
+	do {
+		unsigned int part;
+
+		part = txq->elts_s - (txq->elts_tail & txq->elts_m);
+		part = RTE_MIN(part, n_elts);
+		MLX5_ASSERT(part);
+		MLX5_ASSERT(part <= txq->elts_s);
+		mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m],
+				  part, olx);
+		txq->elts_tail += part;
+		n_elts -= part;
+	} while (n_elts);
+}
+
+/**
+ * Store the mbuf being sent into elts ring buffer.
+ * On Tx completion these mbufs will be freed.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq,
+		  struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  unsigned int olx __rte_unused)
+{
+	unsigned int part;
+	struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
+
+	MLX5_ASSERT(pkts);
+	MLX5_ASSERT(pkts_n);
+	part = txq->elts_s - (txq->elts_head & txq->elts_m);
+	MLX5_ASSERT(part);
+	MLX5_ASSERT(part <= txq->elts_s);
+	/* This code is a good candidate for vectorizing with SIMD. */
+	rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
+		   (void *)pkts,
+		   RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
+	txq->elts_head += pkts_n;
+	if (unlikely(part < pkts_n))
+		/* The copy is wrapping around the elts array. */
+		rte_memcpy((void *)elts, (void *)(pkts + part),
+			   (pkts_n - part) * sizeof(struct rte_mbuf *));
+}
+
+/**
+ * Update completion queue consuming index via doorbell
+ * and flush the completed data buffers.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param valid CQE pointer
+ *   if not NULL update txq->wqe_pi and flush the buffers
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq,
+		   volatile struct mlx5_cqe *last_cqe,
+		   unsigned int olx __rte_unused)
+{
+	if (likely(last_cqe != NULL)) {
+		uint16_t tail;
+
+		txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter);
+		tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m];
+		if (likely(tail != txq->elts_tail)) {
+			mlx5_tx_free_elts(txq, tail, olx);
+			MLX5_ASSERT(tail == txq->elts_tail);
+		}
+	}
+}
+
+/**
+ * Manage TX completions. This routine checks the CQ for
+ * arrived CQEs, deduces the last accomplished WQE in SQ,
+ * updates SQ producing index and frees all completed mbufs.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * NOTE: not inlined intentionally, it makes tx_burst
+ * routine smaller, simple and faster - from experiments.
+ */
+static void
+mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
+			  unsigned int olx __rte_unused)
+{
+	unsigned int count = MLX5_TX_COMP_MAX_CQE;
+	volatile struct mlx5_cqe *last_cqe = NULL;
+	bool ring_doorbell = false;
+	int ret;
+
+	static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value");
+	static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value");
+	do {
+		volatile struct mlx5_cqe *cqe;
+
+		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
+		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
+		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
+				/* No new CQEs in completion queue. */
+				MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN);
+				break;
+			}
+			/*
+			 * Some error occurred, try to restart.
+			 * We have no barrier after WQE related Doorbell
+			 * written, make sure all writes are completed
+			 * here, before we might perform SQ reset.
+			 */
+			rte_wmb();
+			ret = mlx5_tx_error_cqe_handle
+				(txq, (volatile struct mlx5_err_cqe *)cqe);
+			if (unlikely(ret < 0)) {
+				/*
+				 * Some error occurred on queue error
+				 * handling, we do not advance the index
+				 * here, allowing to retry on next call.
+				 */
+				return;
+			}
+			/*
+			 * We are going to fetch all entries with
+			 * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status.
+			 * The send queue is supposed to be empty.
+			 */
+			ring_doorbell = true;
+			++txq->cq_ci;
+			txq->cq_pi = txq->cq_ci;
+			last_cqe = NULL;
+			continue;
+		}
+		/* Normal transmit completion. */
+		MLX5_ASSERT(txq->cq_ci != txq->cq_pi);
+		MLX5_ASSERT((txq->fcqs[txq->cq_ci & txq->cqe_m] >> 16) ==
+			    cqe->wqe_counter);
+		ring_doorbell = true;
+		++txq->cq_ci;
+		last_cqe = cqe;
+		/*
+		 * We have to restrict the amount of processed CQEs
+		 * in one tx_burst routine call. The CQ may be large
+		 * and many CQEs may be updated by the NIC in one
+		 * transaction. Buffers freeing is time consuming,
+		 * multiple iterations may introduce significant
+		 * latency.
+		 */
+		if (likely(--count == 0))
+			break;
+	} while (true);
+	if (likely(ring_doorbell)) {
+		/* Ring doorbell to notify hardware. */
+		rte_compiler_barrier();
+		*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
+		mlx5_tx_comp_flush(txq, last_cqe, olx);
+	}
+}
+
+/**
+ * Check if the completion request flag should be set in the last WQE.
+ * Both pushed mbufs and WQEs are monitored and the completion request
+ * flag is set if any of thresholds is reached.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
+			   struct mlx5_txq_local *restrict loc,
+			   unsigned int olx)
+{
+	uint16_t head = txq->elts_head;
+	unsigned int part;
+
+	part = MLX5_TXOFF_CONFIG(INLINE) ?
+	       0 : loc->pkts_sent - loc->pkts_copy;
+	head += part;
+	if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
+	     (MLX5_TXOFF_CONFIG(INLINE) &&
+	     (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
+		volatile struct mlx5_wqe *last = loc->wqe_last;
+
+		MLX5_ASSERT(last);
+		txq->elts_comp = head;
+		if (MLX5_TXOFF_CONFIG(INLINE))
+			txq->wqe_comp = txq->wqe_ci;
+		/* Request unconditional completion on last WQE. */
+		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					    MLX5_COMP_MODE_OFFSET);
+		/* Save elts_head in dedicated free on completion queue. */
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
+			  (last->cseg.opcode >> 8) << 16;
+#else
+		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
+#endif
+		/* A CQE slot must always be available. */
+		MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
+	}
+}
+
+/**
+ * DPDK callback to check the status of a tx descriptor.
+ *
+ * @param tx_queue
+ *   The tx queue.
+ * @param[in] offset
+ *   The index of the descriptor in the ring.
+ *
+ * @return
+ *   The status of the tx descriptor.
+ */
+int
+mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
+{
+	struct mlx5_txq_data *restrict txq = tx_queue;
+	uint16_t used;
+
+	mlx5_tx_handle_completion(txq, 0);
+	used = txq->elts_head - txq->elts_tail;
+	if (offset < used)
+		return RTE_ETH_TX_DESC_FULL;
+	return RTE_ETH_TX_DESC_DONE;
+}
+
+/**
+ * Build the Control Segment with specified opcode:
+ * - MLX5_OPCODE_SEND
+ * - MLX5_OPCODE_ENHANCED_MPSW
+ * - MLX5_OPCODE_TSO
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Control Segment.
+ * @param ds
+ *   Supposed length of WQE in segments.
+ * @param opcode
+ *   SQ WQE opcode to put into Control Segment.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int ds,
+		  unsigned int opcode,
+		  unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_cseg *restrict cs = &wqe->cseg;
+
+	/* For legacy MPW replace the EMPW by TSO with modifier. */
+	if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW)
+		opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24;
+	cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
+	cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
+			     MLX5_COMP_MODE_OFFSET);
+	cs->misc = RTE_BE32(0);
+}
+
+/**
+ * Build the Ethernet Segment without inlined data.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	es->flags = rte_cpu_to_le_32(csum);
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+		       *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
+	/* Engage VLAN tag insertion feature if requested. */
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+		/*
+		 * We should get here only if device support
+		 * this feature correctly.
+		 */
+		MLX5_ASSERT(txq->vlan_en);
+		es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
+						  loc->mbuf->vlan_tci);
+	} else {
+		es->inline_hdr = RTE_BE32(0);
+	}
+}
+
+/**
+ * Build the Ethernet Segment with minimal inlined data
+ * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
+ * used to fill the gap in single WQEBB WQEs.
+ * Supports Software Parser, Checksums and VLAN
+ * insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *psrc, *pdst;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	es->flags = rte_cpu_to_le_32(csum);
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+		       *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+	es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
+	es->inline_data = *(unaligned_uint16_t *)psrc;
+	psrc +=	sizeof(uint16_t);
+	pdst = (uint8_t *)(es + 1);
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		/* Insert VLAN ethertype + VLAN tag. */
+		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
+						((RTE_ETHER_TYPE_VLAN << 16) |
+						 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		/* Copy the rest two bytes from packet data. */
+		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
+		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
+	} else {
+		/* Fill the gap in the title WQEBB with inline data. */
+		rte_mov16(pdst, psrc);
+	}
+}
+
+/**
+ * Build the Ethernet Segment with entire packet
+ * data inlining. Checks the boundary of WQEBB and
+ * ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param inlen
+ *   Length of data to inline (VLAN included, if any).
+ * @param tso
+ *   TSO flag, set mss field from the packet.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment (aligned and wrapped around).
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int inlen,
+		  unsigned int tso,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *psrc, *pdst;
+	unsigned int part;
+
+	/*
+	 * Calculate and set check sum flags first, dword field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	if (tso) {
+		csum <<= 24;
+		csum |= loc->mbuf->tso_segsz;
+		es->flags = rte_cpu_to_be_32(csum);
+	} else {
+		es->flags = rte_cpu_to_le_32(csum);
+	}
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+		       *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+	es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
+	es->inline_data = *(unaligned_uint16_t *)psrc;
+	psrc +=	sizeof(uint16_t);
+	pdst = (uint8_t *)(es + 1);
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
+		pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		psrc +=	2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
+		/* Insert VLAN ethertype + VLAN tag. */
+		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
+						((RTE_ETHER_TYPE_VLAN << 16) |
+						 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		/* Copy the rest two bytes from packet data. */
+		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
+		*(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
+		psrc += sizeof(uint16_t);
+	} else {
+		/* Fill the gap in the title WQEBB with inline data. */
+		rte_mov16(pdst, psrc);
+		psrc += sizeof(rte_v128u32_t);
+	}
+	pdst = (uint8_t *)(es + 2);
+	MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
+	MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
+	inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
+	if (!inlen) {
+		MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
+		return (struct mlx5_wqe_dseg *)pdst;
+	}
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, inlen);
+	do {
+		rte_memcpy(pdst, psrc, part);
+		inlen -= part;
+		if (likely(!inlen)) {
+			/*
+			 * If return value is not used by the caller
+			 * the code below will be optimized out.
+			 */
+			pdst += part;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+				pdst = (uint8_t *)txq->wqes;
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		psrc += part;
+		part = inlen;
+	} while (true);
+}
+
+/**
+ * Copy data from chain of mbuf to the specified linear buffer.
+ * Checksums and VLAN insertion Tx offload features. If data
+ * from some mbuf copied completely this mbuf is freed. Local
+ * structure is used to keep the byte stream state.
+ *
+ * @param pdst
+ *   Pointer to the destination linear buffer.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param len
+ *   Length of data to be copied.
+ * @param must
+ *   Length of data to be copied ignoring no inline hint.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Number of actual copied data bytes. This is always greater than or
+ *   equal to must parameter and might be lesser than len in no inline
+ *   hint flag is encountered.
+ */
+static __rte_always_inline unsigned int
+mlx5_tx_mseg_memcpy(uint8_t *pdst,
+		    struct mlx5_txq_local *restrict loc,
+		    unsigned int len,
+		    unsigned int must,
+		    unsigned int olx __rte_unused)
+{
+	struct rte_mbuf *mbuf;
+	unsigned int part, dlen, copy = 0;
+	uint8_t *psrc;
+
+	MLX5_ASSERT(len);
+	MLX5_ASSERT(must <= len);
+	do {
+		/* Allow zero length packets, must check first. */
+		dlen = rte_pktmbuf_data_len(loc->mbuf);
+		if (dlen <= loc->mbuf_off) {
+			/* Exhausted packet, just free. */
+			mbuf = loc->mbuf;
+			loc->mbuf = mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			loc->mbuf_off = 0;
+			MLX5_ASSERT(loc->mbuf_nseg > 1);
+			MLX5_ASSERT(loc->mbuf);
+			--loc->mbuf_nseg;
+			if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) {
+				unsigned int diff;
+
+				if (copy >= must) {
+					/*
+					 * We already copied the minimal
+					 * requested amount of data.
+					 */
+					return copy;
+				}
+				diff = must - copy;
+				if (diff <= rte_pktmbuf_data_len(loc->mbuf)) {
+					/*
+					 * Copy only the minimal required
+					 * part of the data buffer.
+					 */
+					len = diff;
+				}
+			}
+			continue;
+		}
+		dlen -= loc->mbuf_off;
+		psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
+					       loc->mbuf_off);
+		part = RTE_MIN(len, dlen);
+		rte_memcpy(pdst, psrc, part);
+		copy += part;
+		loc->mbuf_off += part;
+		len -= part;
+		if (!len) {
+			if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
+				loc->mbuf_off = 0;
+				/* Exhausted packet, just free. */
+				mbuf = loc->mbuf;
+				loc->mbuf = mbuf->next;
+				rte_pktmbuf_free_seg(mbuf);
+				loc->mbuf_off = 0;
+				MLX5_ASSERT(loc->mbuf_nseg >= 1);
+				--loc->mbuf_nseg;
+			}
+			return copy;
+		}
+		pdst += part;
+	} while (true);
+}
+
+/**
+ * Build the Ethernet Segment with inlined data from
+ * multi-segment packet. Checks the boundary of WQEBB
+ * and ring buffer wrapping, supports Software Parser,
+ * Checksums and VLAN insertion Tx offload features.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet Segment.
+ * @param vlan
+ *   Length of VLAN tag insertion if any.
+ * @param inlen
+ *   Length of data to inline (VLAN included, if any).
+ * @param tso
+ *   TSO flag, set mss field from the packet.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment (aligned and
+ *   possible NOT wrapped around - caller should do
+ *   wrapping check on its own).
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int vlan,
+		  unsigned int inlen,
+		  unsigned int tso,
+		  unsigned int olx)
+{
+	struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+	uint32_t csum;
+	uint8_t *pdst;
+	unsigned int part, tlen = 0;
+
+	/*
+	 * Calculate and set check sum flags first, uint32_t field
+	 * in segment may be shared with Software Parser flags.
+	 */
+	csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
+	if (tso) {
+		csum <<= 24;
+		csum |= loc->mbuf->tso_segsz;
+		es->flags = rte_cpu_to_be_32(csum);
+	} else {
+		es->flags = rte_cpu_to_le_32(csum);
+	}
+	/*
+	 * Calculate and set Software Parser offsets and flags.
+	 * These flags a set for custom UDP and IP tunnel packets.
+	 */
+	es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
+	/* Fill metadata field if needed. */
+	es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
+		       loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+		       *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0;
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(rte_v128u32_t)),
+		      "invalid Ethernet Segment data size");
+	static_assert(MLX5_ESEG_MIN_INLINE_SIZE ==
+				(sizeof(uint16_t) +
+				 sizeof(struct rte_vlan_hdr) +
+				 2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Ethernet Segment data size");
+	MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
+	pdst = (uint8_t *)&es->inline_data;
+	if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
+		/* Implement VLAN tag insertion as part inline data. */
+		mlx5_tx_mseg_memcpy(pdst, loc,
+				    2 * RTE_ETHER_ADDR_LEN,
+				    2 * RTE_ETHER_ADDR_LEN, olx);
+		pdst += 2 * RTE_ETHER_ADDR_LEN;
+		*(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
+						((RTE_ETHER_TYPE_VLAN << 16) |
+						 loc->mbuf->vlan_tci);
+		pdst += sizeof(struct rte_vlan_hdr);
+		tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
+	}
+	MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, inlen - tlen);
+	MLX5_ASSERT(part);
+	do {
+		unsigned int copy;
+
+		/*
+		 * Copying may be interrupted inside the routine
+		 * if run into no inline hint flag.
+		 */
+		copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen);
+		copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx);
+		tlen += copy;
+		if (likely(inlen <= tlen) || copy < part) {
+			es->inline_hdr_sz = rte_cpu_to_be_16(tlen);
+			pdst += copy;
+			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		part = inlen - tlen;
+	} while (true);
+}
+
+/**
+ * Build the Data Segment of pointer type.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq,
+		 struct mlx5_txq_local *restrict loc,
+		 struct mlx5_wqe_dseg *restrict dseg,
+		 uint8_t *buf,
+		 unsigned int len,
+		 unsigned int olx __rte_unused)
+
+{
+	MLX5_ASSERT(len);
+	dseg->bcount = rte_cpu_to_be_32(len);
+	dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+	dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+}
+
+/**
+ * Build the Data Segment of pointer type or inline
+ * if data length is less than buffer in minimal
+ * Data Segment size.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+
+{
+	uintptr_t dst, src;
+
+	MLX5_ASSERT(len);
+	if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
+		dseg->bcount = rte_cpu_to_be_32(len);
+		dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
+		dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
+
+		return;
+	}
+	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+	/* Unrolled implementation of generic rte_memcpy. */
+	dst = (uintptr_t)&dseg->inline_data[0];
+	src = (uintptr_t)buf;
+	if (len & 0x08) {
+#ifdef RTE_ARCH_STRICT_ALIGN
+		MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t)));
+		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
+		dst += sizeof(uint32_t);
+		src += sizeof(uint32_t);
+		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
+		dst += sizeof(uint32_t);
+		src += sizeof(uint32_t);
+#else
+		*(uint64_t *)dst = *(unaligned_uint64_t *)src;
+		dst += sizeof(uint64_t);
+		src += sizeof(uint64_t);
+#endif
+	}
+	if (len & 0x04) {
+		*(uint32_t *)dst = *(unaligned_uint32_t *)src;
+		dst += sizeof(uint32_t);
+		src += sizeof(uint32_t);
+	}
+	if (len & 0x02) {
+		*(uint16_t *)dst = *(unaligned_uint16_t *)src;
+		dst += sizeof(uint16_t);
+		src += sizeof(uint16_t);
+	}
+	if (len & 0x01)
+		*(uint8_t *)dst = *(uint8_t *)src;
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet, no VLAN insertion.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to WQE to fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment after inlined data.
+ *   Ring buffer wraparound check is needed. We do not
+ *   do it here because it may not be needed for the
+ *   last packet in the eMPW session.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+{
+	unsigned int part;
+	uint8_t *pdst;
+
+	if (!MLX5_TXOFF_CONFIG(MPW)) {
+		/* Store the descriptor byte counter for eMPW sessions. */
+		dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+		pdst = &dseg->inline_data[0];
+	} else {
+		/* The entire legacy MPW session counter is stored on close. */
+		pdst = (uint8_t *)dseg;
+	}
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, len);
+	do {
+		rte_memcpy(pdst, buf, part);
+		len -= part;
+		if (likely(!len)) {
+			pdst += part;
+			if (!MLX5_TXOFF_CONFIG(MPW))
+				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			/* Note: no final wraparound check here. */
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		buf += part;
+		part = len;
+	} while (true);
+}
+
+/**
+ * Build the Data Segment of inlined data from single
+ * segment packet with VLAN insertion.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dseg
+ *   Pointer to the dseg fill with built Data Segment.
+ * @param buf
+ *   Data buffer to point.
+ * @param len
+ *   Data buffer length.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Pointer to the next Data Segment after inlined data.
+ *   Ring buffer wraparound check is needed.
+ */
+static __rte_always_inline struct mlx5_wqe_dseg *
+mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe_dseg *restrict dseg,
+		  uint8_t *buf,
+		  unsigned int len,
+		  unsigned int olx __rte_unused)
+
+{
+	unsigned int part;
+	uint8_t *pdst;
+
+	MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE);
+	static_assert(MLX5_DSEG_MIN_INLINE_SIZE ==
+				 (2 * RTE_ETHER_ADDR_LEN),
+		      "invalid Data Segment data size");
+	if (!MLX5_TXOFF_CONFIG(MPW)) {
+		/* Store the descriptor byte counter for eMPW sessions. */
+		dseg->bcount = rte_cpu_to_be_32
+				((len + sizeof(struct rte_vlan_hdr)) |
+				 MLX5_ETH_WQE_DATA_INLINE);
+		pdst = &dseg->inline_data[0];
+	} else {
+		/* The entire legacy MPW session counter is stored on close. */
+		pdst = (uint8_t *)dseg;
+	}
+	memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
+	buf += MLX5_DSEG_MIN_INLINE_SIZE;
+	pdst += MLX5_DSEG_MIN_INLINE_SIZE;
+	len -= MLX5_DSEG_MIN_INLINE_SIZE;
+	/* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */
+	MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
+	if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
+		pdst = (uint8_t *)txq->wqes;
+	*(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
+					      loc->mbuf->vlan_tci);
+	pdst += sizeof(struct rte_vlan_hdr);
+	/*
+	 * The WQEBB space availability is checked by caller.
+	 * Here we should be aware of WQE ring buffer wraparound only.
+	 */
+	part = (uint8_t *)txq->wqes_end - pdst;
+	part = RTE_MIN(part, len);
+	do {
+		rte_memcpy(pdst, buf, part);
+		len -= part;
+		if (likely(!len)) {
+			pdst += part;
+			if (!MLX5_TXOFF_CONFIG(MPW))
+				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			/* Note: no final wraparound check here. */
+			return (struct mlx5_wqe_dseg *)pdst;
+		}
+		pdst = (uint8_t *)txq->wqes;
+		buf += part;
+		part = len;
+	} while (true);
+}
+
+/**
+ * Build the Ethernet Segment with optionally inlined data with
+ * VLAN insertion and following Data Segments (if any) from
+ * multi-segment packet. Used by ordinary send and TSO.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Ethernet/Data Segments.
+ * @param vlan
+ *   Length of VLAN header to insert, 0 means no VLAN insertion.
+ * @param inlen
+ *   Data length to inline. For TSO this parameter specifies
+ *   exact value, for ordinary send routine can be aligned by
+ *   caller to provide better WQE space saving and data buffer
+ *   start address alignment. This length includes VLAN header
+ *   being inserted.
+ * @param tso
+ *   Zero means ordinary send, inlined data can be extended,
+ *   otherwise this is TSO, inlined data length is fixed.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Actual size of built WQE in segments.
+ */
+static __rte_always_inline unsigned int
+mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   struct mlx5_wqe *restrict wqe,
+		   unsigned int vlan,
+		   unsigned int inlen,
+		   unsigned int tso,
+		   unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_dseg *restrict dseg;
+	unsigned int ds;
+
+	MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
+	loc->mbuf_nseg = NB_SEGS(loc->mbuf);
+	loc->mbuf_off = 0;
+
+	dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
+	if (!loc->mbuf_nseg)
+		goto dseg_done;
+	/*
+	 * There are still some mbuf remaining, not inlined.
+	 * The first mbuf may be partially inlined and we
+	 * must process the possible non-zero data offset.
+	 */
+	if (loc->mbuf_off) {
+		unsigned int dlen;
+		uint8_t *dptr;
+
+		/*
+		 * Exhausted packets must be dropped before.
+		 * Non-zero offset means there are some data
+		 * remained in the packet.
+		 */
+		MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
+		MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf));
+		dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
+					       loc->mbuf_off);
+		dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
+		/*
+		 * Build the pointer/minimal data Data Segment.
+		 * Do ring buffer wrapping check in advance.
+		 */
+		if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+			dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
+		/* Store the mbuf to be freed on completion. */
+		MLX5_ASSERT(loc->elts_free);
+		txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+		--loc->elts_free;
+		++dseg;
+		if (--loc->mbuf_nseg == 0)
+			goto dseg_done;
+		loc->mbuf = loc->mbuf->next;
+		loc->mbuf_off = 0;
+	}
+	do {
+		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+			struct rte_mbuf *mbuf;
+
+			/* Zero length segment found, just skip. */
+			mbuf = loc->mbuf;
+			loc->mbuf = loc->mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			if (--loc->mbuf_nseg == 0)
+				break;
+		} else {
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+			mlx5_tx_dseg_iptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			MLX5_ASSERT(loc->elts_free);
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			--loc->elts_free;
+			++dseg;
+			if (--loc->mbuf_nseg == 0)
+				break;
+			loc->mbuf = loc->mbuf->next;
+		}
+	} while (true);
+
+dseg_done:
+	/* Calculate actual segments used from the dseg pointer. */
+	if ((uintptr_t)wqe < (uintptr_t)dseg)
+		ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
+	else
+		ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
+		      txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
+	return ds;
+}
+
+/**
+ * Tx one packet function for multi-segment TSO. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
+ * sends one packet per WQE.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq,
+			struct mlx5_txq_local *restrict loc,
+			unsigned int olx)
+{
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, dlen, inlen, ntcp, vlan = 0;
+
+	/*
+	 * Calculate data length to be inlined to estimate
+	 * the required space in WQE ring buffer.
+	 */
+	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		vlan = sizeof(struct rte_vlan_hdr);
+	inlen = loc->mbuf->l2_len + vlan +
+		loc->mbuf->l3_len + loc->mbuf->l4_len;
+	if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
+		return MLX5_TXCMP_CODE_ERROR;
+	if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+		inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
+	/* Packet must contain all TSO headers. */
+	if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
+		     inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+		     inlen > (dlen + vlan)))
+		return MLX5_TXCMP_CODE_ERROR;
+	MLX5_ASSERT(inlen >= txq->inlen_mode);
+	/*
+	 * Check whether there are enough free WQEBBs:
+	 * - Control Segment
+	 * - Ethernet Segment
+	 * - First Segment of inlined Ethernet data
+	 * - ... data continued ...
+	 * - Data Segments of pointer/min inline type
+	 */
+	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+				       MLX5_ESEG_MIN_INLINE_SIZE +
+				       MLX5_WSEG_SIZE +
+				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes/packets counters. */
+	ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
+		loc->mbuf->tso_segsz;
+	/*
+	 * One will be added for mbuf itself
+	 * at the end of the mlx5_tx_burst from
+	 * loc->pkts_sent field.
+	 */
+	--ntcp;
+	txq->stats.opackets += ntcp;
+	txq->stats.obytes += dlen + vlan + ntcp * inlen;
+#endif
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
+	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
+	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, without any data inlining in
+ * Ethernet Segment.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	struct mlx5_wqe_dseg *restrict dseg;
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, nseg;
+
+	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+	/*
+	 * No inline at all, it means the CPU cycles saving
+	 * is prioritized at configuration, we should not
+	 * copy any packet data to WQE.
+	 */
+	nseg = NB_SEGS(loc->mbuf);
+	ds = 2 + nseg;
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+	/*
+	 * Some Tx offloads may cause an error if
+	 * packet is not long enough, check against
+	 * assumed minimal length.
+	 */
+	if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		txq->stats.obytes += sizeof(struct rte_vlan_hdr);
+#endif
+	/*
+	 * SEND WQE, one WQEBB:
+	 * - Control Segment, SEND opcode
+	 * - Ethernet Segment, optional VLAN, no inline
+	 * - Data Segments, pointer only type
+	 */
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
+	mlx5_tx_eseg_none(txq, loc, wqe, olx);
+	dseg = &wqe->dseg[0];
+	do {
+		if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
+			struct rte_mbuf *mbuf;
+
+			/*
+			 * Zero length segment found, have to
+			 * correct total size of WQE in segments.
+			 * It is supposed to be rare occasion, so
+			 * in normal case (no zero length segments)
+			 * we avoid extra writing to the Control
+			 * Segment.
+			 */
+			--ds;
+			wqe->cseg.sq_ds -= RTE_BE32(1);
+			mbuf = loc->mbuf;
+			loc->mbuf = mbuf->next;
+			rte_pktmbuf_free_seg(mbuf);
+			if (--nseg == 0)
+				break;
+		} else {
+			mlx5_tx_dseg_ptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			--loc->elts_free;
+			if (--nseg == 0)
+				break;
+			++dseg;
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+			loc->mbuf = loc->mbuf->next;
+		}
+	} while (true);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx one packet function for multi-segment SEND. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
+ * sends one packet per WQE, with data inlining in
+ * Ethernet Segment and minimal Data Segments.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
+			    struct mlx5_txq_local *restrict loc,
+			    unsigned int olx)
+{
+	struct mlx5_wqe *restrict wqe;
+	unsigned int ds, inlen, dlen, vlan = 0;
+
+	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+	/*
+	 * First calculate data length to be inlined
+	 * to estimate the required space for WQE.
+	 */
+	dlen = rte_pktmbuf_pkt_len(loc->mbuf);
+	if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+		vlan = sizeof(struct rte_vlan_hdr);
+	inlen = dlen + vlan;
+	/* Check against minimal length. */
+	if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+		return MLX5_TXCMP_CODE_ERROR;
+	MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+	if (inlen > txq->inlen_send ||
+	    loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) {
+		struct rte_mbuf *mbuf;
+		unsigned int nxlen;
+		uintptr_t start;
+
+		/*
+		 * Packet length exceeds the allowed inline
+		 * data length, check whether the minimal
+		 * inlining is required.
+		 */
+		if (txq->inlen_mode) {
+			MLX5_ASSERT(txq->inlen_mode >=
+				    MLX5_ESEG_MIN_INLINE_SIZE);
+			MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send);
+			inlen = txq->inlen_mode;
+		} else {
+			if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE ||
+			    !vlan || txq->vlan_en) {
+				/*
+				 * VLAN insertion will be done inside by HW.
+				 * It is not utmost effective - VLAN flag is
+				 * checked twice, but we should proceed the
+				 * inlining length correctly and take into
+				 * account the VLAN header being inserted.
+				 */
+				return mlx5_tx_packet_multi_send
+							(txq, loc, olx);
+			}
+			inlen = MLX5_ESEG_MIN_INLINE_SIZE;
+		}
+		/*
+		 * Now we know the minimal amount of data is requested
+		 * to inline. Check whether we should inline the buffers
+		 * from the chain beginning to eliminate some mbufs.
+		 */
+		mbuf = loc->mbuf;
+		nxlen = rte_pktmbuf_data_len(mbuf);
+		if (unlikely(nxlen <= txq->inlen_send)) {
+			/* We can inline first mbuf at least. */
+			if (nxlen < inlen) {
+				unsigned int smlen;
+
+				/* Scan mbufs till inlen filled. */
+				do {
+					smlen = nxlen;
+					mbuf = NEXT(mbuf);
+					MLX5_ASSERT(mbuf);
+					nxlen = rte_pktmbuf_data_len(mbuf);
+					nxlen += smlen;
+				} while (unlikely(nxlen < inlen));
+				if (unlikely(nxlen > txq->inlen_send)) {
+					/* We cannot inline entire mbuf. */
+					smlen = inlen - smlen;
+					start = rte_pktmbuf_mtod_offset
+						    (mbuf, uintptr_t, smlen);
+					goto do_align;
+				}
+			}
+			do {
+				inlen = nxlen;
+				mbuf = NEXT(mbuf);
+				/* There should be not end of packet. */
+				MLX5_ASSERT(mbuf);
+				nxlen = inlen + rte_pktmbuf_data_len(mbuf);
+			} while (unlikely(nxlen < txq->inlen_send));
+		}
+		start = rte_pktmbuf_mtod(mbuf, uintptr_t);
+		/*
+		 * Check whether we can do inline to align start
+		 * address of data buffer to cacheline.
+		 */
+do_align:
+		start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1);
+		if (unlikely(start)) {
+			start += inlen;
+			if (start <= txq->inlen_send)
+				inlen = start;
+		}
+	}
+	/*
+	 * Check whether there are enough free WQEBBs:
+	 * - Control Segment
+	 * - Ethernet Segment
+	 * - First Segment of inlined Ethernet data
+	 * - ... data continued ...
+	 * - Data Segments of pointer/min inline type
+	 *
+	 * Estimate the number of Data Segments conservatively,
+	 * supposing no any mbufs is being freed during inlining.
+	 */
+	MLX5_ASSERT(inlen <= txq->inlen_send);
+	ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
+				       MLX5_ESEG_MIN_INLINE_SIZE +
+				       MLX5_WSEG_SIZE +
+				       MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+	if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_EXIT;
+	/* Check for maximal WQE size. */
+	if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
+		return MLX5_TXCMP_CODE_ERROR;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes/packets counters. */
+	txq->stats.obytes += dlen + vlan;
+#endif
+	wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+	loc->wqe_last = wqe;
+	mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
+	ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
+	wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+	return MLX5_TXCMP_CODE_MULTI;
+}
+
+/**
+ * Tx burst function for multi-segment packets. Supports all
+ * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
+ * sends one packet per WQE. Function stops sending if it
+ * encounters the single-segment packet.
+ *
+ * This routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ *   MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   unsigned int pkts_n,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int olx)
+{
+	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+	MLX5_ASSERT(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		enum mlx5_txcmp_code ret;
+
+		MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+		/*
+		 * Estimate the number of free elts quickly but
+		 * conservatively. Some segment may be fully inlined
+		 * and freed, ignore this here - precise estimation
+		 * is costly.
+		 */
+		if (loc->elts_free < NB_SEGS(loc->mbuf))
+			return MLX5_TXCMP_CODE_EXIT;
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+			/* Proceed with multi-segment TSO. */
+			ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
+		} else if (MLX5_TXOFF_CONFIG(INLINE)) {
+			/* Proceed with multi-segment SEND with inlining. */
+			ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
+		} else {
+			/* Proceed with multi-segment SEND w/o inlining. */
+			ret = mlx5_tx_packet_multi_send(txq, loc, olx);
+		}
+		if (ret == MLX5_TXCMP_CODE_EXIT)
+			return MLX5_TXCMP_CODE_EXIT;
+		if (ret == MLX5_TXCMP_CODE_ERROR)
+			return MLX5_TXCMP_CODE_ERROR;
+		/* WQE is built, go to the next packet. */
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		if (likely(NB_SEGS(loc->mbuf) > 1))
+			continue;
+		/* Here ends the series of multi-segment packets. */
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
+			return MLX5_TXCMP_CODE_TSO;
+		return MLX5_TXCMP_CODE_SINGLE;
+	}
+	MLX5_ASSERT(false);
+}
+
+/**
+ * Tx burst function for single-segment packets with TSO.
+ * Supports all types of Tx offloads, except multi-packets.
+ * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
+ * Function stops sending if it encounters the multi-segment
+ * packet or packet without TSO requested.
+ *
+ * The routine is responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inline
+ * offloads is requested due to possible early freeing
+ * of the inlined mbufs (can not store pkts array in elts
+ * as a batch).
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
+ *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ * Local context variables updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq,
+		  struct rte_mbuf **restrict pkts,
+		  unsigned int pkts_n,
+		  struct mlx5_txq_local *restrict loc,
+		  unsigned int olx)
+{
+	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+	MLX5_ASSERT(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe *restrict wqe;
+		unsigned int ds, dlen, hlen, ntcp, vlan = 0;
+		uint8_t *dptr;
+
+		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+		dlen = rte_pktmbuf_data_len(loc->mbuf);
+		if (MLX5_TXOFF_CONFIG(VLAN) &&
+		    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+			vlan = sizeof(struct rte_vlan_hdr);
+		}
+		/*
+		 * First calculate the WQE size to check
+		 * whether we have enough space in ring buffer.
+		 */
+		hlen = loc->mbuf->l2_len + vlan +
+		       loc->mbuf->l3_len + loc->mbuf->l4_len;
+		if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
+			return MLX5_TXCMP_CODE_ERROR;
+		if (loc->mbuf->ol_flags & PKT_TX_TUNNEL_MASK)
+			hlen += loc->mbuf->outer_l2_len +
+				loc->mbuf->outer_l3_len;
+		/* Segment must contain all TSO headers. */
+		if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
+			     hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
+			     hlen > (dlen + vlan)))
+			return MLX5_TXCMP_CODE_ERROR;
+		/*
+		 * Check whether there are enough free WQEBBs:
+		 * - Control Segment
+		 * - Ethernet Segment
+		 * - First Segment of inlined Ethernet data
+		 * - ... data continued ...
+		 * - Finishing Data Segment of pointer type
+		 */
+		ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
+			  MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+		if (loc->wqe_free < ((ds + 3) / 4))
+			return MLX5_TXCMP_CODE_EXIT;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Update sent data bytes/packets counters. */
+		ntcp = (dlen + vlan - hlen +
+			loc->mbuf->tso_segsz - 1) /
+			loc->mbuf->tso_segsz;
+		/*
+		 * One will be added for mbuf itself at the end
+		 * of the mlx5_tx_burst from loc->pkts_sent field.
+		 */
+		--ntcp;
+		txq->stats.opackets += ntcp;
+		txq->stats.obytes += dlen + vlan + ntcp * hlen;
+#endif
+		/*
+		 * Build the TSO WQE:
+		 * - Control Segment
+		 * - Ethernet Segment with hlen bytes inlined
+		 * - Data Segment of pointer type
+		 */
+		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		loc->wqe_last = wqe;
+		mlx5_tx_cseg_init(txq, loc, wqe, ds,
+				  MLX5_OPCODE_TSO, olx);
+		dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
+		dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
+		dlen -= hlen - vlan;
+		mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+		/*
+		 * WQE is built, update the loop parameters
+		 * and go to the next packet.
+		 */
+		txq->wqe_ci += (ds + 3) / 4;
+		loc->wqe_free -= (ds + 3) / 4;
+		if (MLX5_TXOFF_CONFIG(INLINE))
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+		--loc->elts_free;
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    unlikely(NB_SEGS(loc->mbuf) > 1))
+			return MLX5_TXCMP_CODE_MULTI;
+		if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)))
+			return MLX5_TXCMP_CODE_SINGLE;
+		/* Continue with the next TSO packet. */
+	}
+	MLX5_ASSERT(false);
+}
+
+/**
+ * Analyze the packet and select the best method to send.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ * @param newp
+ *   The predefined flag whether do complete check for
+ *   multi-segment packets and TSO.
+ *
+ * @return
+ *  MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ *  MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
+ *  MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
+ *  MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq,
+		     struct mlx5_txq_local *restrict loc,
+		     unsigned int olx,
+		     bool newp)
+{
+	/* Check for multi-segment packet. */
+	if (newp &&
+	    MLX5_TXOFF_CONFIG(MULTI) &&
+	    unlikely(NB_SEGS(loc->mbuf) > 1))
+		return MLX5_TXCMP_CODE_MULTI;
+	/* Check for TSO packet. */
+	if (newp &&
+	    MLX5_TXOFF_CONFIG(TSO) &&
+	    unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))
+		return MLX5_TXCMP_CODE_TSO;
+	/* Check if eMPW is enabled at all. */
+	if (!MLX5_TXOFF_CONFIG(EMPW))
+		return MLX5_TXCMP_CODE_SINGLE;
+	/* Check if eMPW can be engaged. */
+	if (MLX5_TXOFF_CONFIG(VLAN) &&
+	    unlikely(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) &&
+		(!MLX5_TXOFF_CONFIG(INLINE) ||
+		 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
+			   sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
+		/*
+		 * eMPW does not support VLAN insertion offload,
+		 * we have to inline the entire packet but
+		 * packet is too long for inlining.
+		 */
+		return MLX5_TXCMP_CODE_SINGLE;
+	}
+	return MLX5_TXCMP_CODE_EMPW;
+}
+
+/**
+ * Check the next packet attributes to match with the eMPW batch ones.
+ * In addition, for legacy MPW the packet length is checked either.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param es
+ *   Pointer to Ethernet Segment of eMPW batch.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param dlen
+ *   Length of previous packet in MPW descriptor.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline bool
+mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
+		   struct mlx5_wqe_eseg *restrict es,
+		   struct mlx5_txq_local *restrict loc,
+		   uint32_t dlen,
+		   unsigned int olx)
+{
+	uint8_t swp_flags = 0;
+
+	/* Compare the checksum flags, if any. */
+	if (MLX5_TXOFF_CONFIG(CSUM) &&
+	    txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
+		return false;
+	/* Compare the Software Parser offsets and flags. */
+	if (MLX5_TXOFF_CONFIG(SWP) &&
+	    (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
+	     es->swp_flags != swp_flags))
+		return false;
+	/* Fill metadata field if needed. */
+	if (MLX5_TXOFF_CONFIG(METADATA) &&
+		es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ?
+				 *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0))
+		return false;
+	/* Legacy MPW can send packets with the same lengt only. */
+	if (MLX5_TXOFF_CONFIG(MPW) &&
+	    dlen != rte_pktmbuf_data_len(loc->mbuf))
+		return false;
+	/* There must be no VLAN packets in eMPW loop. */
+	if (MLX5_TXOFF_CONFIG(VLAN))
+		MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
+	return true;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * without data inlining. Number of Data Segments is
+ * equal to the number of sent packets.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param ds
+ *   Number of packets/Data Segments/Packets.
+ * @param slen
+ *   Accumulated statistics, bytes sent
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int ds,
+		   unsigned int slen,
+		   unsigned int olx __rte_unused)
+{
+	MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	 txq->stats.obytes += slen;
+#else
+	(void)slen;
+#endif
+	loc->elts_free -= ds;
+	loc->pkts_sent += ds;
+	ds += 2;
+	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
+	txq->wqe_ci += (ds + 3) / 4;
+	loc->wqe_free -= (ds + 3) / 4;
+}
+
+/*
+ * Update send loop variables and WQE for eMPW loop
+ * with data inlining. Gets the size of pushed descriptors
+ * and data to the WQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param len
+ *   Total size of descriptor/data in bytes.
+ * @param slen
+ *   Accumulated statistics, data bytes sent.
+ * @param wqem
+ *   The base WQE for the eMPW/MPW descriptor.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *  true - packet match with eMPW batch attributes.
+ *  false - no match, eMPW should be restarted.
+ */
+static __rte_always_inline void
+mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq,
+		   struct mlx5_txq_local *restrict loc,
+		   unsigned int len,
+		   unsigned int slen,
+		   struct mlx5_wqe *restrict wqem,
+		   unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_dseg *dseg = &wqem->dseg[0];
+
+	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Update sent data bytes counter. */
+	 txq->stats.obytes += slen;
+#else
+	(void)slen;
+#endif
+	if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) {
+		/*
+		 * If the legacy MPW session contains the inline packets
+		 * we should set the only inline data segment length
+		 * and align the total length to the segment size.
+		 */
+		MLX5_ASSERT(len > sizeof(dseg->bcount));
+		dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) |
+						MLX5_ETH_WQE_DATA_INLINE);
+		len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2;
+	} else {
+		/*
+		 * The session is not legacy MPW or contains the
+		 * data buffer pointer segments.
+		 */
+		MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
+		len = len / MLX5_WSEG_SIZE + 2;
+	}
+	wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
+	txq->wqe_ci += (len + 3) / 4;
+	loc->wqe_free -= (len + 3) / 4;
+	loc->wqe_last = wqem;
+}
+
+/**
+ * The set of Tx burst functions for single-segment packets
+ * without TSO and with Multi-Packet Writing feature support.
+ * Supports all types of Tx offloads, except multi-packets
+ * and TSO.
+ *
+ * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends
+ * as many packet per WQE as it can. If eMPW is not configured
+ * or packet can not be sent with eMPW (VLAN insertion) the
+ * ordinary SEND opcode is used and only one packet placed
+ * in WQE.
+ *
+ * Functions stop sending if it encounters the multi-segment
+ * packet or packet with TSO requested.
+ *
+ * The routines are responsible for storing processed mbuf
+ * into elts ring buffer and update elts_head if inlining
+ * offload is requested. Otherwise the copying mbufs to elts
+ * can be postponed and completed at the end of burst routine.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
+ *   MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
+ *   MLX5_TXCMP_CODE_TSO - TSO packet encountered.
+ *   MLX5_TXCMP_CODE_SINGLE - used inside functions set.
+ *   MLX5_TXCMP_CODE_EMPW - used inside functions set.
+ *
+ * Local context variables updated.
+ *
+ *
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * without inlining, this is dedicated optimized branch.
+ * No VLAN insertion is supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with eMPW opcode
+	 * without data inlining.
+	 */
+	MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
+	MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
+	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+	MLX5_ASSERT(pkts_n > loc->pkts_sent);
+	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe_eseg *restrict eseg;
+		enum mlx5_txcmp_code ret;
+		unsigned int part, loop;
+		unsigned int slen = 0;
+
+next_empw:
+		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+		part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
+				       MLX5_MPW_MAX_PACKETS :
+				       MLX5_EMPW_MAX_PACKETS);
+		if (unlikely(loc->elts_free < part)) {
+			/* We have no enough elts to save all mbufs. */
+			if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
+				return MLX5_TXCMP_CODE_EXIT;
+			/* But we still able to send at least minimal eMPW. */
+			part = loc->elts_free;
+		}
+		/* Check whether we have enough WQEs */
+		if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
+			if (unlikely(loc->wqe_free <
+				((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+				return MLX5_TXCMP_CODE_EXIT;
+			part = (loc->wqe_free * 4) - 2;
+		}
+		if (likely(part > 1))
+			rte_prefetch0(*pkts);
+		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		/*
+		 * Build eMPW title WQEBB:
+		 * - Control Segment, eMPW opcode
+		 * - Ethernet Segment, no inline
+		 */
+		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
+				  MLX5_OPCODE_ENHANCED_MPSW, olx);
+		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
+		eseg = &loc->wqe_last->eseg;
+		dseg = &loc->wqe_last->dseg[0];
+		loop = part;
+		/* Store the packet length for legacy MPW. */
+		if (MLX5_TXOFF_CONFIG(MPW))
+			eseg->mss = rte_cpu_to_be_16
+					(rte_pktmbuf_data_len(loc->mbuf));
+		for (;;) {
+			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			slen += dlen;
+#endif
+			mlx5_tx_dseg_ptr
+				(txq, loc, dseg,
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 dlen, olx);
+			if (unlikely(--loop == 0))
+				break;
+			loc->mbuf = *pkts++;
+			if (likely(loop > 1))
+				rte_prefetch0(*pkts);
+			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+			/*
+			 * Unroll the completion code to avoid
+			 * returning variable value - it results in
+			 * unoptimized sequent checking in caller.
+			 */
+			if (ret == MLX5_TXCMP_CODE_MULTI) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_MULTI;
+			}
+			MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+			if (ret == MLX5_TXCMP_CODE_TSO) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_TSO;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_SINGLE;
+			}
+			if (ret != MLX5_TXCMP_CODE_EMPW) {
+				MLX5_ASSERT(false);
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/*
+			 * Check whether packet parameters coincide
+			 * within assumed eMPW batch:
+			 * - check sum settings
+			 * - metadata value
+			 * - software parser settings
+			 * - packets length (legacy MPW only)
+			 */
+			if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) {
+				MLX5_ASSERT(loop);
+				part -= loop;
+				mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				pkts_n -= part;
+				goto next_empw;
+			}
+			/* Packet attributes match, continue the same eMPW. */
+			++dseg;
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		}
+		/* eMPW is built successfully, update loop parameters. */
+		MLX5_ASSERT(!loop);
+		MLX5_ASSERT(pkts_n >= part);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Update sent data bytes counter. */
+		txq->stats.obytes += slen;
+#endif
+		loc->elts_free -= part;
+		loc->pkts_sent += part;
+		txq->wqe_ci += (2 + part + 3) / 4;
+		loc->wqe_free -= (2 + part + 3) / 4;
+		pkts_n -= part;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+		if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
+			return ret;
+		/* Continue sending eMPW batches. */
+	}
+	MLX5_ASSERT(false);
+}
+
+/**
+ * The routine sends packets with MLX5_OPCODE_EMPW
+ * with inlining, optionally supports VLAN insertion.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with eMPW opcode
+	 * with data inlining.
+	 */
+	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+	MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
+	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+	MLX5_ASSERT(pkts_n > loc->pkts_sent);
+	static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe_dseg *restrict dseg;
+		struct mlx5_wqe *restrict wqem;
+		enum mlx5_txcmp_code ret;
+		unsigned int room, part, nlim;
+		unsigned int slen = 0;
+
+		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+		/*
+		 * Limits the amount of packets in one WQE
+		 * to improve CQE latency generation.
+		 */
+		nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
+				       MLX5_MPW_INLINE_MAX_PACKETS :
+				       MLX5_EMPW_MAX_PACKETS);
+		/* Check whether we have minimal amount WQEs */
+		if (unlikely(loc->wqe_free <
+			    ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
+			return MLX5_TXCMP_CODE_EXIT;
+		if (likely(pkts_n > 1))
+			rte_prefetch0(*pkts);
+		wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		/*
+		 * Build eMPW title WQEBB:
+		 * - Control Segment, eMPW opcode, zero DS
+		 * - Ethernet Segment, no inline
+		 */
+		mlx5_tx_cseg_init(txq, loc, wqem, 0,
+				  MLX5_OPCODE_ENHANCED_MPSW, olx);
+		mlx5_tx_eseg_none(txq, loc, wqem,
+				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
+		dseg = &wqem->dseg[0];
+		/* Store the packet length for legacy MPW. */
+		if (MLX5_TXOFF_CONFIG(MPW))
+			wqem->eseg.mss = rte_cpu_to_be_16
+					 (rte_pktmbuf_data_len(loc->mbuf));
+		room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
+			       loc->wqe_free) * MLX5_WQE_SIZE -
+					MLX5_WQE_CSEG_SIZE -
+					MLX5_WQE_ESEG_SIZE;
+		/* Limit the room for legacy MPW sessions for performance. */
+		if (MLX5_TXOFF_CONFIG(MPW))
+			room = RTE_MIN(room,
+				       RTE_MAX(txq->inlen_empw +
+					       sizeof(dseg->bcount) +
+					       (MLX5_TXOFF_CONFIG(VLAN) ?
+					       sizeof(struct rte_vlan_hdr) : 0),
+					       MLX5_MPW_INLINE_MAX_PACKETS *
+					       MLX5_WQE_DSEG_SIZE));
+		/* Build WQE till we have space, packets and resources. */
+		part = room;
+		for (;;) {
+			uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
+			uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
+			unsigned int tlen;
+
+			MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
+			MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0);
+			MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
+			/*
+			 * Some Tx offloads may cause an error if
+			 * packet is not long enough, check against
+			 * assumed minimal length.
+			 */
+			if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
+				part -= room;
+				if (unlikely(!part))
+					return MLX5_TXCMP_CODE_ERROR;
+				/*
+				 * We have some successfully built
+				 * packet Data Segments to send.
+				 */
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/* Inline or not inline - that's the Question. */
+			if (dlen > txq->inlen_empw ||
+			    loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE)
+				goto pointer_empw;
+			if (MLX5_TXOFF_CONFIG(MPW)) {
+				if (dlen > txq->inlen_send)
+					goto pointer_empw;
+				tlen = dlen;
+				if (part == room) {
+					/* Open new inline MPW session. */
+					tlen += sizeof(dseg->bcount);
+					dseg->bcount = RTE_BE32(0);
+					dseg = RTE_PTR_ADD
+						(dseg, sizeof(dseg->bcount));
+				} else {
+					/*
+					 * No pointer and inline descriptor
+					 * intermix for legacy MPW sessions.
+					 */
+					if (wqem->dseg[0].bcount)
+						break;
+				}
+			} else {
+				tlen = sizeof(dseg->bcount) + dlen;
+			}
+			/* Inline entire packet, optional VLAN insertion. */
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+				/*
+				 * The packet length must be checked in
+				 * mlx5_tx_able_to_empw() and packet
+				 * fits into inline length guaranteed.
+				 */
+				MLX5_ASSERT((dlen +
+					     sizeof(struct rte_vlan_hdr)) <=
+					    txq->inlen_empw);
+				tlen += sizeof(struct rte_vlan_hdr);
+				if (room < tlen)
+					break;
+				dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
+							 dptr, dlen, olx);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+				/* Update sent data bytes counter. */
+				slen +=	sizeof(struct rte_vlan_hdr);
+#endif
+			} else {
+				if (room < tlen)
+					break;
+				dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
+							 dptr, dlen, olx);
+			}
+			if (!MLX5_TXOFF_CONFIG(MPW))
+				tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
+			MLX5_ASSERT(room >= tlen);
+			room -= tlen;
+			/*
+			 * Packet data are completely inlined,
+			 * free the packet immediately.
+			 */
+			rte_pktmbuf_free_seg(loc->mbuf);
+			goto next_mbuf;
+pointer_empw:
+			/*
+			 * No pointer and inline descriptor
+			 * intermix for legacy MPW sessions.
+			 */
+			if (MLX5_TXOFF_CONFIG(MPW) &&
+			    part != room &&
+			    wqem->dseg[0].bcount == RTE_BE32(0))
+				break;
+			/*
+			 * Not inlinable VLAN packets are
+			 * proceeded outside of this routine.
+			 */
+			MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
+			if (MLX5_TXOFF_CONFIG(VLAN))
+				MLX5_ASSERT(!(loc->mbuf->ol_flags &
+					    PKT_TX_VLAN_PKT));
+			mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
+			/* We have to store mbuf in elts.*/
+			txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
+			room -= MLX5_WQE_DSEG_SIZE;
+			/* Ring buffer wraparound is checked at the loop end.*/
+			++dseg;
+next_mbuf:
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			slen += dlen;
+#endif
+			loc->pkts_sent++;
+			loc->elts_free--;
+			pkts_n--;
+			if (unlikely(!pkts_n || !loc->elts_free)) {
+				/*
+				 * We have no resources/packets to
+				 * continue build descriptors.
+				 */
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
+				return MLX5_TXCMP_CODE_EXIT;
+			}
+			loc->mbuf = *pkts++;
+			if (likely(pkts_n > 1))
+				rte_prefetch0(*pkts);
+			ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+			/*
+			 * Unroll the completion code to avoid
+			 * returning variable value - it results in
+			 * unoptimized sequent checking in caller.
+			 */
+			if (ret == MLX5_TXCMP_CODE_MULTI) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_MULTI;
+			}
+			MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+			if (ret == MLX5_TXCMP_CODE_TSO) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_TSO;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
+				if (unlikely(!loc->elts_free ||
+					     !loc->wqe_free))
+					return MLX5_TXCMP_CODE_EXIT;
+				return MLX5_TXCMP_CODE_SINGLE;
+			}
+			if (ret != MLX5_TXCMP_CODE_EMPW) {
+				MLX5_ASSERT(false);
+				part -= room;
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
+				return MLX5_TXCMP_CODE_ERROR;
+			}
+			/* Check if we have minimal room left. */
+			nlim--;
+			if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE))
+				break;
+			/*
+			 * Check whether packet parameters coincide
+			 * within assumed eMPW batch:
+			 * - check sum settings
+			 * - metadata value
+			 * - software parser settings
+			 * - packets length (legacy MPW only)
+			 */
+			if (!mlx5_tx_match_empw(txq, &wqem->eseg,
+						loc, dlen, olx))
+				break;
+			/* Packet attributes match, continue the same eMPW. */
+			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
+				dseg = (struct mlx5_wqe_dseg *)txq->wqes;
+		}
+		/*
+		 * We get here to close an existing eMPW
+		 * session and start the new one.
+		 */
+		MLX5_ASSERT(pkts_n);
+		part -= room;
+		if (unlikely(!part))
+			return MLX5_TXCMP_CODE_EXIT;
+		mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx);
+		if (unlikely(!loc->elts_free ||
+			     !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		/* Continue the loop with new eMPW session. */
+	}
+	MLX5_ASSERT(false);
+}
+
+/**
+ * The routine sends packets with ordinary MLX5_OPCODE_SEND.
+ * Data inlining and VLAN insertion are supported.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
+			  struct rte_mbuf **restrict pkts,
+			  unsigned int pkts_n,
+			  struct mlx5_txq_local *restrict loc,
+			  unsigned int olx)
+{
+	/*
+	 * Subroutine is the part of mlx5_tx_burst_single()
+	 * and sends single-segment packet with SEND opcode.
+	 */
+	MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+	MLX5_ASSERT(pkts_n > loc->pkts_sent);
+	pkts += loc->pkts_sent + 1;
+	pkts_n -= loc->pkts_sent;
+	for (;;) {
+		struct mlx5_wqe *restrict wqe;
+		enum mlx5_txcmp_code ret;
+
+		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+		if (MLX5_TXOFF_CONFIG(INLINE)) {
+			unsigned int inlen, vlan = 0;
+
+			inlen = rte_pktmbuf_data_len(loc->mbuf);
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+				vlan = sizeof(struct rte_vlan_hdr);
+				inlen += vlan;
+				static_assert((sizeof(struct rte_vlan_hdr) +
+					       sizeof(struct rte_ether_hdr)) ==
+					       MLX5_ESEG_MIN_INLINE_SIZE,
+					       "invalid min inline data size");
+			}
+			/*
+			 * If inlining is enabled at configuration time
+			 * the limit must be not less than minimal size.
+			 * Otherwise we would do extra check for data
+			 * size to avoid crashes due to length overflow.
+			 */
+			MLX5_ASSERT(txq->inlen_send >=
+				    MLX5_ESEG_MIN_INLINE_SIZE);
+			if (inlen <= txq->inlen_send) {
+				unsigned int seg_n, wqe_n;
+
+				rte_prefetch0(rte_pktmbuf_mtod
+						(loc->mbuf, uint8_t *));
+				/* Check against minimal length. */
+				if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
+					return MLX5_TXCMP_CODE_ERROR;
+				if (loc->mbuf->ol_flags &
+				    PKT_TX_DYNF_NOINLINE) {
+					/*
+					 * The hint flag not to inline packet
+					 * data is set. Check whether we can
+					 * follow the hint.
+					 */
+					if ((!MLX5_TXOFF_CONFIG(EMPW) &&
+					      txq->inlen_mode) ||
+					    (MLX5_TXOFF_CONFIG(MPW) &&
+					     txq->inlen_mode)) {
+						/*
+						 * The hardware requires the
+						 * minimal inline data header.
+						 */
+						goto single_min_inline;
+					}
+					if (MLX5_TXOFF_CONFIG(VLAN) &&
+					    vlan && !txq->vlan_en) {
+						/*
+						 * We must insert VLAN tag
+						 * by software means.
+						 */
+						goto single_part_inline;
+					}
+					goto single_no_inline;
+				}
+				/*
+				 * Completely inlined packet data WQE:
+				 * - Control Segment, SEND opcode
+				 * - Ethernet Segment, no VLAN insertion
+				 * - Data inlined, VLAN optionally inserted
+				 * - Alignment to MLX5_WSEG_SIZE
+				 * Have to estimate amount of WQEBBs
+				 */
+				seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
+					 MLX5_ESEG_MIN_INLINE_SIZE +
+					 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+				/* Check if there are enough WQEBBs. */
+				wqe_n = (seg_n + 3) / 4;
+				if (wqe_n > loc->wqe_free)
+					return MLX5_TXCMP_CODE_EXIT;
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
+						  MLX5_OPCODE_SEND, olx);
+				mlx5_tx_eseg_data(txq, loc, wqe,
+						  vlan, inlen, 0, olx);
+				txq->wqe_ci += wqe_n;
+				loc->wqe_free -= wqe_n;
+				/*
+				 * Packet data are completely inlined,
+				 * free the packet immediately.
+				 */
+				rte_pktmbuf_free_seg(loc->mbuf);
+			} else if ((!MLX5_TXOFF_CONFIG(EMPW) ||
+				     MLX5_TXOFF_CONFIG(MPW)) &&
+					txq->inlen_mode) {
+				/*
+				 * If minimal inlining is requested the eMPW
+				 * feature should be disabled due to data is
+				 * inlined into Ethernet Segment, which can
+				 * not contain inlined data for eMPW due to
+				 * segment shared for all packets.
+				 */
+				struct mlx5_wqe_dseg *restrict dseg;
+				unsigned int ds;
+				uint8_t *dptr;
+
+				/*
+				 * The inline-mode settings require
+				 * to inline the specified amount of
+				 * data bytes to the Ethernet Segment.
+				 * We should check the free space in
+				 * WQE ring buffer to inline partially.
+				 */
+single_min_inline:
+				MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode);
+				MLX5_ASSERT(inlen > txq->inlen_mode);
+				MLX5_ASSERT(txq->inlen_mode >=
+					    MLX5_ESEG_MIN_INLINE_SIZE);
+				/*
+				 * Check whether there are enough free WQEBBs:
+				 * - Control Segment
+				 * - Ethernet Segment
+				 * - First Segment of inlined Ethernet data
+				 * - ... data continued ...
+				 * - Finishing Data Segment of pointer type
+				 */
+				ds = (MLX5_WQE_CSEG_SIZE +
+				      MLX5_WQE_ESEG_SIZE +
+				      MLX5_WQE_DSEG_SIZE +
+				      txq->inlen_mode -
+				      MLX5_ESEG_MIN_INLINE_SIZE +
+				      MLX5_WQE_DSEG_SIZE +
+				      MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
+				if (loc->wqe_free < ((ds + 3) / 4))
+					return MLX5_TXCMP_CODE_EXIT;
+				/*
+				 * Build the ordinary SEND WQE:
+				 * - Control Segment
+				 * - Ethernet Segment, inline inlen_mode bytes
+				 * - Data Segment of pointer type
+				 */
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, ds,
+						  MLX5_OPCODE_SEND, olx);
+				dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
+							 txq->inlen_mode,
+							 0, olx);
+				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+				       txq->inlen_mode - vlan;
+				inlen -= txq->inlen_mode;
+				mlx5_tx_dseg_ptr(txq, loc, dseg,
+						 dptr, inlen, olx);
+				/*
+				 * WQE is built, update the loop parameters
+				 * and got to the next packet.
+				 */
+				txq->wqe_ci += (ds + 3) / 4;
+				loc->wqe_free -= (ds + 3) / 4;
+				/* We have to store mbuf in elts.*/
+				MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+				txq->elts[txq->elts_head++ & txq->elts_m] =
+						loc->mbuf;
+				--loc->elts_free;
+			} else {
+				uint8_t *dptr;
+				unsigned int dlen;
+
+				/*
+				 * Partially inlined packet data WQE, we have
+				 * some space in title WQEBB, we can fill it
+				 * with some packet data. It takes one WQEBB,
+				 * it is available, no extra space check:
+				 * - Control Segment, SEND opcode
+				 * - Ethernet Segment, no VLAN insertion
+				 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
+				 * - Data Segment, pointer type
+				 *
+				 * We also get here if VLAN insertion is not
+				 * supported by HW, the inline is enabled.
+				 */
+single_part_inline:
+				wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+				loc->wqe_last = wqe;
+				mlx5_tx_cseg_init(txq, loc, wqe, 4,
+						  MLX5_OPCODE_SEND, olx);
+				mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
+				dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
+				       MLX5_ESEG_MIN_INLINE_SIZE - vlan;
+				/*
+				 * The length check is performed above, by
+				 * comparing with txq->inlen_send. We should
+				 * not get overflow here.
+				 */
+				MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
+				dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
+				mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
+						 dptr, dlen, olx);
+				++txq->wqe_ci;
+				--loc->wqe_free;
+				/* We have to store mbuf in elts.*/
+				MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+				txq->elts[txq->elts_head++ & txq->elts_m] =
+						loc->mbuf;
+				--loc->elts_free;
+			}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += vlan +
+					rte_pktmbuf_data_len(loc->mbuf);
+#endif
+		} else {
+			/*
+			 * No inline at all, it means the CPU cycles saving
+			 * is prioritized at configuration, we should not
+			 * copy any packet data to WQE.
+			 *
+			 * SEND WQE, one WQEBB:
+			 * - Control Segment, SEND opcode
+			 * - Ethernet Segment, optional VLAN, no inline
+			 * - Data Segment, pointer type
+			 */
+single_no_inline:
+			wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+			loc->wqe_last = wqe;
+			mlx5_tx_cseg_init(txq, loc, wqe, 3,
+					  MLX5_OPCODE_SEND, olx);
+			mlx5_tx_eseg_none(txq, loc, wqe, olx);
+			mlx5_tx_dseg_ptr
+				(txq, loc, &wqe->dseg[0],
+				 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
+				 rte_pktmbuf_data_len(loc->mbuf), olx);
+			++txq->wqe_ci;
+			--loc->wqe_free;
+			/*
+			 * We should not store mbuf pointer in elts
+			 * if no inlining is configured, this is done
+			 * by calling routine in a batch copy.
+			 */
+			MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
+			--loc->elts_free;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			/* Update sent data bytes counter. */
+			txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
+			if (MLX5_TXOFF_CONFIG(VLAN) &&
+			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)
+				txq->stats.obytes +=
+					sizeof(struct rte_vlan_hdr);
+#endif
+		}
+		++loc->pkts_sent;
+		--pkts_n;
+		if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
+			return MLX5_TXCMP_CODE_EXIT;
+		loc->mbuf = *pkts++;
+		if (pkts_n > 1)
+			rte_prefetch0(*pkts);
+		ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
+		if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
+			return ret;
+	}
+	MLX5_ASSERT(false);
+}
+
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq,
+		     struct rte_mbuf **restrict pkts,
+		     unsigned int pkts_n,
+		     struct mlx5_txq_local *restrict loc,
+		     unsigned int olx)
+{
+	enum mlx5_txcmp_code ret;
+
+	ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
+	if (ret == MLX5_TXCMP_CODE_SINGLE)
+		goto ordinary_send;
+	MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW);
+	for (;;) {
+		/* Optimize for inline/no inline eMPW send. */
+		ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
+			mlx5_tx_burst_empw_inline
+				(txq, pkts, pkts_n, loc, olx) :
+			mlx5_tx_burst_empw_simple
+				(txq, pkts, pkts_n, loc, olx);
+		if (ret != MLX5_TXCMP_CODE_SINGLE)
+			return ret;
+		/* The resources to send one packet should remain. */
+		MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+ordinary_send:
+		ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
+		MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE);
+		if (ret != MLX5_TXCMP_CODE_EMPW)
+			return ret;
+		/* The resources to send one packet should remain. */
+		MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+	}
+}
+
+/**
+ * DPDK Tx callback template. This is configured template
+ * used to generate routines optimized for specified offload setup.
+ * One of this generated functions is chosen at SQ configuration
+ * time.
+ *
+ * @param txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ * @param olx
+ *   Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
+ *   values. Should be static to take compile time static configuration
+ *   advantages.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+static __rte_always_inline uint16_t
+mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
+		   struct rte_mbuf **restrict pkts,
+		   uint16_t pkts_n,
+		   unsigned int olx)
+{
+	struct mlx5_txq_local loc;
+	enum mlx5_txcmp_code ret;
+	unsigned int part;
+
+	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	if (unlikely(!pkts_n))
+		return 0;
+	loc.pkts_sent = 0;
+	loc.pkts_copy = 0;
+	loc.wqe_last = NULL;
+
+send_loop:
+	loc.pkts_loop = loc.pkts_sent;
+	/*
+	 * Check if there are some CQEs, if any:
+	 * - process an encountered errors
+	 * - process the completed WQEs
+	 * - free related mbufs
+	 * - doorbell the NIC about processed CQEs
+	 */
+	rte_prefetch0(*(pkts + loc.pkts_sent));
+	mlx5_tx_handle_completion(txq, olx);
+	/*
+	 * Calculate the number of available resources - elts and WQEs.
+	 * There are two possible different scenarios:
+	 * - no data inlining into WQEs, one WQEBB may contains up to
+	 *   four packets, in this case elts become scarce resource
+	 * - data inlining into WQEs, one packet may require multiple
+	 *   WQEBBs, the WQEs become the limiting factor.
+	 */
+	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	loc.elts_free = txq->elts_s -
+				(uint16_t)(txq->elts_head - txq->elts_tail);
+	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	loc.wqe_free = txq->wqe_s -
+				(uint16_t)(txq->wqe_ci - txq->wqe_pi);
+	if (unlikely(!loc.elts_free || !loc.wqe_free))
+		goto burst_exit;
+	for (;;) {
+		/*
+		 * Fetch the packet from array. Usually this is
+		 * the first packet in series of multi/single
+		 * segment packets.
+		 */
+		loc.mbuf = *(pkts + loc.pkts_sent);
+		/* Dedicated branch for multi-segment packets. */
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    unlikely(NB_SEGS(loc.mbuf) > 1)) {
+			/*
+			 * Multi-segment packet encountered.
+			 * Hardware is able to process it only
+			 * with SEND/TSO opcodes, one packet
+			 * per WQE, do it in dedicated routine.
+			 */
+enter_send_multi:
+			MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy);
+			part = loc.pkts_sent - loc.pkts_copy;
+			if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+				/*
+				 * There are some single-segment mbufs not
+				 * stored in elts. The mbufs must be in the
+				 * same order as WQEs, so we must copy the
+				 * mbufs to elts here, before the coming
+				 * multi-segment packet mbufs is appended.
+				 */
+				mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
+						  part, olx);
+				loc.pkts_copy = loc.pkts_sent;
+			}
+			MLX5_ASSERT(pkts_n > loc.pkts_sent);
+			ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
+			if (!MLX5_TXOFF_CONFIG(INLINE))
+				loc.pkts_copy = loc.pkts_sent;
+			/*
+			 * These returned code checks are supposed
+			 * to be optimized out due to routine inlining.
+			 */
+			if (ret == MLX5_TXCMP_CODE_EXIT) {
+				/*
+				 * The routine returns this code when
+				 * all packets are sent or there is no
+				 * enough resources to complete request.
+				 */
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_ERROR) {
+				/*
+				 * The routine returns this code when
+				 * some error in the incoming packets
+				 * format occurred.
+				 */
+				txq->stats.oerrors++;
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE) {
+				/*
+				 * The single-segment packet was encountered
+				 * in the array, try to send it with the
+				 * best optimized way, possible engaging eMPW.
+				 */
+				goto enter_send_single;
+			}
+			if (MLX5_TXOFF_CONFIG(TSO) &&
+			    ret == MLX5_TXCMP_CODE_TSO) {
+				/*
+				 * The single-segment TSO packet was
+				 * encountered in the array.
+				 */
+				goto enter_send_tso;
+			}
+			/* We must not get here. Something is going wrong. */
+			MLX5_ASSERT(false);
+			txq->stats.oerrors++;
+			break;
+		}
+		/* Dedicated branch for single-segment TSO packets. */
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    unlikely(loc.mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+			/*
+			 * TSO might require special way for inlining
+			 * (dedicated parameters) and is sent with
+			 * MLX5_OPCODE_TSO opcode only, provide this
+			 * in dedicated branch.
+			 */
+enter_send_tso:
+			MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1);
+			MLX5_ASSERT(pkts_n > loc.pkts_sent);
+			ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
+			/*
+			 * These returned code checks are supposed
+			 * to be optimized out due to routine inlining.
+			 */
+			if (ret == MLX5_TXCMP_CODE_EXIT)
+				break;
+			if (ret == MLX5_TXCMP_CODE_ERROR) {
+				txq->stats.oerrors++;
+				break;
+			}
+			if (ret == MLX5_TXCMP_CODE_SINGLE)
+				goto enter_send_single;
+			if (MLX5_TXOFF_CONFIG(MULTI) &&
+			    ret == MLX5_TXCMP_CODE_MULTI) {
+				/*
+				 * The multi-segment packet was
+				 * encountered in the array.
+				 */
+				goto enter_send_multi;
+			}
+			/* We must not get here. Something is going wrong. */
+			MLX5_ASSERT(false);
+			txq->stats.oerrors++;
+			break;
+		}
+		/*
+		 * The dedicated branch for the single-segment packets
+		 * without TSO. Often these ones can be sent using
+		 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
+		 * The routine builds the WQEs till it encounters
+		 * the TSO or multi-segment packet (in case if these
+		 * offloads are requested at SQ configuration time).
+		 */
+enter_send_single:
+		MLX5_ASSERT(pkts_n > loc.pkts_sent);
+		ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
+		/*
+		 * These returned code checks are supposed
+		 * to be optimized out due to routine inlining.
+		 */
+		if (ret == MLX5_TXCMP_CODE_EXIT)
+			break;
+		if (ret == MLX5_TXCMP_CODE_ERROR) {
+			txq->stats.oerrors++;
+			break;
+		}
+		if (MLX5_TXOFF_CONFIG(MULTI) &&
+		    ret == MLX5_TXCMP_CODE_MULTI) {
+			/*
+			 * The multi-segment packet was
+			 * encountered in the array.
+			 */
+			goto enter_send_multi;
+		}
+		if (MLX5_TXOFF_CONFIG(TSO) &&
+		    ret == MLX5_TXCMP_CODE_TSO) {
+			/*
+			 * The single-segment TSO packet was
+			 * encountered in the array.
+			 */
+			goto enter_send_tso;
+		}
+		/* We must not get here. Something is going wrong. */
+		MLX5_ASSERT(false);
+		txq->stats.oerrors++;
+		break;
+	}
+	/*
+	 * Main Tx loop is completed, do the rest:
+	 * - set completion request if thresholds are reached
+	 * - doorbell the hardware
+	 * - copy the rest of mbufs to elts (if any)
+	 */
+	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) ||
+		    loc.pkts_sent >= loc.pkts_copy);
+	/* Take a shortcut if nothing is sent. */
+	if (unlikely(loc.pkts_sent == loc.pkts_loop))
+		goto burst_exit;
+	/* Request CQE generation if limits are reached. */
+	mlx5_tx_request_completion(txq, &loc, olx);
+	/*
+	 * Ring QP doorbell immediately after WQE building completion
+	 * to improve latencies. The pure software related data treatment
+	 * can be completed after doorbell. Tx CQEs for this SQ are
+	 * processed in this thread only by the polling.
+	 *
+	 * The rdma core library can map doorbell register in two ways,
+	 * depending on the environment variable "MLX5_SHUT_UP_BF":
+	 *
+	 * - as regular cached memory, the variable is either missing or
+	 *   set to zero. This type of mapping may cause the significant
+	 *   doorbell register writing latency and requires explicit
+	 *   memory write barrier to mitigate this issue and prevent
+	 *   write combining.
+	 *
+	 * - as non-cached memory, the variable is present and set to
+	 *   not "0" value. This type of mapping may cause performance
+	 *   impact under heavy loading conditions but the explicit write
+	 *   memory barrier is not required and it may improve core
+	 *   performance.
+	 *
+	 * - the legacy behaviour (prior 19.08 release) was to use some
+	 *   heuristics to decide whether write memory barrier should
+	 *   be performed. This behavior is supported with specifying
+	 *   tx_db_nc=2, write barrier is skipped if application
+	 *   provides the full recommended burst of packets, it
+	 *   supposes the next packets are coming and the write barrier
+	 *   will be issued on the next burst (after descriptor writing,
+	 *   at least).
+	 */
+	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc &&
+			(!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST));
+	/* Not all of the mbufs may be stored into elts yet. */
+	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
+	if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
+		/*
+		 * There are some single-segment mbufs not stored in elts.
+		 * It can be only if the last packet was single-segment.
+		 * The copying is gathered into one place due to it is
+		 * a good opportunity to optimize that with SIMD.
+		 * Unfortunately if inlining is enabled the gaps in
+		 * pointer array may happen due to early freeing of the
+		 * inlined mbufs.
+		 */
+		mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
+		loc.pkts_copy = loc.pkts_sent;
+	}
+	MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+	MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+	if (pkts_n > loc.pkts_sent) {
+		/*
+		 * If burst size is large there might be no enough CQE
+		 * fetched from completion queue and no enough resources
+		 * freed to send all the packets.
+		 */
+		goto send_loop;
+	}
+burst_exit:
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment sent packets counter. */
+	txq->stats.opackets += loc.pkts_sent;
+#endif
+	return loc.pkts_sent;
+}
+
+/* Generate routines with Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full_empw,
+		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(none_empw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(md_empw,
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mt_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtsc_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mti_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtiv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sc_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sci_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(scv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(sciv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(i_empw,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(v_empw,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(iv_empw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+/* Generate routines without Enhanced Multi-Packet Write support. */
+MLX5_TXOFF_DECL(full,
+		MLX5_TXOFF_CONFIG_FULL)
+
+MLX5_TXOFF_DECL(none,
+		MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_DECL(md,
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mt,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mtsc,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(mti,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(mtiv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sc,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(sci,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(scv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+
+MLX5_TXOFF_DECL(sciv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(i,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(v,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_DECL(iv,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+/*
+ * Generate routines with Legacy Multi-Packet Write support.
+ * This mode is supported by ConnectX-4 Lx only and imposes
+ * offload limitations, not supported:
+ *   - ACL/Flows (metadata are becoming meaningless)
+ *   - WQE Inline headers
+ *   - SRIOV (E-Switch offloads)
+ *   - VLAN insertion
+ *   - tunnel encapsulation/decapsulation
+ *   - TSO
+ */
+MLX5_TXOFF_DECL(none_mpw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW |
+		MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_DECL(mci_mpw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
+		MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_DECL(mc_mpw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_DECL(i_mpw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
+		MLX5_TXOFF_CONFIG_MPW)
+
+/*
+ * Array of declared and compiled Tx burst function and corresponding
+ * supported offloads set. The array is used to select the Tx burst
+ * function for specified offloads set at Tx queue configuration time.
+ */
+const struct {
+	eth_tx_burst_t func;
+	unsigned int olx;
+} txoff_func[] = {
+MLX5_TXOFF_INFO(full_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(none_empw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(md_empw,
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mt_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtsc_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mti_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtiv_empw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sc_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sci_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(scv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(sciv_empw,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(i_empw,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(v_empw,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(iv_empw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(full,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(none,
+		MLX5_TXOFF_CONFIG_NONE)
+
+MLX5_TXOFF_INFO(md,
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mt,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtsc,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mti,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(mtiv,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sc,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sci,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(scv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(sciv,
+		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(i,
+		MLX5_TXOFF_CONFIG_INLINE |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(v,
+		MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(iv,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA)
+
+MLX5_TXOFF_INFO(none_mpw,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW |
+		MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_INFO(mci_mpw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
+		MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_INFO(mc_mpw,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM |
+		MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW)
+
+MLX5_TXOFF_INFO(i_mpw,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW |
+		MLX5_TXOFF_CONFIG_MPW)
+};
+
+/**
+ * Configure the Tx function to use. The routine checks configured
+ * Tx offloads for the device and selects appropriate Tx burst
+ * routine. There are multiple Tx burst routines compiled from
+ * the same template in the most optimal way for the dedicated
+ * Tx offloads set.
+ *
+ * @param dev
+ *   Pointer to private data structure.
+ *
+ * @return
+ *   Pointer to selected Tx burst function.
+ */
+eth_tx_burst_t
+mlx5_select_tx_function(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
+	unsigned int diff = 0, olx = 0, i, m;
+
+	static_assert(MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE <=
+		      MLX5_DSEG_MAX, "invalid WQE max size");
+	static_assert(MLX5_WQE_CSEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Control Segment size");
+	static_assert(MLX5_WQE_ESEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Ethernet Segment size");
+	static_assert(MLX5_WQE_DSEG_SIZE == MLX5_WSEG_SIZE,
+		      "invalid WQE Data Segment size");
+	static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE,
+		      "invalid WQE size");
+	MLX5_ASSERT(priv);
+	if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) {
+		/* We should support Multi-Segment Packets. */
+		olx |= MLX5_TXOFF_CONFIG_MULTI;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+			   DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+			   DEV_TX_OFFLOAD_GRE_TNL_TSO |
+			   DEV_TX_OFFLOAD_IP_TNL_TSO |
+			   DEV_TX_OFFLOAD_UDP_TNL_TSO)) {
+		/* We should support TCP Send Offload. */
+		olx |= MLX5_TXOFF_CONFIG_TSO;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
+			   DEV_TX_OFFLOAD_UDP_TNL_TSO |
+			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+		/* We should support Software Parser for Tunnels. */
+		olx |= MLX5_TXOFF_CONFIG_SWP;
+	}
+	if (tx_offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
+			   DEV_TX_OFFLOAD_UDP_CKSUM |
+			   DEV_TX_OFFLOAD_TCP_CKSUM |
+			   DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)) {
+		/* We should support IP/TCP/UDP Checksums. */
+		olx |= MLX5_TXOFF_CONFIG_CSUM;
+	}
+	if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT) {
+		/* We should support VLAN insertion. */
+		olx |= MLX5_TXOFF_CONFIG_VLAN;
+	}
+	if (priv->txqs_n && (*priv->txqs)[0]) {
+		struct mlx5_txq_data *txd = (*priv->txqs)[0];
+
+		if (txd->inlen_send) {
+			/*
+			 * Check the data inline requirements. Data inline
+			 * is enabled on per device basis, we can check
+			 * the first Tx queue only.
+			 *
+			 * If device does not support VLAN insertion in WQE
+			 * and some queues are requested to perform VLAN
+			 * insertion offload than inline must be enabled.
+			 */
+			olx |= MLX5_TXOFF_CONFIG_INLINE;
+		}
+	}
+	if (config->mps == MLX5_MPW_ENHANCED &&
+	    config->txq_inline_min <= 0) {
+		/*
+		 * The NIC supports Enhanced Multi-Packet Write
+		 * and does not require minimal inline data.
+		 */
+		olx |= MLX5_TXOFF_CONFIG_EMPW;
+	}
+	if (rte_flow_dynf_metadata_avail()) {
+		/* We should support Flow metadata. */
+		olx |= MLX5_TXOFF_CONFIG_METADATA;
+	}
+	if (config->mps == MLX5_MPW) {
+		/*
+		 * The NIC supports Legacy Multi-Packet Write.
+		 * The MLX5_TXOFF_CONFIG_MPW controls the
+		 * descriptor building method in combination
+		 * with MLX5_TXOFF_CONFIG_EMPW.
+		 */
+		if (!(olx & (MLX5_TXOFF_CONFIG_TSO |
+			     MLX5_TXOFF_CONFIG_SWP |
+			     MLX5_TXOFF_CONFIG_VLAN |
+			     MLX5_TXOFF_CONFIG_METADATA)))
+			olx |= MLX5_TXOFF_CONFIG_EMPW |
+			       MLX5_TXOFF_CONFIG_MPW;
+	}
+	/*
+	 * Scan the routines table to find the minimal
+	 * satisfying routine with requested offloads.
+	 */
+	m = RTE_DIM(txoff_func);
+	for (i = 0; i < RTE_DIM(txoff_func); i++) {
+		unsigned int tmp;
+
+		tmp = txoff_func[i].olx;
+		if (tmp == olx) {
+			/* Meets requested offloads exactly.*/
+			m = i;
+			break;
+		}
+		if ((tmp & olx) != olx) {
+			/* Does not meet requested offloads at all. */
+			continue;
+		}
+		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW)
+			/* Do not enable eMPW if not configured. */
+			continue;
+		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE)
+			/* Do not enable inlining if not configured. */
+			continue;
+		/*
+		 * Some routine meets the requirements.
+		 * Check whether it has minimal amount
+		 * of not requested offloads.
+		 */
+		tmp = __builtin_popcountl(tmp & ~olx);
+		if (m >= RTE_DIM(txoff_func) || tmp < diff) {
+			/* First or better match, save and continue. */
+			m = i;
+			diff = tmp;
+			continue;
+		}
+		if (tmp == diff) {
+			tmp = txoff_func[i].olx ^ txoff_func[m].olx;
+			if (__builtin_ffsl(txoff_func[i].olx & ~tmp) <
+			    __builtin_ffsl(txoff_func[m].olx & ~tmp)) {
+				/* Lighter not requested offload. */
+				m = i;
+			}
+		}
+	}
+	if (m >= RTE_DIM(txoff_func)) {
+		DRV_LOG(DEBUG, "port %u has no selected Tx function"
+			       " for requested offloads %04X",
+				dev->data->port_id, olx);
+		return NULL;
+	}
+	DRV_LOG(DEBUG, "port %u has selected Tx function"
+		       " supporting offloads %04X/%04X",
+			dev->data->port_id, olx, txoff_func[m].olx);
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MULTI)
+		DRV_LOG(DEBUG, "\tMULTI (multi segment)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TSO)
+		DRV_LOG(DEBUG, "\tTSO   (TCP send offload)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_SWP)
+		DRV_LOG(DEBUG, "\tSWP   (software parser)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_CSUM)
+		DRV_LOG(DEBUG, "\tCSUM  (checksum offload)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_INLINE)
+		DRV_LOG(DEBUG, "\tINLIN (inline data)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_VLAN)
+		DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA)
+		DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) {
+		if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW)
+			DRV_LOG(DEBUG, "\tMPW   (Legacy MPW)");
+		else
+			DRV_LOG(DEBUG, "\tEMPW  (Enhanced MPW)");
+	}
+	return txoff_func[m].func;
+}
+
+/**
+ * DPDK callback to get the TX queue information
+ *
+ * @param dev
+ *   Pointer to the device structure.
+ *
+ * @param tx_queue_id
+ *   Tx queue identificator.
+ *
+ * @param qinfo
+ *   Pointer to the TX queue information structure.
+ *
+ * @return
+ *   None.
+ */
+
+void
+mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+		  struct rte_eth_txq_info *qinfo)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id];
+	struct mlx5_txq_ctrl *txq_ctrl =
+			container_of(txq, struct mlx5_txq_ctrl, txq);
+
+	if (!txq)
+		return;
+	qinfo->nb_desc = txq->elts_s;
+	qinfo->conf.tx_thresh.pthresh = 0;
+	qinfo->conf.tx_thresh.hthresh = 0;
+	qinfo->conf.tx_thresh.wthresh = 0;
+	qinfo->conf.tx_rs_thresh = 0;
+	qinfo->conf.tx_free_thresh = 0;
+	qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1;
+	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
+}
+
+/**
+ * DPDK callback to get the TX packet burst mode information
+ *
+ * @param dev
+ *   Pointer to the device structure.
+ *
+ * @param tx_queue_id
+ *   Tx queue identificatior.
+ *
+ * @param mode
+ *   Pointer to the burts mode information.
+ *
+ * @return
+ *   0 as success, -EINVAL as failure.
+ */
+
+int
+mlx5_tx_burst_mode_get(struct rte_eth_dev *dev,
+		       uint16_t tx_queue_id __rte_unused,
+		       struct rte_eth_burst_mode *mode)
+{
+	eth_tx_burst_t pkt_burst = dev->tx_pkt_burst;
+	unsigned int i, olx;
+
+	for (i = 0; i < RTE_DIM(txoff_func); i++) {
+		if (pkt_burst == txoff_func[i].func) {
+			olx = txoff_func[i].olx;
+			snprintf(mode->info, sizeof(mode->info),
+				 "%s%s%s%s%s%s%s%s",
+				 (olx & MLX5_TXOFF_CONFIG_EMPW) ?
+				 ((olx & MLX5_TXOFF_CONFIG_MPW) ?
+				 "Legacy MPW" : "Enhanced MPW") : "No MPW",
+				 (olx & MLX5_TXOFF_CONFIG_MULTI) ?
+				 " + MULTI" : "",
+				 (olx & MLX5_TXOFF_CONFIG_TSO) ?
+				 " + TSO" : "",
+				 (olx & MLX5_TXOFF_CONFIG_SWP) ?
+				 " + SWP" : "",
+				 (olx & MLX5_TXOFF_CONFIG_CSUM) ?
+				 "  + CSUM" : "",
+				 (olx & MLX5_TXOFF_CONFIG_INLINE) ?
+				 " + INLINE" : "",
+				 (olx & MLX5_TXOFF_CONFIG_VLAN) ?
+				 " + VLAN" : "",
+				 (olx & MLX5_TXOFF_CONFIG_METADATA) ?
+				 " + METADATA" : "");
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h
new file mode 100644
index 000000000..48f2b7941
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx.h
@@ -0,0 +1,683 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_H_
+#define RTE_PMD_MLX5_RXTX_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_common.h>
+#include <rte_hexdump.h>
+#include <rte_atomic.h>
+#include <rte_spinlock.h>
+#include <rte_io.h>
+#include <rte_bus_pci.h>
+#include <rte_malloc.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_prm.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5_defs.h"
+#include "mlx5_utils.h"
+#include "mlx5.h"
+#include "mlx5_autoconf.h"
+
+/* Support tunnel matching. */
+#define MLX5_FLOW_TUNNEL 10
+
+/* Mbuf dynamic flag offset for inline. */
+extern uint64_t rte_net_mlx5_dynf_inline_mask;
+
+struct mlx5_rxq_stats {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint64_t ipackets; /**< Total of successfully received packets. */
+	uint64_t ibytes; /**< Total of successfully received bytes. */
+#endif
+	uint64_t idropped; /**< Total of packets dropped when RX ring full. */
+	uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */
+};
+
+struct mlx5_txq_stats {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint64_t opackets; /**< Total of successfully sent packets. */
+	uint64_t obytes; /**< Total of successfully sent bytes. */
+#endif
+	uint64_t oerrors; /**< Total number of failed transmitted packets. */
+};
+
+struct mlx5_priv;
+
+/* Compressed CQE context. */
+struct rxq_zip {
+	uint16_t ai; /* Array index. */
+	uint16_t ca; /* Current array index. */
+	uint16_t na; /* Next array index. */
+	uint16_t cq_ci; /* The next CQE. */
+	uint32_t cqe_cnt; /* Number of CQEs. */
+};
+
+/* Multi-Packet RQ buffer header. */
+struct mlx5_mprq_buf {
+	struct rte_mempool *mp;
+	rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
+	uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first packet. */
+	struct rte_mbuf_ext_shared_info shinfos[];
+	/*
+	 * Shared information per stride.
+	 * More memory will be allocated for the first stride head-room and for
+	 * the strides data.
+	 */
+} __rte_cache_aligned;
+
+/* Get pointer to the first stride. */
+#define mlx5_mprq_buf_addr(ptr, strd_n) (RTE_PTR_ADD((ptr), \
+				sizeof(struct mlx5_mprq_buf) + \
+				(strd_n) * \
+				sizeof(struct rte_mbuf_ext_shared_info) + \
+				RTE_PKTMBUF_HEADROOM))
+
+#define MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES 6
+#define MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES 9
+
+enum mlx5_rxq_err_state {
+	MLX5_RXQ_ERR_STATE_NO_ERROR = 0,
+	MLX5_RXQ_ERR_STATE_NEED_RESET,
+	MLX5_RXQ_ERR_STATE_NEED_READY,
+};
+
+/* RX queue descriptor. */
+struct mlx5_rxq_data {
+	unsigned int csum:1; /* Enable checksum offloading. */
+	unsigned int hw_timestamp:1; /* Enable HW timestamp. */
+	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
+	unsigned int crc_present:1; /* CRC must be subtracted. */
+	unsigned int sges_n:3; /* Log 2 of SGEs (max buffers per packet). */
+	unsigned int cqe_n:4; /* Log 2 of CQ elements. */
+	unsigned int elts_n:4; /* Log 2 of Mbufs. */
+	unsigned int rss_hash:1; /* RSS hash result is enabled. */
+	unsigned int mark:1; /* Marked flow available on the queue. */
+	unsigned int strd_num_n:5; /* Log 2 of the number of stride. */
+	unsigned int strd_sz_n:4; /* Log 2 of stride size. */
+	unsigned int strd_shift_en:1; /* Enable 2bytes shift on a stride. */
+	unsigned int err_state:2; /* enum mlx5_rxq_err_state. */
+	unsigned int strd_scatter_en:1; /* Scattered packets from a stride. */
+	unsigned int lro:1; /* Enable LRO. */
+	unsigned int dynf_meta:1; /* Dynamic metadata is configured. */
+	volatile uint32_t *rq_db;
+	volatile uint32_t *cq_db;
+	uint16_t port_id;
+	uint32_t rq_ci;
+	uint16_t consumed_strd; /* Number of consumed strides in WQE. */
+	uint32_t rq_pi;
+	uint32_t cq_ci;
+	uint16_t rq_repl_thresh; /* Threshold for buffer replenishment. */
+	union {
+		struct rxq_zip zip; /* Compressed context. */
+		uint16_t decompressed;
+		/* Number of ready mbufs decompressed from the CQ. */
+	};
+	struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
+	uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
+	volatile void *wqes;
+	volatile struct mlx5_cqe(*cqes)[];
+	RTE_STD_C11
+	union  {
+		struct rte_mbuf *(*elts)[];
+		struct mlx5_mprq_buf *(*mprq_bufs)[];
+	};
+	struct rte_mempool *mp;
+	struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
+	struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */
+	uint16_t idx; /* Queue index. */
+	struct mlx5_rxq_stats stats;
+	rte_xmm_t mbuf_initializer; /* Default rearm/flags for vectorized Rx. */
+	struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */
+	void *cq_uar; /* CQ user access region. */
+	uint32_t cqn; /* CQ number. */
+	uint8_t cq_arm_sn; /* CQ arm seq number. */
+#ifndef RTE_ARCH_64
+	rte_spinlock_t *uar_lock_cq;
+	/* CQ (UAR) access lock required for 32bit implementations */
+#endif
+	uint32_t tunnel; /* Tunnel information. */
+	uint64_t flow_meta_mask;
+	int32_t flow_meta_offset;
+} __rte_cache_aligned;
+
+enum mlx5_rxq_obj_type {
+	MLX5_RXQ_OBJ_TYPE_IBV,		/* mlx5_rxq_obj with ibv_wq. */
+	MLX5_RXQ_OBJ_TYPE_DEVX_RQ,	/* mlx5_rxq_obj with mlx5_devx_rq. */
+	MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN,
+	/* mlx5_rxq_obj with mlx5_devx_rq and hairpin support. */
+};
+
+enum mlx5_rxq_type {
+	MLX5_RXQ_TYPE_STANDARD, /* Standard Rx queue. */
+	MLX5_RXQ_TYPE_HAIRPIN, /* Hairpin Rx queue. */
+	MLX5_RXQ_TYPE_UNDEFINED,
+};
+
+/* Verbs/DevX Rx queue elements. */
+struct mlx5_rxq_obj {
+	LIST_ENTRY(mlx5_rxq_obj) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct mlx5_rxq_ctrl *rxq_ctrl; /* Back pointer to parent. */
+	struct ibv_cq *cq; /* Completion Queue. */
+	enum mlx5_rxq_obj_type type;
+	RTE_STD_C11
+	union {
+		struct ibv_wq *wq; /* Work Queue. */
+		struct mlx5_devx_obj *rq; /* DevX object for Rx Queue. */
+	};
+	struct ibv_comp_channel *channel;
+};
+
+/* RX queue control descriptor. */
+struct mlx5_rxq_ctrl {
+	struct mlx5_rxq_data rxq; /* Data path structure. */
+	LIST_ENTRY(mlx5_rxq_ctrl) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct mlx5_rxq_obj *obj; /* Verbs/DevX elements. */
+	struct mlx5_priv *priv; /* Back pointer to private data. */
+	enum mlx5_rxq_type type; /* Rxq type. */
+	unsigned int socket; /* CPU socket ID for allocations. */
+	unsigned int irq:1; /* Whether IRQ is enabled. */
+	unsigned int dbr_umem_id_valid:1; /* dbr_umem_id holds a valid value. */
+	uint32_t flow_mark_n; /* Number of Mark/Flag flows using this Queue. */
+	uint32_t flow_tunnels_n[MLX5_FLOW_TUNNEL]; /* Tunnels counters. */
+	uint32_t wqn; /* WQ number. */
+	uint16_t dump_file_n; /* Number of dump files. */
+	uint32_t dbr_umem_id; /* Storing door-bell information, */
+	uint64_t dbr_offset;  /* needed when freeing door-bell. */
+	struct mlx5dv_devx_umem *wq_umem; /* WQ buffer registration info. */
+	struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
+};
+
+enum mlx5_ind_tbl_type {
+	MLX5_IND_TBL_TYPE_IBV,
+	MLX5_IND_TBL_TYPE_DEVX,
+};
+
+/* Indirection table. */
+struct mlx5_ind_table_obj {
+	LIST_ENTRY(mlx5_ind_table_obj) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	enum mlx5_ind_tbl_type type;
+	RTE_STD_C11
+	union {
+		struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */
+		struct mlx5_devx_obj *rqt; /* DevX RQT object. */
+	};
+	uint32_t queues_n; /**< Number of queues in the list. */
+	uint16_t queues[]; /**< Queue list. */
+};
+
+/* Hash Rx queue. */
+struct mlx5_hrxq {
+	ILIST_ENTRY(uint32_t)next; /* Index to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct mlx5_ind_table_obj *ind_table; /* Indirection table. */
+	RTE_STD_C11
+	union {
+		struct ibv_qp *qp; /* Verbs queue pair. */
+		struct mlx5_devx_obj *tir; /* DevX TIR object. */
+	};
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	void *action; /* DV QP action pointer. */
+#endif
+	uint64_t hash_fields; /* Verbs Hash fields. */
+	uint32_t rss_key_len; /* Hash key length in bytes. */
+	uint8_t rss_key[]; /* Hash key. */
+};
+
+/* TX queue send local data. */
+__extension__
+struct mlx5_txq_local {
+	struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */
+	struct rte_mbuf *mbuf; /* first mbuf to process. */
+	uint16_t pkts_copy; /* packets copied to elts. */
+	uint16_t pkts_sent; /* packets sent. */
+	uint16_t pkts_loop; /* packets sent on loop entry. */
+	uint16_t elts_free; /* available elts remain. */
+	uint16_t wqe_free; /* available wqe remain. */
+	uint16_t mbuf_off; /* data offset in current mbuf. */
+	uint16_t mbuf_nseg; /* number of remaining mbuf. */
+};
+
+/* TX queue descriptor. */
+__extension__
+struct mlx5_txq_data {
+	uint16_t elts_head; /* Current counter in (*elts)[]. */
+	uint16_t elts_tail; /* Counter of first element awaiting completion. */
+	uint16_t elts_comp; /* elts index since last completion request. */
+	uint16_t elts_s; /* Number of mbuf elements. */
+	uint16_t elts_m; /* Mask for mbuf elements indices. */
+	/* Fields related to elts mbuf storage. */
+	uint16_t wqe_ci; /* Consumer index for work queue. */
+	uint16_t wqe_pi; /* Producer index for work queue. */
+	uint16_t wqe_s; /* Number of WQ elements. */
+	uint16_t wqe_m; /* Mask Number for WQ elements. */
+	uint16_t wqe_comp; /* WQE index since last completion request. */
+	uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */
+	/* WQ related fields. */
+	uint16_t cq_ci; /* Consumer index for completion queue. */
+	uint16_t cq_pi; /* Production index for completion queue. */
+	uint16_t cqe_s; /* Number of CQ elements. */
+	uint16_t cqe_m; /* Mask for CQ indices. */
+	/* CQ related fields. */
+	uint16_t elts_n:4; /* elts[] length (in log2). */
+	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
+	uint16_t wqe_n:4; /* Number of WQ elements (in log2). */
+	uint16_t tso_en:1; /* When set hardware TSO is enabled. */
+	uint16_t tunnel_en:1;
+	/* When set TX offload for tunneled packets are supported. */
+	uint16_t swp_en:1; /* Whether SW parser is enabled. */
+	uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
+	uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
+	uint16_t db_heu:1; /* Doorbell heuristic write barrier. */
+	uint16_t inlen_send; /* Ordinary send data inline size. */
+	uint16_t inlen_empw; /* eMPW max packet size to inline. */
+	uint16_t inlen_mode; /* Minimal data length to inline. */
+	uint32_t qp_num_8s; /* QP number shifted by 8. */
+	uint64_t offloads; /* Offloads for Tx Queue. */
+	struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
+	struct mlx5_wqe *wqes; /* Work queue. */
+	struct mlx5_wqe *wqes_end; /* Work queue array limit. */
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+	uint32_t *fcqs; /* Free completion queue (debug extended). */
+#else
+	uint16_t *fcqs; /* Free completion queue. */
+#endif
+	volatile struct mlx5_cqe *cqes; /* Completion queue. */
+	volatile uint32_t *qp_db; /* Work queue doorbell. */
+	volatile uint32_t *cq_db; /* Completion queue doorbell. */
+	uint16_t port_id; /* Port ID of device. */
+	uint16_t idx; /* Queue index. */
+	struct mlx5_txq_stats stats; /* TX queue counters. */
+#ifndef RTE_ARCH_64
+	rte_spinlock_t *uar_lock;
+	/* UAR access lock required for 32bit implementations */
+#endif
+	struct rte_mbuf *elts[0];
+	/* Storage for queued packets, must be the last field. */
+} __rte_cache_aligned;
+
+enum mlx5_txq_obj_type {
+	MLX5_TXQ_OBJ_TYPE_IBV,		/* mlx5_txq_obj with ibv_wq. */
+	MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN,
+	/* mlx5_txq_obj with mlx5_devx_tq and hairpin support. */
+};
+
+enum mlx5_txq_type {
+	MLX5_TXQ_TYPE_STANDARD, /* Standard Tx queue. */
+	MLX5_TXQ_TYPE_HAIRPIN, /* Hairpin Rx queue. */
+};
+
+/* Verbs/DevX Tx queue elements. */
+struct mlx5_txq_obj {
+	LIST_ENTRY(mlx5_txq_obj) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct mlx5_txq_ctrl *txq_ctrl; /* Pointer to the control queue. */
+	enum mlx5_txq_obj_type type; /* The txq object type. */
+	RTE_STD_C11
+	union {
+		struct {
+			struct ibv_cq *cq; /* Completion Queue. */
+			struct ibv_qp *qp; /* Queue Pair. */
+		};
+		struct {
+			struct mlx5_devx_obj *sq;
+			/* DevX object for Sx queue. */
+			struct mlx5_devx_obj *tis; /* The TIS object. */
+		};
+	};
+};
+
+/* TX queue control descriptor. */
+struct mlx5_txq_ctrl {
+	LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	unsigned int socket; /* CPU socket ID for allocations. */
+	enum mlx5_txq_type type; /* The txq ctrl type. */
+	unsigned int max_inline_data; /* Max inline data. */
+	unsigned int max_tso_header; /* Max TSO header size. */
+	struct mlx5_txq_obj *obj; /* Verbs/DevX queue object. */
+	struct mlx5_priv *priv; /* Back pointer to private data. */
+	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
+	void *bf_reg; /* BlueFlame register from Verbs. */
+	uint16_t dump_file_n; /* Number of dump files. */
+	struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
+	struct mlx5_txq_data txq; /* Data path structure. */
+	/* Must be the last field in the structure, contains elts[]. */
+};
+
+#define MLX5_TX_BFREG(txq) \
+		(MLX5_PROC_PRIV((txq)->port_id)->uar_table[(txq)->idx])
+
+/* mlx5_rxq.c */
+
+extern uint8_t rss_hash_default_key[];
+
+int mlx5_check_mprq_support(struct rte_eth_dev *dev);
+int mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq);
+int mlx5_mprq_enabled(struct rte_eth_dev *dev);
+int mlx5_mprq_free_mp(struct rte_eth_dev *dev);
+int mlx5_mprq_alloc_mp(struct rte_eth_dev *dev);
+int mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+			unsigned int socket, const struct rte_eth_rxconf *conf,
+			struct rte_mempool *mp);
+int mlx5_rx_hairpin_queue_setup
+	(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+	 const struct rte_eth_hairpin_conf *hairpin_conf);
+void mlx5_rx_queue_release(void *dpdk_rxq);
+int mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev);
+void mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev);
+int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
+int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
+struct mlx5_rxq_obj *mlx5_rxq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
+				      enum mlx5_rxq_obj_type type);
+int mlx5_rxq_obj_verify(struct rte_eth_dev *dev);
+struct mlx5_rxq_ctrl *mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx,
+				   uint16_t desc, unsigned int socket,
+				   const struct rte_eth_rxconf *conf,
+				   struct rte_mempool *mp);
+struct mlx5_rxq_ctrl *mlx5_rxq_hairpin_new
+	(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+	 const struct rte_eth_hairpin_conf *hairpin_conf);
+struct mlx5_rxq_ctrl *mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_rxq_verify(struct rte_eth_dev *dev);
+int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl);
+int mlx5_ind_table_obj_verify(struct rte_eth_dev *dev);
+uint32_t mlx5_hrxq_new(struct rte_eth_dev *dev,
+		       const uint8_t *rss_key, uint32_t rss_key_len,
+		       uint64_t hash_fields,
+		       const uint16_t *queues, uint32_t queues_n,
+		       int tunnel __rte_unused);
+uint32_t mlx5_hrxq_get(struct rte_eth_dev *dev,
+		       const uint8_t *rss_key, uint32_t rss_key_len,
+		       uint64_t hash_fields,
+		       const uint16_t *queues, uint32_t queues_n);
+int mlx5_hrxq_release(struct rte_eth_dev *dev, uint32_t hxrq_idx);
+int mlx5_hrxq_verify(struct rte_eth_dev *dev);
+enum mlx5_rxq_type mlx5_rxq_get_type(struct rte_eth_dev *dev, uint16_t idx);
+struct mlx5_hrxq *mlx5_hrxq_drop_new(struct rte_eth_dev *dev);
+void mlx5_hrxq_drop_release(struct rte_eth_dev *dev);
+uint64_t mlx5_get_rx_port_offloads(void);
+uint64_t mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev);
+
+/* mlx5_txq.c */
+
+int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+			unsigned int socket, const struct rte_eth_txconf *conf);
+int mlx5_tx_hairpin_queue_setup
+	(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+	 const struct rte_eth_hairpin_conf *hairpin_conf);
+void mlx5_tx_queue_release(void *dpdk_txq);
+int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd);
+struct mlx5_txq_obj *mlx5_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
+				      enum mlx5_txq_obj_type type);
+struct mlx5_txq_obj *mlx5_txq_obj_get(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_obj_release(struct mlx5_txq_obj *txq_ibv);
+int mlx5_txq_obj_verify(struct rte_eth_dev *dev);
+struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
+				   uint16_t desc, unsigned int socket,
+				   const struct rte_eth_txconf *conf);
+struct mlx5_txq_ctrl *mlx5_txq_hairpin_new
+	(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+	 const struct rte_eth_hairpin_conf *hairpin_conf);
+struct mlx5_txq_ctrl *mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx);
+int mlx5_txq_verify(struct rte_eth_dev *dev);
+void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl);
+void txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl);
+uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev);
+
+/* mlx5_rxtx.c */
+
+extern uint32_t mlx5_ptype_table[];
+extern uint8_t mlx5_cksum_table[];
+extern uint8_t mlx5_swp_types_table[];
+
+void mlx5_set_ptype_table(void);
+void mlx5_set_cksum_table(void);
+void mlx5_set_swp_types_table(void);
+uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
+void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq);
+__rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec);
+void mlx5_mprq_buf_free_cb(void *addr, void *opaque);
+void mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf);
+uint16_t mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts,
+			    uint16_t pkts_n);
+uint16_t removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
+			  uint16_t pkts_n);
+uint16_t removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
+			  uint16_t pkts_n);
+int mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset);
+int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset);
+uint32_t mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id);
+void mlx5_dump_debug_information(const char *path, const char *title,
+				 const void *buf, unsigned int len);
+int mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
+			const struct mlx5_mp_arg_queue_state_modify *sm);
+void mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
+		       struct rte_eth_rxq_info *qinfo);
+void mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
+		       struct rte_eth_txq_info *qinfo);
+int mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+			   struct rte_eth_burst_mode *mode);
+int mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+			   struct rte_eth_burst_mode *mode);
+
+/* Vectorized version of mlx5_rxtx.c */
+int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
+int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
+uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
+			   uint16_t pkts_n);
+
+/* mlx5_mr.c */
+
+void mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl);
+uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr);
+uint32_t mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb);
+uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
+			       struct rte_mempool *mp);
+int mlx5_dma_map(struct rte_pci_device *pdev, void *addr, uint64_t iova,
+		 size_t len);
+int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t iova,
+		   size_t len);
+
+/**
+ * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and
+ * 64bit architectures.
+ *
+ * @param val
+ *   value to write in CPU endian format.
+ * @param addr
+ *   Address to write to.
+ * @param lock
+ *   Address of the lock to use for that UAR access.
+ */
+static __rte_always_inline void
+__mlx5_uar_write64_relaxed(uint64_t val, void *addr,
+			   rte_spinlock_t *lock __rte_unused)
+{
+#ifdef RTE_ARCH_64
+	*(uint64_t *)addr = val;
+#else /* !RTE_ARCH_64 */
+	rte_spinlock_lock(lock);
+	*(uint32_t *)addr = val;
+	rte_io_wmb();
+	*((uint32_t *)addr + 1) = val >> 32;
+	rte_spinlock_unlock(lock);
+#endif
+}
+
+/**
+ * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and
+ * 64bit architectures while guaranteeing the order of execution with the
+ * code being executed.
+ *
+ * @param val
+ *   value to write in CPU endian format.
+ * @param addr
+ *   Address to write to.
+ * @param lock
+ *   Address of the lock to use for that UAR access.
+ */
+static __rte_always_inline void
+__mlx5_uar_write64(uint64_t val, void *addr, rte_spinlock_t *lock)
+{
+	rte_io_wmb();
+	__mlx5_uar_write64_relaxed(val, addr, lock);
+}
+
+/* Assist macros, used instead of directly calling the functions they wrap. */
+#ifdef RTE_ARCH_64
+#define mlx5_uar_write64_relaxed(val, dst, lock) \
+		__mlx5_uar_write64_relaxed(val, dst, NULL)
+#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, NULL)
+#else
+#define mlx5_uar_write64_relaxed(val, dst, lock) \
+		__mlx5_uar_write64_relaxed(val, dst, lock)
+#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, lock)
+#endif
+
+/**
+ * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which the
+ * cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Memory pool where data is located for given mbuf.
+ */
+static inline struct rte_mempool *
+mlx5_mb2mp(struct rte_mbuf *buf)
+{
+	if (unlikely(RTE_MBUF_CLONED(buf)))
+		return rte_mbuf_from_indirect(buf)->pool;
+	return buf->pool;
+}
+
+/**
+ * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx
+ * as mempool is pre-configured and static.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ * @param addr
+ *   Address to search.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx5_rx_addr2mr(struct mlx5_rxq_data *rxq, uintptr_t addr)
+{
+	struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+	uint32_t lkey;
+
+	/* Linear search on MR cache array. */
+	lkey = mlx5_mr_lookup_lkey(mr_ctrl->cache, &mr_ctrl->mru,
+				   MLX5_MR_CACHE_N, addr);
+	if (likely(lkey != UINT32_MAX))
+		return lkey;
+	/* Take slower bottom-half (Binary Search) on miss. */
+	return mlx5_rx_addr2mr_bh(rxq, addr);
+}
+
+#define mlx5_rx_mb2mr(rxq, mb) mlx5_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr))
+
+/**
+ * Query LKey from a packet buffer for Tx. If not found, add the mempool.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param addr
+ *   Address to search.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
+{
+	struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+	uintptr_t addr = (uintptr_t)mb->buf_addr;
+	uint32_t lkey;
+
+	/* Check generation bit to see if there's any change on existing MRs. */
+	if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen))
+		mlx5_mr_flush_local_cache(mr_ctrl);
+	/* Linear search on MR cache array. */
+	lkey = mlx5_mr_lookup_lkey(mr_ctrl->cache, &mr_ctrl->mru,
+				   MLX5_MR_CACHE_N, addr);
+	if (likely(lkey != UINT32_MAX))
+		return lkey;
+	/* Take slower bottom-half on miss. */
+	return mlx5_tx_mb2mr_bh(txq, mb);
+}
+
+/**
+ * Ring TX queue doorbell and flush the update if requested.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the last WQE posted in the NIC.
+ * @param cond
+ *   Request for write memory barrier after BlueFlame update.
+ */
+static __rte_always_inline void
+mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe,
+		       int cond)
+{
+	uint64_t *dst = MLX5_TX_BFREG(txq);
+	volatile uint64_t *src = ((volatile uint64_t *)wqe);
+
+	rte_cio_wmb();
+	*txq->qp_db = rte_cpu_to_be_32(txq->wqe_ci);
+	/* Ensure ordering between DB record and BF copy. */
+	rte_wmb();
+	mlx5_uar_write64_relaxed(*src, dst, txq->uar_lock);
+	if (cond)
+		rte_wmb();
+}
+
+/**
+ * Ring TX queue doorbell and flush the update by write memory barrier.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the last WQE posted in the NIC.
+ */
+static __rte_always_inline void
+mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
+{
+	mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c
new file mode 100644
index 000000000..1518bdd5b
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+
+#if defined RTE_ARCH_X86_64
+#include "mlx5_rxtx_vec_sse.h"
+#elif defined RTE_ARCH_ARM64
+#include "mlx5_rxtx_vec_neon.h"
+#elif defined RTE_ARCH_PPC_64
+#include "mlx5_rxtx_vec_altivec.h"
+#else
+#error "This should not be compiled if SIMD instructions are not supported."
+#endif
+
+/**
+ * Skip error packets.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+static uint16_t
+rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+			 uint16_t pkts_n)
+{
+	uint16_t n = 0;
+	unsigned int i;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t err_bytes = 0;
+#endif
+
+	for (i = 0; i < pkts_n; ++i) {
+		struct rte_mbuf *pkt = pkts[i];
+
+		if (pkt->packet_type == RTE_PTYPE_ALL_MASK || rxq->err_state) {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			err_bytes += PKT_LEN(pkt);
+#endif
+			rte_pktmbuf_free_seg(pkt);
+		} else {
+			pkts[n++] = pkt;
+		}
+	}
+	rxq->stats.idropped += (pkts_n - n);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Correct counters of errored completions. */
+	rxq->stats.ipackets -= (pkts_n - n);
+	rxq->stats.ibytes -= err_bytes;
+#endif
+	mlx5_rx_err_handle(rxq, 1);
+	return n;
+}
+
+/**
+ * DPDK callback for vectorized RX.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct mlx5_rxq_data *rxq = dpdk_rxq;
+	uint16_t nb_rx;
+	uint64_t err = 0;
+
+	nb_rx = rxq_burst_v(rxq, pkts, pkts_n, &err);
+	if (unlikely(err | rxq->err_state))
+		nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx);
+	return nb_rx;
+}
+
+/**
+ * Check a RX queue can support vectorized RX.
+ *
+ * @param rxq
+ *   Pointer to RX queue.
+ *
+ * @return
+ *   1 if supported, negative errno value if not.
+ */
+int __rte_cold
+mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq)
+{
+	struct mlx5_rxq_ctrl *ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+	if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv)))
+		return -ENOTSUP;
+	if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)
+		return -ENOTSUP;
+	if (rxq->lro)
+		return -ENOTSUP;
+	return 1;
+}
+
+/**
+ * Check a device can support vectorized RX.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   1 if supported, negative errno value if not.
+ */
+int __rte_cold
+mlx5_check_vec_rx_support(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint16_t i;
+
+	if (!priv->config.rx_vec_en)
+		return -ENOTSUP;
+	if (mlx5_mprq_enabled(dev))
+		return -ENOTSUP;
+	/* All the configured queues should support. */
+	for (i = 0; i < priv->rxqs_n; ++i) {
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+
+		if (!rxq)
+			continue;
+		if (mlx5_rxq_check_vec_support(rxq) < 0)
+			break;
+	}
+	if (i != priv->rxqs_n)
+		return -ENOTSUP;
+	return 1;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h
new file mode 100644
index 000000000..6ddcbfb0a
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_H_
+#define RTE_PMD_MLX5_RXTX_VEC_H_
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_autoconf.h"
+
+#include "mlx5_mr.h"
+
+/* HW checksum offload capabilities of vectorized Tx. */
+#define MLX5_VEC_TX_CKSUM_OFFLOAD_CAP \
+	(DEV_TX_OFFLOAD_IPV4_CKSUM | \
+	 DEV_TX_OFFLOAD_UDP_CKSUM | \
+	 DEV_TX_OFFLOAD_TCP_CKSUM | \
+	 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)
+
+/*
+ * Compile time sanity check for vectorized functions.
+ */
+
+#define S_ASSERT_RTE_MBUF(s) \
+	static_assert(s, "A field of struct rte_mbuf is changed")
+#define S_ASSERT_MLX5_CQE(s) \
+	static_assert(s, "A field of struct mlx5_cqe is changed")
+
+/* rxq_cq_decompress_v() */
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, pkt_len) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, data_len) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, hash) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+/* rxq_cq_to_ptype_oflags_v() */
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, ol_flags) ==
+		  offsetof(struct rte_mbuf, rearm_data) + 8);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, rearm_data) ==
+		  RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+
+/* rxq_burst_v() */
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, pkt_len) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, data_len) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+#if (RTE_CACHE_LINE_SIZE == 128)
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, pkt_info) == 64);
+#else
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, pkt_info) == 0);
+#endif
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, rx_hash_res) ==
+		  offsetof(struct mlx5_cqe, pkt_info) + 12);
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, rsvd1) + 11 ==
+		  offsetof(struct mlx5_cqe, hdr_type_etc));
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, vlan_info) ==
+		  offsetof(struct mlx5_cqe, hdr_type_etc) + 2);
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, lro_num_seg) + 12 ==
+		  offsetof(struct mlx5_cqe, byte_cnt));
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, sop_drop_qpn) ==
+		  RTE_ALIGN(offsetof(struct mlx5_cqe, sop_drop_qpn), 8));
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) ==
+		  offsetof(struct mlx5_cqe, sop_drop_qpn) + 7);
+
+/**
+ * Replenish buffers for RX in bulk.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param n
+ *   Number of buffers to be replenished.
+ */
+static inline void
+mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
+{
+	const uint16_t q_n = 1 << rxq->elts_n;
+	const uint16_t q_mask = q_n - 1;
+	uint16_t elts_idx = rxq->rq_ci & q_mask;
+	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
+	volatile struct mlx5_wqe_data_seg *wq =
+		&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
+	unsigned int i;
+
+	MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
+	MLX5_ASSERT(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
+	MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >
+		    MLX5_VPMD_DESCS_PER_LOOP);
+	/* Not to cross queue end. */
+	n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
+	if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
+		rxq->stats.rx_nombuf += n;
+		return;
+	}
+	for (i = 0; i < n; ++i) {
+		void *buf_addr;
+
+		/*
+		 * In order to support the mbufs with external attached
+		 * data buffer we should use the buf_addr pointer instead of
+		 * rte_mbuf_buf_addr(). It touches the mbuf itself and may
+		 * impact the performance.
+		 */
+		buf_addr = elts[i]->buf_addr;
+		wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
+					      RTE_PKTMBUF_HEADROOM);
+		/* If there's only one MR, no need to replace LKey in WQE. */
+		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+			wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+	}
+	rxq->rq_ci += n;
+	/* Prevent overflowing into consumed mbufs. */
+	elts_idx = rxq->rq_ci & q_mask;
+	for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+		(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;
+	rte_cio_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
new file mode 100644
index 000000000..26715ef45
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -0,0 +1,1114 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_
+#define RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <rte_altivec.h>
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ */
+static inline void
+rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+{
+	const uint16_t q_mask = (1 << rxq->elts_n) - 1;
+	struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
+	unsigned int pos;
+	uint16_t p = n & -2;
+
+	for (pos = 0; pos < p; pos += 2) {
+		vector unsigned char mbp;
+
+		mbp = (vector unsigned char)vec_vsx_ld(0,
+				(signed int const *)&elts[pos]);
+		*(vector unsigned char *)&pkts[pos] = mbp;
+	}
+	if (n & 1)
+		pkts[pos] = elts[pos];
+}
+
+/**
+ * Decompress a compressed completion and fill in mbufs in RX SW ring with data
+ * extracted from the title completion descriptor.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param cq
+ *   Pointer to completion array having a compressed completion at first.
+ * @param elts
+ *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ *   the title completion descriptor to be copied to the rest of mbufs.
+ *
+ * @return
+ *   Number of mini-CQEs successfully decompressed.
+ */
+static inline uint16_t
+rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+		    struct rte_mbuf **elts)
+{
+	volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info;
+	struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+	const vector unsigned char zero = (vector unsigned char){0};
+	/* Mask to shuffle from extracted mini CQE to mbuf. */
+	const vector unsigned char shuf_mask1 = (vector unsigned char){
+			-1, -1, -1, -1,   /* skip packet_type */
+			 7,  6, -1, -1,   /* bswap16, pkt_len */
+			 7,  6,           /* bswap16, data_len */
+			-1, -1,           /* skip vlan_tci */
+			 3,  2,  1,  0};  /* bswap32, rss */
+	const vector unsigned char shuf_mask2 = (vector unsigned char){
+			-1, -1, -1, -1,   /* skip packet_type */
+			15, 14, -1, -1,   /* bswap16, pkt_len */
+			15, 14,           /* data_len, bswap16 */
+			-1, -1,           /* skip vlan_tci */
+			11, 10,  9,  8};  /* bswap32, rss */
+	/* Restore the compressed count. Must be 16 bits. */
+	const uint16_t mcqe_n = t_pkt->data_len +
+		(rxq->crc_present * RTE_ETHER_CRC_LEN);
+	const vector unsigned char rearm =
+		(vector unsigned char)vec_vsx_ld(0,
+		(signed int const *)&t_pkt->rearm_data);
+	const vector unsigned char rxdf =
+		(vector unsigned char)vec_vsx_ld(0,
+		(signed int const *)&t_pkt->rx_descriptor_fields1);
+	const vector unsigned char crc_adj =
+		(vector unsigned char)(vector unsigned short){
+			0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+			rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0};
+	const vector unsigned short rxdf_sel_mask =
+		(vector unsigned short){
+			0xffff, 0xffff, 0, 0, 0, 0xffff, 0, 0};
+	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+	unsigned int pos;
+	unsigned int i;
+	unsigned int inv = 0;
+
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	const vector unsigned char ones = vec_splat_u8(-1);
+	uint32_t rcvd_byte = 0;
+	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+	const vector unsigned char len_shuf_mask = (vector unsigned char){
+		 3,  2, 11, 10,
+		 7,  6, 15, 14,
+		-1, -1, -1, -1,
+		-1, -1, -1, -1};
+#endif
+
+	/*
+	 * A. load mCQEs into a 128bit register.
+	 * B. store rearm data to mbuf.
+	 * C. combine data from mCQEs with rx_descriptor_fields1.
+	 * D. store rx_descriptor_fields1.
+	 * E. store flow tag (rte_flow mark).
+	 */
+	for (pos = 0; pos < mcqe_n; ) {
+		vector unsigned char mcqe1, mcqe2;
+		vector unsigned char rxdf1, rxdf2;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		const vector unsigned short mcqe_sel_mask =
+			(vector unsigned short){0, 0, 0xffff, 0xffff,
+			0, 0, 0xfff, 0xffff};
+		const vector unsigned char lower_half = {
+			0, 1, 4, 5, 8, 9, 12, 13, 16,
+			17, 20, 21, 24, 25, 28, 29};
+		const vector unsigned char upper_half = {
+			2, 3, 6, 7, 10, 11, 14, 15,
+			18, 19, 22, 23, 26, 27, 30, 31};
+		vector unsigned short left, right;
+		vector unsigned char byte_cnt, invalid_mask;
+		vector unsigned long lshift;
+		__attribute__((altivec(vector__)))
+			__attribute__((altivec(bool__)))
+			unsigned long long shmask;
+		const vector unsigned long shmax = {64, 64};
+#endif
+
+		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+			if (likely(pos + i < mcqe_n))
+				rte_prefetch0((void *)(cq + pos + i));
+
+		/* A.1 load mCQEs into a 128bit register. */
+		mcqe1 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&mcq[pos % 8]);
+		mcqe2 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&mcq[pos % 8 + 2]);
+
+		/* B.1 store rearm data to mbuf. */
+		*(vector unsigned char *)
+			&elts[pos]->rearm_data = rearm;
+		*(vector unsigned char *)
+			&elts[pos + 1]->rearm_data = rearm;
+
+		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		rxdf1 = vec_perm(mcqe1, zero, shuf_mask1);
+		rxdf2 = vec_perm(mcqe1, zero, shuf_mask2);
+		rxdf1 = (vector unsigned char)
+			((vector unsigned short)rxdf1 -
+			(vector unsigned short)crc_adj);
+		rxdf2 = (vector unsigned char)
+			((vector unsigned short)rxdf2 -
+			(vector unsigned short)crc_adj);
+		rxdf1 = (vector unsigned char)
+			vec_sel((vector unsigned short)rxdf1,
+			(vector unsigned short)rxdf, rxdf_sel_mask);
+		rxdf2 = (vector unsigned char)
+			vec_sel((vector unsigned short)rxdf2,
+			(vector unsigned short)rxdf, rxdf_sel_mask);
+
+		/* D.1 store rx_descriptor_fields1. */
+		*(vector unsigned char *)
+			&elts[pos]->rx_descriptor_fields1 = rxdf1;
+		*(vector unsigned char *)
+			&elts[pos + 1]->rx_descriptor_fields1 = rxdf2;
+
+		/* B.1 store rearm data to mbuf. */
+		*(vector unsigned char *)
+			&elts[pos + 2]->rearm_data = rearm;
+		*(vector unsigned char *)
+			&elts[pos + 3]->rearm_data = rearm;
+
+		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		rxdf1 = vec_perm(mcqe2, zero, shuf_mask1);
+		rxdf2 = vec_perm(mcqe2, zero, shuf_mask2);
+		rxdf1 = (vector unsigned char)
+			((vector unsigned short)rxdf1 -
+			(vector unsigned short)crc_adj);
+		rxdf2 = (vector unsigned char)
+			((vector unsigned short)rxdf2 -
+			(vector unsigned short)crc_adj);
+		rxdf1 = (vector unsigned char)
+			vec_sel((vector unsigned short)rxdf1,
+			(vector unsigned short)rxdf, rxdf_sel_mask);
+		rxdf2 = (vector unsigned char)
+			vec_sel((vector unsigned short)rxdf2,
+			(vector unsigned short)rxdf, rxdf_sel_mask);
+
+		/* D.1 store rx_descriptor_fields1. */
+		*(vector unsigned char *)
+			&elts[pos + 2]->rx_descriptor_fields1 = rxdf1;
+		*(vector unsigned char *)
+			&elts[pos + 3]->rx_descriptor_fields1 = rxdf2;
+
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		invalid_mask = (vector unsigned char)(vector unsigned long){
+			(mcqe_n - pos) * sizeof(uint16_t) * 8, 0};
+
+		lshift =
+			vec_splat((vector unsigned long)invalid_mask, 0);
+		shmask = vec_cmpgt(shmax, lshift);
+		invalid_mask = (vector unsigned char)
+			vec_sl((vector unsigned long)ones, lshift);
+		invalid_mask = (vector unsigned char)
+			vec_sel((vector unsigned long)shmask,
+			(vector unsigned long)invalid_mask, shmask);
+
+		mcqe1 = (vector unsigned char)
+			vec_sro((vector unsigned short)mcqe1,
+			(vector unsigned char){32}),
+		byte_cnt = (vector unsigned char)
+			vec_sel((vector unsigned short)mcqe1,
+			(vector unsigned short)mcqe2, mcqe_sel_mask);
+		byte_cnt = vec_perm(byte_cnt, zero, len_shuf_mask);
+		byte_cnt = (vector unsigned char)
+			vec_andc((vector unsigned long)byte_cnt,
+			(vector unsigned long)invalid_mask);
+		left = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, lower_half);
+		right = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, upper_half);
+		byte_cnt = (vector unsigned char)vec_add(left, right);
+		left = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, lower_half);
+		right = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, upper_half);
+		byte_cnt = (vector unsigned char)vec_add(left, right);
+		rcvd_byte += ((vector unsigned long)byte_cnt)[0];
+#endif
+
+		if (rxq->mark) {
+			/* E.1 store flow tag (rte_flow mark). */
+			elts[pos]->hash.fdir.hi = flow_tag;
+			elts[pos + 1]->hash.fdir.hi = flow_tag;
+			elts[pos + 2]->hash.fdir.hi = flow_tag;
+			elts[pos + 3]->hash.fdir.hi = flow_tag;
+		}
+		if (rxq->dynf_meta) {
+			int32_t offs = rxq->flow_meta_offset;
+			const uint32_t meta =
+				*RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
+
+			/* Check if title packet has valid metadata. */
+			if (meta) {
+				MLX5_ASSERT(t_pkt->ol_flags &
+					    rxq->flow_meta_mask);
+				*RTE_MBUF_DYNFIELD(elts[pos], offs,
+							uint32_t *) = meta;
+				*RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
+							uint32_t *) = meta;
+				*RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
+							uint32_t *) = meta;
+				*RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
+							uint32_t *) = meta;
+			}
+		}
+
+		pos += MLX5_VPMD_DESCS_PER_LOOP;
+		/* Move to next CQE and invalidate consumed CQEs. */
+		if (!(pos & 0x7) && pos < mcqe_n) {
+			mcq = (void *)&(cq + pos)->pkt_info;
+			for (i = 0; i < 8; ++i)
+				cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+		}
+	}
+
+	/* Invalidate the rest of CQEs. */
+	for (; inv < mcqe_n; ++inv)
+		cq[inv].op_own = MLX5_CQE_INVALIDATE;
+
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += mcqe_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+
+	rxq->cq_ci += mcqe_n;
+	return mcqe_n;
+}
+
+/**
+ * Calculate packet type and offload flag for mbuf and store it.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param cqes[4]
+ *   Array of four 16bytes completions extracted from the original completion
+ *   descriptor.
+ * @param op_err
+ *   Opcode vector having responder error status. Each field is 4B.
+ * @param pkts
+ *   Pointer to array of packets to be filled.
+ */
+static inline void
+rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
+		vector unsigned char cqes[4], vector unsigned char op_err,
+		struct rte_mbuf **pkts)
+{
+	vector unsigned char pinfo0, pinfo1;
+	vector unsigned char pinfo, ptype;
+	vector unsigned char ol_flags = (vector unsigned char)
+		(vector unsigned int){
+			rxq->rss_hash * PKT_RX_RSS_HASH |
+				rxq->hw_timestamp * PKT_RX_TIMESTAMP,
+			rxq->rss_hash * PKT_RX_RSS_HASH |
+				rxq->hw_timestamp * PKT_RX_TIMESTAMP,
+			rxq->rss_hash * PKT_RX_RSS_HASH |
+				rxq->hw_timestamp * PKT_RX_TIMESTAMP,
+			rxq->rss_hash * PKT_RX_RSS_HASH |
+				rxq->hw_timestamp * PKT_RX_TIMESTAMP};
+	vector unsigned char cv_flags;
+	const vector unsigned char zero = (vector unsigned char){0};
+	const vector unsigned char ptype_mask =
+		(vector unsigned char)(vector unsigned int){
+		0x0000fd06, 0x0000fd06, 0x0000fd06, 0x0000fd06};
+	const vector unsigned char ptype_ol_mask =
+		(vector unsigned char)(vector unsigned int){
+		0x00000106, 0x00000106, 0x00000106, 0x00000106};
+	const vector unsigned char pinfo_mask =
+		(vector unsigned char)(vector unsigned int){
+		0x00000003, 0x00000003, 0x00000003, 0x00000003};
+	const vector unsigned char cv_flag_sel = (vector unsigned char){
+		0, (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+		(uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), 0,
+		(uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), 0,
+		(uint8_t)((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+		0, 0, 0, 0, 0, 0, 0, 0, 0};
+	const vector unsigned char cv_mask =
+		(vector unsigned char)(vector unsigned int){
+		PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+		PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+		PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+		PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+		PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+		PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+		PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+		PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED};
+	const vector unsigned char mbuf_init =
+		(vector unsigned char)vec_vsx_ld
+			(0, (vector unsigned char *)&rxq->mbuf_initializer);
+	const vector unsigned short rearm_sel_mask =
+		(vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
+	vector unsigned char rearm0, rearm1, rearm2, rearm3;
+	uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
+
+	/* Extract pkt_info field. */
+	pinfo0 = (vector unsigned char)
+		vec_mergeh((vector unsigned int)cqes[0],
+		(vector unsigned int)cqes[1]);
+	pinfo1 = (vector unsigned char)
+		vec_mergeh((vector unsigned int)cqes[2],
+		(vector unsigned int)cqes[3]);
+	pinfo = (vector unsigned char)
+		vec_mergeh((vector unsigned long)pinfo0,
+		(vector unsigned long)pinfo1);
+
+	/* Extract hdr_type_etc field. */
+	pinfo0 = (vector unsigned char)
+		vec_mergel((vector unsigned int)cqes[0],
+		(vector unsigned int)cqes[1]);
+	pinfo1 = (vector unsigned char)
+		vec_mergel((vector unsigned int)cqes[2],
+		(vector unsigned int)cqes[3]);
+	ptype = (vector unsigned char)
+		vec_mergeh((vector unsigned long)pinfo0,
+		(vector unsigned long)pinfo1);
+
+	if (rxq->mark) {
+		const vector unsigned char pinfo_ft_mask =
+			(vector unsigned char)(vector unsigned int){
+			0xffffff00, 0xffffff00, 0xffffff00, 0xffffff00};
+		const vector unsigned char fdir_flags =
+			(vector unsigned char)(vector unsigned int){
+			PKT_RX_FDIR, PKT_RX_FDIR,
+			PKT_RX_FDIR, PKT_RX_FDIR};
+		vector unsigned char fdir_id_flags =
+			(vector unsigned char)(vector unsigned int){
+			PKT_RX_FDIR_ID, PKT_RX_FDIR_ID,
+			PKT_RX_FDIR_ID, PKT_RX_FDIR_ID};
+		vector unsigned char flow_tag, invalid_mask;
+
+		flow_tag = (vector unsigned char)
+			vec_and((vector unsigned long)pinfo,
+			(vector unsigned long)pinfo_ft_mask);
+
+		/* Check if flow tag is non-zero then set PKT_RX_FDIR. */
+		invalid_mask = (vector unsigned char)
+			vec_cmpeq((vector unsigned int)flow_tag,
+			(vector unsigned int)zero);
+		ol_flags = (vector unsigned char)
+			vec_or((vector unsigned long)ol_flags,
+			(vector unsigned long)
+			vec_andc((vector unsigned long)fdir_flags,
+			(vector unsigned long)invalid_mask));
+
+		/* Mask out invalid entries. */
+		fdir_id_flags = (vector unsigned char)
+			vec_andc((vector unsigned long)fdir_id_flags,
+			(vector unsigned long)invalid_mask);
+
+		/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+		ol_flags = (vector unsigned char)
+			vec_or((vector unsigned long)ol_flags,
+			(vector unsigned long)
+			vec_andc((vector unsigned long)fdir_id_flags,
+			(vector unsigned long)
+			vec_cmpeq((vector unsigned int)flow_tag,
+			(vector unsigned int)pinfo_ft_mask)));
+	}
+	/*
+	 * Merge the two fields to generate the following:
+	 * bit[1]     = l3_ok
+	 * bit[2]     = l4_ok
+	 * bit[8]     = cv
+	 * bit[11:10] = l3_hdr_type
+	 * bit[14:12] = l4_hdr_type
+	 * bit[15]    = ip_frag
+	 * bit[16]    = tunneled
+	 * bit[17]    = outer_l3_type
+	 */
+	ptype = (vector unsigned char)
+		vec_and((vector unsigned long)ptype,
+		(vector unsigned long)ptype_mask);
+	pinfo = (vector unsigned char)
+		vec_and((vector unsigned long)pinfo,
+		(vector unsigned long)pinfo_mask);
+	pinfo = (vector unsigned char)
+		vec_sl((vector unsigned int)pinfo,
+		(vector unsigned int){16, 16, 16, 16});
+
+	/* Make pinfo has merged fields for ol_flags calculation. */
+	pinfo = (vector unsigned char)
+		vec_or((vector unsigned long)ptype,
+		(vector unsigned long)pinfo);
+	ptype = (vector unsigned char)
+		vec_sr((vector unsigned int)pinfo,
+		(vector unsigned int){10, 10, 10, 10});
+	ptype = (vector unsigned char)
+		vec_packs((vector unsigned int)ptype,
+		(vector unsigned int)zero);
+
+	/* Errored packets will have RTE_PTYPE_ALL_MASK. */
+	op_err = (vector unsigned char)
+		vec_sr((vector unsigned short)op_err,
+		(vector unsigned short){8, 8, 8, 8, 8, 8, 8, 8});
+	ptype = (vector unsigned char)
+		vec_or((vector unsigned long)ptype,
+		(vector unsigned long)op_err);
+
+	pt_idx0 = (uint8_t)((vector unsigned char)ptype)[0];
+	pt_idx1 = (uint8_t)((vector unsigned char)ptype)[2];
+	pt_idx2 = (uint8_t)((vector unsigned char)ptype)[4];
+	pt_idx3 = (uint8_t)((vector unsigned char)ptype)[6];
+
+	pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
+		!!(pt_idx0 & (1 << 6)) * rxq->tunnel;
+	pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
+		!!(pt_idx1 & (1 << 6)) * rxq->tunnel;
+	pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
+		!!(pt_idx2 & (1 << 6)) * rxq->tunnel;
+	pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
+		!!(pt_idx3 & (1 << 6)) * rxq->tunnel;
+
+	/* Fill flags for checksum and VLAN. */
+	pinfo = (vector unsigned char)
+		vec_and((vector unsigned long)pinfo,
+		(vector unsigned long)ptype_ol_mask);
+	pinfo = vec_perm(cv_flag_sel, zero, pinfo);
+
+	/* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
+	cv_flags = (vector unsigned char)
+		vec_sl((vector unsigned int)pinfo,
+		(vector unsigned int){9, 9, 9, 9});
+	cv_flags = (vector unsigned char)
+		vec_or((vector unsigned long)pinfo,
+		(vector unsigned long)cv_flags);
+
+	/* Move back flags to start from byte[0]. */
+	cv_flags = (vector unsigned char)
+		vec_sr((vector unsigned int)cv_flags,
+		(vector unsigned int){8, 8, 8, 8});
+
+	/* Mask out garbage bits. */
+	cv_flags = (vector unsigned char)
+		vec_and((vector unsigned long)cv_flags,
+		(vector unsigned long)cv_mask);
+
+	/* Merge to ol_flags. */
+	ol_flags = (vector unsigned char)
+		vec_or((vector unsigned long)ol_flags,
+		(vector unsigned long)cv_flags);
+
+	/* Merge mbuf_init and ol_flags. */
+	rearm0 = (vector unsigned char)
+		vec_sel((vector unsigned short)mbuf_init,
+		(vector unsigned short)
+		vec_slo((vector unsigned short)ol_flags,
+		(vector unsigned char){64}), rearm_sel_mask);
+	rearm1 = (vector unsigned char)
+		vec_sel((vector unsigned short)mbuf_init,
+		(vector unsigned short)
+		vec_slo((vector unsigned short)ol_flags,
+		(vector unsigned char){32}), rearm_sel_mask);
+	rearm2 = (vector unsigned char)
+		vec_sel((vector unsigned short)mbuf_init,
+		(vector unsigned short)ol_flags, rearm_sel_mask);
+	rearm3 = (vector unsigned char)
+		vec_sel((vector unsigned short)mbuf_init,
+		(vector unsigned short)
+		vec_sro((vector unsigned short)ol_flags,
+		(vector unsigned char){32}), rearm_sel_mask);
+
+	/* Write 8B rearm_data and 8B ol_flags. */
+	vec_vsx_st(rearm0, 0,
+		(vector unsigned char *)&pkts[0]->rearm_data);
+	vec_vsx_st(rearm1, 0,
+		(vector unsigned char *)&pkts[1]->rearm_data);
+	vec_vsx_st(rearm2, 0,
+		(vector unsigned char *)&pkts[2]->rearm_data);
+	vec_vsx_st(rearm3, 0,
+		(vector unsigned char *)&pkts[3]->rearm_data);
+}
+
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
+	    uint64_t *err)
+{
+	const uint16_t q_n = 1 << rxq->cqe_n;
+	const uint16_t q_mask = q_n - 1;
+	volatile struct mlx5_cqe *cq;
+	struct rte_mbuf **elts;
+	unsigned int pos;
+	uint64_t n;
+	uint16_t repl_n;
+	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+	uint16_t nocmp_n = 0;
+	uint16_t rcvd_pkt = 0;
+	unsigned int cq_idx = rxq->cq_ci & q_mask;
+	unsigned int elts_idx;
+	unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+	const vector unsigned char zero = (vector unsigned char){0};
+	const vector unsigned char ones = vec_splat_u8(-1);
+	const vector unsigned char owner_check =
+		(vector unsigned char)(vector unsigned long){
+		0x0100000001000000LL, 0x0100000001000000LL};
+	const vector unsigned char opcode_check =
+		(vector unsigned char)(vector unsigned long){
+		0xf0000000f0000000LL, 0xf0000000f0000000LL};
+	const vector unsigned char format_check =
+		(vector unsigned char)(vector unsigned long){
+		0x0c0000000c000000LL, 0x0c0000000c000000LL};
+	const vector unsigned char resp_err_check =
+		(vector unsigned char)(vector unsigned long){
+		0xe0000000e0000000LL, 0xe0000000e0000000LL};
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t rcvd_byte = 0;
+	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+	const vector unsigned char len_shuf_mask = (vector unsigned char){
+		 1,  0,  5,  4,
+		 9,  8, 13, 12,
+		-1, -1, -1, -1,
+		-1, -1, -1, -1};
+#endif
+	/* Mask to shuffle from extracted CQE to mbuf. */
+	const vector unsigned char shuf_mask = (vector unsigned char){
+		 5,  4,           /* bswap16, pkt_len */
+		-1, -1,           /* zero out 2nd half of pkt_len */
+		 5,  4,           /* bswap16, data_len */
+		11, 10,           /* bswap16, vlan+tci */
+		15, 14, 13, 12,   /* bswap32, rss */
+		 1,  2,  3, -1};  /* fdir.hi */
+	/* Mask to blend from the last Qword to the first DQword. */
+	/* Mask to blend from the last Qword to the first DQword. */
+	const vector unsigned char blend_mask = (vector unsigned char){
+		-1,  0,  0,  0,
+		 0,  0,  0,  0,
+		-1, -1, -1, -1,
+		-1, -1, -1, -1};
+	const vector unsigned char crc_adj =
+		(vector unsigned char)(vector unsigned short){
+		rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+		rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0};
+	const vector unsigned char flow_mark_adj =
+		(vector unsigned char)(vector unsigned int){
+		0, 0, 0, rxq->mark * (-1)};
+	const vector unsigned short cqe_sel_mask1 =
+		(vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
+	const vector unsigned short cqe_sel_mask2 =
+		(vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0};
+
+	MLX5_ASSERT(rxq->sges_n == 0);
+	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+	cq = &(*rxq->cqes)[cq_idx];
+	rte_prefetch0(cq);
+	rte_prefetch0(cq + 1);
+	rte_prefetch0(cq + 2);
+	rte_prefetch0(cq + 3);
+	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+
+	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
+	if (repl_n >= rxq->rq_repl_thresh)
+		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
+	/* See if there're unreturned mbufs from compressed CQE. */
+	rcvd_pkt = rxq->decompressed;
+	if (rcvd_pkt > 0) {
+		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+		rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
+		rxq->rq_pi += rcvd_pkt;
+		rxq->decompressed -= rcvd_pkt;
+		pkts += rcvd_pkt;
+	}
+	elts_idx = rxq->rq_pi & q_mask;
+	elts = &(*rxq->elts)[elts_idx];
+	/* Not to overflow pkts array. */
+	pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
+	/* Not to cross queue end. */
+	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
+	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+	if (!pkts_n)
+		return rcvd_pkt;
+	/* At this point, there shouldn't be any remaining packets. */
+	MLX5_ASSERT(rxq->decompressed == 0);
+
+	/*
+	 * A. load first Qword (8bytes) in one loop.
+	 * B. copy 4 mbuf pointers from elts ring to returing pkts.
+	 * C. load remaining CQE data and extract necessary fields.
+	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+	 *    following structure:
+	 *        struct {
+	 *          uint8_t  pkt_info;
+	 *          uint8_t  flow_tag[3];
+	 *          uint16_t byte_cnt;
+	 *          uint8_t  rsvd4;
+	 *          uint8_t  op_own;
+	 *          uint16_t hdr_type_etc;
+	 *          uint16_t vlan_info;
+	 *          uint32_t rx_has_res;
+	 *        } c;
+	 * D. fill in mbuf.
+	 * E. get valid CQEs.
+	 * F. find compressed CQE.
+	 */
+	for (pos = 0;
+	     pos < pkts_n;
+	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
+		vector unsigned char cqes[MLX5_VPMD_DESCS_PER_LOOP];
+		vector unsigned char cqe_tmp1, cqe_tmp2;
+		vector unsigned char pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+		vector unsigned char op_own, op_own_tmp1, op_own_tmp2;
+		vector unsigned char opcode, owner_mask, invalid_mask;
+		vector unsigned char comp_mask;
+		vector unsigned char mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		const vector unsigned char lower_half = {
+			0, 1, 4, 5, 8, 9, 12, 13,
+			16, 17, 20, 21, 24, 25, 28, 29};
+		const vector unsigned char upper_half = {
+			2, 3, 6, 7, 10, 11, 14, 15,
+			18, 19, 22, 23, 26, 27, 30, 31};
+		const vector unsigned long shmax = {64, 64};
+		vector unsigned char byte_cnt;
+		vector unsigned short left, right;
+		vector unsigned long lshift;
+		vector __attribute__((altivec(bool__)))
+			unsigned long shmask;
+#endif
+		vector unsigned char mbp1, mbp2;
+		vector unsigned char p =
+			(vector unsigned char)(vector unsigned short){
+				0, 1, 2, 3, 0, 0, 0, 0};
+		unsigned int p1, p2, p3;
+
+		/* Prefetch next 4 CQEs. */
+		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+		}
+
+		/* A.0 do not cross the end of CQ. */
+		mask = (vector unsigned char)(vector unsigned long){
+			(pkts_n - pos) * sizeof(uint16_t) * 8, 0};
+
+		{
+			vector unsigned long lshift;
+			vector __attribute__((altivec(bool__)))
+				unsigned long shmask;
+			const vector unsigned long shmax = {64, 64};
+
+			lshift = vec_splat((vector unsigned long)mask, 0);
+			shmask = vec_cmpgt(shmax, lshift);
+			mask = (vector unsigned char)
+				vec_sl((vector unsigned long)ones, lshift);
+			mask = (vector unsigned char)
+				vec_sel((vector unsigned long)shmask,
+				(vector unsigned long)mask, shmask);
+		}
+
+		p = (vector unsigned char)
+			vec_andc((vector unsigned long)p,
+			(vector unsigned long)mask);
+
+		/* A.1 load cqes. */
+		p3 = (unsigned int)((vector unsigned short)p)[3];
+		cqes[3] = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p3].sop_drop_qpn, 0LL};
+		rte_compiler_barrier();
+
+		p2 = (unsigned int)((vector unsigned short)p)[2];
+		cqes[2] = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p2].sop_drop_qpn, 0LL};
+		rte_compiler_barrier();
+
+		/* B.1 load mbuf pointers. */
+		mbp1 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&elts[pos]);
+		mbp2 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&elts[pos + 2]);
+
+		/* A.1 load a block having op_own. */
+		p1 = (unsigned int)((vector unsigned short)p)[1];
+		cqes[1] = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p1].sop_drop_qpn, 0LL};
+		rte_compiler_barrier();
+
+		cqes[0] = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos].sop_drop_qpn, 0LL};
+		rte_compiler_barrier();
+
+		/* B.2 copy mbuf pointers. */
+		*(vector unsigned char *)&pkts[pos] = mbp1;
+		*(vector unsigned char *)&pkts[pos + 2] = mbp2;
+		rte_cio_rmb();
+
+		/* C.1 load remaining CQE data and extract necessary fields. */
+		cqe_tmp2 = *(vector unsigned char *)
+			&cq[pos + p3].pkt_info;
+		cqe_tmp1 = *(vector unsigned char *)
+			&cq[pos + p2].pkt_info;
+		cqes[3] = vec_sel(cqes[3], cqe_tmp2, blend_mask);
+		cqes[2] = vec_sel(cqes[2], cqe_tmp1, blend_mask);
+		cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&cq[pos + p3].csum);
+		cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&cq[pos + p2].csum);
+		cqes[3] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[3],
+			(vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+		cqes[2] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[2],
+			(vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+		cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p3].rsvd3[9], 0LL};
+		cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p2].rsvd3[9], 0LL};
+		cqes[3] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[3],
+			(vector unsigned short)cqe_tmp2,
+			(vector unsigned short)cqe_sel_mask2);
+		cqes[2] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[2],
+			(vector unsigned short)cqe_tmp1,
+			(vector unsigned short)cqe_sel_mask2);
+
+		/* C.2 generate final structure for mbuf with swapping bytes. */
+		pkt_mb3 = vec_perm(cqes[3], zero, shuf_mask);
+		pkt_mb2 = vec_perm(cqes[2], zero, shuf_mask);
+
+		/* C.3 adjust CRC length. */
+		pkt_mb3 = (vector unsigned char)
+			((vector unsigned short)pkt_mb3 -
+			(vector unsigned short)crc_adj);
+		pkt_mb2 = (vector unsigned char)
+			((vector unsigned short)pkt_mb2 -
+			(vector unsigned short)crc_adj);
+
+		/* C.4 adjust flow mark. */
+		pkt_mb3 = (vector unsigned char)
+			((vector unsigned int)pkt_mb3 +
+			(vector unsigned int)flow_mark_adj);
+		pkt_mb2 = (vector unsigned char)
+			((vector unsigned int)pkt_mb2 +
+			(vector unsigned int)flow_mark_adj);
+
+		/* D.1 fill in mbuf - rx_descriptor_fields1. */
+		*(vector unsigned char *)
+			&pkts[pos + 3]->pkt_len = pkt_mb3;
+		*(vector unsigned char *)
+			&pkts[pos + 2]->pkt_len = pkt_mb2;
+
+		/* E.1 extract op_own field. */
+		op_own_tmp2 = (vector unsigned char)
+			vec_mergeh((vector unsigned int)cqes[2],
+			(vector unsigned int)cqes[3]);
+
+		/* C.1 load remaining CQE data and extract necessary fields. */
+		cqe_tmp2 = *(vector unsigned char *)
+			&cq[pos + p1].pkt_info;
+		cqe_tmp1 = *(vector unsigned char *)
+			&cq[pos].pkt_info;
+		cqes[1] = vec_sel(cqes[1], cqe_tmp2, blend_mask);
+		cqes[0] = vec_sel(cqes[0], cqe_tmp2, blend_mask);
+		cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&cq[pos + p1].csum);
+		cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&cq[pos].csum);
+		cqes[1] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[1],
+			(vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+		cqes[0] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[0],
+			(vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+		cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p1].rsvd3[9], 0LL};
+		cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos].rsvd3[9], 0LL};
+		cqes[1] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[1],
+			(vector unsigned short)cqe_tmp2, cqe_sel_mask2);
+		cqes[0] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[0],
+			(vector unsigned short)cqe_tmp1, cqe_sel_mask2);
+
+		/* C.2 generate final structure for mbuf with swapping bytes. */
+		pkt_mb1 = vec_perm(cqes[1], zero, shuf_mask);
+		pkt_mb0 = vec_perm(cqes[0], zero, shuf_mask);
+
+		/* C.3 adjust CRC length. */
+		pkt_mb1 = (vector unsigned char)
+			((vector unsigned short)pkt_mb1 -
+			(vector unsigned short)crc_adj);
+		pkt_mb0 = (vector unsigned char)
+			((vector unsigned short)pkt_mb0 -
+			(vector unsigned short)crc_adj);
+
+		/* C.4 adjust flow mark. */
+		pkt_mb1 = (vector unsigned char)
+			((vector unsigned int)pkt_mb1 +
+			(vector unsigned int)flow_mark_adj);
+		pkt_mb0 = (vector unsigned char)
+			((vector unsigned int)pkt_mb0 +
+			(vector unsigned int)flow_mark_adj);
+
+		/* E.1 extract op_own byte. */
+		op_own_tmp1 = (vector unsigned char)
+			vec_mergeh((vector unsigned int)cqes[0],
+			(vector unsigned int)cqes[1]);
+		op_own = (vector unsigned char)
+			vec_mergel((vector unsigned long)op_own_tmp1,
+			(vector unsigned long)op_own_tmp2);
+
+		/* D.1 fill in mbuf - rx_descriptor_fields1. */
+		*(vector unsigned char *)
+			&pkts[pos + 1]->pkt_len = pkt_mb1;
+		*(vector unsigned char *)
+			&pkts[pos]->pkt_len = pkt_mb0;
+
+		/* E.2 flip owner bit to mark CQEs from last round. */
+		owner_mask = (vector unsigned char)
+			vec_and((vector unsigned long)op_own,
+			(vector unsigned long)owner_check);
+		if (ownership)
+			owner_mask = (vector unsigned char)
+				vec_xor((vector unsigned long)owner_mask,
+				(vector unsigned long)owner_check);
+		owner_mask = (vector unsigned char)
+			vec_cmpeq((vector unsigned int)owner_mask,
+			(vector unsigned int)owner_check);
+		owner_mask = (vector unsigned char)
+			vec_packs((vector unsigned int)owner_mask,
+			(vector unsigned int)zero);
+
+		/* E.3 get mask for invalidated CQEs. */
+		opcode = (vector unsigned char)
+			vec_and((vector unsigned long)op_own,
+			(vector unsigned long)opcode_check);
+		invalid_mask = (vector unsigned char)
+			vec_cmpeq((vector unsigned int)opcode_check,
+			(vector unsigned int)opcode);
+		invalid_mask = (vector unsigned char)
+			vec_packs((vector unsigned int)invalid_mask,
+			(vector unsigned int)zero);
+
+		/* E.4 mask out beyond boundary. */
+		invalid_mask = (vector unsigned char)
+			vec_or((vector unsigned long)invalid_mask,
+			(vector unsigned long)mask);
+
+		/* E.5 merge invalid_mask with invalid owner. */
+		invalid_mask = (vector unsigned char)
+			vec_or((vector unsigned long)invalid_mask,
+			(vector unsigned long)owner_mask);
+
+		/* F.1 find compressed CQE format. */
+		comp_mask = (vector unsigned char)
+			vec_and((vector unsigned long)op_own,
+			(vector unsigned long)format_check);
+		comp_mask = (vector unsigned char)
+			vec_cmpeq((vector unsigned int)comp_mask,
+			(vector unsigned int)format_check);
+		comp_mask = (vector unsigned char)
+			vec_packs((vector unsigned int)comp_mask,
+			(vector unsigned int)zero);
+
+		/* F.2 mask out invalid entries. */
+		comp_mask = (vector unsigned char)
+			vec_andc((vector unsigned long)comp_mask,
+			(vector unsigned long)invalid_mask);
+		comp_idx = ((vector unsigned long)comp_mask)[0];
+
+		/* F.3 get the first compressed CQE. */
+		comp_idx = comp_idx ? __builtin_ctzll(comp_idx) /
+			(sizeof(uint16_t) * 8) : MLX5_VPMD_DESCS_PER_LOOP;
+
+		/* E.6 mask out entries after the compressed CQE. */
+		mask = (vector unsigned char)(vector unsigned long){
+			(comp_idx * sizeof(uint16_t) * 8), 0};
+		lshift = vec_splat((vector unsigned long)mask, 0);
+		shmask = vec_cmpgt(shmax, lshift);
+		mask = (vector unsigned char)
+			vec_sl((vector unsigned long)ones, lshift);
+		mask = (vector unsigned char)
+			vec_sel((vector unsigned long)shmask,
+			(vector unsigned long)mask, shmask);
+		invalid_mask = (vector unsigned char)
+			vec_or((vector unsigned long)invalid_mask,
+			(vector unsigned long)mask);
+
+		/* E.7 count non-compressed valid CQEs. */
+		n = ((vector unsigned long)invalid_mask)[0];
+		n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+			MLX5_VPMD_DESCS_PER_LOOP;
+		nocmp_n += n;
+
+		/* D.2 get the final invalid mask. */
+		mask = (vector unsigned char)(vector unsigned long){
+			(n * sizeof(uint16_t) * 8), 0};
+		lshift = vec_splat((vector unsigned long)mask, 0);
+		shmask = vec_cmpgt(shmax, lshift);
+		mask = (vector unsigned char)
+			vec_sl((vector unsigned long)ones, lshift);
+		mask = (vector unsigned char)
+			vec_sel((vector unsigned long)shmask,
+			(vector unsigned long)mask, shmask);
+		invalid_mask = (vector unsigned char)
+			vec_or((vector unsigned long)invalid_mask,
+			(vector unsigned long)mask);
+
+		/* D.3 check error in opcode. */
+		opcode = (vector unsigned char)
+			vec_cmpeq((vector unsigned int)resp_err_check,
+			(vector unsigned int)opcode);
+		opcode = (vector unsigned char)
+			vec_packs((vector unsigned int)opcode,
+			(vector unsigned int)zero);
+		opcode = (vector unsigned char)
+			vec_andc((vector unsigned long)opcode,
+			(vector unsigned long)invalid_mask);
+
+		/* D.4 mark if any error is set */
+		*err |= ((vector unsigned long)opcode)[0];
+
+		/* D.5 fill in mbuf - rearm_data and packet_type. */
+		rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+		if (rxq->hw_timestamp) {
+			pkts[pos]->timestamp =
+				rte_be_to_cpu_64(cq[pos].timestamp);
+			pkts[pos + 1]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p1].timestamp);
+			pkts[pos + 2]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p2].timestamp);
+			pkts[pos + 3]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p3].timestamp);
+		}
+		if (rxq->dynf_meta) {
+			uint64_t flag = rxq->flow_meta_mask;
+			int32_t offs = rxq->flow_meta_offset;
+			uint32_t metadata;
+
+			/* This code is subject for futher optimization. */
+			metadata = cq[pos].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+								metadata;
+			pkts[pos]->ol_flags |= metadata ? flag : 0ULL;
+			metadata = cq[pos + 1].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+								metadata;
+			pkts[pos + 1]->ol_flags |= metadata ? flag : 0ULL;
+			metadata = cq[pos + 2].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+								metadata;
+			pkts[pos + 2]->ol_flags |= metadata ? flag : 0ULL;
+			metadata = cq[pos + 3].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+								metadata;
+			pkts[pos + 3]->ol_flags |= metadata ? flag : 0ULL;
+		}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Add up received bytes count. */
+		byte_cnt = vec_perm(op_own, zero, len_shuf_mask);
+		byte_cnt = (vector unsigned char)
+			vec_andc((vector unsigned long)byte_cnt,
+			(vector unsigned long)invalid_mask);
+		left = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, lower_half);
+		right = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, upper_half);
+		byte_cnt = (vector unsigned char)vec_add(left, right);
+		left = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, lower_half);
+		right = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, upper_half);
+		byte_cnt = (vector unsigned char)vec_add(left, right);
+		rcvd_byte += ((vector unsigned long)byte_cnt)[0];
+#endif
+
+		/*
+		 * Break the loop unless more valid CQE is expected, or if
+		 * there's a compressed CQE.
+		 */
+		if (n != MLX5_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+	/* If no new CQE seen, return without updating cq_db. */
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+		return rcvd_pkt;
+	/* Update the consumer indexes for non-compressed CQEs. */
+	MLX5_ASSERT(nocmp_n <= pkts_n);
+	rxq->cq_ci += nocmp_n;
+	rxq->rq_pi += nocmp_n;
+	rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += nocmp_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	/* Decompress the last CQE if compressed. */
+	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+		rxq->decompressed =
+			rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]);
+		/* Return more packets if needed. */
+		if (nocmp_n < pkts_n) {
+			uint16_t n = rxq->decompressed;
+
+			n = RTE_MIN(n, pkts_n - nocmp_n);
+			rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
+			rxq->rq_pi += n;
+			rcvd_pkt += n;
+			rxq->decompressed -= n;
+		}
+	}
+	rte_compiler_barrier();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	return rcvd_pkt;
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
new file mode 100644
index 000000000..ecafbf800
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -0,0 +1,780 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_NEON_H_
+#define RTE_PMD_MLX5_RXTX_VEC_NEON_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ */
+static inline void
+rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+{
+	const uint16_t q_mask = (1 << rxq->elts_n) - 1;
+	struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
+	unsigned int pos;
+	uint16_t p = n & -2;
+
+	for (pos = 0; pos < p; pos += 2) {
+		uint64x2_t mbp;
+
+		mbp = vld1q_u64((void *)&elts[pos]);
+		vst1q_u64((void *)&pkts[pos], mbp);
+	}
+	if (n & 1)
+		pkts[pos] = elts[pos];
+}
+
+/**
+ * Decompress a compressed completion and fill in mbufs in RX SW ring with data
+ * extracted from the title completion descriptor.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param cq
+ *   Pointer to completion array having a compressed completion at first.
+ * @param elts
+ *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ *   the title completion descriptor to be copied to the rest of mbufs.
+ *
+ * @return
+ *   Number of mini-CQEs successfully decompressed.
+ */
+static inline uint16_t
+rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+		    struct rte_mbuf **elts)
+{
+	volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info;
+	struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+	unsigned int pos;
+	unsigned int i;
+	unsigned int inv = 0;
+	/* Mask to shuffle from extracted mini CQE to mbuf. */
+	const uint8x16_t mcqe_shuf_m1 = {
+		-1, -1, -1, -1, /* skip packet_type */
+		 7,  6, -1, -1, /* pkt_len, bswap16 */
+		 7,  6,         /* data_len, bswap16 */
+		-1, -1,         /* skip vlan_tci */
+		 3,  2,  1,  0  /* hash.rss, bswap32 */
+	};
+	const uint8x16_t mcqe_shuf_m2 = {
+		-1, -1, -1, -1, /* skip packet_type */
+		15, 14, -1, -1, /* pkt_len, bswap16 */
+		15, 14,         /* data_len, bswap16 */
+		-1, -1,         /* skip vlan_tci */
+		11, 10,  9,  8  /* hash.rss, bswap32 */
+	};
+	/* Restore the compressed count. Must be 16 bits. */
+	const uint16_t mcqe_n = t_pkt->data_len +
+				(rxq->crc_present * RTE_ETHER_CRC_LEN);
+	const uint64x2_t rearm =
+		vld1q_u64((void *)&t_pkt->rearm_data);
+	const uint32x4_t rxdf_mask = {
+		0xffffffff, /* packet_type */
+		0,          /* skip pkt_len */
+		0xffff0000, /* vlan_tci, skip data_len */
+		0,          /* skip hash.rss */
+	};
+	const uint8x16_t rxdf =
+		vandq_u8(vld1q_u8((void *)&t_pkt->rx_descriptor_fields1),
+			 vreinterpretq_u8_u32(rxdf_mask));
+	const uint16x8_t crc_adj = {
+		0, 0,
+		rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+		rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+		0, 0
+	};
+	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t rcvd_byte = 0;
+#endif
+	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+	const uint8x8_t len_shuf_m = {
+		 7,  6,         /* 1st mCQE */
+		15, 14,         /* 2nd mCQE */
+		23, 22,         /* 3rd mCQE */
+		31, 30          /* 4th mCQE */
+	};
+
+	/*
+	 * A. load mCQEs into a 128bit register.
+	 * B. store rearm data to mbuf.
+	 * C. combine data from mCQEs with rx_descriptor_fields1.
+	 * D. store rx_descriptor_fields1.
+	 * E. store flow tag (rte_flow mark).
+	 */
+	for (pos = 0; pos < mcqe_n; ) {
+		uint8_t *p = (void *)&mcq[pos % 8];
+		uint8_t *e0 = (void *)&elts[pos]->rearm_data;
+		uint8_t *e1 = (void *)&elts[pos + 1]->rearm_data;
+		uint8_t *e2 = (void *)&elts[pos + 2]->rearm_data;
+		uint8_t *e3 = (void *)&elts[pos + 3]->rearm_data;
+		uint16x4_t byte_cnt;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		uint16x4_t invalid_mask =
+			vcreate_u16(mcqe_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?
+				    -1UL << ((mcqe_n - pos) *
+					     sizeof(uint16_t) * 8) : 0);
+#endif
+		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+			if (likely(pos + i < mcqe_n))
+				rte_prefetch0((void *)(cq + pos + i));
+		__asm__ volatile (
+		/* A.1 load mCQEs into a 128bit register. */
+		"ld1 {v16.16b - v17.16b}, [%[mcq]] \n\t"
+		/* B.1 store rearm data to mbuf. */
+		"st1 {%[rearm].2d}, [%[e0]] \n\t"
+		"add %[e0], %[e0], #16 \n\t"
+		"st1 {%[rearm].2d}, [%[e1]] \n\t"
+		"add %[e1], %[e1], #16 \n\t"
+		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		"tbl v18.16b, {v16.16b}, %[mcqe_shuf_m1].16b \n\t"
+		"tbl v19.16b, {v16.16b}, %[mcqe_shuf_m2].16b \n\t"
+		"sub v18.8h, v18.8h, %[crc_adj].8h \n\t"
+		"sub v19.8h, v19.8h, %[crc_adj].8h \n\t"
+		"orr v18.16b, v18.16b, %[rxdf].16b \n\t"
+		"orr v19.16b, v19.16b, %[rxdf].16b \n\t"
+		/* D.1 store rx_descriptor_fields1. */
+		"st1 {v18.2d}, [%[e0]] \n\t"
+		"st1 {v19.2d}, [%[e1]] \n\t"
+		/* B.1 store rearm data to mbuf. */
+		"st1 {%[rearm].2d}, [%[e2]] \n\t"
+		"add %[e2], %[e2], #16 \n\t"
+		"st1 {%[rearm].2d}, [%[e3]] \n\t"
+		"add %[e3], %[e3], #16 \n\t"
+		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		"tbl v18.16b, {v17.16b}, %[mcqe_shuf_m1].16b \n\t"
+		"tbl v19.16b, {v17.16b}, %[mcqe_shuf_m2].16b \n\t"
+		"sub v18.8h, v18.8h, %[crc_adj].8h \n\t"
+		"sub v19.8h, v19.8h, %[crc_adj].8h \n\t"
+		"orr v18.16b, v18.16b, %[rxdf].16b \n\t"
+		"orr v19.16b, v19.16b, %[rxdf].16b \n\t"
+		/* D.1 store rx_descriptor_fields1. */
+		"st1 {v18.2d}, [%[e2]] \n\t"
+		"st1 {v19.2d}, [%[e3]] \n\t"
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		"tbl %[byte_cnt].8b, {v16.16b - v17.16b}, %[len_shuf_m].8b \n\t"
+#endif
+		:[byte_cnt]"=&w"(byte_cnt)
+		:[mcq]"r"(p),
+		 [rxdf]"w"(rxdf),
+		 [rearm]"w"(rearm),
+		 [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+		 [mcqe_shuf_m1]"w"(mcqe_shuf_m1),
+		 [mcqe_shuf_m2]"w"(mcqe_shuf_m2),
+		 [crc_adj]"w"(crc_adj),
+		 [len_shuf_m]"w"(len_shuf_m)
+		:"memory", "v16", "v17", "v18", "v19");
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		byte_cnt = vbic_u16(byte_cnt, invalid_mask);
+		rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
+#endif
+		if (rxq->mark) {
+			/* E.1 store flow tag (rte_flow mark). */
+			elts[pos]->hash.fdir.hi = flow_tag;
+			elts[pos + 1]->hash.fdir.hi = flow_tag;
+			elts[pos + 2]->hash.fdir.hi = flow_tag;
+			elts[pos + 3]->hash.fdir.hi = flow_tag;
+		}
+		if (rxq->dynf_meta) {
+			int32_t offs = rxq->flow_meta_offset;
+			const uint32_t meta =
+				*RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
+
+			/* Check if title packet has valid metadata. */
+			if (meta) {
+				MLX5_ASSERT(t_pkt->ol_flags &
+					    rxq->flow_meta_mask);
+				*RTE_MBUF_DYNFIELD(elts[pos], offs,
+							uint32_t *) = meta;
+				*RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
+							uint32_t *) = meta;
+				*RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
+							uint32_t *) = meta;
+				*RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
+							uint32_t *) = meta;
+			}
+		}
+		pos += MLX5_VPMD_DESCS_PER_LOOP;
+		/* Move to next CQE and invalidate consumed CQEs. */
+		if (!(pos & 0x7) && pos < mcqe_n) {
+			mcq = (void *)&(cq + pos)->pkt_info;
+			for (i = 0; i < 8; ++i)
+				cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+		}
+	}
+	/* Invalidate the rest of CQEs. */
+	for (; inv < mcqe_n; ++inv)
+		cq[inv].op_own = MLX5_CQE_INVALIDATE;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += mcqe_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	rxq->cq_ci += mcqe_n;
+	return mcqe_n;
+}
+
+/**
+ * Calculate packet type and offload flag for mbuf and store it.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param ptype_info
+ *   Array of four 4bytes packet type info extracted from the original
+ *   completion descriptor.
+ * @param flow_tag
+ *   Array of four 4bytes flow ID extracted from the original completion
+ *   descriptor.
+ * @param op_err
+ *   Opcode vector having responder error status. Each field is 4B.
+ * @param pkts
+ *   Pointer to array of packets to be filled.
+ */
+static inline void
+rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
+			 uint32x4_t ptype_info, uint32x4_t flow_tag,
+			 uint16x4_t op_err, struct rte_mbuf **pkts)
+{
+	uint16x4_t ptype;
+	uint32x4_t pinfo, cv_flags;
+	uint32x4_t ol_flags =
+		vdupq_n_u32(rxq->rss_hash * PKT_RX_RSS_HASH |
+			    rxq->hw_timestamp * PKT_RX_TIMESTAMP);
+	const uint32x4_t ptype_ol_mask = { 0x106, 0x106, 0x106, 0x106 };
+	const uint8x16_t cv_flag_sel = {
+		0,
+		(uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+		(uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1),
+		0,
+		(uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1),
+		0,
+		(uint8_t)((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+		0, 0, 0, 0, 0, 0, 0, 0, 0
+	};
+	const uint32x4_t cv_mask =
+		vdupq_n_u32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+			    PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
+	const uint64x2_t mbuf_init = vld1q_u64
+				((const uint64_t *)&rxq->mbuf_initializer);
+	uint64x2_t rearm0, rearm1, rearm2, rearm3;
+	uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
+
+	if (rxq->mark) {
+		const uint32x4_t ft_def = vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT);
+		const uint32x4_t fdir_flags = vdupq_n_u32(PKT_RX_FDIR);
+		uint32x4_t fdir_id_flags = vdupq_n_u32(PKT_RX_FDIR_ID);
+		uint32x4_t invalid_mask;
+
+		/* Check if flow tag is non-zero then set PKT_RX_FDIR. */
+		invalid_mask = vceqzq_u32(flow_tag);
+		ol_flags = vorrq_u32(ol_flags,
+				     vbicq_u32(fdir_flags, invalid_mask));
+		/* Mask out invalid entries. */
+		fdir_id_flags = vbicq_u32(fdir_id_flags, invalid_mask);
+		/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+		ol_flags = vorrq_u32(ol_flags,
+				     vbicq_u32(fdir_id_flags,
+					       vceqq_u32(flow_tag, ft_def)));
+	}
+	/*
+	 * ptype_info has the following:
+	 * bit[1]     = l3_ok
+	 * bit[2]     = l4_ok
+	 * bit[8]     = cv
+	 * bit[11:10] = l3_hdr_type
+	 * bit[14:12] = l4_hdr_type
+	 * bit[15]    = ip_frag
+	 * bit[16]    = tunneled
+	 * bit[17]    = outer_l3_type
+	 */
+	ptype = vshrn_n_u32(ptype_info, 10);
+	/* Errored packets will have RTE_PTYPE_ALL_MASK. */
+	ptype = vorr_u16(ptype, op_err);
+	pt_idx0 = vget_lane_u8(vreinterpret_u8_u16(ptype), 6);
+	pt_idx1 = vget_lane_u8(vreinterpret_u8_u16(ptype), 4);
+	pt_idx2 = vget_lane_u8(vreinterpret_u8_u16(ptype), 2);
+	pt_idx3 = vget_lane_u8(vreinterpret_u8_u16(ptype), 0);
+	pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
+			       !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
+	pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
+			       !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
+	pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
+			       !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
+	pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
+			       !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
+	/* Fill flags for checksum and VLAN. */
+	pinfo = vandq_u32(ptype_info, ptype_ol_mask);
+	pinfo = vreinterpretq_u32_u8(
+		vqtbl1q_u8(cv_flag_sel, vreinterpretq_u8_u32(pinfo)));
+	/* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
+	cv_flags = vshlq_n_u32(pinfo, 9);
+	cv_flags = vorrq_u32(pinfo, cv_flags);
+	/* Move back flags to start from byte[0]. */
+	cv_flags = vshrq_n_u32(cv_flags, 8);
+	/* Mask out garbage bits. */
+	cv_flags = vandq_u32(cv_flags, cv_mask);
+	/* Merge to ol_flags. */
+	ol_flags = vorrq_u32(ol_flags, cv_flags);
+	/* Merge mbuf_init and ol_flags, and store. */
+	rearm0 = vreinterpretq_u64_u32(vsetq_lane_u32
+					(vgetq_lane_u32(ol_flags, 3),
+					 vreinterpretq_u32_u64(mbuf_init), 2));
+	rearm1 = vreinterpretq_u64_u32(vsetq_lane_u32
+					(vgetq_lane_u32(ol_flags, 2),
+					 vreinterpretq_u32_u64(mbuf_init), 2));
+	rearm2 = vreinterpretq_u64_u32(vsetq_lane_u32
+					(vgetq_lane_u32(ol_flags, 1),
+					 vreinterpretq_u32_u64(mbuf_init), 2));
+	rearm3 = vreinterpretq_u64_u32(vsetq_lane_u32
+					(vgetq_lane_u32(ol_flags, 0),
+					 vreinterpretq_u32_u64(mbuf_init), 2));
+
+	vst1q_u64((void *)&pkts[0]->rearm_data, rearm0);
+	vst1q_u64((void *)&pkts[1]->rearm_data, rearm1);
+	vst1q_u64((void *)&pkts[2]->rearm_data, rearm2);
+	vst1q_u64((void *)&pkts[3]->rearm_data, rearm3);
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
+	    uint64_t *err)
+{
+	const uint16_t q_n = 1 << rxq->cqe_n;
+	const uint16_t q_mask = q_n - 1;
+	volatile struct mlx5_cqe *cq;
+	struct rte_mbuf **elts;
+	unsigned int pos;
+	uint64_t n;
+	uint16_t repl_n;
+	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+	uint16_t nocmp_n = 0;
+	uint16_t rcvd_pkt = 0;
+	unsigned int cq_idx = rxq->cq_ci & q_mask;
+	unsigned int elts_idx;
+	const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));
+	const uint16x4_t owner_check = vcreate_u16(0x0001000100010001);
+	const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);
+	const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c);
+	const uint16x4_t resp_err_check = vcreate_u16(0x00e000e000e000e0);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t rcvd_byte = 0;
+#endif
+	/* Mask to generate 16B length vector. */
+	const uint8x8_t len_shuf_m = {
+		52, 53,         /* 4th CQE */
+		36, 37,         /* 3rd CQE */
+		20, 21,         /* 2nd CQE */
+		 4,  5          /* 1st CQE */
+	};
+	/* Mask to extract 16B data from a 64B CQE. */
+	const uint8x16_t cqe_shuf_m = {
+		28, 29,         /* hdr_type_etc */
+		 0,             /* pkt_info */
+		-1,             /* null */
+		47, 46,         /* byte_cnt, bswap16 */
+		31, 30,         /* vlan_info, bswap16 */
+		15, 14, 13, 12, /* rx_hash_res, bswap32 */
+		57, 58, 59,     /* flow_tag */
+		63              /* op_own */
+	};
+	/* Mask to generate 16B data for mbuf. */
+	const uint8x16_t mb_shuf_m = {
+		 4,  5, -1, -1, /* pkt_len */
+		 4,  5,         /* data_len */
+		 6,  7,         /* vlan_tci */
+		 8,  9, 10, 11, /* hash.rss */
+		12, 13, 14, -1  /* hash.fdir.hi */
+	};
+	/* Mask to generate 16B owner vector. */
+	const uint8x8_t owner_shuf_m = {
+		63, -1,         /* 4th CQE */
+		47, -1,         /* 3rd CQE */
+		31, -1,         /* 2nd CQE */
+		15, -1          /* 1st CQE */
+	};
+	/* Mask to generate a vector having packet_type/ol_flags. */
+	const uint8x16_t ptype_shuf_m = {
+		48, 49, 50, -1, /* 4th CQE */
+		32, 33, 34, -1, /* 3rd CQE */
+		16, 17, 18, -1, /* 2nd CQE */
+		 0,  1,  2, -1  /* 1st CQE */
+	};
+	/* Mask to generate a vector having flow tags. */
+	const uint8x16_t ftag_shuf_m = {
+		60, 61, 62, -1, /* 4th CQE */
+		44, 45, 46, -1, /* 3rd CQE */
+		28, 29, 30, -1, /* 2nd CQE */
+		12, 13, 14, -1  /* 1st CQE */
+	};
+	const uint16x8_t crc_adj = {
+		0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0
+	};
+	const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) };
+
+	MLX5_ASSERT(rxq->sges_n == 0);
+	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+	cq = &(*rxq->cqes)[cq_idx];
+	rte_prefetch_non_temporal(cq);
+	rte_prefetch_non_temporal(cq + 1);
+	rte_prefetch_non_temporal(cq + 2);
+	rte_prefetch_non_temporal(cq + 3);
+	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
+	if (repl_n >= rxq->rq_repl_thresh)
+		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
+	/* See if there're unreturned mbufs from compressed CQE. */
+	rcvd_pkt = rxq->decompressed;
+	if (rcvd_pkt > 0) {
+		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+		rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
+		rxq->rq_pi += rcvd_pkt;
+		pkts += rcvd_pkt;
+		rxq->decompressed -= rcvd_pkt;
+	}
+	elts_idx = rxq->rq_pi & q_mask;
+	elts = &(*rxq->elts)[elts_idx];
+	/* Not to overflow pkts array. */
+	pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
+	/* Not to cross queue end. */
+	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
+	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+	if (!pkts_n)
+		return rcvd_pkt;
+	/* At this point, there shouldn't be any remained packets. */
+	MLX5_ASSERT(rxq->decompressed == 0);
+	/*
+	 * Note that vectors have reverse order - {v3, v2, v1, v0}, because
+	 * there's no instruction to count trailing zeros. __builtin_clzl() is
+	 * used instead.
+	 *
+	 * A. copy 4 mbuf pointers from elts ring to returing pkts.
+	 * B. load 64B CQE and extract necessary fields
+	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+	 *    following structure:
+	 *        struct {
+	 *          uint16_t hdr_type_etc;
+	 *          uint8_t  pkt_info;
+	 *          uint8_t  rsvd;
+	 *          uint16_t byte_cnt;
+	 *          uint16_t vlan_info;
+	 *          uint32_t rx_has_res;
+	 *          uint8_t  flow_tag[3];
+	 *          uint8_t  op_own;
+	 *        } c;
+	 * C. fill in mbuf.
+	 * D. get valid CQEs.
+	 * E. find compressed CQE.
+	 */
+	for (pos = 0;
+	     pos < pkts_n;
+	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
+		uint16x4_t op_own;
+		uint16x4_t opcode, owner_mask, invalid_mask;
+		uint16x4_t comp_mask;
+		uint16x4_t mask;
+		uint16x4_t byte_cnt;
+		uint32x4_t ptype_info, flow_tag;
+		register uint64x2_t c0, c1, c2, c3;
+		uint8_t *p0, *p1, *p2, *p3;
+		uint8_t *e0 = (void *)&elts[pos]->pkt_len;
+		uint8_t *e1 = (void *)&elts[pos + 1]->pkt_len;
+		uint8_t *e2 = (void *)&elts[pos + 2]->pkt_len;
+		uint8_t *e3 = (void *)&elts[pos + 3]->pkt_len;
+		void *elts_p = (void *)&elts[pos];
+		void *pkts_p = (void *)&pkts[pos];
+
+		/* A.0 do not cross the end of CQ. */
+		mask = vcreate_u16(pkts_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?
+				   -1UL >> ((pkts_n - pos) *
+					    sizeof(uint16_t) * 8) : 0);
+		p0 = (void *)&cq[pos].pkt_info;
+		p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);
+		p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);
+		p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);
+		/* B.0 (CQE 3) load a block having op_own. */
+		c3 = vld1q_u64((uint64_t *)(p3 + 48));
+		/* B.0 (CQE 2) load a block having op_own. */
+		c2 = vld1q_u64((uint64_t *)(p2 + 48));
+		/* B.0 (CQE 1) load a block having op_own. */
+		c1 = vld1q_u64((uint64_t *)(p1 + 48));
+		/* B.0 (CQE 0) load a block having op_own. */
+		c0 = vld1q_u64((uint64_t *)(p0 + 48));
+		/* Synchronize for loading the rest of blocks. */
+		rte_cio_rmb();
+		/* Prefetch next 4 CQEs. */
+		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+			unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP;
+			rte_prefetch_non_temporal(&cq[next]);
+			rte_prefetch_non_temporal(&cq[next + 1]);
+			rte_prefetch_non_temporal(&cq[next + 2]);
+			rte_prefetch_non_temporal(&cq[next + 3]);
+		}
+		__asm__ volatile (
+		/* B.1 (CQE 3) load the rest of blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p3]] \n\t"
+		/* B.2 (CQE 3) move the block having op_own. */
+		"mov v19.16b, %[c3].16b \n\t"
+		/* B.3 (CQE 3) extract 16B fields. */
+		"tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.1 (CQE 2) load the rest of blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
+		/* B.4 (CQE 3) adjust CRC length. */
+		"sub v23.8h, v23.8h, %[crc_adj].8h \n\t"
+		/* C.1 (CQE 3) generate final structure for mbuf. */
+		"tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t"
+		/* B.2 (CQE 2) move the block having op_own. */
+		"mov v19.16b, %[c2].16b \n\t"
+		/* B.3 (CQE 2) extract 16B fields. */
+		"tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.1 (CQE 1) load the rest of blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
+		/* B.4 (CQE 2) adjust CRC length. */
+		"sub v22.8h, v22.8h, %[crc_adj].8h \n\t"
+		/* C.1 (CQE 2) generate final structure for mbuf. */
+		"tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t"
+		/* B.2 (CQE 1) move the block having op_own. */
+		"mov v19.16b, %[c1].16b \n\t"
+		/* B.3 (CQE 1) extract 16B fields. */
+		"tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.1 (CQE 0) load the rest of blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
+		/* B.4 (CQE 1) adjust CRC length. */
+		"sub v21.8h, v21.8h, %[crc_adj].8h \n\t"
+		/* C.1 (CQE 1) generate final structure for mbuf. */
+		"tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t"
+		/* B.2 (CQE 0) move the block having op_own. */
+		"mov v19.16b, %[c0].16b \n\t"
+		/* A.1 load mbuf pointers. */
+		"ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
+		/* B.3 (CQE 0) extract 16B fields. */
+		"tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.4 (CQE 0) adjust CRC length. */
+		"sub v20.8h, v20.8h, %[crc_adj].8h \n\t"
+		/* D.1 extract op_own byte. */
+		"tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t"
+		/* C.2 (CQE 3) adjust flow mark. */
+		"add v15.4s, v15.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 3) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v15.2d}, [%[e3]] \n\t"
+		/* C.2 (CQE 2) adjust flow mark. */
+		"add v14.4s, v14.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 2) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v14.2d}, [%[e2]] \n\t"
+		/* C.1 (CQE 0) generate final structure for mbuf. */
+		"tbl v12.16b, {v20.16b}, %[mb_shuf_m].16b \n\t"
+		/* C.2 (CQE 1) adjust flow mark. */
+		"add v13.4s, v13.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v13.2d}, [%[e1]] \n\t"
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Extract byte_cnt. */
+		"tbl %[byte_cnt].8b, {v20.16b - v23.16b}, %[len_shuf_m].8b \n\t"
+#endif
+		/* Extract ptype_info. */
+		"tbl %[ptype_info].16b, {v20.16b - v23.16b}, %[ptype_shuf_m].16b \n\t"
+		/* Extract flow_tag. */
+		"tbl %[flow_tag].16b, {v20.16b - v23.16b}, %[ftag_shuf_m].16b \n\t"
+		/* A.2 copy mbuf pointers. */
+		"st1 {v24.2d - v25.2d}, [%[pkts_p]] \n\t"
+		/* C.2 (CQE 0) adjust flow mark. */
+		"add v12.4s, v12.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v12.2d}, [%[e0]] \n\t"
+		:[op_own]"=&w"(op_own),
+		 [byte_cnt]"=&w"(byte_cnt),
+		 [ptype_info]"=&w"(ptype_info),
+		 [flow_tag]"=&w"(flow_tag)
+		:[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0),
+		 [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+		 [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0),
+		 [elts_p]"r"(elts_p),
+		 [pkts_p]"r"(pkts_p),
+		 [cqe_shuf_m]"w"(cqe_shuf_m),
+		 [mb_shuf_m]"w"(mb_shuf_m),
+		 [owner_shuf_m]"w"(owner_shuf_m),
+		 [len_shuf_m]"w"(len_shuf_m),
+		 [ptype_shuf_m]"w"(ptype_shuf_m),
+		 [ftag_shuf_m]"w"(ftag_shuf_m),
+		 [crc_adj]"w"(crc_adj),
+		 [flow_mark_adj]"w"(flow_mark_adj)
+		:"memory",
+		 "v12", "v13", "v14", "v15",
+		 "v16", "v17", "v18", "v19",
+		 "v20", "v21", "v22", "v23",
+		 "v24", "v25");
+		/* D.2 flip owner bit to mark CQEs from last round. */
+		owner_mask = vand_u16(op_own, owner_check);
+		owner_mask = vceq_u16(owner_mask, ownership);
+		/* D.3 get mask for invalidated CQEs. */
+		opcode = vand_u16(op_own, opcode_check);
+		invalid_mask = vceq_u16(opcode_check, opcode);
+		/* E.1 find compressed CQE format. */
+		comp_mask = vand_u16(op_own, format_check);
+		comp_mask = vceq_u16(comp_mask, format_check);
+		/* D.4 mask out beyond boundary. */
+		invalid_mask = vorr_u16(invalid_mask, mask);
+		/* D.5 merge invalid_mask with invalid owner. */
+		invalid_mask = vorr_u16(invalid_mask, owner_mask);
+		/* E.2 mask out invalid entries. */
+		comp_mask = vbic_u16(comp_mask, invalid_mask);
+		/* E.3 get the first compressed CQE. */
+		comp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+					  comp_mask), 0)) /
+					  (sizeof(uint16_t) * 8);
+		/* D.6 mask out entries after the compressed CQE. */
+		mask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ?
+				   -1UL >> (comp_idx * sizeof(uint16_t) * 8) :
+				   0);
+		invalid_mask = vorr_u16(invalid_mask, mask);
+		/* D.7 count non-compressed valid CQEs. */
+		n = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+				   invalid_mask), 0)) / (sizeof(uint16_t) * 8);
+		nocmp_n += n;
+		/* D.2 get the final invalid mask. */
+		mask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ?
+				   -1UL >> (n * sizeof(uint16_t) * 8) : 0);
+		invalid_mask = vorr_u16(invalid_mask, mask);
+		/* D.3 check error in opcode. */
+		opcode = vceq_u16(resp_err_check, opcode);
+		opcode = vbic_u16(opcode, invalid_mask);
+		/* D.4 mark if any error is set */
+		*err |= vget_lane_u64(vreinterpret_u64_u16(opcode), 0);
+		/* C.4 fill in mbuf - rearm_data and packet_type. */
+		rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag,
+					 opcode, &elts[pos]);
+		if (rxq->hw_timestamp) {
+			elts[pos]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p0, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+			elts[pos + 1]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p1, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+			elts[pos + 2]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p2, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+			elts[pos + 3]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p3, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+		}
+		if (!!rxq->flow_meta_mask) {
+			/* This code is subject for futher optimization. */
+			int32_t offs = rxq->flow_meta_offset;
+
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				container_of(p0, struct mlx5_cqe,
+					     pkt_info)->flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				container_of(p1, struct mlx5_cqe,
+					     pkt_info)->flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				container_of(p2, struct mlx5_cqe,
+					     pkt_info)->flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				container_of(p3, struct mlx5_cqe,
+					     pkt_info)->flow_table_metadata;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+				elts[pos]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+				elts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+				elts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+				elts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+		}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Add up received bytes count. */
+		byte_cnt = vbic_u16(byte_cnt, invalid_mask);
+		rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
+#endif
+		/*
+		 * Break the loop unless more valid CQE is expected, or if
+		 * there's a compressed CQE.
+		 */
+		if (n != MLX5_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+	/* If no new CQE seen, return without updating cq_db. */
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+		return rcvd_pkt;
+	/* Update the consumer indexes for non-compressed CQEs. */
+	MLX5_ASSERT(nocmp_n <= pkts_n);
+	rxq->cq_ci += nocmp_n;
+	rxq->rq_pi += nocmp_n;
+	rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += nocmp_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	/* Decompress the last CQE if compressed. */
+	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+		rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+							&elts[nocmp_n]);
+		/* Return more packets if needed. */
+		if (nocmp_n < pkts_n) {
+			uint16_t n = rxq->decompressed;
+
+			n = RTE_MIN(n, pkts_n - nocmp_n);
+			rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
+			rxq->rq_pi += n;
+			rcvd_pkt += n;
+			rxq->decompressed -= n;
+		}
+	}
+	rte_cio_wmb();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	return rcvd_pkt;
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
new file mode 100644
index 000000000..6847ae782
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -0,0 +1,731 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
+#define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <smmintrin.h>
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include <mlx5_prm.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ */
+static inline void
+rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+{
+	const uint16_t q_mask = (1 << rxq->elts_n) - 1;
+	struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
+	unsigned int pos;
+	uint16_t p = n & -2;
+
+	for (pos = 0; pos < p; pos += 2) {
+		__m128i mbp;
+
+		mbp = _mm_loadu_si128((__m128i *)&elts[pos]);
+		_mm_storeu_si128((__m128i *)&pkts[pos], mbp);
+	}
+	if (n & 1)
+		pkts[pos] = elts[pos];
+}
+
+/**
+ * Decompress a compressed completion and fill in mbufs in RX SW ring with data
+ * extracted from the title completion descriptor.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param cq
+ *   Pointer to completion array having a compressed completion at first.
+ * @param elts
+ *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ *   the title completion descriptor to be copied to the rest of mbufs.
+ *
+ * @return
+ *   Number of mini-CQEs successfully decompressed.
+ */
+static inline uint16_t
+rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+		    struct rte_mbuf **elts)
+{
+	volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1);
+	struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+	unsigned int pos;
+	unsigned int i;
+	unsigned int inv = 0;
+	/* Mask to shuffle from extracted mini CQE to mbuf. */
+	const __m128i shuf_mask1 =
+		_mm_set_epi8(0,  1,  2,  3, /* rss, bswap32 */
+			    -1, -1,         /* skip vlan_tci */
+			     6,  7,         /* data_len, bswap16 */
+			    -1, -1,  6,  7, /* pkt_len, bswap16 */
+			    -1, -1, -1, -1  /* skip packet_type */);
+	const __m128i shuf_mask2 =
+		_mm_set_epi8(8,  9, 10, 11, /* rss, bswap32 */
+			    -1, -1,         /* skip vlan_tci */
+			    14, 15,         /* data_len, bswap16 */
+			    -1, -1, 14, 15, /* pkt_len, bswap16 */
+			    -1, -1, -1, -1  /* skip packet_type */);
+	/* Restore the compressed count. Must be 16 bits. */
+	const uint16_t mcqe_n = t_pkt->data_len +
+				(rxq->crc_present * RTE_ETHER_CRC_LEN);
+	const __m128i rearm =
+		_mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
+	const __m128i rxdf =
+		_mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1);
+	const __m128i crc_adj =
+		_mm_set_epi16(0, 0, 0,
+			      rxq->crc_present * RTE_ETHER_CRC_LEN,
+			      0,
+			      rxq->crc_present * RTE_ETHER_CRC_LEN,
+			      0, 0);
+	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
+	uint32_t rcvd_byte = 0;
+	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+	const __m128i len_shuf_mask =
+		_mm_set_epi8(-1, -1, -1, -1,
+			     -1, -1, -1, -1,
+			     14, 15,  6,  7,
+			     10, 11,  2,  3);
+#endif
+	/*
+	 * A. load mCQEs into a 128bit register.
+	 * B. store rearm data to mbuf.
+	 * C. combine data from mCQEs with rx_descriptor_fields1.
+	 * D. store rx_descriptor_fields1.
+	 * E. store flow tag (rte_flow mark).
+	 */
+	for (pos = 0; pos < mcqe_n; ) {
+		__m128i mcqe1, mcqe2;
+		__m128i rxdf1, rxdf2;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		__m128i byte_cnt, invalid_mask;
+#endif
+
+		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+			if (likely(pos + i < mcqe_n))
+				rte_prefetch0((void *)(cq + pos + i));
+
+		/* A.1 load mCQEs into a 128bit register. */
+		mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
+		mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
+		/* B.1 store rearm data to mbuf. */
+		_mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm);
+		_mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm);
+		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1);
+		rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2);
+		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
+		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
+		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
+		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
+		/* D.1 store rx_descriptor_fields1. */
+		_mm_storeu_si128((__m128i *)
+				  &elts[pos]->rx_descriptor_fields1,
+				 rxdf1);
+		_mm_storeu_si128((__m128i *)
+				  &elts[pos + 1]->rx_descriptor_fields1,
+				 rxdf2);
+		/* B.1 store rearm data to mbuf. */
+		_mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm);
+		_mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm);
+		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
+		rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
+		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
+		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
+		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
+		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
+		/* D.1 store rx_descriptor_fields1. */
+		_mm_storeu_si128((__m128i *)
+				  &elts[pos + 2]->rx_descriptor_fields1,
+				 rxdf1);
+		_mm_storeu_si128((__m128i *)
+				  &elts[pos + 3]->rx_descriptor_fields1,
+				 rxdf2);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		invalid_mask = _mm_set_epi64x(0,
+					      (mcqe_n - pos) *
+					      sizeof(uint16_t) * 8);
+		invalid_mask = _mm_sll_epi64(ones, invalid_mask);
+		mcqe1 = _mm_srli_si128(mcqe1, 4);
+		byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc);
+		byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
+		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
+		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
+		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
+#endif
+		if (rxq->mark) {
+			/* E.1 store flow tag (rte_flow mark). */
+			elts[pos]->hash.fdir.hi = flow_tag;
+			elts[pos + 1]->hash.fdir.hi = flow_tag;
+			elts[pos + 2]->hash.fdir.hi = flow_tag;
+			elts[pos + 3]->hash.fdir.hi = flow_tag;
+		}
+		if (rxq->dynf_meta) {
+			int32_t offs = rxq->flow_meta_offset;
+			const uint32_t meta =
+				*RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
+
+			/* Check if title packet has valid metadata. */
+			if (meta) {
+				MLX5_ASSERT(t_pkt->ol_flags &
+					    rxq->flow_meta_mask);
+				*RTE_MBUF_DYNFIELD(elts[pos], offs,
+							uint32_t *) = meta;
+				*RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
+							uint32_t *) = meta;
+				*RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
+							uint32_t *) = meta;
+				*RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
+							uint32_t *) = meta;
+			}
+		}
+		pos += MLX5_VPMD_DESCS_PER_LOOP;
+		/* Move to next CQE and invalidate consumed CQEs. */
+		if (!(pos & 0x7) && pos < mcqe_n) {
+			mcq = (void *)(cq + pos);
+			for (i = 0; i < 8; ++i)
+				cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+		}
+	}
+	/* Invalidate the rest of CQEs. */
+	for (; inv < mcqe_n; ++inv)
+		cq[inv].op_own = MLX5_CQE_INVALIDATE;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += mcqe_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	rxq->cq_ci += mcqe_n;
+	return mcqe_n;
+}
+
+/**
+ * Calculate packet type and offload flag for mbuf and store it.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param cqes[4]
+ *   Array of four 16bytes completions extracted from the original completion
+ *   descriptor.
+ * @param op_err
+ *   Opcode vector having responder error status. Each field is 4B.
+ * @param pkts
+ *   Pointer to array of packets to be filled.
+ */
+static inline void
+rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
+			 __m128i op_err, struct rte_mbuf **pkts)
+{
+	__m128i pinfo0, pinfo1;
+	__m128i pinfo, ptype;
+	__m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH |
+					  rxq->hw_timestamp * PKT_RX_TIMESTAMP);
+	__m128i cv_flags;
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i ptype_mask =
+		_mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06);
+	const __m128i ptype_ol_mask =
+		_mm_set_epi32(0x106, 0x106, 0x106, 0x106);
+	const __m128i pinfo_mask =
+		_mm_set_epi32(0x3, 0x3, 0x3, 0x3);
+	const __m128i cv_flag_sel =
+		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
+			     (uint8_t)((PKT_RX_IP_CKSUM_GOOD |
+					PKT_RX_L4_CKSUM_GOOD) >> 1),
+			     0,
+			     (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1),
+			     0,
+			     (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1),
+			     (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+			     0);
+	const __m128i cv_mask =
+		_mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
+	const __m128i mbuf_init =
+		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);
+	__m128i rearm0, rearm1, rearm2, rearm3;
+	uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
+
+	/* Extract pkt_info field. */
+	pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
+	pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
+	pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1);
+	/* Extract hdr_type_etc field. */
+	pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]);
+	pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]);
+	ptype = _mm_unpacklo_epi64(pinfo0, pinfo1);
+	if (rxq->mark) {
+		const __m128i pinfo_ft_mask =
+			_mm_set_epi32(0xffffff00, 0xffffff00,
+				      0xffffff00, 0xffffff00);
+		const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR);
+		__m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID);
+		__m128i flow_tag, invalid_mask;
+
+		flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask);
+		/* Check if flow tag is non-zero then set PKT_RX_FDIR. */
+		invalid_mask = _mm_cmpeq_epi32(flow_tag, zero);
+		ol_flags = _mm_or_si128(ol_flags,
+					_mm_andnot_si128(invalid_mask,
+							 fdir_flags));
+		/* Mask out invalid entries. */
+		fdir_id_flags = _mm_andnot_si128(invalid_mask, fdir_id_flags);
+		/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+		ol_flags = _mm_or_si128(ol_flags,
+					_mm_andnot_si128(
+						_mm_cmpeq_epi32(flow_tag,
+								pinfo_ft_mask),
+						fdir_id_flags));
+	}
+	/*
+	 * Merge the two fields to generate the following:
+	 * bit[1]     = l3_ok
+	 * bit[2]     = l4_ok
+	 * bit[8]     = cv
+	 * bit[11:10] = l3_hdr_type
+	 * bit[14:12] = l4_hdr_type
+	 * bit[15]    = ip_frag
+	 * bit[16]    = tunneled
+	 * bit[17]    = outer_l3_type
+	 */
+	ptype = _mm_and_si128(ptype, ptype_mask);
+	pinfo = _mm_and_si128(pinfo, pinfo_mask);
+	pinfo = _mm_slli_epi32(pinfo, 16);
+	/* Make pinfo has merged fields for ol_flags calculation. */
+	pinfo = _mm_or_si128(ptype, pinfo);
+	ptype = _mm_srli_epi32(pinfo, 10);
+	ptype = _mm_packs_epi32(ptype, zero);
+	/* Errored packets will have RTE_PTYPE_ALL_MASK. */
+	op_err = _mm_srli_epi16(op_err, 8);
+	ptype = _mm_or_si128(ptype, op_err);
+	pt_idx0 = _mm_extract_epi8(ptype, 0);
+	pt_idx1 = _mm_extract_epi8(ptype, 2);
+	pt_idx2 = _mm_extract_epi8(ptype, 4);
+	pt_idx3 = _mm_extract_epi8(ptype, 6);
+	pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
+			       !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
+	pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
+			       !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
+	pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
+			       !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
+	pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
+			       !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
+	/* Fill flags for checksum and VLAN. */
+	pinfo = _mm_and_si128(pinfo, ptype_ol_mask);
+	pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo);
+	/* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
+	cv_flags = _mm_slli_epi32(pinfo, 9);
+	cv_flags = _mm_or_si128(pinfo, cv_flags);
+	/* Move back flags to start from byte[0]. */
+	cv_flags = _mm_srli_epi32(cv_flags, 8);
+	/* Mask out garbage bits. */
+	cv_flags = _mm_and_si128(cv_flags, cv_mask);
+	/* Merge to ol_flags. */
+	ol_flags = _mm_or_si128(ol_flags, cv_flags);
+	/* Merge mbuf_init and ol_flags. */
+	rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30);
+	rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30);
+	rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30);
+	rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30);
+	/* Write 8B rearm_data and 8B ol_flags. */
+	_mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0);
+	_mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1);
+	_mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2);
+	_mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3);
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
+	    uint64_t *err)
+{
+	const uint16_t q_n = 1 << rxq->cqe_n;
+	const uint16_t q_mask = q_n - 1;
+	volatile struct mlx5_cqe *cq;
+	struct rte_mbuf **elts;
+	unsigned int pos;
+	uint64_t n;
+	uint16_t repl_n;
+	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+	uint16_t nocmp_n = 0;
+	uint16_t rcvd_pkt = 0;
+	unsigned int cq_idx = rxq->cq_ci & q_mask;
+	unsigned int elts_idx;
+	unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+	const __m128i owner_check =
+		_mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL);
+	const __m128i opcode_check =
+		_mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL);
+	const __m128i format_check =
+		_mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL);
+	const __m128i resp_err_check =
+		_mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t rcvd_byte = 0;
+	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+	const __m128i len_shuf_mask =
+		_mm_set_epi8(-1, -1, -1, -1,
+			     -1, -1, -1, -1,
+			     12, 13,  8,  9,
+			      4,  5,  0,  1);
+#endif
+	/* Mask to shuffle from extracted CQE to mbuf. */
+	const __m128i shuf_mask =
+		_mm_set_epi8(-1,  3,  2,  1, /* fdir.hi */
+			     12, 13, 14, 15, /* rss, bswap32 */
+			     10, 11,         /* vlan_tci, bswap16 */
+			      4,  5,         /* data_len, bswap16 */
+			     -1, -1,         /* zero out 2nd half of pkt_len */
+			      4,  5          /* pkt_len, bswap16 */);
+	/* Mask to blend from the last Qword to the first DQword. */
+	const __m128i blend_mask =
+		_mm_set_epi8(-1, -1, -1, -1,
+			     -1, -1, -1, -1,
+			      0,  0,  0,  0,
+			      0,  0,  0, -1);
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
+	const __m128i crc_adj =
+		_mm_set_epi16(0, 0, 0, 0, 0,
+			      rxq->crc_present * RTE_ETHER_CRC_LEN,
+			      0,
+			      rxq->crc_present * RTE_ETHER_CRC_LEN);
+	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
+
+	MLX5_ASSERT(rxq->sges_n == 0);
+	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+	cq = &(*rxq->cqes)[cq_idx];
+	rte_prefetch0(cq);
+	rte_prefetch0(cq + 1);
+	rte_prefetch0(cq + 2);
+	rte_prefetch0(cq + 3);
+	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
+	if (repl_n >= rxq->rq_repl_thresh)
+		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
+	/* See if there're unreturned mbufs from compressed CQE. */
+	rcvd_pkt = rxq->decompressed;
+	if (rcvd_pkt > 0) {
+		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+		rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
+		rxq->rq_pi += rcvd_pkt;
+		rxq->decompressed -= rcvd_pkt;
+		pkts += rcvd_pkt;
+	}
+	elts_idx = rxq->rq_pi & q_mask;
+	elts = &(*rxq->elts)[elts_idx];
+	/* Not to overflow pkts array. */
+	pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
+	/* Not to cross queue end. */
+	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
+	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+	if (!pkts_n)
+		return rcvd_pkt;
+	/* At this point, there shouldn't be any remained packets. */
+	MLX5_ASSERT(rxq->decompressed == 0);
+	/*
+	 * A. load first Qword (8bytes) in one loop.
+	 * B. copy 4 mbuf pointers from elts ring to returing pkts.
+	 * C. load remained CQE data and extract necessary fields.
+	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+	 *    following structure:
+	 *        struct {
+	 *          uint8_t  pkt_info;
+	 *          uint8_t  flow_tag[3];
+	 *          uint16_t byte_cnt;
+	 *          uint8_t  rsvd4;
+	 *          uint8_t  op_own;
+	 *          uint16_t hdr_type_etc;
+	 *          uint16_t vlan_info;
+	 *          uint32_t rx_has_res;
+	 *        } c;
+	 * D. fill in mbuf.
+	 * E. get valid CQEs.
+	 * F. find compressed CQE.
+	 */
+	for (pos = 0;
+	     pos < pkts_n;
+	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
+		__m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
+		__m128i cqe_tmp1, cqe_tmp2;
+		__m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+		__m128i op_own, op_own_tmp1, op_own_tmp2;
+		__m128i opcode, owner_mask, invalid_mask;
+		__m128i comp_mask;
+		__m128i mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		__m128i byte_cnt;
+#endif
+		__m128i mbp1, mbp2;
+		__m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+		unsigned int p1, p2, p3;
+
+		/* Prefetch next 4 CQEs. */
+		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+		}
+		/* A.0 do not cross the end of CQ. */
+		mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
+		mask = _mm_sll_epi64(ones, mask);
+		p = _mm_andnot_si128(mask, p);
+		/* A.1 load cqes. */
+		p3 = _mm_extract_epi16(p, 3);
+		cqes[3] = _mm_loadl_epi64((__m128i *)
+					   &cq[pos + p3].sop_drop_qpn);
+		rte_compiler_barrier();
+		p2 = _mm_extract_epi16(p, 2);
+		cqes[2] = _mm_loadl_epi64((__m128i *)
+					   &cq[pos + p2].sop_drop_qpn);
+		rte_compiler_barrier();
+		/* B.1 load mbuf pointers. */
+		mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]);
+		mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]);
+		/* A.1 load a block having op_own. */
+		p1 = _mm_extract_epi16(p, 1);
+		cqes[1] = _mm_loadl_epi64((__m128i *)
+					   &cq[pos + p1].sop_drop_qpn);
+		rte_compiler_barrier();
+		cqes[0] = _mm_loadl_epi64((__m128i *)
+					   &cq[pos].sop_drop_qpn);
+		/* B.2 copy mbuf pointers. */
+		_mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
+		_mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
+		rte_cio_rmb();
+		/* C.1 load remained CQE data and extract necessary fields. */
+		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
+		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
+		cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
+		cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
+		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);
+		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);
+		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
+		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
+		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);
+		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);
+		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
+		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
+		/* C.2 generate final structure for mbuf with swapping bytes. */
+		pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
+		pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
+		/* C.3 adjust CRC length. */
+		pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
+		pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
+		/* C.4 adjust flow mark. */
+		pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
+		pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
+		/* D.1 fill in mbuf - rx_descriptor_fields1. */
+		_mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
+		_mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
+		/* E.1 extract op_own field. */
+		op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
+		/* C.1 load remained CQE data and extract necessary fields. */
+		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);
+		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);
+		cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
+		cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
+		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);
+		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);
+		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
+		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
+		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);
+		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);
+		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
+		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
+		/* C.2 generate final structure for mbuf with swapping bytes. */
+		pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
+		pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
+		/* C.3 adjust CRC length. */
+		pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
+		pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
+		/* C.4 adjust flow mark. */
+		pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
+		pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
+		/* E.1 extract op_own byte. */
+		op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
+		op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
+		/* D.1 fill in mbuf - rx_descriptor_fields1. */
+		_mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
+		_mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
+		/* E.2 flip owner bit to mark CQEs from last round. */
+		owner_mask = _mm_and_si128(op_own, owner_check);
+		if (ownership)
+			owner_mask = _mm_xor_si128(owner_mask, owner_check);
+		owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check);
+		owner_mask = _mm_packs_epi32(owner_mask, zero);
+		/* E.3 get mask for invalidated CQEs. */
+		opcode = _mm_and_si128(op_own, opcode_check);
+		invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
+		invalid_mask = _mm_packs_epi32(invalid_mask, zero);
+		/* E.4 mask out beyond boundary. */
+		invalid_mask = _mm_or_si128(invalid_mask, mask);
+		/* E.5 merge invalid_mask with invalid owner. */
+		invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
+		/* F.1 find compressed CQE format. */
+		comp_mask = _mm_and_si128(op_own, format_check);
+		comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
+		comp_mask = _mm_packs_epi32(comp_mask, zero);
+		/* F.2 mask out invalid entries. */
+		comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
+		comp_idx = _mm_cvtsi128_si64(comp_mask);
+		/* F.3 get the first compressed CQE. */
+		comp_idx = comp_idx ?
+				__builtin_ctzll(comp_idx) /
+					(sizeof(uint16_t) * 8) :
+				MLX5_VPMD_DESCS_PER_LOOP;
+		/* E.6 mask out entries after the compressed CQE. */
+		mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
+		mask = _mm_sll_epi64(ones, mask);
+		invalid_mask = _mm_or_si128(invalid_mask, mask);
+		/* E.7 count non-compressed valid CQEs. */
+		n = _mm_cvtsi128_si64(invalid_mask);
+		n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+			MLX5_VPMD_DESCS_PER_LOOP;
+		nocmp_n += n;
+		/* D.2 get the final invalid mask. */
+		mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
+		mask = _mm_sll_epi64(ones, mask);
+		invalid_mask = _mm_or_si128(invalid_mask, mask);
+		/* D.3 check error in opcode. */
+		opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
+		opcode = _mm_packs_epi32(opcode, zero);
+		opcode = _mm_andnot_si128(invalid_mask, opcode);
+		/* D.4 mark if any error is set */
+		*err |= _mm_cvtsi128_si64(opcode);
+		/* D.5 fill in mbuf - rearm_data and packet_type. */
+		rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+		if (rxq->hw_timestamp) {
+			pkts[pos]->timestamp =
+				rte_be_to_cpu_64(cq[pos].timestamp);
+			pkts[pos + 1]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p1].timestamp);
+			pkts[pos + 2]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p2].timestamp);
+			pkts[pos + 3]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p3].timestamp);
+		}
+		if (rxq->dynf_meta) {
+			/* This code is subject for futher optimization. */
+			int32_t offs = rxq->flow_meta_offset;
+
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				cq[pos].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+				cq[pos + p1].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+				cq[pos + p2].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+				cq[pos + p3].flow_table_metadata;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+				pkts[pos]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+				pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+				pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+				pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+		}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Add up received bytes count. */
+		byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
+		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
+		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
+		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
+#endif
+		/*
+		 * Break the loop unless more valid CQE is expected, or if
+		 * there's a compressed CQE.
+		 */
+		if (n != MLX5_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+	/* If no new CQE seen, return without updating cq_db. */
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+		return rcvd_pkt;
+	/* Update the consumer indexes for non-compressed CQEs. */
+	MLX5_ASSERT(nocmp_n <= pkts_n);
+	rxq->cq_ci += nocmp_n;
+	rxq->rq_pi += nocmp_n;
+	rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += nocmp_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	/* Decompress the last CQE if compressed. */
+	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+		rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+							&elts[nocmp_n]);
+		/* Return more packets if needed. */
+		if (nocmp_n < pkts_n) {
+			uint16_t n = rxq->decompressed;
+
+			n = RTE_MIN(n, pkts_n - nocmp_n);
+			rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
+			rxq->rq_pi += n;
+			rcvd_pkt += n;
+			rxq->decompressed -= n;
+		}
+	}
+	rte_compiler_barrier();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	return rcvd_pkt;
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c
new file mode 100644
index 000000000..a79896cb3
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_socket.c
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "rte_eal.h"
+#include "mlx5_utils.h"
+#include "mlx5.h"
+
+/* PMD socket service for tools. */
+
+int server_socket; /* Unix socket for primary process. */
+struct rte_intr_handle server_intr_handle; /* Interrupt handler. */
+
+static void
+mlx5_pmd_make_path(struct sockaddr_un *addr, int pid)
+{
+	snprintf(addr->sun_path, sizeof(addr->sun_path), "/var/tmp/dpdk_%s_%d",
+		 MLX5_DRIVER_NAME, pid);
+}
+
+/**
+ * Handle server pmd socket interrupts.
+ */
+static void
+mlx5_pmd_socket_handle(void *cb __rte_unused)
+{
+	int conn_sock;
+	int ret = -1;
+	struct cmsghdr *cmsg = NULL;
+	int data;
+	char buf[CMSG_SPACE(sizeof(int))] = { 0 };
+	struct iovec io = {
+		.iov_base = &data,
+		.iov_len = sizeof(data),
+	};
+	struct msghdr msg = {
+		.msg_iov = &io,
+		.msg_iovlen = 1,
+		.msg_control = buf,
+		.msg_controllen = sizeof(buf),
+	};
+	uint16_t port_id;
+	int fd;
+	FILE *file = NULL;
+	struct rte_eth_dev *dev;
+
+	/* Accept the connection from the client. */
+	conn_sock = accept(server_socket, NULL, NULL);
+	if (conn_sock < 0) {
+		DRV_LOG(WARNING, "connection failed: %s", strerror(errno));
+		return;
+	}
+	ret = recvmsg(conn_sock, &msg, MSG_WAITALL);
+	if (ret < 0) {
+		DRV_LOG(WARNING, "wrong message received: %s",
+			strerror(errno));
+		goto error;
+	}
+	/* Receive file descriptor. */
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (cmsg == NULL || cmsg->cmsg_type != SCM_RIGHTS ||
+	    cmsg->cmsg_len < sizeof(int)) {
+		DRV_LOG(WARNING, "invalid file descriptor message");
+		goto error;
+	}
+	memcpy(&fd, CMSG_DATA(cmsg), sizeof(fd));
+	file = fdopen(fd, "w");
+	if (!file) {
+		DRV_LOG(WARNING, "Failed to open file");
+		goto error;
+	}
+	/* Receive port number. */
+	if (msg.msg_iovlen != 1 || msg.msg_iov->iov_len < sizeof(uint16_t)) {
+		DRV_LOG(WARNING, "wrong port number message");
+		goto error;
+	}
+	memcpy(&port_id, msg.msg_iov->iov_base, sizeof(port_id));
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		DRV_LOG(WARNING, "Invalid port %u", port_id);
+		goto error;
+	}
+	/* Dump flow. */
+	dev = &rte_eth_devices[port_id];
+	ret = mlx5_flow_dev_dump(dev, file, NULL);
+	/* Set-up the ancillary data and reply. */
+	msg.msg_controllen = 0;
+	msg.msg_control = NULL;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = &io;
+	data = -ret;
+	io.iov_len = sizeof(data);
+	io.iov_base = &data;
+	do {
+		ret = sendmsg(conn_sock, &msg, 0);
+	} while (ret < 0 && errno == EINTR);
+	if (ret < 0)
+		DRV_LOG(WARNING, "failed to send response %s",
+			strerror(errno));
+error:
+	if (conn_sock > 0)
+		close(conn_sock);
+	if (file)
+		fclose(file);
+}
+
+/**
+ * Install interrupt handler.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @return
+ *   0 on success, a negative errno value otherwise.
+ */
+static int
+mlx5_pmd_interrupt_handler_install(void)
+{
+	MLX5_ASSERT(server_socket);
+	server_intr_handle.fd = server_socket;
+	server_intr_handle.type = RTE_INTR_HANDLE_EXT;
+	return rte_intr_callback_register(&server_intr_handle,
+					  mlx5_pmd_socket_handle, NULL);
+}
+
+/**
+ * Uninstall interrupt handler.
+ */
+static void
+mlx5_pmd_interrupt_handler_uninstall(void)
+{
+	if (server_socket) {
+		mlx5_intr_callback_unregister(&server_intr_handle,
+					      mlx5_pmd_socket_handle,
+					      NULL);
+	}
+	server_intr_handle.fd = 0;
+	server_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+}
+
+/**
+ * Initialise the socket to communicate with the secondary process
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int
+mlx5_pmd_socket_init(void)
+{
+	struct sockaddr_un sun = {
+		.sun_family = AF_UNIX,
+	};
+	int ret = -1;
+	int flags;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (server_socket)
+		return 0;
+	/*
+	 * Initialize the socket to communicate with the secondary
+	 * process.
+	 */
+	ret = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (ret < 0) {
+		DRV_LOG(WARNING, "Failed to open mlx5 socket: %s",
+			strerror(errno));
+		goto error;
+	}
+	server_socket = ret;
+	flags = fcntl(server_socket, F_GETFL, 0);
+	if (flags == -1)
+		goto error;
+	ret = fcntl(server_socket, F_SETFL, flags | O_NONBLOCK);
+	if (ret < 0)
+		goto error;
+	mlx5_pmd_make_path(&sun, getpid());
+	remove(sun.sun_path);
+	ret = bind(server_socket, (const struct sockaddr *)&sun, sizeof(sun));
+	if (ret < 0) {
+		DRV_LOG(WARNING,
+			"cannot bind mlx5 socket: %s", strerror(errno));
+		goto close;
+	}
+	ret = listen(server_socket, 0);
+	if (ret < 0) {
+		DRV_LOG(WARNING, "cannot listen on mlx5 socket: %s",
+			strerror(errno));
+		goto close;
+	}
+	if (mlx5_pmd_interrupt_handler_install()) {
+		DRV_LOG(WARNING, "cannot register interrupt handler for mlx5 socket: %s",
+			strerror(errno));
+		goto close;
+	}
+	return 0;
+close:
+	remove(sun.sun_path);
+error:
+	claim_zero(close(server_socket));
+	server_socket = 0;
+	DRV_LOG(ERR, "Cannot initialize socket: %s", strerror(errno));
+	return -errno;
+}
+
+/**
+ * Un-Initialize the pmd socket
+ */
+RTE_FINI(mlx5_pmd_socket_uninit)
+{
+	if (!server_socket)
+		return;
+	mlx5_pmd_interrupt_handler_uninstall();
+	claim_zero(close(server_socket));
+	server_socket = 0;
+	MKSTR(path, "/var/tmp/dpdk_%s_%d", MLX5_DRIVER_NAME, getpid());
+	claim_zero(remove(path));
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c
new file mode 100644
index 000000000..b4ca6922a
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_stats.c
@@ -0,0 +1,589 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <linux/sockios.h>
+#include <linux/ethtool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+#include <rte_malloc.h>
+
+#include <mlx5_common.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+
+
+static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
+	{
+		.dpdk_name = "rx_port_unicast_bytes",
+		.ctr_name = "rx_vport_unicast_bytes",
+	},
+	{
+		.dpdk_name = "rx_port_multicast_bytes",
+		.ctr_name = "rx_vport_multicast_bytes",
+	},
+	{
+		.dpdk_name = "rx_port_broadcast_bytes",
+		.ctr_name = "rx_vport_broadcast_bytes",
+	},
+	{
+		.dpdk_name = "rx_port_unicast_packets",
+		.ctr_name = "rx_vport_unicast_packets",
+	},
+	{
+		.dpdk_name = "rx_port_multicast_packets",
+		.ctr_name = "rx_vport_multicast_packets",
+	},
+	{
+		.dpdk_name = "rx_port_broadcast_packets",
+		.ctr_name = "rx_vport_broadcast_packets",
+	},
+	{
+		.dpdk_name = "tx_port_unicast_bytes",
+		.ctr_name = "tx_vport_unicast_bytes",
+	},
+	{
+		.dpdk_name = "tx_port_multicast_bytes",
+		.ctr_name = "tx_vport_multicast_bytes",
+	},
+	{
+		.dpdk_name = "tx_port_broadcast_bytes",
+		.ctr_name = "tx_vport_broadcast_bytes",
+	},
+	{
+		.dpdk_name = "tx_port_unicast_packets",
+		.ctr_name = "tx_vport_unicast_packets",
+	},
+	{
+		.dpdk_name = "tx_port_multicast_packets",
+		.ctr_name = "tx_vport_multicast_packets",
+	},
+	{
+		.dpdk_name = "tx_port_broadcast_packets",
+		.ctr_name = "tx_vport_broadcast_packets",
+	},
+	{
+		.dpdk_name = "rx_wqe_err",
+		.ctr_name = "rx_wqe_err",
+	},
+	{
+		.dpdk_name = "rx_crc_errors_phy",
+		.ctr_name = "rx_crc_errors_phy",
+	},
+	{
+		.dpdk_name = "rx_in_range_len_errors_phy",
+		.ctr_name = "rx_in_range_len_errors_phy",
+	},
+	{
+		.dpdk_name = "rx_symbol_err_phy",
+		.ctr_name = "rx_symbol_err_phy",
+	},
+	{
+		.dpdk_name = "tx_errors_phy",
+		.ctr_name = "tx_errors_phy",
+	},
+	{
+		.dpdk_name = "rx_out_of_buffer",
+		.ctr_name = "out_of_buffer",
+		.ib = 1,
+	},
+	{
+		.dpdk_name = "tx_packets_phy",
+		.ctr_name = "tx_packets_phy",
+	},
+	{
+		.dpdk_name = "rx_packets_phy",
+		.ctr_name = "rx_packets_phy",
+	},
+	{
+		.dpdk_name = "tx_discards_phy",
+		.ctr_name = "tx_discards_phy",
+	},
+	{
+		.dpdk_name = "rx_discards_phy",
+		.ctr_name = "rx_discards_phy",
+	},
+	{
+		.dpdk_name = "tx_bytes_phy",
+		.ctr_name = "tx_bytes_phy",
+	},
+	{
+		.dpdk_name = "rx_bytes_phy",
+		.ctr_name = "rx_bytes_phy",
+	},
+	/* Representor only */
+	{
+		.dpdk_name = "rx_packets",
+		.ctr_name = "vport_rx_packets",
+	},
+	{
+		.dpdk_name = "rx_bytes",
+		.ctr_name = "vport_rx_bytes",
+	},
+	{
+		.dpdk_name = "tx_packets",
+		.ctr_name = "vport_tx_packets",
+	},
+	{
+		.dpdk_name = "tx_bytes",
+		.ctr_name = "vport_tx_bytes",
+	},
+};
+
+static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
+
+static inline int
+mlx5_read_ib_stat(struct mlx5_priv *priv, const char *ctr_name, uint64_t *stat)
+{
+	int fd;
+
+	if (priv->sh) {
+		MKSTR(path, "%s/ports/%d/hw_counters/%s",
+			  priv->sh->ibdev_path,
+			  priv->ibv_port,
+			  ctr_name);
+		fd = open(path, O_RDONLY);
+		if (fd != -1) {
+			char buf[21] = {'\0'};
+			ssize_t n = read(fd, buf, sizeof(buf));
+
+			close(fd);
+			if (n != -1) {
+				*stat = strtoull(buf, NULL, 10);
+				return 0;
+			}
+		}
+	}
+	*stat = 0;
+	return 1;
+}
+
+/**
+ * Read device counters table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] stats
+ *   Counters table output buffer.
+ *
+ * @return
+ *   0 on success and stats is filled, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+	unsigned int i;
+	struct ifreq ifr;
+	unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t);
+	unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
+	struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
+	int ret;
+
+	et_stats->cmd = ETHTOOL_GSTATS;
+	et_stats->n_stats = xstats_ctrl->stats_n;
+	ifr.ifr_data = (caddr_t)et_stats;
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING,
+			"port %u unable to read statistic values from device",
+			dev->data->port_id);
+		return ret;
+	}
+	for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
+		if (xstats_ctrl->info[i].ib) {
+			ret = mlx5_read_ib_stat(priv,
+						xstats_ctrl->info[i].ctr_name,
+						&stats[i]);
+			/* return last xstats counter if fail to read. */
+			if (ret == 0)
+				xstats_ctrl->xstats[i] = stats[i];
+			else
+				stats[i] = xstats_ctrl->xstats[i];
+		} else {
+			stats[i] = (uint64_t)
+				et_stats->data[xstats_ctrl->dev_table_idx[i]];
+		}
+	}
+	return 0;
+}
+
+/**
+ * Query the number of statistics provided by ETHTOOL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   Number of statistics on success, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
+	struct ethtool_drvinfo drvinfo;
+	struct ifreq ifr;
+	int ret;
+
+	drvinfo.cmd = ETHTOOL_GDRVINFO;
+	ifr.ifr_data = (caddr_t)&drvinfo;
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u unable to query number of statistics",
+			dev->data->port_id);
+		return ret;
+	}
+	return drvinfo.n_stats;
+}
+
+/**
+ * Init the structures to read device counters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_stats_init(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+	struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
+	unsigned int i;
+	unsigned int j;
+	struct ifreq ifr;
+	struct ethtool_gstrings *strings = NULL;
+	unsigned int dev_stats_n;
+	unsigned int str_sz;
+	int ret;
+
+	/* So that it won't aggregate for each init. */
+	xstats_ctrl->mlx5_stats_n = 0;
+	ret = mlx5_ethtool_get_stats_n(dev);
+	if (ret < 0) {
+		DRV_LOG(WARNING, "port %u no extended statistics available",
+			dev->data->port_id);
+		return;
+	}
+	dev_stats_n = ret;
+	/* Allocate memory to grab stat names and values. */
+	str_sz = dev_stats_n * ETH_GSTRING_LEN;
+	strings = (struct ethtool_gstrings *)
+		  rte_malloc("xstats_strings",
+			     str_sz + sizeof(struct ethtool_gstrings), 0);
+	if (!strings) {
+		DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
+		     dev->data->port_id);
+		return;
+	}
+	strings->cmd = ETHTOOL_GSTRINGS;
+	strings->string_set = ETH_SS_STATS;
+	strings->len = dev_stats_n;
+	ifr.ifr_data = (caddr_t)strings;
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u unable to get statistic names",
+			dev->data->port_id);
+		goto free;
+	}
+	for (i = 0; i != dev_stats_n; ++i) {
+		const char *curr_string = (const char *)
+			&strings->data[i * ETH_GSTRING_LEN];
+
+		for (j = 0; j != xstats_n; ++j) {
+			if (!strcmp(mlx5_counters_init[j].ctr_name,
+				    curr_string)) {
+				unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+				xstats_ctrl->dev_table_idx[idx] = i;
+				xstats_ctrl->info[idx] = mlx5_counters_init[j];
+				break;
+			}
+		}
+	}
+	/* Add IB counters. */
+	for (i = 0; i != xstats_n; ++i) {
+		if (mlx5_counters_init[i].ib) {
+			unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+			xstats_ctrl->info[idx] = mlx5_counters_init[i];
+			xstats_ctrl->hw_stats[idx] = 0;
+		}
+	}
+	MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS);
+	xstats_ctrl->stats_n = dev_stats_n;
+	/* Copy to base at first time. */
+	ret = mlx5_read_dev_counters(dev, xstats_ctrl->base);
+	if (ret)
+		DRV_LOG(ERR, "port %u cannot read device counters: %s",
+			dev->data->port_id, strerror(rte_errno));
+	mlx5_read_ib_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
+	stats_ctrl->imissed = 0;
+free:
+	rte_free(strings);
+}
+
+/**
+ * DPDK callback to get extended device statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] stats
+ *   Pointer to rte extended stats table.
+ * @param n
+ *   The size of the stats table.
+ *
+ * @return
+ *   Number of extended stats on success and stats is filled,
+ *   negative on error and rte_errno is set.
+ */
+int
+mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats,
+		unsigned int n)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+	uint64_t counters[n];
+	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+	uint16_t mlx5_stats_n = xstats_ctrl->mlx5_stats_n;
+
+	if (n >= mlx5_stats_n && stats) {
+		int stats_n;
+		int ret;
+
+		stats_n = mlx5_ethtool_get_stats_n(dev);
+		if (stats_n < 0)
+			return stats_n;
+		if (xstats_ctrl->stats_n != stats_n)
+			mlx5_stats_init(dev);
+		ret = mlx5_read_dev_counters(dev, counters);
+		if (ret)
+			return ret;
+		for (i = 0; i != mlx5_stats_n; ++i) {
+			stats[i].id = i;
+			if (xstats_ctrl->info[i].ib) {
+				uint64_t wrap_n;
+				uint64_t hw_stat = xstats_ctrl->hw_stats[i];
+
+				stats[i].value = (counters[i] -
+						  xstats_ctrl->base[i]) &
+						  (uint64_t)UINT32_MAX;
+				wrap_n = hw_stat >> 32;
+				if (stats[i].value <
+					    (hw_stat & (uint64_t)UINT32_MAX))
+					wrap_n++;
+				stats[i].value |= (wrap_n) << 32;
+				xstats_ctrl->hw_stats[i] = stats[i].value;
+			} else {
+				stats[i].value =
+					(counters[i] - xstats_ctrl->base[i]);
+			}
+		}
+	}
+	return mlx5_stats_n;
+}
+
+/**
+ * DPDK callback to get device statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] stats
+ *   Stats structure output buffer.
+ *
+ * @return
+ *   0 on success and stats is filled, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int
+mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
+	struct rte_eth_stats tmp;
+	unsigned int i;
+	unsigned int idx;
+	uint64_t wrap_n;
+	int ret;
+
+	memset(&tmp, 0, sizeof(tmp));
+	/* Add software counters. */
+	for (i = 0; (i != priv->rxqs_n); ++i) {
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+
+		if (rxq == NULL)
+			continue;
+		idx = rxq->idx;
+		if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			tmp.q_ipackets[idx] += rxq->stats.ipackets;
+			tmp.q_ibytes[idx] += rxq->stats.ibytes;
+#endif
+			tmp.q_errors[idx] += (rxq->stats.idropped +
+					      rxq->stats.rx_nombuf);
+		}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		tmp.ipackets += rxq->stats.ipackets;
+		tmp.ibytes += rxq->stats.ibytes;
+#endif
+		tmp.ierrors += rxq->stats.idropped;
+		tmp.rx_nombuf += rxq->stats.rx_nombuf;
+	}
+	for (i = 0; (i != priv->txqs_n); ++i) {
+		struct mlx5_txq_data *txq = (*priv->txqs)[i];
+
+		if (txq == NULL)
+			continue;
+		idx = txq->idx;
+		if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			tmp.q_opackets[idx] += txq->stats.opackets;
+			tmp.q_obytes[idx] += txq->stats.obytes;
+#endif
+		}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		tmp.opackets += txq->stats.opackets;
+		tmp.obytes += txq->stats.obytes;
+#endif
+		tmp.oerrors += txq->stats.oerrors;
+	}
+	ret = mlx5_read_ib_stat(priv, "out_of_buffer", &tmp.imissed);
+	if (ret == 0) {
+		tmp.imissed = (tmp.imissed - stats_ctrl->imissed_base) &
+				 (uint64_t)UINT32_MAX;
+		wrap_n = stats_ctrl->imissed >> 32;
+		if (tmp.imissed < (stats_ctrl->imissed & (uint64_t)UINT32_MAX))
+			wrap_n++;
+		tmp.imissed |= (wrap_n) << 32;
+		stats_ctrl->imissed = tmp.imissed;
+	} else {
+		tmp.imissed = stats_ctrl->imissed;
+	}
+#ifndef MLX5_PMD_SOFT_COUNTERS
+	/* FIXME: retrieve and add hardware counters. */
+#endif
+	*stats = tmp;
+	return 0;
+}
+
+/**
+ * DPDK callback to clear device statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   always 0 on success and stats is reset
+ */
+int
+mlx5_stats_reset(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
+	unsigned int i;
+
+	for (i = 0; (i != priv->rxqs_n); ++i) {
+		if ((*priv->rxqs)[i] == NULL)
+			continue;
+		memset(&(*priv->rxqs)[i]->stats, 0,
+		       sizeof(struct mlx5_rxq_stats));
+	}
+	for (i = 0; (i != priv->txqs_n); ++i) {
+		if ((*priv->txqs)[i] == NULL)
+			continue;
+		memset(&(*priv->txqs)[i]->stats, 0,
+		       sizeof(struct mlx5_txq_stats));
+	}
+	mlx5_read_ib_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
+	stats_ctrl->imissed = 0;
+#ifndef MLX5_PMD_SOFT_COUNTERS
+	/* FIXME: reset hardware counters. */
+#endif
+
+	return 0;
+}
+
+/**
+ * DPDK callback to clear device extended statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success and stats is reset, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int
+mlx5_xstats_reset(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+	int stats_n;
+	unsigned int i;
+	unsigned int n = xstats_ctrl->mlx5_stats_n;
+	uint64_t counters[n];
+	int ret;
+
+	stats_n = mlx5_ethtool_get_stats_n(dev);
+	if (stats_n < 0) {
+		DRV_LOG(ERR, "port %u cannot get stats: %s", dev->data->port_id,
+			strerror(-stats_n));
+		return stats_n;
+	}
+	if (xstats_ctrl->stats_n != stats_n)
+		mlx5_stats_init(dev);
+	ret = mlx5_read_dev_counters(dev, counters);
+	if (ret) {
+		DRV_LOG(ERR, "port %u cannot read device counters: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	for (i = 0; i != n; ++i) {
+		xstats_ctrl->base[i] = counters[i];
+		xstats_ctrl->hw_stats[i] = 0;
+	}
+
+	return 0;
+}
+
+/**
+ * DPDK callback to retrieve names of extended device statistics
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] xstats_names
+ *   Buffer to insert names into.
+ * @param n
+ *   Number of names.
+ *
+ * @return
+ *   Number of xstats names.
+ */
+int
+mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
+		      struct rte_eth_xstat_name *xstats_names, unsigned int n)
+{
+	unsigned int i;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+	unsigned int mlx5_xstats_n = xstats_ctrl->mlx5_stats_n;
+
+	if (n >= mlx5_xstats_n && xstats_names) {
+		for (i = 0; i != mlx5_xstats_n; ++i) {
+			strncpy(xstats_names[i].name,
+				xstats_ctrl->info[i].dpdk_name,
+				RTE_ETH_XSTATS_NAME_SIZE);
+			xstats_names[i].name[RTE_ETH_XSTATS_NAME_SIZE - 1] = 0;
+		}
+	}
+	return mlx5_xstats_n;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c
new file mode 100644
index 000000000..8106598ff
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_trigger.c
@@ -0,0 +1,579 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <unistd.h>
+
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_interrupts.h>
+#include <rte_alarm.h>
+
+#include "mlx5.h"
+#include "mlx5_mr.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+#include "rte_pmd_mlx5.h"
+
+/**
+ * Stop traffic on Tx queues.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+static void
+mlx5_txq_stop(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+
+	for (i = 0; i != priv->txqs_n; ++i)
+		mlx5_txq_release(dev, i);
+}
+
+/**
+ * Start traffic on Tx queues.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_txq_start(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i != priv->txqs_n; ++i) {
+		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
+
+		if (!txq_ctrl)
+			continue;
+		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
+			txq_ctrl->obj = mlx5_txq_obj_new
+				(dev, i, MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN);
+		} else {
+			txq_alloc_elts(txq_ctrl);
+			txq_ctrl->obj = mlx5_txq_obj_new
+				(dev, i, MLX5_TXQ_OBJ_TYPE_IBV);
+		}
+		if (!txq_ctrl->obj) {
+			rte_errno = ENOMEM;
+			goto error;
+		}
+	}
+	return 0;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	do {
+		mlx5_txq_release(dev, i);
+	} while (i-- != 0);
+	rte_errno = ret; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Stop traffic on Rx queues.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+static void
+mlx5_rxq_stop(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+
+	for (i = 0; i != priv->rxqs_n; ++i)
+		mlx5_rxq_release(dev, i);
+}
+
+/**
+ * Start traffic on Rx queues.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rxq_start(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+	int ret = 0;
+	enum mlx5_rxq_obj_type obj_type = MLX5_RXQ_OBJ_TYPE_IBV;
+	struct mlx5_rxq_data *rxq = NULL;
+
+	for (i = 0; i < priv->rxqs_n; ++i) {
+		rxq = (*priv->rxqs)[i];
+		if (rxq && rxq->lro) {
+			obj_type =  MLX5_RXQ_OBJ_TYPE_DEVX_RQ;
+			break;
+		}
+	}
+	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
+	if (mlx5_mprq_alloc_mp(dev)) {
+		/* Should not release Rx queues but return immediately. */
+		return -rte_errno;
+	}
+	for (i = 0; i != priv->rxqs_n; ++i) {
+		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
+		struct rte_mempool *mp;
+
+		if (!rxq_ctrl)
+			continue;
+		if (rxq_ctrl->type == MLX5_RXQ_TYPE_HAIRPIN) {
+			rxq_ctrl->obj = mlx5_rxq_obj_new
+				(dev, i, MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN);
+			if (!rxq_ctrl->obj)
+				goto error;
+			continue;
+		}
+		/* Pre-register Rx mempool. */
+		mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
+		     rxq_ctrl->rxq.mprq_mp : rxq_ctrl->rxq.mp;
+		DRV_LOG(DEBUG,
+			"port %u Rx queue %u registering"
+			" mp %s having %u chunks",
+			dev->data->port_id, rxq_ctrl->rxq.idx,
+			mp->name, mp->nb_mem_chunks);
+		mlx5_mr_update_mp(dev, &rxq_ctrl->rxq.mr_ctrl, mp);
+		ret = rxq_alloc_elts(rxq_ctrl);
+		if (ret)
+			goto error;
+		rxq_ctrl->obj = mlx5_rxq_obj_new(dev, i, obj_type);
+		if (!rxq_ctrl->obj)
+			goto error;
+		if (obj_type == MLX5_RXQ_OBJ_TYPE_IBV)
+			rxq_ctrl->wqn = rxq_ctrl->obj->wq->wq_num;
+		else if (obj_type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ)
+			rxq_ctrl->wqn = rxq_ctrl->obj->rq->id;
+	}
+	return 0;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	do {
+		mlx5_rxq_release(dev, i);
+	} while (i-- != 0);
+	rte_errno = ret; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Binds Tx queues to Rx queues for hairpin.
+ *
+ * Binds Tx queues to the target Rx queues.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_hairpin_bind(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_devx_modify_sq_attr sq_attr = { 0 };
+	struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
+	struct mlx5_txq_ctrl *txq_ctrl;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+	struct mlx5_devx_obj *sq;
+	struct mlx5_devx_obj *rq;
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i != priv->txqs_n; ++i) {
+		txq_ctrl = mlx5_txq_get(dev, i);
+		if (!txq_ctrl)
+			continue;
+		if (txq_ctrl->type != MLX5_TXQ_TYPE_HAIRPIN) {
+			mlx5_txq_release(dev, i);
+			continue;
+		}
+		if (!txq_ctrl->obj) {
+			rte_errno = ENOMEM;
+			DRV_LOG(ERR, "port %u no txq object found: %d",
+				dev->data->port_id, i);
+			mlx5_txq_release(dev, i);
+			return -rte_errno;
+		}
+		sq = txq_ctrl->obj->sq;
+		rxq_ctrl = mlx5_rxq_get(dev,
+					txq_ctrl->hairpin_conf.peers[0].queue);
+		if (!rxq_ctrl) {
+			mlx5_txq_release(dev, i);
+			rte_errno = EINVAL;
+			DRV_LOG(ERR, "port %u no rxq object found: %d",
+				dev->data->port_id,
+				txq_ctrl->hairpin_conf.peers[0].queue);
+			return -rte_errno;
+		}
+		if (rxq_ctrl->type != MLX5_RXQ_TYPE_HAIRPIN ||
+		    rxq_ctrl->hairpin_conf.peers[0].queue != i) {
+			rte_errno = ENOMEM;
+			DRV_LOG(ERR, "port %u Tx queue %d can't be binded to "
+				"Rx queue %d", dev->data->port_id,
+				i, txq_ctrl->hairpin_conf.peers[0].queue);
+			goto error;
+		}
+		rq = rxq_ctrl->obj->rq;
+		if (!rq) {
+			rte_errno = ENOMEM;
+			DRV_LOG(ERR, "port %u hairpin no matching rxq: %d",
+				dev->data->port_id,
+				txq_ctrl->hairpin_conf.peers[0].queue);
+			goto error;
+		}
+		sq_attr.state = MLX5_SQC_STATE_RDY;
+		sq_attr.sq_state = MLX5_SQC_STATE_RST;
+		sq_attr.hairpin_peer_rq = rq->id;
+		sq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
+		ret = mlx5_devx_cmd_modify_sq(sq, &sq_attr);
+		if (ret)
+			goto error;
+		rq_attr.state = MLX5_SQC_STATE_RDY;
+		rq_attr.rq_state = MLX5_SQC_STATE_RST;
+		rq_attr.hairpin_peer_sq = sq->id;
+		rq_attr.hairpin_peer_vhca = priv->config.hca_attr.vhca_id;
+		ret = mlx5_devx_cmd_modify_rq(rq, &rq_attr);
+		if (ret)
+			goto error;
+		mlx5_txq_release(dev, i);
+		mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
+	}
+	return 0;
+error:
+	mlx5_txq_release(dev, i);
+	mlx5_rxq_release(dev, txq_ctrl->hairpin_conf.peers[0].queue);
+	return -rte_errno;
+}
+
+/**
+ * DPDK callback to start the device.
+ *
+ * Simulate device start by attaching all configured flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_start(struct rte_eth_dev *dev)
+{
+	int ret;
+	int fine_inline;
+
+	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
+	fine_inline = rte_mbuf_dynflag_lookup
+		(RTE_PMD_MLX5_FINE_GRANULARITY_INLINE, NULL);
+	if (fine_inline > 0)
+		rte_net_mlx5_dynf_inline_mask = 1UL << fine_inline;
+	else
+		rte_net_mlx5_dynf_inline_mask = 0;
+	if (dev->data->nb_rx_queues > 0) {
+		ret = mlx5_dev_configure_rss_reta(dev);
+		if (ret) {
+			DRV_LOG(ERR, "port %u reta config failed: %s",
+				dev->data->port_id, strerror(rte_errno));
+			return -rte_errno;
+		}
+	}
+	ret = mlx5_txq_start(dev);
+	if (ret) {
+		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return -rte_errno;
+	}
+	ret = mlx5_rxq_start(dev);
+	if (ret) {
+		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		mlx5_txq_stop(dev);
+		return -rte_errno;
+	}
+	ret = mlx5_hairpin_bind(dev);
+	if (ret) {
+		DRV_LOG(ERR, "port %u hairpin binding failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		mlx5_txq_stop(dev);
+		return -rte_errno;
+	}
+	/* Set started flag here for the following steps like control flow. */
+	dev->data->dev_started = 1;
+	ret = mlx5_rx_intr_vec_enable(dev);
+	if (ret) {
+		DRV_LOG(ERR, "port %u Rx interrupt vector creation failed",
+			dev->data->port_id);
+		goto error;
+	}
+	mlx5_stats_init(dev);
+	ret = mlx5_traffic_enable(dev);
+	if (ret) {
+		DRV_LOG(ERR, "port %u failed to set defaults flows",
+			dev->data->port_id);
+		goto error;
+	}
+	/* Set a mask and offset of dynamic metadata flows into Rx queues*/
+	mlx5_flow_rxq_dynf_metadata_set(dev);
+	/*
+	 * In non-cached mode, it only needs to start the default mreg copy
+	 * action and no flow created by application exists anymore.
+	 * But it is worth wrapping the interface for further usage.
+	 */
+	ret = mlx5_flow_start_default(dev);
+	if (ret) {
+		DRV_LOG(DEBUG, "port %u failed to start default actions: %s",
+			dev->data->port_id, strerror(rte_errno));
+		goto error;
+	}
+	rte_wmb();
+	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
+	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
+	/* Enable datapath on secondary process. */
+	mlx5_mp_req_start_rxtx(dev);
+	mlx5_dev_interrupt_handler_install(dev);
+	return 0;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	/* Rollback. */
+	dev->data->dev_started = 0;
+	mlx5_flow_stop_default(dev);
+	mlx5_traffic_disable(dev);
+	mlx5_txq_stop(dev);
+	mlx5_rxq_stop(dev);
+	rte_errno = ret; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * DPDK callback to stop the device.
+ *
+ * Simulate device stop by detaching all configured flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx5_dev_stop(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	dev->data->dev_started = 0;
+	/* Prevent crashes when queues are still in use. */
+	dev->rx_pkt_burst = removed_rx_burst;
+	dev->tx_pkt_burst = removed_tx_burst;
+	rte_wmb();
+	/* Disable datapath on secondary process. */
+	mlx5_mp_req_stop_rxtx(dev);
+	usleep(1000 * priv->rxqs_n);
+	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
+	mlx5_flow_stop_default(dev);
+	/* Control flows for default traffic can be removed firstly. */
+	mlx5_traffic_disable(dev);
+	/* All RX queue flags will be cleared in the flush interface. */
+	mlx5_flow_list_flush(dev, &priv->flows, true);
+	mlx5_rx_intr_vec_disable(dev);
+	mlx5_dev_interrupt_handler_uninstall(dev);
+	mlx5_txq_stop(dev);
+	mlx5_rxq_stop(dev);
+}
+
+/**
+ * Enable traffic flows configured by control plane
+ *
+ * @param dev
+ *   Pointer to Ethernet device private data.
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_traffic_enable(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow_item_eth bcast = {
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+	};
+	struct rte_flow_item_eth ipv6_multi_spec = {
+		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
+	};
+	struct rte_flow_item_eth ipv6_multi_mask = {
+		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
+	};
+	struct rte_flow_item_eth unicast = {
+		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+	};
+	struct rte_flow_item_eth unicast_mask = {
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+	};
+	const unsigned int vlan_filter_n = priv->vlan_filter_n;
+	const struct rte_ether_addr cmp = {
+		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+	};
+	unsigned int i;
+	unsigned int j;
+	int ret;
+
+	/*
+	 * Hairpin txq default flow should be created no matter if it is
+	 * isolation mode. Or else all the packets to be sent will be sent
+	 * out directly without the TX flow actions, e.g. encapsulation.
+	 */
+	for (i = 0; i != priv->txqs_n; ++i) {
+		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
+		if (!txq_ctrl)
+			continue;
+		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
+			ret = mlx5_ctrl_flow_source_queue(dev, i);
+			if (ret) {
+				mlx5_txq_release(dev, i);
+				goto error;
+			}
+		}
+		mlx5_txq_release(dev, i);
+	}
+	if (priv->config.dv_esw_en && !priv->config.vf) {
+		if (mlx5_flow_create_esw_table_zero_flow(dev))
+			priv->fdb_def_rule = 1;
+		else
+			DRV_LOG(INFO, "port %u FDB default rule cannot be"
+				" configured - only Eswitch group 0 flows are"
+				" supported.", dev->data->port_id);
+	}
+	if (priv->isolated)
+		return 0;
+	if (dev->data->promiscuous) {
+		struct rte_flow_item_eth promisc = {
+			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+			.type = 0,
+		};
+
+		ret = mlx5_ctrl_flow(dev, &promisc, &promisc);
+		if (ret)
+			goto error;
+	}
+	if (dev->data->all_multicast) {
+		struct rte_flow_item_eth multicast = {
+			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
+			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+			.type = 0,
+		};
+
+		ret = mlx5_ctrl_flow(dev, &multicast, &multicast);
+		if (ret)
+			goto error;
+	} else {
+		/* Add broadcast/multicast flows. */
+		for (i = 0; i != vlan_filter_n; ++i) {
+			uint16_t vlan = priv->vlan_filter[i];
+
+			struct rte_flow_item_vlan vlan_spec = {
+				.tci = rte_cpu_to_be_16(vlan),
+			};
+			struct rte_flow_item_vlan vlan_mask =
+				rte_flow_item_vlan_mask;
+
+			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
+						  &vlan_spec, &vlan_mask);
+			if (ret)
+				goto error;
+			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
+						  &ipv6_multi_mask,
+						  &vlan_spec, &vlan_mask);
+			if (ret)
+				goto error;
+		}
+		if (!vlan_filter_n) {
+			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
+			if (ret)
+				goto error;
+			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
+					     &ipv6_multi_mask);
+			if (ret)
+				goto error;
+		}
+	}
+	/* Add MAC address flows. */
+	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
+		struct rte_ether_addr *mac = &dev->data->mac_addrs[i];
+
+		if (!memcmp(mac, &cmp, sizeof(*mac)))
+			continue;
+		memcpy(&unicast.dst.addr_bytes,
+		       mac->addr_bytes,
+		       RTE_ETHER_ADDR_LEN);
+		for (j = 0; j != vlan_filter_n; ++j) {
+			uint16_t vlan = priv->vlan_filter[j];
+
+			struct rte_flow_item_vlan vlan_spec = {
+				.tci = rte_cpu_to_be_16(vlan),
+			};
+			struct rte_flow_item_vlan vlan_mask =
+				rte_flow_item_vlan_mask;
+
+			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
+						  &unicast_mask,
+						  &vlan_spec,
+						  &vlan_mask);
+			if (ret)
+				goto error;
+		}
+		if (!vlan_filter_n) {
+			ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask);
+			if (ret)
+				goto error;
+		}
+	}
+	return 0;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	mlx5_flow_list_flush(dev, &priv->ctrl_flows, false);
+	rte_errno = ret; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+
+/**
+ * Disable traffic flows configured by control plane
+ *
+ * @param dev
+ *   Pointer to Ethernet device private data.
+ */
+void
+mlx5_traffic_disable(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	mlx5_flow_list_flush(dev, &priv->ctrl_flows, false);
+}
+
+/**
+ * Restart traffic flows configured by control plane
+ *
+ * @param dev
+ *   Pointer to Ethernet device private data.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_traffic_restart(struct rte_eth_dev *dev)
+{
+	if (dev->data->dev_started) {
+		mlx5_traffic_disable(dev);
+		return mlx5_traffic_enable(dev);
+	}
+	return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c
new file mode 100644
index 000000000..a211fa91b
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_txq.c
@@ -0,0 +1,1470 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <inttypes.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_malloc.h>
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mr.h>
+
+#include "mlx5_defs.h"
+#include "mlx5_utils.h"
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_autoconf.h"
+
+/**
+ * Allocate TX queue elements.
+ *
+ * @param txq_ctrl
+ *   Pointer to TX queue structure.
+ */
+void
+txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	const unsigned int elts_n = 1 << txq_ctrl->txq.elts_n;
+	unsigned int i;
+
+	for (i = 0; (i != elts_n); ++i)
+		txq_ctrl->txq.elts[i] = NULL;
+	DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs",
+		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx, elts_n);
+	txq_ctrl->txq.elts_head = 0;
+	txq_ctrl->txq.elts_tail = 0;
+	txq_ctrl->txq.elts_comp = 0;
+}
+
+/**
+ * Free TX queue elements.
+ *
+ * @param txq_ctrl
+ *   Pointer to TX queue structure.
+ */
+void
+txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	const uint16_t elts_n = 1 << txq_ctrl->txq.elts_n;
+	const uint16_t elts_m = elts_n - 1;
+	uint16_t elts_head = txq_ctrl->txq.elts_head;
+	uint16_t elts_tail = txq_ctrl->txq.elts_tail;
+	struct rte_mbuf *(*elts)[elts_n] = &txq_ctrl->txq.elts;
+
+	DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs",
+		PORT_ID(txq_ctrl->priv), txq_ctrl->txq.idx);
+	txq_ctrl->txq.elts_head = 0;
+	txq_ctrl->txq.elts_tail = 0;
+	txq_ctrl->txq.elts_comp = 0;
+
+	while (elts_tail != elts_head) {
+		struct rte_mbuf *elt = (*elts)[elts_tail & elts_m];
+
+		MLX5_ASSERT(elt != NULL);
+		rte_pktmbuf_free_seg(elt);
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+		/* Poisoning. */
+		memset(&(*elts)[elts_tail & elts_m],
+		       0x77,
+		       sizeof((*elts)[elts_tail & elts_m]));
+#endif
+		++elts_tail;
+	}
+}
+
+/**
+ * Returns the per-port supported offloads.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   Supported Tx offloads.
+ */
+uint64_t
+mlx5_get_tx_port_offloads(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint64_t offloads = (DEV_TX_OFFLOAD_MULTI_SEGS |
+			     DEV_TX_OFFLOAD_VLAN_INSERT);
+	struct mlx5_dev_config *config = &priv->config;
+
+	if (config->hw_csum)
+		offloads |= (DEV_TX_OFFLOAD_IPV4_CKSUM |
+			     DEV_TX_OFFLOAD_UDP_CKSUM |
+			     DEV_TX_OFFLOAD_TCP_CKSUM);
+	if (config->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (config->swp) {
+		if (config->hw_csum)
+			offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (config->tso)
+			offloads |= (DEV_TX_OFFLOAD_IP_TNL_TSO |
+				     DEV_TX_OFFLOAD_UDP_TNL_TSO);
+	}
+	if (config->tunnel_en) {
+		if (config->hw_csum)
+			offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (config->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO |
+				     DEV_TX_OFFLOAD_GENEVE_TNL_TSO);
+	}
+	return offloads;
+}
+
+/**
+ * Tx queue presetup checks.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Tx queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_tx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (desc <= MLX5_TX_COMP_THRESH) {
+		DRV_LOG(WARNING,
+			"port %u number of descriptors requested for Tx queue"
+			" %u must be higher than MLX5_TX_COMP_THRESH, using %u"
+			" instead of %u",
+			dev->data->port_id, idx, MLX5_TX_COMP_THRESH + 1, desc);
+		desc = MLX5_TX_COMP_THRESH + 1;
+	}
+	if (!rte_is_power_of_2(desc)) {
+		desc = 1 << log2above(desc);
+		DRV_LOG(WARNING,
+			"port %u increased number of descriptors in Tx queue"
+			" %u to the next power of two (%d)",
+			dev->data->port_id, idx, desc);
+	}
+	DRV_LOG(DEBUG, "port %u configuring queue %u for %u descriptors",
+		dev->data->port_id, idx, desc);
+	if (idx >= priv->txqs_n) {
+		DRV_LOG(ERR, "port %u Tx queue index out of range (%u >= %u)",
+			dev->data->port_id, idx, priv->txqs_n);
+		rte_errno = EOVERFLOW;
+		return -rte_errno;
+	}
+	if (!mlx5_txq_releasable(dev, idx)) {
+		rte_errno = EBUSY;
+		DRV_LOG(ERR, "port %u unable to release queue index %u",
+			dev->data->port_id, idx);
+		return -rte_errno;
+	}
+	mlx5_txq_release(dev, idx);
+	return 0;
+}
+/**
+ * DPDK callback to configure a TX queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   TX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ *   Thresholds parameters.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+		    unsigned int socket, const struct rte_eth_txconf *conf)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq, struct mlx5_txq_ctrl, txq);
+	int res;
+
+	res = mlx5_tx_queue_pre_setup(dev, idx, desc);
+	if (res)
+		return res;
+	txq_ctrl = mlx5_txq_new(dev, idx, desc, socket, conf);
+	if (!txq_ctrl) {
+		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
+			dev->data->port_id, idx);
+		return -rte_errno;
+	}
+	DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
+		dev->data->port_id, idx);
+	(*priv->txqs)[idx] = &txq_ctrl->txq;
+	return 0;
+}
+
+/**
+ * DPDK callback to configure a TX hairpin queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   TX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param[in] hairpin_conf
+ *   The hairpin binding configuration.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+			    uint16_t desc,
+			    const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq, struct mlx5_txq_ctrl, txq);
+	int res;
+
+	res = mlx5_tx_queue_pre_setup(dev, idx, desc);
+	if (res)
+		return res;
+	if (hairpin_conf->peer_count != 1 ||
+	    hairpin_conf->peers[0].port != dev->data->port_id ||
+	    hairpin_conf->peers[0].queue >= priv->rxqs_n) {
+		DRV_LOG(ERR, "port %u unable to setup hairpin queue index %u "
+			" invalid hairpind configuration", dev->data->port_id,
+			idx);
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	txq_ctrl = mlx5_txq_hairpin_new(dev, idx, desc,	hairpin_conf);
+	if (!txq_ctrl) {
+		DRV_LOG(ERR, "port %u unable to allocate queue index %u",
+			dev->data->port_id, idx);
+		return -rte_errno;
+	}
+	DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
+		dev->data->port_id, idx);
+	(*priv->txqs)[idx] = &txq_ctrl->txq;
+	return 0;
+}
+
+/**
+ * DPDK callback to release a TX queue.
+ *
+ * @param dpdk_txq
+ *   Generic TX queue pointer.
+ */
+void
+mlx5_tx_queue_release(void *dpdk_txq)
+{
+	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
+	struct mlx5_txq_ctrl *txq_ctrl;
+	struct mlx5_priv *priv;
+	unsigned int i;
+
+	if (txq == NULL)
+		return;
+	txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
+	priv = txq_ctrl->priv;
+	for (i = 0; (i != priv->txqs_n); ++i)
+		if ((*priv->txqs)[i] == txq) {
+			DRV_LOG(DEBUG, "port %u removing Tx queue %u from list",
+				PORT_ID(priv), txq->idx);
+			mlx5_txq_release(ETH_DEV(priv), i);
+			break;
+		}
+}
+
+/**
+ * Configure the doorbell register non-cached attribute.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ * @param page_size
+ *   Systme page size
+ */
+static void
+txq_uar_ncattr_init(struct mlx5_txq_ctrl *txq_ctrl, size_t page_size)
+{
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	off_t cmd;
+
+	txq_ctrl->txq.db_heu = priv->config.dbnc == MLX5_TXDB_HEURISTIC;
+	txq_ctrl->txq.db_nc = 0;
+	/* Check the doorbell register mapping type. */
+	cmd = txq_ctrl->uar_mmap_offset / page_size;
+	cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
+	cmd &= MLX5_UAR_MMAP_CMD_MASK;
+	if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
+		txq_ctrl->txq.db_nc = 1;
+}
+
+/**
+ * Initialize Tx UAR registers for primary process.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ */
+static void
+txq_uar_init(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
+	const size_t page_size = sysconf(_SC_PAGESIZE);
+#ifndef RTE_ARCH_64
+	unsigned int lock_idx;
+#endif
+
+	if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD)
+		return;
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	MLX5_ASSERT(ppriv);
+	ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg;
+	txq_uar_ncattr_init(txq_ctrl, page_size);
+#ifndef RTE_ARCH_64
+	/* Assign an UAR lock according to UAR page number */
+	lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
+		   MLX5_UAR_PAGE_NUM_MASK;
+	txq_ctrl->txq.uar_lock = &priv->uar_lock[lock_idx];
+#endif
+}
+
+/**
+ * Remap UAR register of a Tx queue for secondary process.
+ *
+ * Remapped address is stored at the table in the process private structure of
+ * the device, indexed by queue index.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ * @param fd
+ *   Verbs file descriptor to map UAR pages.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+txq_uar_init_secondary(struct mlx5_txq_ctrl *txq_ctrl, int fd)
+{
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
+	struct mlx5_txq_data *txq = &txq_ctrl->txq;
+	void *addr;
+	uintptr_t uar_va;
+	uintptr_t offset;
+	const size_t page_size = sysconf(_SC_PAGESIZE);
+
+	if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD)
+		return 0;
+	MLX5_ASSERT(ppriv);
+	/*
+	 * As rdma-core, UARs are mapped in size of OS page
+	 * size. Ref to libmlx5 function: mlx5_init_context()
+	 */
+	uar_va = (uintptr_t)txq_ctrl->bf_reg;
+	offset = uar_va & (page_size - 1); /* Offset in page. */
+	addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
+			txq_ctrl->uar_mmap_offset);
+	if (addr == MAP_FAILED) {
+		DRV_LOG(ERR,
+			"port %u mmap failed for BF reg of txq %u",
+			txq->port_id, txq->idx);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	addr = RTE_PTR_ADD(addr, offset);
+	ppriv->uar_table[txq->idx] = addr;
+	txq_uar_ncattr_init(txq_ctrl, page_size);
+	return 0;
+}
+
+/**
+ * Unmap UAR register of a Tx queue for secondary process.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ */
+static void
+txq_uar_uninit_secondary(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(txq_ctrl->priv));
+	const size_t page_size = sysconf(_SC_PAGESIZE);
+	void *addr;
+
+	if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD)
+		return;
+	addr = ppriv->uar_table[txq_ctrl->txq.idx];
+	munmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size);
+}
+
+/**
+ * Initialize Tx UAR registers for secondary process.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param fd
+ *   Verbs file descriptor to map UAR pages.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq;
+	struct mlx5_txq_ctrl *txq_ctrl;
+	unsigned int i;
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	for (i = 0; i != priv->txqs_n; ++i) {
+		if (!(*priv->txqs)[i])
+			continue;
+		txq = (*priv->txqs)[i];
+		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
+		if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD)
+			continue;
+		MLX5_ASSERT(txq->idx == (uint16_t)i);
+		ret = txq_uar_init_secondary(txq_ctrl, fd);
+		if (ret)
+			goto error;
+	}
+	return 0;
+error:
+	/* Rollback. */
+	do {
+		if (!(*priv->txqs)[i])
+			continue;
+		txq = (*priv->txqs)[i];
+		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
+		txq_uar_uninit_secondary(txq_ctrl);
+	} while (i--);
+	return -rte_errno;
+}
+
+/**
+ * Create the Tx hairpin queue object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Tx queue array
+ *
+ * @return
+ *   The hairpin DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_txq_obj *
+mlx5_txq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq_data, struct mlx5_txq_ctrl, txq);
+	struct mlx5_devx_create_sq_attr attr = { 0 };
+	struct mlx5_txq_obj *tmpl = NULL;
+	int ret = 0;
+	uint32_t max_wq_data;
+
+	MLX5_ASSERT(txq_data);
+	MLX5_ASSERT(!txq_ctrl->obj);
+	tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+				 txq_ctrl->socket);
+	if (!tmpl) {
+		DRV_LOG(ERR,
+			"port %u Tx queue %u cannot allocate memory resources",
+			dev->data->port_id, txq_data->idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	tmpl->type = MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN;
+	tmpl->txq_ctrl = txq_ctrl;
+	attr.hairpin = 1;
+	attr.tis_lst_sz = 1;
+	max_wq_data = priv->config.hca_attr.log_max_hairpin_wq_data_sz;
+	/* Jumbo frames > 9KB should be supported, and more packets. */
+	if (priv->config.log_hp_size != (uint32_t)MLX5_ARG_UNSET) {
+		if (priv->config.log_hp_size > max_wq_data) {
+			DRV_LOG(ERR, "total data size %u power of 2 is "
+				"too large for hairpin",
+				priv->config.log_hp_size);
+			rte_errno = ERANGE;
+			return NULL;
+		}
+		attr.wq_attr.log_hairpin_data_sz = priv->config.log_hp_size;
+	} else {
+		attr.wq_attr.log_hairpin_data_sz =
+				(max_wq_data < MLX5_HAIRPIN_JUMBO_LOG_SIZE) ?
+				 max_wq_data : MLX5_HAIRPIN_JUMBO_LOG_SIZE;
+	}
+	/* Set the packets number to the maximum value for performance. */
+	attr.wq_attr.log_hairpin_num_packets =
+			attr.wq_attr.log_hairpin_data_sz -
+			MLX5_HAIRPIN_QUEUE_STRIDE;
+	attr.tis_num = priv->sh->tis->id;
+	tmpl->sq = mlx5_devx_cmd_create_sq(priv->sh->ctx, &attr);
+	if (!tmpl->sq) {
+		DRV_LOG(ERR,
+			"port %u tx hairpin queue %u can't create sq object",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+		goto error;
+	}
+	DRV_LOG(DEBUG, "port %u sxq %u updated with %p", dev->data->port_id,
+		idx, (void *)&tmpl);
+	rte_atomic32_inc(&tmpl->refcnt);
+	LIST_INSERT_HEAD(&priv->txqsobj, tmpl, next);
+	return tmpl;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	if (tmpl->tis)
+		mlx5_devx_cmd_destroy(tmpl->tis);
+	if (tmpl->sq)
+		mlx5_devx_cmd_destroy(tmpl->sq);
+	rte_errno = ret; /* Restore rte_errno. */
+	return NULL;
+}
+
+/**
+ * Create the Tx queue Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Tx queue array.
+ * @param type
+ *   Type of the Tx queue object to create.
+ *
+ * @return
+ *   The Verbs object initialised, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_txq_obj *
+mlx5_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
+		 enum mlx5_txq_obj_type type)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq_data, struct mlx5_txq_ctrl, txq);
+	struct mlx5_txq_obj tmpl;
+	struct mlx5_txq_obj *txq_obj = NULL;
+	union {
+		struct ibv_qp_init_attr_ex init;
+		struct ibv_cq_init_attr_ex cq;
+		struct ibv_qp_attr mod;
+	} attr;
+	unsigned int cqe_n;
+	struct mlx5dv_qp qp = { .comp_mask = MLX5DV_QP_MASK_UAR_MMAP_OFFSET };
+	struct mlx5dv_cq cq_info;
+	struct mlx5dv_obj obj;
+	const int desc = 1 << txq_data->elts_n;
+	int ret = 0;
+
+	if (type == MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN)
+		return mlx5_txq_obj_hairpin_new(dev, idx);
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	/* If using DevX, need additional mask to read tisn value. */
+	if (priv->config.devx && !priv->sh->tdn)
+		qp.comp_mask |= MLX5DV_QP_MASK_RAW_QP_HANDLES;
+#endif
+	MLX5_ASSERT(txq_data);
+	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_TX_QUEUE;
+	priv->verbs_alloc_ctx.obj = txq_ctrl;
+	if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
+		DRV_LOG(ERR,
+			"port %u MLX5_ENABLE_CQE_COMPRESSION must never be set",
+			dev->data->port_id);
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	memset(&tmpl, 0, sizeof(struct mlx5_txq_obj));
+	attr.cq = (struct ibv_cq_init_attr_ex){
+		.comp_mask = 0,
+	};
+	cqe_n = desc / MLX5_TX_COMP_THRESH +
+		1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
+	tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
+	if (tmpl.cq == NULL) {
+		DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+		goto error;
+	}
+	attr.init = (struct ibv_qp_init_attr_ex){
+		/* CQ to be associated with the send queue. */
+		.send_cq = tmpl.cq,
+		/* CQ to be associated with the receive queue. */
+		.recv_cq = tmpl.cq,
+		.cap = {
+			/* Max number of outstanding WRs. */
+			.max_send_wr =
+				((priv->sh->device_attr.orig_attr.max_qp_wr <
+				  desc) ?
+				 priv->sh->device_attr.orig_attr.max_qp_wr :
+				 desc),
+			/*
+			 * Max number of scatter/gather elements in a WR,
+			 * must be 1 to prevent libmlx5 from trying to affect
+			 * too much memory. TX gather is not impacted by the
+			 * device_attr.max_sge limit and will still work
+			 * properly.
+			 */
+			.max_send_sge = 1,
+		},
+		.qp_type = IBV_QPT_RAW_PACKET,
+		/*
+		 * Do *NOT* enable this, completions events are managed per
+		 * Tx burst.
+		 */
+		.sq_sig_all = 0,
+		.pd = priv->sh->pd,
+		.comp_mask = IBV_QP_INIT_ATTR_PD,
+	};
+	if (txq_data->inlen_send)
+		attr.init.cap.max_inline_data = txq_ctrl->max_inline_data;
+	if (txq_data->tso_en) {
+		attr.init.max_tso_header = txq_ctrl->max_tso_header;
+		attr.init.comp_mask |= IBV_QP_INIT_ATTR_MAX_TSO_HEADER;
+	}
+	tmpl.qp = mlx5_glue->create_qp_ex(priv->sh->ctx, &attr.init);
+	if (tmpl.qp == NULL) {
+		DRV_LOG(ERR, "port %u Tx queue %u QP creation failure",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+		goto error;
+	}
+	attr.mod = (struct ibv_qp_attr){
+		/* Move the QP to this state. */
+		.qp_state = IBV_QPS_INIT,
+		/* IB device port number. */
+		.port_num = (uint8_t)priv->ibv_port,
+	};
+	ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod,
+				   (IBV_QP_STATE | IBV_QP_PORT));
+	if (ret) {
+		DRV_LOG(ERR,
+			"port %u Tx queue %u QP state to IBV_QPS_INIT failed",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+		goto error;
+	}
+	attr.mod = (struct ibv_qp_attr){
+		.qp_state = IBV_QPS_RTR
+	};
+	ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE);
+	if (ret) {
+		DRV_LOG(ERR,
+			"port %u Tx queue %u QP state to IBV_QPS_RTR failed",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+		goto error;
+	}
+	attr.mod.qp_state = IBV_QPS_RTS;
+	ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE);
+	if (ret) {
+		DRV_LOG(ERR,
+			"port %u Tx queue %u QP state to IBV_QPS_RTS failed",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+		goto error;
+	}
+	txq_obj = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_txq_obj), 0,
+				    txq_ctrl->socket);
+	if (!txq_obj) {
+		DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory",
+			dev->data->port_id, idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	obj.cq.in = tmpl.cq;
+	obj.cq.out = &cq_info;
+	obj.qp.in = tmpl.qp;
+	obj.qp.out = &qp;
+	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP);
+	if (ret != 0) {
+		rte_errno = errno;
+		goto error;
+	}
+	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+		DRV_LOG(ERR,
+			"port %u wrong MLX5_CQE_SIZE environment variable"
+			" value: it should be set to %u",
+			dev->data->port_id, RTE_CACHE_LINE_SIZE);
+		rte_errno = EINVAL;
+		goto error;
+	}
+	txq_data->cqe_n = log2above(cq_info.cqe_cnt);
+	txq_data->cqe_s = 1 << txq_data->cqe_n;
+	txq_data->cqe_m = txq_data->cqe_s - 1;
+	txq_data->qp_num_8s = tmpl.qp->qp_num << 8;
+	txq_data->wqes = qp.sq.buf;
+	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
+	txq_data->wqe_s = 1 << txq_data->wqe_n;
+	txq_data->wqe_m = txq_data->wqe_s - 1;
+	txq_data->wqes_end = txq_data->wqes + txq_data->wqe_s;
+	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
+	txq_data->cq_db = cq_info.dbrec;
+	txq_data->cqes = (volatile struct mlx5_cqe *)cq_info.buf;
+	txq_data->cq_ci = 0;
+	txq_data->cq_pi = 0;
+	txq_data->wqe_ci = 0;
+	txq_data->wqe_pi = 0;
+	txq_data->wqe_comp = 0;
+	txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV;
+	txq_data->fcqs = rte_calloc_socket(__func__,
+					   txq_data->cqe_s,
+					   sizeof(*txq_data->fcqs),
+					   RTE_CACHE_LINE_SIZE,
+					   txq_ctrl->socket);
+	if (!txq_data->fcqs) {
+		DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory (FCQ)",
+			dev->data->port_id, idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	/*
+	 * If using DevX need to query and store TIS transport domain value.
+	 * This is done once per port.
+	 * Will use this value on Rx, when creating matching TIR.
+	 */
+	if (priv->config.devx && !priv->sh->tdn) {
+		ret = mlx5_devx_cmd_qp_query_tis_td(tmpl.qp, qp.tisn,
+						    &priv->sh->tdn);
+		if (ret) {
+			DRV_LOG(ERR, "Fail to query port %u Tx queue %u QP TIS "
+				"transport domain", dev->data->port_id, idx);
+			rte_errno = EINVAL;
+			goto error;
+		} else {
+			DRV_LOG(DEBUG, "port %u Tx queue %u TIS number %d "
+				"transport domain %d", dev->data->port_id,
+				idx, qp.tisn, priv->sh->tdn);
+		}
+	}
+#endif
+	txq_obj->qp = tmpl.qp;
+	txq_obj->cq = tmpl.cq;
+	rte_atomic32_inc(&txq_obj->refcnt);
+	txq_ctrl->bf_reg = qp.bf.reg;
+	if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
+		txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
+		DRV_LOG(DEBUG, "port %u: uar_mmap_offset 0x%"PRIx64,
+			dev->data->port_id, txq_ctrl->uar_mmap_offset);
+	} else {
+		DRV_LOG(ERR,
+			"port %u failed to retrieve UAR info, invalid"
+			" libmlx5.so",
+			dev->data->port_id);
+		rte_errno = EINVAL;
+		goto error;
+	}
+	txq_uar_init(txq_ctrl);
+	LIST_INSERT_HEAD(&priv->txqsobj, txq_obj, next);
+	txq_obj->txq_ctrl = txq_ctrl;
+	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+	return txq_obj;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	if (tmpl.cq)
+		claim_zero(mlx5_glue->destroy_cq(tmpl.cq));
+	if (tmpl.qp)
+		claim_zero(mlx5_glue->destroy_qp(tmpl.qp));
+	if (txq_data && txq_data->fcqs)
+		rte_free(txq_data->fcqs);
+	if (txq_obj)
+		rte_free(txq_obj);
+	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+	rte_errno = ret; /* Restore rte_errno. */
+	return NULL;
+}
+
+/**
+ * Get an Tx queue Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Tx queue array.
+ *
+ * @return
+ *   The Verbs object if it exists.
+ */
+struct mlx5_txq_obj *
+mlx5_txq_obj_get(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_ctrl *txq_ctrl;
+
+	if (idx >= priv->txqs_n)
+		return NULL;
+	if (!(*priv->txqs)[idx])
+		return NULL;
+	txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
+	if (txq_ctrl->obj)
+		rte_atomic32_inc(&txq_ctrl->obj->refcnt);
+	return txq_ctrl->obj;
+}
+
+/**
+ * Release an Tx verbs queue object.
+ *
+ * @param txq_obj
+ *   Verbs Tx queue object.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+int
+mlx5_txq_obj_release(struct mlx5_txq_obj *txq_obj)
+{
+	MLX5_ASSERT(txq_obj);
+	if (rte_atomic32_dec_and_test(&txq_obj->refcnt)) {
+		if (txq_obj->type == MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN) {
+			if (txq_obj->tis)
+				claim_zero(mlx5_devx_cmd_destroy(txq_obj->tis));
+		} else {
+			claim_zero(mlx5_glue->destroy_qp(txq_obj->qp));
+			claim_zero(mlx5_glue->destroy_cq(txq_obj->cq));
+				if (txq_obj->txq_ctrl->txq.fcqs)
+					rte_free(txq_obj->txq_ctrl->txq.fcqs);
+		}
+		LIST_REMOVE(txq_obj, next);
+		rte_free(txq_obj);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Verify the Verbs Tx queue list is empty
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The number of object not released.
+ */
+int
+mlx5_txq_obj_verify(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret = 0;
+	struct mlx5_txq_obj *txq_obj;
+
+	LIST_FOREACH(txq_obj, &priv->txqsobj, next) {
+		DRV_LOG(DEBUG, "port %u Verbs Tx queue %u still referenced",
+			dev->data->port_id, txq_obj->txq_ctrl->txq.idx);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Calculate the total number of WQEBB for Tx queue.
+ *
+ * Simplified version of calc_sq_size() in rdma-core.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ *
+ * @return
+ *   The number of WQEBB.
+ */
+static int
+txq_calc_wqebb_cnt(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	unsigned int wqe_size;
+	const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
+
+	wqe_size = MLX5_WQE_CSEG_SIZE +
+		   MLX5_WQE_ESEG_SIZE +
+		   MLX5_WSEG_SIZE -
+		   MLX5_ESEG_MIN_INLINE_SIZE +
+		   txq_ctrl->max_inline_data;
+	return rte_align32pow2(wqe_size * desc) / MLX5_WQE_SIZE;
+}
+
+/**
+ * Calculate the maximal inline data size for Tx queue.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ *
+ * @return
+ *   The maximal inline data size.
+ */
+static unsigned int
+txq_calc_inline_max(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	const unsigned int desc = 1 << txq_ctrl->txq.elts_n;
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	unsigned int wqe_size;
+
+	wqe_size = priv->sh->device_attr.orig_attr.max_qp_wr / desc;
+	if (!wqe_size)
+		return 0;
+	/*
+	 * This calculation is derived from tthe source of
+	 * mlx5_calc_send_wqe() in rdma_core library.
+	 */
+	wqe_size = wqe_size * MLX5_WQE_SIZE -
+		   MLX5_WQE_CSEG_SIZE -
+		   MLX5_WQE_ESEG_SIZE -
+		   MLX5_WSEG_SIZE -
+		   MLX5_WSEG_SIZE +
+		   MLX5_DSEG_MIN_INLINE_SIZE;
+	return wqe_size;
+}
+
+/**
+ * Set Tx queue parameters from device configuration.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ */
+static void
+txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_dev_config *config = &priv->config;
+	unsigned int inlen_send; /* Inline data for ordinary SEND.*/
+	unsigned int inlen_empw; /* Inline data for enhanced MPW. */
+	unsigned int inlen_mode; /* Minimal required Inline data. */
+	unsigned int txqs_inline; /* Min Tx queues to enable inline. */
+	uint64_t dev_txoff = priv->dev_data->dev_conf.txmode.offloads;
+	bool tso = txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO |
+					    DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+					    DEV_TX_OFFLOAD_GRE_TNL_TSO |
+					    DEV_TX_OFFLOAD_IP_TNL_TSO |
+					    DEV_TX_OFFLOAD_UDP_TNL_TSO);
+	bool vlan_inline;
+	unsigned int temp;
+
+	if (config->txqs_inline == MLX5_ARG_UNSET)
+		txqs_inline =
+#if defined(RTE_ARCH_ARM64)
+		(priv->pci_dev->id.device_id ==
+			PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) ?
+			MLX5_INLINE_MAX_TXQS_BLUEFIELD :
+#endif
+			MLX5_INLINE_MAX_TXQS;
+	else
+		txqs_inline = (unsigned int)config->txqs_inline;
+	inlen_send = (config->txq_inline_max == MLX5_ARG_UNSET) ?
+		     MLX5_SEND_DEF_INLINE_LEN :
+		     (unsigned int)config->txq_inline_max;
+	inlen_empw = (config->txq_inline_mpw == MLX5_ARG_UNSET) ?
+		     MLX5_EMPW_DEF_INLINE_LEN :
+		     (unsigned int)config->txq_inline_mpw;
+	inlen_mode = (config->txq_inline_min == MLX5_ARG_UNSET) ?
+		     0 : (unsigned int)config->txq_inline_min;
+	if (config->mps != MLX5_MPW_ENHANCED && config->mps != MLX5_MPW)
+		inlen_empw = 0;
+	/*
+	 * If there is requested minimal amount of data to inline
+	 * we MUST enable inlining. This is a case for ConnectX-4
+	 * which usually requires L2 inlined for correct operating
+	 * and ConnectX-4 Lx which requires L2-L4 inlined to
+	 * support E-Switch Flows.
+	 */
+	if (inlen_mode) {
+		if (inlen_mode <= MLX5_ESEG_MIN_INLINE_SIZE) {
+			/*
+			 * Optimize minimal inlining for single
+			 * segment packets to fill one WQEBB
+			 * without gaps.
+			 */
+			temp = MLX5_ESEG_MIN_INLINE_SIZE;
+		} else {
+			temp = inlen_mode - MLX5_ESEG_MIN_INLINE_SIZE;
+			temp = RTE_ALIGN(temp, MLX5_WSEG_SIZE) +
+			       MLX5_ESEG_MIN_INLINE_SIZE;
+			temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+		}
+		if (temp != inlen_mode) {
+			DRV_LOG(INFO,
+				"port %u minimal required inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_mode, temp);
+			inlen_mode = temp;
+		}
+	}
+	/*
+	 * If port is configured to support VLAN insertion and device
+	 * does not support this feature by HW (for NICs before ConnectX-5
+	 * or in case of wqe_vlan_insert flag is not set) we must enable
+	 * data inline on all queues because it is supported by single
+	 * tx_burst routine.
+	 */
+	txq_ctrl->txq.vlan_en = config->hw_vlan_insert;
+	vlan_inline = (dev_txoff & DEV_TX_OFFLOAD_VLAN_INSERT) &&
+		      !config->hw_vlan_insert;
+	/*
+	 * If there are few Tx queues it is prioritized
+	 * to save CPU cycles and disable data inlining at all.
+	 */
+	if (inlen_send && priv->txqs_n >= txqs_inline) {
+		/*
+		 * The data sent with ordinal MLX5_OPCODE_SEND
+		 * may be inlined in Ethernet Segment, align the
+		 * length accordingly to fit entire WQEBBs.
+		 */
+		temp = RTE_MAX(inlen_send,
+			       MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE);
+		temp -= MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
+		temp = RTE_ALIGN(temp, MLX5_WQE_SIZE);
+		temp += MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WQE_DSEG_SIZE;
+		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+				     MLX5_ESEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE * 2);
+		temp = RTE_MIN(temp, MLX5_SEND_MAX_INLINE_LEN);
+		temp = RTE_MAX(temp, inlen_mode);
+		if (temp != inlen_send) {
+			DRV_LOG(INFO,
+				"port %u ordinary send inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_send, temp);
+			inlen_send = temp;
+		}
+		/*
+		 * Not aligned to cache lines, but to WQEs.
+		 * First bytes of data (initial alignment)
+		 * is going to be copied explicitly at the
+		 * beginning of inlining buffer in Ethernet
+		 * Segment.
+		 */
+		MLX5_ASSERT(inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+		MLX5_ASSERT(inlen_send <= MLX5_WQE_SIZE_MAX +
+					  MLX5_ESEG_MIN_INLINE_SIZE -
+					  MLX5_WQE_CSEG_SIZE -
+					  MLX5_WQE_ESEG_SIZE -
+					  MLX5_WQE_DSEG_SIZE * 2);
+	} else if (inlen_mode) {
+		/*
+		 * If minimal inlining is requested we must
+		 * enable inlining in general, despite the
+		 * number of configured queues. Ignore the
+		 * txq_inline_max devarg, this is not
+		 * full-featured inline.
+		 */
+		inlen_send = inlen_mode;
+		inlen_empw = 0;
+	} else if (vlan_inline) {
+		/*
+		 * Hardware does not report offload for
+		 * VLAN insertion, we must enable data inline
+		 * to implement feature by software.
+		 */
+		inlen_send = MLX5_ESEG_MIN_INLINE_SIZE;
+		inlen_empw = 0;
+	} else {
+		inlen_send = 0;
+		inlen_empw = 0;
+	}
+	txq_ctrl->txq.inlen_send = inlen_send;
+	txq_ctrl->txq.inlen_mode = inlen_mode;
+	txq_ctrl->txq.inlen_empw = 0;
+	if (inlen_send && inlen_empw && priv->txqs_n >= txqs_inline) {
+		/*
+		 * The data sent with MLX5_OPCODE_ENHANCED_MPSW
+		 * may be inlined in Data Segment, align the
+		 * length accordingly to fit entire WQEBBs.
+		 */
+		temp = RTE_MAX(inlen_empw,
+			       MLX5_WQE_SIZE + MLX5_DSEG_MIN_INLINE_SIZE);
+		temp -= MLX5_DSEG_MIN_INLINE_SIZE;
+		temp = RTE_ALIGN(temp, MLX5_WQE_SIZE);
+		temp += MLX5_DSEG_MIN_INLINE_SIZE;
+		temp = RTE_MIN(temp, MLX5_WQE_SIZE_MAX +
+				     MLX5_DSEG_MIN_INLINE_SIZE -
+				     MLX5_WQE_CSEG_SIZE -
+				     MLX5_WQE_ESEG_SIZE -
+				     MLX5_WQE_DSEG_SIZE);
+		temp = RTE_MIN(temp, MLX5_EMPW_MAX_INLINE_LEN);
+		if (temp != inlen_empw) {
+			DRV_LOG(INFO,
+				"port %u enhanced empw inline setting"
+				" aligned from %u to %u",
+				PORT_ID(priv), inlen_empw, temp);
+			inlen_empw = temp;
+		}
+		MLX5_ASSERT(inlen_empw >= MLX5_ESEG_MIN_INLINE_SIZE);
+		MLX5_ASSERT(inlen_empw <= MLX5_WQE_SIZE_MAX +
+					  MLX5_DSEG_MIN_INLINE_SIZE -
+					  MLX5_WQE_CSEG_SIZE -
+					  MLX5_WQE_ESEG_SIZE -
+					  MLX5_WQE_DSEG_SIZE);
+		txq_ctrl->txq.inlen_empw = inlen_empw;
+	}
+	txq_ctrl->max_inline_data = RTE_MAX(inlen_send, inlen_empw);
+	if (tso) {
+		txq_ctrl->max_tso_header = MLX5_MAX_TSO_HEADER;
+		txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->max_inline_data,
+						    MLX5_MAX_TSO_HEADER);
+		txq_ctrl->txq.tso_en = 1;
+	}
+	txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp;
+	txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO |
+				 DEV_TX_OFFLOAD_UDP_TNL_TSO |
+				 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) &
+				txq_ctrl->txq.offloads) && config->swp;
+}
+
+/**
+ * Adjust Tx queue data inline parameters for large queue sizes.
+ * The data inline feature requires multiple WQEs to fit the packets,
+ * and if the large amount of Tx descriptors is requested by application
+ * the total WQE amount may exceed the hardware capabilities. If the
+ * default inline setting are used we can try to adjust these ones and
+ * meet the hardware requirements and not exceed the queue size.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ *
+ * @return
+ *   Zero on success, otherwise the parameters can not be adjusted.
+ */
+static int
+txq_adjust_params(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_dev_config *config = &priv->config;
+	unsigned int max_inline;
+
+	max_inline = txq_calc_inline_max(txq_ctrl);
+	if (!txq_ctrl->txq.inlen_send) {
+		/*
+		 * Inline data feature is not engaged at all.
+		 * There is nothing to adjust.
+		 */
+		return 0;
+	}
+	if (txq_ctrl->max_inline_data <= max_inline) {
+		/*
+		 * The requested inline data length does not
+		 * exceed queue capabilities.
+		 */
+		return 0;
+	}
+	if (txq_ctrl->txq.inlen_mode > max_inline) {
+		DRV_LOG(ERR,
+			"minimal data inline requirements (%u) are not"
+			" satisfied (%u) on port %u, try the smaller"
+			" Tx queue size (%d)",
+			txq_ctrl->txq.inlen_mode, max_inline,
+			priv->dev_data->port_id,
+			priv->sh->device_attr.orig_attr.max_qp_wr);
+		goto error;
+	}
+	if (txq_ctrl->txq.inlen_send > max_inline &&
+	    config->txq_inline_max != MLX5_ARG_UNSET &&
+	    config->txq_inline_max > (int)max_inline) {
+		DRV_LOG(ERR,
+			"txq_inline_max requirements (%u) are not"
+			" satisfied (%u) on port %u, try the smaller"
+			" Tx queue size (%d)",
+			txq_ctrl->txq.inlen_send, max_inline,
+			priv->dev_data->port_id,
+			priv->sh->device_attr.orig_attr.max_qp_wr);
+		goto error;
+	}
+	if (txq_ctrl->txq.inlen_empw > max_inline &&
+	    config->txq_inline_mpw != MLX5_ARG_UNSET &&
+	    config->txq_inline_mpw > (int)max_inline) {
+		DRV_LOG(ERR,
+			"txq_inline_mpw requirements (%u) are not"
+			" satisfied (%u) on port %u, try the smaller"
+			" Tx queue size (%d)",
+			txq_ctrl->txq.inlen_empw, max_inline,
+			priv->dev_data->port_id,
+			priv->sh->device_attr.orig_attr.max_qp_wr);
+		goto error;
+	}
+	if (txq_ctrl->txq.tso_en && max_inline < MLX5_MAX_TSO_HEADER) {
+		DRV_LOG(ERR,
+			"tso header inline requirements (%u) are not"
+			" satisfied (%u) on port %u, try the smaller"
+			" Tx queue size (%d)",
+			MLX5_MAX_TSO_HEADER, max_inline,
+			priv->dev_data->port_id,
+			priv->sh->device_attr.orig_attr.max_qp_wr);
+		goto error;
+	}
+	if (txq_ctrl->txq.inlen_send > max_inline) {
+		DRV_LOG(WARNING,
+			"adjust txq_inline_max (%u->%u)"
+			" due to large Tx queue on port %u",
+			txq_ctrl->txq.inlen_send, max_inline,
+			priv->dev_data->port_id);
+		txq_ctrl->txq.inlen_send = max_inline;
+	}
+	if (txq_ctrl->txq.inlen_empw > max_inline) {
+		DRV_LOG(WARNING,
+			"adjust txq_inline_mpw (%u->%u)"
+			"due to large Tx queue on port %u",
+			txq_ctrl->txq.inlen_empw, max_inline,
+			priv->dev_data->port_id);
+		txq_ctrl->txq.inlen_empw = max_inline;
+	}
+	txq_ctrl->max_inline_data = RTE_MAX(txq_ctrl->txq.inlen_send,
+					    txq_ctrl->txq.inlen_empw);
+	MLX5_ASSERT(txq_ctrl->max_inline_data <= max_inline);
+	MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= max_inline);
+	MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_send);
+	MLX5_ASSERT(txq_ctrl->txq.inlen_mode <= txq_ctrl->txq.inlen_empw ||
+		    !txq_ctrl->txq.inlen_empw);
+	return 0;
+error:
+	rte_errno = ENOMEM;
+	return -ENOMEM;
+}
+
+/**
+ * Create a DPDK Tx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   TX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ *  Thresholds parameters.
+ *
+ * @return
+ *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_txq_ctrl *
+mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+	     unsigned int socket, const struct rte_eth_txconf *conf)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_ctrl *tmpl;
+
+	tmpl = rte_calloc_socket("TXQ", 1,
+				 sizeof(*tmpl) +
+				 desc * sizeof(struct rte_mbuf *),
+				 0, socket);
+	if (!tmpl) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	if (mlx5_mr_btree_init(&tmpl->txq.mr_ctrl.cache_bh,
+			       MLX5_MR_BTREE_CACHE_N, socket)) {
+		/* rte_errno is already set. */
+		goto error;
+	}
+	/* Save pointer of global generation number to check memory event. */
+	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->share_cache.dev_gen;
+	MLX5_ASSERT(desc > MLX5_TX_COMP_THRESH);
+	tmpl->txq.offloads = conf->offloads |
+			     dev->data->dev_conf.txmode.offloads;
+	tmpl->priv = priv;
+	tmpl->socket = socket;
+	tmpl->txq.elts_n = log2above(desc);
+	tmpl->txq.elts_s = desc;
+	tmpl->txq.elts_m = desc - 1;
+	tmpl->txq.port_id = dev->data->port_id;
+	tmpl->txq.idx = idx;
+	txq_set_params(tmpl);
+	if (txq_adjust_params(tmpl))
+		goto error;
+	if (txq_calc_wqebb_cnt(tmpl) >
+	    priv->sh->device_attr.orig_attr.max_qp_wr) {
+		DRV_LOG(ERR,
+			"port %u Tx WQEBB count (%d) exceeds the limit (%d),"
+			" try smaller queue size",
+			dev->data->port_id, txq_calc_wqebb_cnt(tmpl),
+			priv->sh->device_attr.orig_attr.max_qp_wr);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	rte_atomic32_inc(&tmpl->refcnt);
+	tmpl->type = MLX5_TXQ_TYPE_STANDARD;
+	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
+	return tmpl;
+error:
+	rte_free(tmpl);
+	return NULL;
+}
+
+/**
+ * Create a DPDK Tx hairpin queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   TX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param hairpin_conf
+ *  The hairpin configuration.
+ *
+ * @return
+ *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_txq_ctrl *
+mlx5_txq_hairpin_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+		     const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_ctrl *tmpl;
+
+	tmpl = rte_calloc_socket("TXQ", 1,
+				 sizeof(*tmpl), 0, SOCKET_ID_ANY);
+	if (!tmpl) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	tmpl->priv = priv;
+	tmpl->socket = SOCKET_ID_ANY;
+	tmpl->txq.elts_n = log2above(desc);
+	tmpl->txq.port_id = dev->data->port_id;
+	tmpl->txq.idx = idx;
+	tmpl->hairpin_conf = *hairpin_conf;
+	tmpl->type = MLX5_TXQ_TYPE_HAIRPIN;
+	rte_atomic32_inc(&tmpl->refcnt);
+	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
+	return tmpl;
+}
+
+/**
+ * Get a Tx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   TX queue index.
+ *
+ * @return
+ *   A pointer to the queue if it exists.
+ */
+struct mlx5_txq_ctrl *
+mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_ctrl *ctrl = NULL;
+
+	if ((*priv->txqs)[idx]) {
+		ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl,
+				    txq);
+		mlx5_txq_obj_get(dev, idx);
+		rte_atomic32_inc(&ctrl->refcnt);
+	}
+	return ctrl;
+}
+
+/**
+ * Release a Tx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   TX queue index.
+ *
+ * @return
+ *   1 while a reference on it exists, 0 when freed.
+ */
+int
+mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_ctrl *txq;
+
+	if (!(*priv->txqs)[idx])
+		return 0;
+	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
+	if (txq->obj && !mlx5_txq_obj_release(txq->obj))
+		txq->obj = NULL;
+	if (rte_atomic32_dec_and_test(&txq->refcnt)) {
+		txq_free_elts(txq);
+		mlx5_mr_btree_free(&txq->txq.mr_ctrl.cache_bh);
+		LIST_REMOVE(txq, next);
+		rte_free(txq);
+		(*priv->txqs)[idx] = NULL;
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * Verify if the queue can be released.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   TX queue index.
+ *
+ * @return
+ *   1 if the queue can be released.
+ */
+int
+mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_ctrl *txq;
+
+	if (!(*priv->txqs)[idx])
+		return -1;
+	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
+	return (rte_atomic32_read(&txq->refcnt) == 1);
+}
+
+/**
+ * Verify the Tx Queue list is empty
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   The number of object not released.
+ */
+int
+mlx5_txq_verify(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_ctrl *txq_ctrl;
+	int ret = 0;
+
+	LIST_FOREACH(txq_ctrl, &priv->txqsctrl, next) {
+		DRV_LOG(DEBUG, "port %u Tx queue %u still referenced",
+			dev->data->port_id, txq_ctrl->txq.idx);
+		++ret;
+	}
+	return ret;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c
new file mode 100644
index 000000000..d29fbcbc8
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.c
@@ -0,0 +1,484 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#include <rte_malloc.h>
+#include <rte_hash_crc.h>
+
+#include "mlx5_utils.h"
+
+struct mlx5_hlist *
+mlx5_hlist_create(const char *name, uint32_t size)
+{
+	struct mlx5_hlist *h;
+	uint32_t act_size;
+	uint32_t alloc_size;
+
+	if (!size)
+		return NULL;
+	/* Align to the next power of 2, 32bits integer is enough now. */
+	if (!rte_is_power_of_2(size)) {
+		act_size = rte_align32pow2(size);
+		DRV_LOG(WARNING, "Size 0x%" PRIX32 " is not power of 2, will "
+			"be aligned to 0x%" PRIX32 ".\n", size, act_size);
+	} else {
+		act_size = size;
+	}
+	alloc_size = sizeof(struct mlx5_hlist) +
+		     sizeof(struct mlx5_hlist_head) * act_size;
+	/* Using zmalloc, then no need to initialize the heads. */
+	h = rte_zmalloc(name, alloc_size, RTE_CACHE_LINE_SIZE);
+	if (!h) {
+		DRV_LOG(ERR, "No memory for hash list %s creation\n",
+			name ? name : "None");
+		return NULL;
+	}
+	if (name)
+		snprintf(h->name, MLX5_HLIST_NAMESIZE, "%s", name);
+	h->table_sz = act_size;
+	h->mask = act_size - 1;
+	DRV_LOG(DEBUG, "Hash list with %s size 0x%" PRIX32 " is created.\n",
+		h->name, act_size);
+	return h;
+}
+
+struct mlx5_hlist_entry *
+mlx5_hlist_lookup(struct mlx5_hlist *h, uint64_t key)
+{
+	uint32_t idx;
+	struct mlx5_hlist_head *first;
+	struct mlx5_hlist_entry *node;
+
+	MLX5_ASSERT(h);
+	idx = rte_hash_crc_8byte(key, 0) & h->mask;
+	first = &h->heads[idx];
+	LIST_FOREACH(node, first, next) {
+		if (node->key == key)
+			return node;
+	}
+	return NULL;
+}
+
+int
+mlx5_hlist_insert(struct mlx5_hlist *h, struct mlx5_hlist_entry *entry)
+{
+	uint32_t idx;
+	struct mlx5_hlist_head *first;
+	struct mlx5_hlist_entry *node;
+
+	MLX5_ASSERT(h && entry);
+	idx = rte_hash_crc_8byte(entry->key, 0) & h->mask;
+	first = &h->heads[idx];
+	/* No need to reuse the lookup function. */
+	LIST_FOREACH(node, first, next) {
+		if (node->key == entry->key)
+			return -EEXIST;
+	}
+	LIST_INSERT_HEAD(first, entry, next);
+	return 0;
+}
+
+void
+mlx5_hlist_remove(struct mlx5_hlist *h __rte_unused,
+		  struct mlx5_hlist_entry *entry)
+{
+	MLX5_ASSERT(entry && entry->next.le_prev);
+	LIST_REMOVE(entry, next);
+	/* Set to NULL to get rid of removing action for more than once. */
+	entry->next.le_prev = NULL;
+}
+
+void
+mlx5_hlist_destroy(struct mlx5_hlist *h,
+		   mlx5_hlist_destroy_callback_fn cb, void *ctx)
+{
+	uint32_t idx;
+	struct mlx5_hlist_entry *entry;
+
+	MLX5_ASSERT(h);
+	for (idx = 0; idx < h->table_sz; ++idx) {
+		/* no LIST_FOREACH_SAFE, using while instead */
+		while (!LIST_EMPTY(&h->heads[idx])) {
+			entry = LIST_FIRST(&h->heads[idx]);
+			LIST_REMOVE(entry, next);
+			/*
+			 * The owner of whole element which contains data entry
+			 * is the user, so it's the user's duty to do the clean
+			 * up and the free work because someone may not put the
+			 * hlist entry at the beginning(suggested to locate at
+			 * the beginning). Or else the default free function
+			 * will be used.
+			 */
+			if (cb)
+				cb(entry, ctx);
+			else
+				rte_free(entry);
+		}
+	}
+	rte_free(h);
+}
+
+static inline void
+mlx5_ipool_lock(struct mlx5_indexed_pool *pool)
+{
+	if (pool->cfg.need_lock)
+		rte_spinlock_lock(&pool->lock);
+}
+
+static inline void
+mlx5_ipool_unlock(struct mlx5_indexed_pool *pool)
+{
+	if (pool->cfg.need_lock)
+		rte_spinlock_unlock(&pool->lock);
+}
+
+static inline uint32_t
+mlx5_trunk_idx_get(struct mlx5_indexed_pool *pool, uint32_t entry_idx)
+{
+	struct mlx5_indexed_pool_config *cfg = &pool->cfg;
+	uint32_t trunk_idx = 0;
+	uint32_t i;
+
+	if (!cfg->grow_trunk)
+		return entry_idx / cfg->trunk_size;
+	if (entry_idx >= pool->grow_tbl[cfg->grow_trunk - 1]) {
+		trunk_idx = (entry_idx - pool->grow_tbl[cfg->grow_trunk - 1]) /
+			    (cfg->trunk_size << (cfg->grow_shift *
+			    cfg->grow_trunk)) + cfg->grow_trunk;
+	} else {
+		for (i = 0; i < cfg->grow_trunk; i++) {
+			if (entry_idx < pool->grow_tbl[i])
+				break;
+		}
+		trunk_idx = i;
+	}
+	return trunk_idx;
+}
+
+static inline uint32_t
+mlx5_trunk_size_get(struct mlx5_indexed_pool *pool, uint32_t trunk_idx)
+{
+	struct mlx5_indexed_pool_config *cfg = &pool->cfg;
+
+	return cfg->trunk_size << (cfg->grow_shift *
+	       (trunk_idx > cfg->grow_trunk ? cfg->grow_trunk : trunk_idx));
+}
+
+static inline uint32_t
+mlx5_trunk_idx_offset_get(struct mlx5_indexed_pool *pool, uint32_t trunk_idx)
+{
+	struct mlx5_indexed_pool_config *cfg = &pool->cfg;
+	uint32_t offset = 0;
+
+	if (!trunk_idx)
+		return 0;
+	if (!cfg->grow_trunk)
+		return cfg->trunk_size * trunk_idx;
+	if (trunk_idx < cfg->grow_trunk)
+		offset = pool->grow_tbl[trunk_idx - 1];
+	else
+		offset = pool->grow_tbl[cfg->grow_trunk - 1] +
+			 (cfg->trunk_size << (cfg->grow_shift *
+			 cfg->grow_trunk)) * (trunk_idx - cfg->grow_trunk);
+	return offset;
+}
+
+struct mlx5_indexed_pool *
+mlx5_ipool_create(struct mlx5_indexed_pool_config *cfg)
+{
+	struct mlx5_indexed_pool *pool;
+	uint32_t i;
+
+	if (!cfg || !cfg->size || (!cfg->malloc ^ !cfg->free) ||
+	    (cfg->trunk_size && ((cfg->trunk_size & (cfg->trunk_size - 1)) ||
+	    ((__builtin_ffs(cfg->trunk_size) + TRUNK_IDX_BITS) > 32))))
+		return NULL;
+	pool = rte_zmalloc("mlx5_ipool", sizeof(*pool) + cfg->grow_trunk *
+				sizeof(pool->grow_tbl[0]), RTE_CACHE_LINE_SIZE);
+	if (!pool)
+		return NULL;
+	pool->cfg = *cfg;
+	if (!pool->cfg.trunk_size)
+		pool->cfg.trunk_size = MLX5_IPOOL_DEFAULT_TRUNK_SIZE;
+	if (!cfg->malloc && !cfg->free) {
+		pool->cfg.malloc = rte_malloc_socket;
+		pool->cfg.free = rte_free;
+	}
+	pool->free_list = TRUNK_INVALID;
+	if (pool->cfg.need_lock)
+		rte_spinlock_init(&pool->lock);
+	/*
+	 * Initialize the dynamic grow trunk size lookup table to have a quick
+	 * lookup for the trunk entry index offset.
+	 */
+	for (i = 0; i < cfg->grow_trunk; i++) {
+		pool->grow_tbl[i] = cfg->trunk_size << (cfg->grow_shift * i);
+		if (i > 0)
+			pool->grow_tbl[i] += pool->grow_tbl[i - 1];
+	}
+	return pool;
+}
+
+static int
+mlx5_ipool_grow(struct mlx5_indexed_pool *pool)
+{
+	struct mlx5_indexed_trunk *trunk;
+	struct mlx5_indexed_trunk **trunk_tmp;
+	struct mlx5_indexed_trunk **p;
+	size_t trunk_size = 0;
+	size_t data_size;
+	size_t bmp_size;
+	uint32_t idx;
+
+	if (pool->n_trunk_valid == TRUNK_MAX_IDX)
+		return -ENOMEM;
+	if (pool->n_trunk_valid == pool->n_trunk) {
+		/* No free trunk flags, expand trunk list. */
+		int n_grow = pool->n_trunk_valid ? pool->n_trunk :
+			     RTE_CACHE_LINE_SIZE / sizeof(void *);
+
+		p = pool->cfg.malloc(pool->cfg.type,
+				 (pool->n_trunk_valid + n_grow) *
+				 sizeof(struct mlx5_indexed_trunk *),
+				 RTE_CACHE_LINE_SIZE, rte_socket_id());
+		if (!p)
+			return -ENOMEM;
+		if (pool->trunks)
+			memcpy(p, pool->trunks, pool->n_trunk_valid *
+			       sizeof(struct mlx5_indexed_trunk *));
+		memset(RTE_PTR_ADD(p, pool->n_trunk_valid * sizeof(void *)), 0,
+		       n_grow * sizeof(void *));
+		trunk_tmp = pool->trunks;
+		pool->trunks = p;
+		if (trunk_tmp)
+			pool->cfg.free(trunk_tmp);
+		pool->n_trunk += n_grow;
+	}
+	if (!pool->cfg.release_mem_en) {
+		idx = pool->n_trunk_valid;
+	} else {
+		/* Find the first available slot in trunk list */
+		for (idx = 0; idx < pool->n_trunk; idx++)
+			if (pool->trunks[idx] == NULL)
+				break;
+	}
+	trunk_size += sizeof(*trunk);
+	data_size = mlx5_trunk_size_get(pool, idx);
+	bmp_size = rte_bitmap_get_memory_footprint(data_size);
+	/* rte_bitmap requires memory cacheline aligned. */
+	trunk_size += RTE_CACHE_LINE_ROUNDUP(data_size * pool->cfg.size);
+	trunk_size += bmp_size;
+	trunk = pool->cfg.malloc(pool->cfg.type, trunk_size,
+				 RTE_CACHE_LINE_SIZE, rte_socket_id());
+	if (!trunk)
+		return -ENOMEM;
+	pool->trunks[idx] = trunk;
+	trunk->idx = idx;
+	trunk->free = data_size;
+	trunk->prev = TRUNK_INVALID;
+	trunk->next = TRUNK_INVALID;
+	MLX5_ASSERT(pool->free_list == TRUNK_INVALID);
+	pool->free_list = idx;
+	/* Mark all entries as available. */
+	trunk->bmp = rte_bitmap_init_with_all_set(data_size, &trunk->data
+		     [RTE_CACHE_LINE_ROUNDUP(data_size * pool->cfg.size)],
+		     bmp_size);
+	MLX5_ASSERT(trunk->bmp);
+	pool->n_trunk_valid++;
+#ifdef POOL_DEBUG
+	pool->trunk_new++;
+	pool->trunk_avail++;
+#endif
+	return 0;
+}
+
+void *
+mlx5_ipool_malloc(struct mlx5_indexed_pool *pool, uint32_t *idx)
+{
+	struct mlx5_indexed_trunk *trunk;
+	uint64_t slab = 0;
+	uint32_t iidx = 0;
+	void *p;
+
+	mlx5_ipool_lock(pool);
+	if (pool->free_list == TRUNK_INVALID) {
+		/* If no available trunks, grow new. */
+		if (mlx5_ipool_grow(pool)) {
+			mlx5_ipool_unlock(pool);
+			return NULL;
+		}
+	}
+	MLX5_ASSERT(pool->free_list != TRUNK_INVALID);
+	trunk = pool->trunks[pool->free_list];
+	MLX5_ASSERT(trunk->free);
+	if (!rte_bitmap_scan(trunk->bmp, &iidx, &slab)) {
+		mlx5_ipool_unlock(pool);
+		return NULL;
+	}
+	MLX5_ASSERT(slab);
+	iidx += __builtin_ctzll(slab);
+	MLX5_ASSERT(iidx != UINT32_MAX);
+	MLX5_ASSERT(iidx < mlx5_trunk_size_get(pool, trunk->idx));
+	rte_bitmap_clear(trunk->bmp, iidx);
+	p = &trunk->data[iidx * pool->cfg.size];
+	iidx += mlx5_trunk_idx_offset_get(pool, trunk->idx);
+	iidx += 1; /* non-zero index. */
+	trunk->free--;
+#ifdef POOL_DEBUG
+	pool->n_entry++;
+#endif
+	if (!trunk->free) {
+		/* Full trunk will be removed from free list in imalloc. */
+		MLX5_ASSERT(pool->free_list == trunk->idx);
+		pool->free_list = trunk->next;
+		if (trunk->next != TRUNK_INVALID)
+			pool->trunks[trunk->next]->prev = TRUNK_INVALID;
+		trunk->prev = TRUNK_INVALID;
+		trunk->next = TRUNK_INVALID;
+#ifdef POOL_DEBUG
+		pool->trunk_empty++;
+		pool->trunk_avail--;
+#endif
+	}
+	*idx = iidx;
+	mlx5_ipool_unlock(pool);
+	return p;
+}
+
+void *
+mlx5_ipool_zmalloc(struct mlx5_indexed_pool *pool, uint32_t *idx)
+{
+	void *entry = mlx5_ipool_malloc(pool, idx);
+
+	if (entry)
+		memset(entry, 0, pool->cfg.size);
+	return entry;
+}
+
+void
+mlx5_ipool_free(struct mlx5_indexed_pool *pool, uint32_t idx)
+{
+	struct mlx5_indexed_trunk *trunk;
+	uint32_t trunk_idx;
+	uint32_t entry_idx;
+
+	if (!idx)
+		return;
+	idx -= 1;
+	mlx5_ipool_lock(pool);
+	trunk_idx = mlx5_trunk_idx_get(pool, idx);
+	if ((!pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk_valid) ||
+	    (pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk))
+		goto out;
+	trunk = pool->trunks[trunk_idx];
+	if (!trunk)
+		goto out;
+	entry_idx = idx - mlx5_trunk_idx_offset_get(pool, trunk->idx);
+	if (trunk_idx != trunk->idx ||
+	    rte_bitmap_get(trunk->bmp, entry_idx))
+		goto out;
+	rte_bitmap_set(trunk->bmp, entry_idx);
+	trunk->free++;
+	if (pool->cfg.release_mem_en && trunk->free == mlx5_trunk_size_get
+	   (pool, trunk->idx)) {
+		if (pool->free_list == trunk->idx)
+			pool->free_list = trunk->next;
+		if (trunk->next != TRUNK_INVALID)
+			pool->trunks[trunk->next]->prev = trunk->prev;
+		if (trunk->prev != TRUNK_INVALID)
+			pool->trunks[trunk->prev]->next = trunk->next;
+		pool->cfg.free(trunk);
+		pool->trunks[trunk_idx] = NULL;
+		pool->n_trunk_valid--;
+#ifdef POOL_DEBUG
+		pool->trunk_avail--;
+		pool->trunk_free++;
+#endif
+		if (pool->n_trunk_valid == 0) {
+			pool->cfg.free(pool->trunks);
+			pool->trunks = NULL;
+			pool->n_trunk = 0;
+		}
+	} else if (trunk->free == 1) {
+		/* Put into free trunk list head. */
+		MLX5_ASSERT(pool->free_list != trunk->idx);
+		trunk->next = pool->free_list;
+		trunk->prev = TRUNK_INVALID;
+		if (pool->free_list != TRUNK_INVALID)
+			pool->trunks[pool->free_list]->prev = trunk->idx;
+		pool->free_list = trunk->idx;
+#ifdef POOL_DEBUG
+		pool->trunk_empty--;
+		pool->trunk_avail++;
+#endif
+	}
+#ifdef POOL_DEBUG
+	pool->n_entry--;
+#endif
+out:
+	mlx5_ipool_unlock(pool);
+}
+
+void *
+mlx5_ipool_get(struct mlx5_indexed_pool *pool, uint32_t idx)
+{
+	struct mlx5_indexed_trunk *trunk;
+	void *p = NULL;
+	uint32_t trunk_idx;
+	uint32_t entry_idx;
+
+	if (!idx)
+		return NULL;
+	idx -= 1;
+	mlx5_ipool_lock(pool);
+	trunk_idx = mlx5_trunk_idx_get(pool, idx);
+	if ((!pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk_valid) ||
+	    (pool->cfg.release_mem_en && trunk_idx >= pool->n_trunk))
+		goto out;
+	trunk = pool->trunks[trunk_idx];
+	if (!trunk)
+		goto out;
+	entry_idx = idx - mlx5_trunk_idx_offset_get(pool, trunk->idx);
+	if (trunk_idx != trunk->idx ||
+	    rte_bitmap_get(trunk->bmp, entry_idx))
+		goto out;
+	p = &trunk->data[entry_idx * pool->cfg.size];
+out:
+	mlx5_ipool_unlock(pool);
+	return p;
+}
+
+int
+mlx5_ipool_destroy(struct mlx5_indexed_pool *pool)
+{
+	struct mlx5_indexed_trunk **trunks;
+	uint32_t i;
+
+	MLX5_ASSERT(pool);
+	mlx5_ipool_lock(pool);
+	trunks = pool->trunks;
+	for (i = 0; i < pool->n_trunk; i++) {
+		if (trunks[i])
+			pool->cfg.free(trunks[i]);
+	}
+	if (!pool->trunks)
+		pool->cfg.free(pool->trunks);
+	mlx5_ipool_unlock(pool);
+	rte_free(pool);
+	return 0;
+}
+
+void
+mlx5_ipool_dump(struct mlx5_indexed_pool *pool)
+{
+	printf("Pool %s entry size %u, trunks %u, %d entry per trunk, "
+	       "total: %d\n",
+	       pool->cfg.type, pool->cfg.size, pool->n_trunk_valid,
+	       pool->cfg.trunk_size, pool->n_trunk_valid);
+#ifdef POOL_DEBUG
+	printf("Pool %s entry %u, trunk alloc %u, empty: %u, "
+	       "available %u free %u\n",
+	       pool->cfg.type, pool->n_entry, pool->trunk_new,
+	       pool->trunk_empty, pool->trunk_avail, pool->trunk_free);
+#endif
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h
new file mode 100644
index 000000000..f4ec15170
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_utils.h
@@ -0,0 +1,423 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_UTILS_H_
+#define RTE_PMD_MLX5_UTILS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <limits.h>
+#include <errno.h>
+
+#include <rte_spinlock.h>
+#include <rte_memory.h>
+#include <rte_bitmap.h>
+
+#include <mlx5_common.h>
+
+#include "mlx5_defs.h"
+
+
+/* Convert a bit number to the corresponding 64-bit mask */
+#define MLX5_BITSHIFT(v) (UINT64_C(1) << (v))
+
+/* Save and restore errno around argument evaluation. */
+#define ERRNO_SAFE(x) ((errno = (int []){ errno, ((x), 0) }[0]))
+
+extern int mlx5_logtype;
+
+/* Generic printf()-like logging macro with automatic line feed. */
+#define DRV_LOG(level, ...) \
+	PMD_DRV_LOG_(level, mlx5_logtype, MLX5_DRIVER_NAME, \
+		__VA_ARGS__ PMD_DRV_LOG_STRIP PMD_DRV_LOG_OPAREN, \
+		PMD_DRV_LOG_CPAREN)
+
+#define INFO(...) DRV_LOG(INFO, __VA_ARGS__)
+#define WARN(...) DRV_LOG(WARNING, __VA_ARGS__)
+#define ERROR(...) DRV_LOG(ERR, __VA_ARGS__)
+
+/* Convenience macros for accessing mbuf fields. */
+#define NEXT(m) ((m)->next)
+#define DATA_LEN(m) ((m)->data_len)
+#define PKT_LEN(m) ((m)->pkt_len)
+#define DATA_OFF(m) ((m)->data_off)
+#define SET_DATA_OFF(m, o) ((m)->data_off = (o))
+#define NB_SEGS(m) ((m)->nb_segs)
+#define PORT(m) ((m)->port)
+
+/* Transpose flags. Useful to convert IBV to DPDK flags. */
+#define TRANSPOSE(val, from, to) \
+	(((from) >= (to)) ? \
+	 (((val) & (from)) / ((from) / (to))) : \
+	 (((val) & (from)) * ((to) / (from))))
+
+/*
+ * The indexed memory entry index is made up of trunk index and offset of
+ * the entry in the trunk. Since the entry index is 32 bits, in case user
+ * prefers to have small trunks, user can change the macro below to a big
+ * number which helps the pool contains more trunks with lots of entries
+ * allocated.
+ */
+#define TRUNK_IDX_BITS 16
+#define TRUNK_MAX_IDX ((1 << TRUNK_IDX_BITS) - 1)
+#define TRUNK_INVALID TRUNK_MAX_IDX
+#define MLX5_IPOOL_DEFAULT_TRUNK_SIZE (1 << (28 - TRUNK_IDX_BITS))
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+#define POOL_DEBUG 1
+#endif
+
+struct mlx5_indexed_pool_config {
+	uint32_t size; /* Pool entry size. */
+	uint32_t trunk_size:22;
+	/*
+	 * Trunk entry number. Must be power of 2. It can be increased
+	 * if trunk_grow enable. The trunk entry number increases with
+	 * left shift grow_shift. Trunks with index are after grow_trunk
+	 * will keep the entry number same with the last grow trunk.
+	 */
+	uint32_t grow_trunk:4;
+	/*
+	 * Trunks with entry number increase in the pool. Set it to 0
+	 * to make the pool works as trunk entry fixed pool. It works
+	 * only if grow_shift is not 0.
+	 */
+	uint32_t grow_shift:4;
+	/*
+	 * Trunk entry number increase shift value, stop after grow_trunk.
+	 * It works only if grow_trunk is not 0.
+	 */
+	uint32_t need_lock:1;
+	/* Lock is needed for multiple thread usage. */
+	uint32_t release_mem_en:1; /* Rlease trunk when it is free. */
+	const char *type; /* Memory allocate type name. */
+	void *(*malloc)(const char *type, size_t size, unsigned int align,
+			int socket);
+	/* User defined memory allocator. */
+	void (*free)(void *addr); /* User defined memory release. */
+};
+
+struct mlx5_indexed_trunk {
+	uint32_t idx; /* Trunk id. */
+	uint32_t prev; /* Previous free trunk in free list. */
+	uint32_t next; /* Next free trunk in free list. */
+	uint32_t free; /* Free entries available */
+	struct rte_bitmap *bmp;
+	uint8_t data[] __rte_cache_aligned; /* Entry data start. */
+};
+
+struct mlx5_indexed_pool {
+	struct mlx5_indexed_pool_config cfg; /* Indexed pool configuration. */
+	rte_spinlock_t lock; /* Pool lock for multiple thread usage. */
+	uint32_t n_trunk_valid; /* Trunks allocated. */
+	uint32_t n_trunk; /* Trunk pointer array size. */
+	/* Dim of trunk pointer array. */
+	struct mlx5_indexed_trunk **trunks;
+	uint32_t free_list; /* Index to first free trunk. */
+#ifdef POOL_DEBUG
+	uint32_t n_entry;
+	uint32_t trunk_new;
+	uint32_t trunk_avail;
+	uint32_t trunk_empty;
+	uint32_t trunk_free;
+#endif
+	uint32_t grow_tbl[]; /* Save the index offset for the grow trunks. */
+};
+
+/**
+ * Return logarithm of the nearest power of two above input value.
+ *
+ * @param v
+ *   Input value.
+ *
+ * @return
+ *   Logarithm of the nearest power of two above input value.
+ */
+static inline unsigned int
+log2above(unsigned int v)
+{
+	unsigned int l;
+	unsigned int r;
+
+	for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
+		r |= (v & 1);
+	return l + r;
+}
+
+/** Maximum size of string for naming the hlist table. */
+#define MLX5_HLIST_NAMESIZE			32
+
+/**
+ * Structure of the entry in the hash list, user should define its own struct
+ * that contains this in order to store the data. The 'key' is 64-bits right
+ * now and its user's responsibility to guarantee there is no collision.
+ */
+struct mlx5_hlist_entry {
+	LIST_ENTRY(mlx5_hlist_entry) next; /* entry pointers in the list. */
+	uint64_t key; /* user defined 'key', could be the hash signature. */
+};
+
+/** Structure for hash head. */
+LIST_HEAD(mlx5_hlist_head, mlx5_hlist_entry);
+
+/** Type of function that is used to handle the data before freeing. */
+typedef void (*mlx5_hlist_destroy_callback_fn)(void *p, void *ctx);
+
+/** hash list table structure */
+struct mlx5_hlist {
+	char name[MLX5_HLIST_NAMESIZE]; /**< Name of the hash list. */
+	/**< number of heads, need to be power of 2. */
+	uint32_t table_sz;
+	/**< mask to get the index of the list heads. */
+	uint32_t mask;
+	struct mlx5_hlist_head heads[];	/**< list head arrays. */
+};
+
+/**
+ * Create a hash list table, the user can specify the list heads array size
+ * of the table, now the size should be a power of 2 in order to get better
+ * distribution for the entries. Each entry is a part of the whole data element
+ * and the caller should be responsible for the data element's allocation and
+ * cleanup / free. Key of each entry will be calculated with CRC in order to
+ * generate a little fairer distribution.
+ *
+ * @param name
+ *   Name of the hash list(optional).
+ * @param size
+ *   Heads array size of the hash list.
+ *
+ * @return
+ *   Pointer of the hash list table created, NULL on failure.
+ */
+struct mlx5_hlist *mlx5_hlist_create(const char *name, uint32_t size);
+
+/**
+ * Search an entry matching the key.
+ *
+ * @param h
+ *   Pointer to the hast list table.
+ * @param key
+ *   Key for the searching entry.
+ *
+ * @return
+ *   Pointer of the hlist entry if found, NULL otherwise.
+ */
+struct mlx5_hlist_entry *mlx5_hlist_lookup(struct mlx5_hlist *h, uint64_t key);
+
+/**
+ * Insert an entry to the hash list table, the entry is only part of whole data
+ * element and a 64B key is used for matching. User should construct the key or
+ * give a calculated hash signature and guarantee there is no collision.
+ *
+ * @param h
+ *   Pointer to the hast list table.
+ * @param entry
+ *   Entry to be inserted into the hash list table.
+ *
+ * @return
+ *   - zero for success.
+ *   - -EEXIST if the entry is already inserted.
+ */
+int mlx5_hlist_insert(struct mlx5_hlist *h, struct mlx5_hlist_entry *entry);
+
+/**
+ * Remove an entry from the hash list table. User should guarantee the validity
+ * of the entry.
+ *
+ * @param h
+ *   Pointer to the hast list table. (not used)
+ * @param entry
+ *   Entry to be removed from the hash list table.
+ */
+void mlx5_hlist_remove(struct mlx5_hlist *h __rte_unused,
+		       struct mlx5_hlist_entry *entry);
+
+/**
+ * Destroy the hash list table, all the entries already inserted into the lists
+ * will be handled by the callback function provided by the user (including
+ * free if needed) before the table is freed.
+ *
+ * @param h
+ *   Pointer to the hast list table.
+ * @param cb
+ *   Callback function for each inserted entry when destroying the hash list.
+ * @param ctx
+ *   Common context parameter used by callback function for each entry.
+ */
+void mlx5_hlist_destroy(struct mlx5_hlist *h,
+			mlx5_hlist_destroy_callback_fn cb, void *ctx);
+
+/**
+ * This function allocates non-initialized memory entry from pool.
+ * In NUMA systems, the memory entry allocated resides on the same
+ * NUMA socket as the core that calls this function.
+ *
+ * Memory entry is allocated from memory trunk, no alignment.
+ *
+ * @param pool
+ *   Pointer to indexed memory entry pool.
+ *   No initialization required.
+ * @param[out] idx
+ *   Pointer to memory to save allocated index.
+ *   Memory index always positive value.
+ * @return
+ *   - Pointer to the allocated memory entry.
+ *   - NULL on error. Not enough memory, or invalid arguments.
+ */
+void *mlx5_ipool_malloc(struct mlx5_indexed_pool *pool, uint32_t *idx);
+
+/**
+ * This function allocates zero initialized memory entry from pool.
+ * In NUMA systems, the memory entry allocated resides on the same
+ * NUMA socket as the core that calls this function.
+ *
+ * Memory entry is allocated from memory trunk, no alignment.
+ *
+ * @param pool
+ *   Pointer to indexed memory pool.
+ *   No initialization required.
+ * @param[out] idx
+ *   Pointer to memory to save allocated index.
+ *   Memory index always positive value.
+ * @return
+ *   - Pointer to the allocated memory entry .
+ *   - NULL on error. Not enough memory, or invalid arguments.
+ */
+void *mlx5_ipool_zmalloc(struct mlx5_indexed_pool *pool, uint32_t *idx);
+
+/**
+ * This function frees indexed memory entry to pool.
+ * Caller has to make sure that the index is allocated from same pool.
+ *
+ * @param pool
+ *   Pointer to indexed memory pool.
+ * @param idx
+ *   Allocated memory entry index.
+ */
+void mlx5_ipool_free(struct mlx5_indexed_pool *pool, uint32_t idx);
+
+/**
+ * This function returns pointer of indexed memory entry from index.
+ * Caller has to make sure that the index is valid, and allocated
+ * from same pool.
+ *
+ * @param pool
+ *   Pointer to indexed memory pool.
+ * @param idx
+ *   Allocated memory index.
+ * @return
+ *   - Pointer to indexed memory entry.
+ */
+void *mlx5_ipool_get(struct mlx5_indexed_pool *pool, uint32_t idx);
+
+/**
+ * This function creates indexed memory pool.
+ * Caller has to configure the configuration accordingly.
+ *
+ * @param pool
+ *   Pointer to indexed memory pool.
+ * @param cfg
+ *   Allocated memory index.
+ */
+struct mlx5_indexed_pool *
+mlx5_ipool_create(struct mlx5_indexed_pool_config *cfg);
+
+/**
+ * This function releases all resources of pool.
+ * Caller has to make sure that all indexes and memories allocated
+ * from this pool not referenced anymore.
+ *
+ * @param pool
+ *   Pointer to indexed memory pool.
+ * @return
+ *   - non-zero value on error.
+ *   - 0 on success.
+ */
+int mlx5_ipool_destroy(struct mlx5_indexed_pool *pool);
+
+/**
+ * This function dumps debug info of pool.
+ *
+ * @param pool
+ *   Pointer to indexed memory pool.
+ */
+void mlx5_ipool_dump(struct mlx5_indexed_pool *pool);
+
+/*
+ * Macros for linked list based on indexed memory.
+ * Example data structure:
+ * struct Foo {
+ *	ILIST_ENTRY(uint16_t) next;
+ *	...
+ * }
+ *
+ */
+#define ILIST_ENTRY(type)						\
+struct {								\
+	type prev; /* Index of previous element. */			\
+	type next; /* Index of next element. */				\
+}
+
+#define ILIST_INSERT(pool, head, idx, elem, field)			\
+	do {								\
+		typeof(elem) peer;					\
+		MLX5_ASSERT((elem) && (idx));				\
+		(elem)->field.next = *(head);				\
+		(elem)->field.prev = 0;					\
+		if (*(head)) {						\
+			(peer) = mlx5_ipool_get(pool, *(head));		\
+			if (peer)					\
+				(peer)->field.prev = (idx);		\
+		}							\
+		*(head) = (idx);					\
+	} while (0)
+
+#define ILIST_REMOVE(pool, head, idx, elem, field)			\
+	do {								\
+		typeof(elem) peer;					\
+		MLX5_ASSERT(elem);					\
+		MLX5_ASSERT(head);					\
+		if ((elem)->field.prev) {				\
+			(peer) = mlx5_ipool_get				\
+				 (pool, (elem)->field.prev);		\
+			if (peer)					\
+				(peer)->field.next = (elem)->field.next;\
+		}							\
+		if ((elem)->field.next) {				\
+			(peer) = mlx5_ipool_get				\
+				 (pool, (elem)->field.next);		\
+			if (peer)					\
+				(peer)->field.prev = (elem)->field.prev;\
+		}							\
+		if (*(head) == (idx))					\
+			*(head) = (elem)->field.next;			\
+	} while (0)
+
+#define ILIST_FOREACH(pool, head, idx, elem, field)			\
+	for ((idx) = (head), (elem) =					\
+	     (idx) ? mlx5_ipool_get(pool, (idx)) : NULL; (elem);	\
+	     idx = (elem)->field.next, (elem) =				\
+	     (idx) ? mlx5_ipool_get(pool, idx) : NULL)
+
+/* Single index list. */
+#define SILIST_ENTRY(type)						\
+struct {								\
+	type next; /* Index of next element. */				\
+}
+
+#define SILIST_INSERT(head, idx, elem, field)				\
+	do {								\
+		MLX5_ASSERT((elem) && (idx));				\
+		(elem)->field.next = *(head);				\
+		*(head) = (idx);					\
+	} while (0)
+
+#define SILIST_FOREACH(pool, head, idx, elem, field)			\
+	for ((idx) = (head), (elem) =					\
+	     (idx) ? mlx5_ipool_get(pool, (idx)) : NULL; (elem);	\
+	     idx = (elem)->field.next, (elem) =				\
+	     (idx) ? mlx5_ipool_get(pool, idx) : NULL)
+
+#endif /* RTE_PMD_MLX5_UTILS_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c b/src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c
new file mode 100644
index 000000000..f65e416da
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/mlx5_vlan.c
@@ -0,0 +1,327 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <stdint.h>
+#include <unistd.h>
+
+
+/*
+ * Not needed by this file; included to work around the lack of off_t
+ * definition for mlx5dv.h with unpatched rdma-core versions.
+ */
+#include <sys/types.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx5dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev_driver.h>
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_hypervisor.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_nl.h>
+
+#include "mlx5.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+/**
+ * DPDK callback to configure a VLAN filter.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param vlan_id
+ *   VLAN ID to filter.
+ * @param on
+ *   Toggle filter.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+
+	DRV_LOG(DEBUG, "port %u %s VLAN filter ID %" PRIu16,
+		dev->data->port_id, (on ? "enable" : "disable"), vlan_id);
+	MLX5_ASSERT(priv->vlan_filter_n <= RTE_DIM(priv->vlan_filter));
+	for (i = 0; (i != priv->vlan_filter_n); ++i)
+		if (priv->vlan_filter[i] == vlan_id)
+			break;
+	/* Check if there's room for another VLAN filter. */
+	if (i == RTE_DIM(priv->vlan_filter)) {
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	if (i < priv->vlan_filter_n) {
+		MLX5_ASSERT(priv->vlan_filter_n != 0);
+		/* Enabling an existing VLAN filter has no effect. */
+		if (on)
+			goto out;
+		/* Remove VLAN filter from list. */
+		--priv->vlan_filter_n;
+		memmove(&priv->vlan_filter[i],
+			&priv->vlan_filter[i + 1],
+			sizeof(priv->vlan_filter[i]) *
+			(priv->vlan_filter_n - i));
+		priv->vlan_filter[priv->vlan_filter_n] = 0;
+	} else {
+		MLX5_ASSERT(i == priv->vlan_filter_n);
+		/* Disabling an unknown VLAN filter has no effect. */
+		if (!on)
+			goto out;
+		/* Add new VLAN filter. */
+		priv->vlan_filter[priv->vlan_filter_n] = vlan_id;
+		++priv->vlan_filter_n;
+	}
+out:
+	if (dev->data->dev_started)
+		return mlx5_traffic_restart(dev);
+	return 0;
+}
+
+/**
+ * Callback to set/reset VLAN stripping for a specific queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param queue
+ *   RX queue index.
+ * @param on
+ *   Enable/disable VLAN stripping.
+ */
+void
+mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[queue];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	struct ibv_wq_attr mod;
+	uint16_t vlan_offloads =
+		(on ? IBV_WQ_FLAGS_CVLAN_STRIPPING : 0) |
+		0;
+	int ret = 0;
+
+	/* Validate hw support */
+	if (!priv->config.hw_vlan_strip) {
+		DRV_LOG(ERR, "port %u VLAN stripping is not supported",
+			dev->data->port_id);
+		return;
+	}
+	/* Validate queue number */
+	if (queue >= priv->rxqs_n) {
+		DRV_LOG(ERR, "port %u VLAN stripping, invalid queue number %d",
+			dev->data->port_id, queue);
+		return;
+	}
+	DRV_LOG(DEBUG, "port %u set VLAN offloads 0x%x for port %uqueue %d",
+		dev->data->port_id, vlan_offloads, rxq->port_id, queue);
+	if (!rxq_ctrl->obj) {
+		/* Update related bits in RX queue. */
+		rxq->vlan_strip = !!on;
+		return;
+	}
+	if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
+		mod = (struct ibv_wq_attr){
+			.attr_mask = IBV_WQ_ATTR_FLAGS,
+			.flags_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING,
+			.flags = vlan_offloads,
+		};
+		ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod);
+	} else if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) {
+		struct mlx5_devx_modify_rq_attr rq_attr;
+
+		memset(&rq_attr, 0, sizeof(rq_attr));
+		rq_attr.rq_state = MLX5_RQC_STATE_RDY;
+		rq_attr.state = MLX5_RQC_STATE_RDY;
+		rq_attr.vsd = (on ? 0 : 1);
+		rq_attr.modify_bitmask = MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD;
+		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
+	}
+	if (ret) {
+		DRV_LOG(ERR, "port %u failed to modify object %d stripping "
+			"mode: %s", dev->data->port_id,
+			rxq_ctrl->obj->type, strerror(rte_errno));
+		return;
+	}
+	/* Update related bits in RX queue. */
+	rxq->vlan_strip = !!on;
+}
+
+/**
+ * Callback to set/reset VLAN offloads for a port.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mask
+ *   VLAN offload bit mask.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int i;
+
+	if (mask & ETH_VLAN_STRIP_MASK) {
+		int hw_vlan_strip = !!(dev->data->dev_conf.rxmode.offloads &
+				       DEV_RX_OFFLOAD_VLAN_STRIP);
+
+		if (!priv->config.hw_vlan_strip) {
+			DRV_LOG(ERR, "port %u VLAN stripping is not supported",
+				dev->data->port_id);
+			return 0;
+		}
+		/* Run on every RX queue and set/reset VLAN stripping. */
+		for (i = 0; (i != priv->rxqs_n); i++)
+			mlx5_vlan_strip_queue_set(dev, i, hw_vlan_strip);
+	}
+	return 0;
+}
+
+/*
+ * Release VLAN network device, created for VM workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to release.
+ */
+void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_nl_vlan_vmwa_context *vmwa = priv->vmwa_context;
+	struct mlx5_nl_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
+
+	MLX5_ASSERT(vlan->created);
+	MLX5_ASSERT(priv->vmwa_context);
+	if (!vlan->created || !vmwa)
+		return;
+	vlan->created = 0;
+	MLX5_ASSERT(vlan_dev[vlan->tag].refcnt);
+	if (--vlan_dev[vlan->tag].refcnt == 0 &&
+	    vlan_dev[vlan->tag].ifindex) {
+		mlx5_nl_vlan_vmwa_delete(vmwa, vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex = 0;
+	}
+}
+
+/**
+ * Acquire VLAN interface with specified tag for VM workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to acquire.
+ */
+void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_nl_vlan_vmwa_context *vmwa = priv->vmwa_context;
+	struct mlx5_nl_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
+
+	MLX5_ASSERT(!vlan->created);
+	MLX5_ASSERT(priv->vmwa_context);
+	if (vlan->created || !vmwa)
+		return;
+	if (vlan_dev[vlan->tag].refcnt == 0) {
+		MLX5_ASSERT(!vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex =
+			mlx5_nl_vlan_vmwa_create(vmwa, vmwa->vf_ifindex,
+						 vlan->tag);
+	}
+	if (vlan_dev[vlan->tag].ifindex) {
+		vlan_dev[vlan->tag].refcnt++;
+		vlan->created = 1;
+	}
+}
+
+/*
+ * Create per ethernet device VLAN VM workaround context
+ */
+struct mlx5_nl_vlan_vmwa_context *
+mlx5_vlan_vmwa_init(struct rte_eth_dev *dev, uint32_t ifindex)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	struct mlx5_nl_vlan_vmwa_context *vmwa;
+	enum rte_hypervisor hv_type;
+
+	/* Do not engage workaround over PF. */
+	if (!config->vf)
+		return NULL;
+	/* Check whether there is desired virtual environment */
+	hv_type = rte_hypervisor_get();
+	switch (hv_type) {
+	case RTE_HYPERVISOR_UNKNOWN:
+	case RTE_HYPERVISOR_VMWARE:
+		/*
+		 * The "white list" of configurations
+		 * to engage the workaround.
+		 */
+		break;
+	default:
+		/*
+		 * The configuration is not found in the "white list".
+		 * We should not engage the VLAN workaround.
+		 */
+		return NULL;
+	}
+	vmwa = rte_zmalloc(__func__, sizeof(*vmwa), sizeof(uint32_t));
+	if (!vmwa) {
+		DRV_LOG(WARNING,
+			"Can not allocate memory"
+			" for VLAN workaround context");
+		return NULL;
+	}
+	vmwa->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
+	if (vmwa->nl_socket < 0) {
+		DRV_LOG(WARNING,
+			"Can not create Netlink socket"
+			" for VLAN workaround context");
+		rte_free(vmwa);
+		return NULL;
+	}
+	vmwa->vf_ifindex = ifindex;
+	/* Cleanup for existing VLAN devices. */
+	return vmwa;
+}
+
+/*
+ * Destroy per ethernet device VLAN VM workaround context
+ */
+void mlx5_vlan_vmwa_exit(struct mlx5_nl_vlan_vmwa_context *vmwa)
+{
+	unsigned int i;
+
+	/* Delete all remaining VLAN devices. */
+	for (i = 0; i < RTE_DIM(vmwa->vlan_dev); i++) {
+		if (vmwa->vlan_dev[i].ifindex)
+			mlx5_nl_vlan_vmwa_delete(vmwa,
+						 vmwa->vlan_dev[i].ifindex);
+	}
+	if (vmwa->nl_socket >= 0)
+		close(vmwa->nl_socket);
+	rte_free(vmwa);
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h
new file mode 100644
index 000000000..8c6922835
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_PRIVATE_MLX5_H_
+#define RTE_PMD_PRIVATE_MLX5_H_
+
+/**
+ * @file
+ * MLX5 public header.
+ *
+ * This interface provides the ability to support private PMD
+ * dynamic flags.
+ */
+
+#define RTE_PMD_MLX5_FINE_GRANULARITY_INLINE "mlx5_fine_granularity_inline"
+
+/**
+ * Returns the dynamic flags name, that are supported.
+ *
+ * @param[out] names
+ *   Array that is used to return the supported dynamic flags names.
+ * @param[in] n
+ *   The number of elements in the names array.
+ *
+ * @return
+ *   The number of dynamic flags that were copied if not negative.
+ *   Otherwise:
+ *   - ENOMEM - not enough entries in the array
+ *   - EINVAL - invalid array entry
+ */
+__rte_experimental
+int rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n);
+
+#endif
diff --git a/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map
new file mode 100644
index 000000000..c8b1031b0
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx5/rte_pmd_mlx5_version.map
@@ -0,0 +1,10 @@
+DPDK_20.0 {
+	local: *;
+};
+
+EXPERIMENTAL {
+        global:
+
+        # added in 20.02
+	rte_pmd_mlx5_get_dyn_flag_names;
+};