Adding upstream version 14.2.21.upstream/14.2.21 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
commit: 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree: e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/spdk/dpdk/drivers/net/mlx4
parent: Initial commit. (diff)
download: ceph-upstream.tar.xz
ceph-upstream.zip
19 files changed, 9316 insertions, 0 deletions
diff --git a/src/spdk/dpdk/drivers/net/mlx4/Makefile b/src/spdk/dpdk/drivers/net/mlx4/Makefile
new file mode 100644
index 00000000..92e93225
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/Makefile
@@ -0,0 +1,128 @@
+#   SPDX-License-Identifier: BSD-3-Clause
+#   Copyright 2012 6WIND S.A.
+#   Copyright 2012 Mellanox Technologies, Ltd
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# Library name.
+LIB = librte_pmd_mlx4.a
+LIB_GLUE = $(LIB_GLUE_BASE).$(LIB_GLUE_VERSION)
+LIB_GLUE_BASE = librte_pmd_mlx4_glue.so
+LIB_GLUE_VERSION = 18.02.0
+
+# Sources.
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_ethdev.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c
+ifneq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y)
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_glue.c
+endif
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_txq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c
+
+ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y)
+INSTALL-$(CONFIG_RTE_LIBRTE_MLX4_PMD)-lib += $(LIB_GLUE)
+endif
+
+# Basic CFLAGS.
+CFLAGS += -O3
+CFLAGS += -std=c11 -Wall -Wextra
+CFLAGS += -g
+CFLAGS += -I.
+CFLAGS += -D_BSD_SOURCE
+CFLAGS += -D_DEFAULT_SOURCE
+CFLAGS += -D_XOPEN_SOURCE=600
+CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y)
+CFLAGS += -DMLX4_GLUE='"$(LIB_GLUE)"'
+CFLAGS += -DMLX4_GLUE_VERSION='"$(LIB_GLUE_VERSION)"'
+CFLAGS_mlx4_glue.o += -fPIC
+LDLIBS += -ldl
+else
+LDLIBS += -libverbs -lmlx4
+endif
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_pci
+
+# A few warnings cannot be avoided in external headers.
+CFLAGS += -Wno-error=cast-qual
+
+EXPORT_MAP := rte_pmd_mlx4_version.map
+LIBABIVER := 1
+
+# DEBUG which is usually provided on the command-line may enable
+# CONFIG_RTE_LIBRTE_MLX4_DEBUG.
+ifeq ($(DEBUG),1)
+CONFIG_RTE_LIBRTE_MLX4_DEBUG := y
+endif
+
+# User-defined CFLAGS.
+ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DEBUG),y)
+CFLAGS += -pedantic -UNDEBUG -DPEDANTIC
+else
+CFLAGS += -DNDEBUG -UPEDANTIC
+endif
+
+include $(RTE_SDK)/mk/rte.lib.mk
+
+# Generate and clean-up mlx4_autoconf.h.
+
+export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS
+export AUTO_CONFIG_CFLAGS = -Wno-error
+
+ifndef V
+AUTOCONF_OUTPUT := >/dev/null
+endif
+
+mlx4_autoconf.h.new: FORCE
+
+mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
+	$Q $(RM) -f -- '$@'
+	$Q : > '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_IBV_MLX4_WQE_LSO_SEG \
+		infiniband/mlx4dv.h \
+		type 'struct mlx4_wqe_lso_seg' \
+		$(AUTOCONF_OUTPUT)
+
+# Create mlx4_autoconf.h or update it in case it differs from the new one.
+
+mlx4_autoconf.h: mlx4_autoconf.h.new
+	$Q [ -f '$@' ] && \
+		cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \
+		mv '$<' '$@'
+
+$(SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD):.c=.o): mlx4_autoconf.h
+
+# Generate dependency plug-in for rdma-core when the PMD must not be linked
+# directly, so that applications do not inherit this dependency.
+
+ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y)
+
+$(LIB): $(LIB_GLUE)
+
+ifeq ($(LINK_USING_CC),1)
+GLUE_LDFLAGS := $(call linkerprefix,$(LDFLAGS))
+else
+GLUE_LDFLAGS := $(LDFLAGS)
+endif
+$(LIB_GLUE): mlx4_glue.o
+	$Q $(LD) $(GLUE_LDFLAGS) $(EXTRA_LDFLAGS) \
+		-Wl,-h,$(LIB_GLUE) \
+		-shared -o $@ $< -libverbs -lmlx4
+
+mlx4_glue.o: mlx4_autoconf.h
+
+endif
+
+clean_mlx4: FORCE
+	$Q rm -f -- mlx4_autoconf.h mlx4_autoconf.h.new
+	$Q rm -f -- mlx4_glue.o $(LIB_GLUE_BASE)*
+
+clean: clean_mlx4
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4.c
new file mode 100644
index 00000000..defc0d4b
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4.c
@@ -0,0 +1,1013 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2012 6WIND S.A.
+ * Copyright 2012 Mellanox Technologies, Ltd
+ */
+
+/**
+ * @file
+ * mlx4 driver initialization.
+ */
+
+#include <assert.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_config.h>
+#include <rte_dev.h>
+#include <rte_errno.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ethdev_pci.h>
+#include <rte_ether.h>
+#include <rte_flow.h>
+#include <rte_interrupts.h>
+#include <rte_kvargs.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+
+#include "mlx4.h"
+#include "mlx4_glue.h"
+#include "mlx4_flow.h"
+#include "mlx4_mr.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+struct mlx4_dev_list mlx4_mem_event_cb_list =
+	LIST_HEAD_INITIALIZER(mlx4_mem_event_cb_list);
+
+rte_rwlock_t mlx4_mem_event_rwlock = RTE_RWLOCK_INITIALIZER;
+
+/** Configuration structure for device arguments. */
+struct mlx4_conf {
+	struct {
+		uint32_t present; /**< Bit-field for existing ports. */
+		uint32_t enabled; /**< Bit-field for user-enabled ports. */
+	} ports;
+};
+
+/* Available parameters list. */
+const char *pmd_mlx4_init_params[] = {
+	MLX4_PMD_PORT_KVARG,
+	NULL,
+};
+
+static void mlx4_dev_stop(struct rte_eth_dev *dev);
+
+/**
+ * DPDK callback for Ethernet device configuration.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_dev_configure(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
+	int ret;
+
+	/* Prepare internal flow rules. */
+	ret = mlx4_flow_sync(priv, &error);
+	if (ret) {
+		ERROR("cannot set up internal flow rules (code %d, \"%s\"),"
+		      " flow error type %d, cause %p, message: %s",
+		      -ret, strerror(-ret), error.type, error.cause,
+		      error.message ? error.message : "(unspecified)");
+		goto exit;
+	}
+	ret = mlx4_intr_install(priv);
+	if (ret)
+		ERROR("%p: interrupt handler installation failed",
+		      (void *)dev);
+exit:
+	return ret;
+}
+
+/**
+ * DPDK callback to start the device.
+ *
+ * Simulate device start by initializing common RSS resources and attaching
+ * all configured flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_dev_start(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
+	int ret;
+
+	if (priv->started)
+		return 0;
+	DEBUG("%p: attaching configured flows to all RX queues", (void *)dev);
+	priv->started = 1;
+	ret = mlx4_rss_init(priv);
+	if (ret) {
+		ERROR("%p: cannot initialize RSS resources: %s",
+		      (void *)dev, strerror(-ret));
+		goto err;
+	}
+#ifndef NDEBUG
+	mlx4_mr_dump_dev(dev);
+#endif
+	ret = mlx4_rxq_intr_enable(priv);
+	if (ret) {
+		ERROR("%p: interrupt handler installation failed",
+		     (void *)dev);
+		goto err;
+	}
+	ret = mlx4_flow_sync(priv, &error);
+	if (ret) {
+		ERROR("%p: cannot attach flow rules (code %d, \"%s\"),"
+		      " flow error type %d, cause %p, message: %s",
+		      (void *)dev,
+		      -ret, strerror(-ret), error.type, error.cause,
+		      error.message ? error.message : "(unspecified)");
+		goto err;
+	}
+	rte_wmb();
+	dev->tx_pkt_burst = mlx4_tx_burst;
+	dev->rx_pkt_burst = mlx4_rx_burst;
+	return 0;
+err:
+	mlx4_dev_stop(dev);
+	return ret;
+}
+
+/**
+ * DPDK callback to stop the device.
+ *
+ * Simulate device stop by detaching all configured flows.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+static void
+mlx4_dev_stop(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	if (!priv->started)
+		return;
+	DEBUG("%p: detaching flows from all RX queues", (void *)dev);
+	priv->started = 0;
+	dev->tx_pkt_burst = mlx4_tx_burst_removed;
+	dev->rx_pkt_burst = mlx4_rx_burst_removed;
+	rte_wmb();
+	mlx4_flow_sync(priv, NULL);
+	mlx4_rxq_intr_disable(priv);
+	mlx4_rss_deinit(priv);
+}
+
+/**
+ * DPDK callback to close the device.
+ *
+ * Destroy all queues and objects, free memory.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+static void
+mlx4_dev_close(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+	unsigned int i;
+
+	DEBUG("%p: closing device \"%s\"",
+	      (void *)dev,
+	      ((priv->ctx != NULL) ? priv->ctx->device->name : ""));
+	dev->rx_pkt_burst = mlx4_rx_burst_removed;
+	dev->tx_pkt_burst = mlx4_tx_burst_removed;
+	rte_wmb();
+	mlx4_flow_clean(priv);
+	mlx4_rss_deinit(priv);
+	for (i = 0; i != dev->data->nb_rx_queues; ++i)
+		mlx4_rx_queue_release(dev->data->rx_queues[i]);
+	for (i = 0; i != dev->data->nb_tx_queues; ++i)
+		mlx4_tx_queue_release(dev->data->tx_queues[i]);
+	mlx4_mr_release(dev);
+	if (priv->pd != NULL) {
+		assert(priv->ctx != NULL);
+		claim_zero(mlx4_glue->dealloc_pd(priv->pd));
+		claim_zero(mlx4_glue->close_device(priv->ctx));
+	} else
+		assert(priv->ctx == NULL);
+	mlx4_intr_uninstall(priv);
+	memset(priv, 0, sizeof(*priv));
+}
+
+static const struct eth_dev_ops mlx4_dev_ops = {
+	.dev_configure = mlx4_dev_configure,
+	.dev_start = mlx4_dev_start,
+	.dev_stop = mlx4_dev_stop,
+	.dev_set_link_down = mlx4_dev_set_link_down,
+	.dev_set_link_up = mlx4_dev_set_link_up,
+	.dev_close = mlx4_dev_close,
+	.link_update = mlx4_link_update,
+	.promiscuous_enable = mlx4_promiscuous_enable,
+	.promiscuous_disable = mlx4_promiscuous_disable,
+	.allmulticast_enable = mlx4_allmulticast_enable,
+	.allmulticast_disable = mlx4_allmulticast_disable,
+	.mac_addr_remove = mlx4_mac_addr_remove,
+	.mac_addr_add = mlx4_mac_addr_add,
+	.mac_addr_set = mlx4_mac_addr_set,
+	.stats_get = mlx4_stats_get,
+	.stats_reset = mlx4_stats_reset,
+	.dev_infos_get = mlx4_dev_infos_get,
+	.dev_supported_ptypes_get = mlx4_dev_supported_ptypes_get,
+	.vlan_filter_set = mlx4_vlan_filter_set,
+	.rx_queue_setup = mlx4_rx_queue_setup,
+	.tx_queue_setup = mlx4_tx_queue_setup,
+	.rx_queue_release = mlx4_rx_queue_release,
+	.tx_queue_release = mlx4_tx_queue_release,
+	.flow_ctrl_get = mlx4_flow_ctrl_get,
+	.flow_ctrl_set = mlx4_flow_ctrl_set,
+	.mtu_set = mlx4_mtu_set,
+	.filter_ctrl = mlx4_filter_ctrl,
+	.rx_queue_intr_enable = mlx4_rx_intr_enable,
+	.rx_queue_intr_disable = mlx4_rx_intr_disable,
+	.is_removed = mlx4_is_removed,
+};
+
+/**
+ * Get PCI information from struct ibv_device.
+ *
+ * @param device
+ *   Pointer to Ethernet device structure.
+ * @param[out] pci_addr
+ *   PCI bus address output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
+			    struct rte_pci_addr *pci_addr)
+{
+	FILE *file;
+	char line[32];
+	MKSTR(path, "%s/device/uevent", device->ibdev_path);
+
+	file = fopen(path, "rb");
+	if (file == NULL) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	while (fgets(line, sizeof(line), file) == line) {
+		size_t len = strlen(line);
+		int ret;
+
+		/* Truncate long lines. */
+		if (len == (sizeof(line) - 1))
+			while (line[(len - 1)] != '\n') {
+				ret = fgetc(file);
+				if (ret == EOF)
+					break;
+				line[(len - 1)] = ret;
+			}
+		/* Extract information. */
+		if (sscanf(line,
+			   "PCI_SLOT_NAME="
+			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
+			   &pci_addr->domain,
+			   &pci_addr->bus,
+			   &pci_addr->devid,
+			   &pci_addr->function) == 4) {
+			ret = 0;
+			break;
+		}
+	}
+	fclose(file);
+	return 0;
+}
+
+/**
+ * Verify and store value for device argument.
+ *
+ * @param[in] key
+ *   Key argument to verify.
+ * @param[in] val
+ *   Value associated with key.
+ * @param[in, out] conf
+ *   Shared configuration data.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_arg_parse(const char *key, const char *val, struct mlx4_conf *conf)
+{
+	unsigned long tmp;
+
+	errno = 0;
+	tmp = strtoul(val, NULL, 0);
+	if (errno) {
+		rte_errno = errno;
+		WARN("%s: \"%s\" is not a valid integer", key, val);
+		return -rte_errno;
+	}
+	if (strcmp(MLX4_PMD_PORT_KVARG, key) == 0) {
+		uint32_t ports = rte_log2_u32(conf->ports.present + 1);
+
+		if (tmp >= ports) {
+			ERROR("port index %lu outside range [0,%" PRIu32 ")",
+			      tmp, ports);
+			return -EINVAL;
+		}
+		if (!(conf->ports.present & (1 << tmp))) {
+			rte_errno = EINVAL;
+			ERROR("invalid port index %lu", tmp);
+			return -rte_errno;
+		}
+		conf->ports.enabled |= 1 << tmp;
+	} else {
+		rte_errno = EINVAL;
+		WARN("%s: unknown parameter", key);
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Parse device parameters.
+ *
+ * @param devargs
+ *   Device arguments structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf)
+{
+	struct rte_kvargs *kvlist;
+	unsigned int arg_count;
+	int ret = 0;
+	int i;
+
+	if (devargs == NULL)
+		return 0;
+	kvlist = rte_kvargs_parse(devargs->args, pmd_mlx4_init_params);
+	if (kvlist == NULL) {
+		rte_errno = EINVAL;
+		ERROR("failed to parse kvargs");
+		return -rte_errno;
+	}
+	/* Process parameters. */
+	for (i = 0; pmd_mlx4_init_params[i]; ++i) {
+		arg_count = rte_kvargs_count(kvlist, MLX4_PMD_PORT_KVARG);
+		while (arg_count-- > 0) {
+			ret = rte_kvargs_process(kvlist,
+						 MLX4_PMD_PORT_KVARG,
+						 (int (*)(const char *,
+							  const char *,
+							  void *))
+						 mlx4_arg_parse,
+						 conf);
+			if (ret != 0)
+				goto free_kvlist;
+		}
+	}
+free_kvlist:
+	rte_kvargs_free(kvlist);
+	return ret;
+}
+
+/**
+ * Interpret RSS capabilities reported by device.
+ *
+ * This function returns the set of usable Verbs RSS hash fields, kernel
+ * quirks taken into account.
+ *
+ * @param ctx
+ *   Verbs context.
+ * @param pd
+ *   Verbs protection domain.
+ * @param device_attr_ex
+ *   Extended device attributes to interpret.
+ *
+ * @return
+ *   Usable RSS hash fields mask in Verbs format.
+ */
+static uint64_t
+mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd,
+		struct ibv_device_attr_ex *device_attr_ex)
+{
+	uint64_t hw_rss_sup = device_attr_ex->rss_caps.rx_hash_fields_mask;
+	struct ibv_cq *cq = NULL;
+	struct ibv_wq *wq = NULL;
+	struct ibv_rwq_ind_table *ind = NULL;
+	struct ibv_qp *qp = NULL;
+
+	if (!hw_rss_sup) {
+		WARN("no RSS capabilities reported; disabling support for UDP"
+		     " RSS and inner VXLAN RSS");
+		return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 |
+			IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 |
+			IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP;
+	}
+	if (!(hw_rss_sup & IBV_RX_HASH_INNER))
+		return hw_rss_sup;
+	/*
+	 * Although reported as supported, missing code in some Linux
+	 * versions (v4.15, v4.16) prevents the creation of hash QPs with
+	 * inner capability.
+	 *
+	 * There is no choice but to attempt to instantiate a temporary RSS
+	 * context in order to confirm its support.
+	 */
+	cq = mlx4_glue->create_cq(ctx, 1, NULL, NULL, 0);
+	wq = cq ? mlx4_glue->create_wq
+		(ctx,
+		 &(struct ibv_wq_init_attr){
+			.wq_type = IBV_WQT_RQ,
+			.max_wr = 1,
+			.max_sge = 1,
+			.pd = pd,
+			.cq = cq,
+		 }) : NULL;
+	ind = wq ? mlx4_glue->create_rwq_ind_table
+		(ctx,
+		 &(struct ibv_rwq_ind_table_init_attr){
+			.log_ind_tbl_size = 0,
+			.ind_tbl = &wq,
+			.comp_mask = 0,
+		 }) : NULL;
+	qp = ind ? mlx4_glue->create_qp_ex
+		(ctx,
+		 &(struct ibv_qp_init_attr_ex){
+			.comp_mask =
+				(IBV_QP_INIT_ATTR_PD |
+				 IBV_QP_INIT_ATTR_RX_HASH |
+				 IBV_QP_INIT_ATTR_IND_TABLE),
+			.qp_type = IBV_QPT_RAW_PACKET,
+			.pd = pd,
+			.rwq_ind_tbl = ind,
+			.rx_hash_conf = {
+				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+				.rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE,
+				.rx_hash_key = mlx4_rss_hash_key_default,
+				.rx_hash_fields_mask = hw_rss_sup,
+			},
+		 }) : NULL;
+	if (!qp) {
+		WARN("disabling unusable inner RSS capability due to kernel"
+		     " quirk");
+		hw_rss_sup &= ~IBV_RX_HASH_INNER;
+	} else {
+		claim_zero(mlx4_glue->destroy_qp(qp));
+	}
+	if (ind)
+		claim_zero(mlx4_glue->destroy_rwq_ind_table(ind));
+	if (wq)
+		claim_zero(mlx4_glue->destroy_wq(wq));
+	if (cq)
+		claim_zero(mlx4_glue->destroy_cq(cq));
+	return hw_rss_sup;
+}
+
+static struct rte_pci_driver mlx4_driver;
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function creates an Ethernet device for each port of a given
+ * PCI device.
+ *
+ * @param[in] pci_drv
+ *   PCI driver structure (mlx4_driver).
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
+{
+	struct ibv_device **list;
+	struct ibv_device *ibv_dev;
+	int err = 0;
+	struct ibv_context *attr_ctx = NULL;
+	struct ibv_device_attr device_attr;
+	struct ibv_device_attr_ex device_attr_ex;
+	struct mlx4_conf conf = {
+		.ports.present = 0,
+	};
+	unsigned int vf;
+	int i;
+
+	(void)pci_drv;
+	assert(pci_drv == &mlx4_driver);
+	list = mlx4_glue->get_device_list(&i);
+	if (list == NULL) {
+		rte_errno = errno;
+		assert(rte_errno);
+		if (rte_errno == ENOSYS)
+			ERROR("cannot list devices, is ib_uverbs loaded?");
+		return -rte_errno;
+	}
+	assert(i >= 0);
+	/*
+	 * For each listed device, check related sysfs entry against
+	 * the provided PCI ID.
+	 */
+	while (i != 0) {
+		struct rte_pci_addr pci_addr;
+
+		--i;
+		DEBUG("checking device \"%s\"", list[i]->name);
+		if (mlx4_ibv_device_to_pci_addr(list[i], &pci_addr))
+			continue;
+		if ((pci_dev->addr.domain != pci_addr.domain) ||
+		    (pci_dev->addr.bus != pci_addr.bus) ||
+		    (pci_dev->addr.devid != pci_addr.devid) ||
+		    (pci_dev->addr.function != pci_addr.function))
+			continue;
+		vf = (pci_dev->id.device_id ==
+		      PCI_DEVICE_ID_MELLANOX_CONNECTX3VF);
+		INFO("PCI information matches, using device \"%s\" (VF: %s)",
+		     list[i]->name, (vf ? "true" : "false"));
+		attr_ctx = mlx4_glue->open_device(list[i]);
+		err = errno;
+		break;
+	}
+	if (attr_ctx == NULL) {
+		mlx4_glue->free_device_list(list);
+		switch (err) {
+		case 0:
+			rte_errno = ENODEV;
+			ERROR("cannot access device, is mlx4_ib loaded?");
+			return -rte_errno;
+		case EINVAL:
+			rte_errno = EINVAL;
+			ERROR("cannot use device, are drivers up to date?");
+			return -rte_errno;
+		}
+		assert(err > 0);
+		rte_errno = err;
+		return -rte_errno;
+	}
+	ibv_dev = list[i];
+	DEBUG("device opened");
+	if (mlx4_glue->query_device(attr_ctx, &device_attr)) {
+		err = ENODEV;
+		goto error;
+	}
+	INFO("%u port(s) detected", device_attr.phys_port_cnt);
+	conf.ports.present |= (UINT64_C(1) << device_attr.phys_port_cnt) - 1;
+	if (mlx4_args(pci_dev->device.devargs, &conf)) {
+		ERROR("failed to process device arguments");
+		err = EINVAL;
+		goto error;
+	}
+	/* Use all ports when none are defined */
+	if (!conf.ports.enabled)
+		conf.ports.enabled = conf.ports.present;
+	/* Retrieve extended device attributes. */
+	if (mlx4_glue->query_device_ex(attr_ctx, NULL, &device_attr_ex)) {
+		err = ENODEV;
+		goto error;
+	}
+	assert(device_attr.max_sge >= MLX4_MAX_SGE);
+	for (i = 0; i < device_attr.phys_port_cnt; i++) {
+		uint32_t port = i + 1; /* ports are indexed from one */
+		struct ibv_context *ctx = NULL;
+		struct ibv_port_attr port_attr;
+		struct ibv_pd *pd = NULL;
+		struct priv *priv = NULL;
+		struct rte_eth_dev *eth_dev = NULL;
+		struct ether_addr mac;
+
+		/* If port is not enabled, skip. */
+		if (!(conf.ports.enabled & (1 << i)))
+			continue;
+		DEBUG("using port %u", port);
+		ctx = mlx4_glue->open_device(ibv_dev);
+		if (ctx == NULL) {
+			err = ENODEV;
+			goto port_error;
+		}
+		/* Check port status. */
+		err = mlx4_glue->query_port(ctx, port, &port_attr);
+		if (err) {
+			err = ENODEV;
+			ERROR("port query failed: %s", strerror(err));
+			goto port_error;
+		}
+		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+			err = ENOTSUP;
+			ERROR("port %d is not configured in Ethernet mode",
+			      port);
+			goto port_error;
+		}
+		if (port_attr.state != IBV_PORT_ACTIVE)
+			DEBUG("port %d is not active: \"%s\" (%d)",
+			      port, mlx4_glue->port_state_str(port_attr.state),
+			      port_attr.state);
+		/* Make asynchronous FD non-blocking to handle interrupts. */
+		err = mlx4_fd_set_non_blocking(ctx->async_fd);
+		if (err) {
+			ERROR("cannot make asynchronous FD non-blocking: %s",
+			      strerror(err));
+			goto port_error;
+		}
+		/* Allocate protection domain. */
+		pd = mlx4_glue->alloc_pd(ctx);
+		if (pd == NULL) {
+			err = ENOMEM;
+			ERROR("PD allocation failure");
+			goto port_error;
+		}
+		/* from rte_ethdev.c */
+		priv = rte_zmalloc("ethdev private structure",
+				   sizeof(*priv),
+				   RTE_CACHE_LINE_SIZE);
+		if (priv == NULL) {
+			err = ENOMEM;
+			ERROR("priv allocation failure");
+			goto port_error;
+		}
+		priv->ctx = ctx;
+		priv->device_attr = device_attr;
+		priv->port = port;
+		priv->pd = pd;
+		priv->mtu = ETHER_MTU;
+		priv->vf = vf;
+		priv->hw_csum =	!!(device_attr.device_cap_flags &
+				   IBV_DEVICE_RAW_IP_CSUM);
+		DEBUG("checksum offloading is %ssupported",
+		      (priv->hw_csum ? "" : "not "));
+		/* Only ConnectX-3 Pro supports tunneling. */
+		priv->hw_csum_l2tun =
+			priv->hw_csum &&
+			(device_attr.vendor_part_id ==
+			 PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO);
+		DEBUG("L2 tunnel checksum offloads are %ssupported",
+		      priv->hw_csum_l2tun ? "" : "not ");
+		priv->hw_rss_sup = mlx4_hw_rss_sup(priv->ctx, priv->pd,
+						   &device_attr_ex);
+		DEBUG("supported RSS hash fields mask: %016" PRIx64,
+		      priv->hw_rss_sup);
+		priv->hw_rss_max_qps =
+			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
+		DEBUG("MAX RSS queues %d", priv->hw_rss_max_qps);
+		priv->hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
+					IBV_RAW_PACKET_CAP_SCATTER_FCS);
+		DEBUG("FCS stripping toggling is %ssupported",
+		      priv->hw_fcs_strip ? "" : "not ");
+		priv->tso =
+			((device_attr_ex.tso_caps.max_tso > 0) &&
+			 (device_attr_ex.tso_caps.supported_qpts &
+			  (1 << IBV_QPT_RAW_PACKET)));
+		if (priv->tso)
+			priv->tso_max_payload_sz =
+					device_attr_ex.tso_caps.max_tso;
+		DEBUG("TSO is %ssupported",
+		      priv->tso ? "" : "not ");
+		/* Configure the first MAC address by default. */
+		err = mlx4_get_mac(priv, &mac.addr_bytes);
+		if (err) {
+			ERROR("cannot get MAC address, is mlx4_en loaded?"
+			      " (error: %s)", strerror(err));
+			goto port_error;
+		}
+		INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
+		     priv->port,
+		     mac.addr_bytes[0], mac.addr_bytes[1],
+		     mac.addr_bytes[2], mac.addr_bytes[3],
+		     mac.addr_bytes[4], mac.addr_bytes[5]);
+		/* Register MAC address. */
+		priv->mac[0] = mac;
+#ifndef NDEBUG
+		{
+			char ifname[IF_NAMESIZE];
+
+			if (mlx4_get_ifname(priv, &ifname) == 0)
+				DEBUG("port %u ifname is \"%s\"",
+				      priv->port, ifname);
+			else
+				DEBUG("port %u ifname is unknown", priv->port);
+		}
+#endif
+		/* Get actual MTU if possible. */
+		mlx4_mtu_get(priv, &priv->mtu);
+		DEBUG("port %u MTU is %u", priv->port, priv->mtu);
+		/* from rte_ethdev.c */
+		{
+			char name[RTE_ETH_NAME_MAX_LEN];
+
+			snprintf(name, sizeof(name), "%s port %u",
+				 mlx4_glue->get_device_name(ibv_dev), port);
+			eth_dev = rte_eth_dev_allocate(name);
+		}
+		if (eth_dev == NULL) {
+			err = ENOMEM;
+			ERROR("can not allocate rte ethdev");
+			goto port_error;
+		}
+		eth_dev->data->dev_private = priv;
+		eth_dev->data->mac_addrs = priv->mac;
+		eth_dev->device = &pci_dev->device;
+		rte_eth_copy_pci_info(eth_dev, pci_dev);
+		eth_dev->device->driver = &mlx4_driver.driver;
+		/* Initialize local interrupt handle for current port. */
+		priv->intr_handle = (struct rte_intr_handle){
+			.fd = -1,
+			.type = RTE_INTR_HANDLE_EXT,
+		};
+		/*
+		 * Override ethdev interrupt handle pointer with private
+		 * handle instead of that of the parent PCI device used by
+		 * default. This prevents it from being shared between all
+		 * ports of the same PCI device since each of them is
+		 * associated its own Verbs context.
+		 *
+		 * Rx interrupts in particular require this as the PMD has
+		 * no control over the registration of queue interrupts
+		 * besides setting up eth_dev->intr_handle, the rest is
+		 * handled by rte_intr_rx_ctl().
+		 */
+		eth_dev->intr_handle = &priv->intr_handle;
+		priv->dev = eth_dev;
+		eth_dev->dev_ops = &mlx4_dev_ops;
+		/* Bring Ethernet device up. */
+		DEBUG("forcing Ethernet interface up");
+		mlx4_dev_set_link_up(priv->dev);
+		/* Update link status once if waiting for LSC. */
+		if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
+			mlx4_link_update(eth_dev, 0);
+		/*
+		 * Once the device is added to the list of memory event
+		 * callback, its global MR cache table cannot be expanded
+		 * on the fly because of deadlock. If it overflows, lookup
+		 * should be done by searching MR list linearly, which is slow.
+		 */
+		err = mlx4_mr_btree_init(&priv->mr.cache,
+					 MLX4_MR_BTREE_CACHE_N * 2,
+					 eth_dev->device->numa_node);
+		if (err) {
+			/* rte_errno is already set. */
+			goto port_error;
+		}
+		/* Add device to memory callback list. */
+		rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
+		LIST_INSERT_HEAD(&mlx4_mem_event_cb_list, priv, mem_event_cb);
+		rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+		rte_eth_dev_probing_finish(eth_dev);
+		continue;
+port_error:
+		rte_free(priv);
+		if (pd)
+			claim_zero(mlx4_glue->dealloc_pd(pd));
+		if (ctx)
+			claim_zero(mlx4_glue->close_device(ctx));
+		if (eth_dev)
+			rte_eth_dev_release_port(eth_dev);
+		break;
+	}
+	/*
+	 * XXX if something went wrong in the loop above, there is a resource
+	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
+	 * long as the dpdk does not provide a way to deallocate a ethdev and a
+	 * way to enumerate the registered ethdevs to free the previous ones.
+	 */
+error:
+	if (attr_ctx)
+		claim_zero(mlx4_glue->close_device(attr_ctx));
+	if (list)
+		mlx4_glue->free_device_list(list);
+	if (err)
+		rte_errno = err;
+	return -err;
+}
+
+static const struct rte_pci_id mlx4_pci_id_map[] = {
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX3)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO)
+	},
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+			       PCI_DEVICE_ID_MELLANOX_CONNECTX3VF)
+	},
+	{
+		.vendor_id = 0
+	}
+};
+
+static struct rte_pci_driver mlx4_driver = {
+	.driver = {
+		.name = MLX4_DRIVER_NAME
+	},
+	.id_table = mlx4_pci_id_map,
+	.probe = mlx4_pci_probe,
+	.drv_flags = RTE_PCI_DRV_INTR_LSC |
+		     RTE_PCI_DRV_INTR_RMV,
+};
+
+#ifdef RTE_LIBRTE_MLX4_DLOPEN_DEPS
+
+/**
+ * Suffix RTE_EAL_PMD_PATH with "-glue".
+ *
+ * This function performs a sanity check on RTE_EAL_PMD_PATH before
+ * suffixing its last component.
+ *
+ * @param buf[out]
+ *   Output buffer, should be large enough otherwise NULL is returned.
+ * @param size
+ *   Size of @p out.
+ *
+ * @return
+ *   Pointer to @p buf or @p NULL in case suffix cannot be appended.
+ */
+static char *
+mlx4_glue_path(char *buf, size_t size)
+{
+	static const char *const bad[] = { "/", ".", "..", NULL };
+	const char *path = RTE_EAL_PMD_PATH;
+	size_t len = strlen(path);
+	size_t off;
+	int i;
+
+	while (len && path[len - 1] == '/')
+		--len;
+	for (off = len; off && path[off - 1] != '/'; --off)
+		;
+	for (i = 0; bad[i]; ++i)
+		if (!strncmp(path + off, bad[i], (int)(len - off)))
+			goto error;
+	i = snprintf(buf, size, "%.*s-glue", (int)len, path);
+	if (i == -1 || (size_t)i >= size)
+		goto error;
+	return buf;
+error:
+	ERROR("unable to append \"-glue\" to last component of"
+	      " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"),"
+	      " please re-configure DPDK");
+	return NULL;
+}
+
+/**
+ * Initialization routine for run-time dependency on rdma-core.
+ */
+static int
+mlx4_glue_init(void)
+{
+	char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
+	const char *path[] = {
+		/*
+		 * A basic security check is necessary before trusting
+		 * MLX4_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
+		 */
+		(geteuid() == getuid() && getegid() == getgid() ?
+		 getenv("MLX4_GLUE_PATH") : NULL),
+		/*
+		 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
+		 * variant, otherwise let dlopen() look up libraries on its
+		 * own.
+		 */
+		(*RTE_EAL_PMD_PATH ?
+		 mlx4_glue_path(glue_path, sizeof(glue_path)) : ""),
+	};
+	unsigned int i = 0;
+	void *handle = NULL;
+	void **sym;
+	const char *dlmsg;
+
+	while (!handle && i != RTE_DIM(path)) {
+		const char *end;
+		size_t len;
+		int ret;
+
+		if (!path[i]) {
+			++i;
+			continue;
+		}
+		end = strpbrk(path[i], ":;");
+		if (!end)
+			end = path[i] + strlen(path[i]);
+		len = end - path[i];
+		ret = 0;
+		do {
+			char name[ret + 1];
+
+			ret = snprintf(name, sizeof(name), "%.*s%s" MLX4_GLUE,
+				       (int)len, path[i],
+				       (!len || *(end - 1) == '/') ? "" : "/");
+			if (ret == -1)
+				break;
+			if (sizeof(name) != (size_t)ret + 1)
+				continue;
+			DEBUG("looking for rdma-core glue as \"%s\"", name);
+			handle = dlopen(name, RTLD_LAZY);
+			break;
+		} while (1);
+		path[i] = end + 1;
+		if (!*end)
+			++i;
+	}
+	if (!handle) {
+		rte_errno = EINVAL;
+		dlmsg = dlerror();
+		if (dlmsg)
+			WARN("cannot load glue library: %s", dlmsg);
+		goto glue_error;
+	}
+	sym = dlsym(handle, "mlx4_glue");
+	if (!sym || !*sym) {
+		rte_errno = EINVAL;
+		dlmsg = dlerror();
+		if (dlmsg)
+			ERROR("cannot resolve glue symbol: %s", dlmsg);
+		goto glue_error;
+	}
+	mlx4_glue = *sym;
+	return 0;
+glue_error:
+	if (handle)
+		dlclose(handle);
+	WARN("cannot initialize PMD due to missing run-time"
+	     " dependency on rdma-core libraries (libibverbs,"
+	     " libmlx4)");
+	return -rte_errno;
+}
+
+#endif
+
+/**
+ * Driver initialization routine.
+ */
+RTE_INIT(rte_mlx4_pmd_init)
+{
+	/*
+	 * MLX4_DEVICE_FATAL_CLEANUP tells ibv_destroy functions we
+	 * want to get success errno value in case of calling them
+	 * when the device was removed.
+	 */
+	setenv("MLX4_DEVICE_FATAL_CLEANUP", "1", 1);
+	/*
+	 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
+	 * huge pages. Calling ibv_fork_init() during init allows
+	 * applications to use fork() safely for purposes other than
+	 * using this PMD, which is not supported in forked processes.
+	 */
+	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
+#ifdef RTE_LIBRTE_MLX4_DLOPEN_DEPS
+	if (mlx4_glue_init())
+		return;
+	assert(mlx4_glue);
+#endif
+#ifndef NDEBUG
+	/* Glue structure must not contain any NULL pointers. */
+	{
+		unsigned int i;
+
+		for (i = 0; i != sizeof(*mlx4_glue) / sizeof(void *); ++i)
+			assert(((const void *const *)mlx4_glue)[i]);
+	}
+#endif
+	if (strcmp(mlx4_glue->version, MLX4_GLUE_VERSION)) {
+		ERROR("rdma-core glue \"%s\" mismatch: \"%s\" is required",
+		      mlx4_glue->version, MLX4_GLUE_VERSION);
+		return;
+	}
+	mlx4_glue->fork_init();
+	rte_pci_register(&mlx4_driver);
+	rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
+					mlx4_mr_mem_event_cb, NULL);
+}
+
+RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__);
+RTE_PMD_REGISTER_PCI_TABLE(net_mlx4, mlx4_pci_id_map);
+RTE_PMD_REGISTER_KMOD_DEP(net_mlx4,
+	"* ib_uverbs & mlx4_en & mlx4_core & mlx4_ib");
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4.h b/src/spdk/dpdk/drivers/net/mlx4/mlx4.h
new file mode 100644
index 00000000..e6fb934f
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2012 6WIND S.A.
+ * Copyright 2012 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX4_H_
+#define RTE_PMD_MLX4_H_
+
+#include <net/if.h>
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev_driver.h>
+#include <rte_ether.h>
+#include <rte_interrupts.h>
+#include <rte_mempool.h>
+#include <rte_rwlock.h>
+
+#include "mlx4_mr.h"
+
+#ifndef IBV_RX_HASH_INNER
+/** This is not necessarily defined by supported RDMA core versions. */
+#define IBV_RX_HASH_INNER (1ull << 31)
+#endif /* IBV_RX_HASH_INNER */
+
+/** Maximum number of simultaneous MAC addresses. This value is arbitrary. */
+#define MLX4_MAX_MAC_ADDRESSES 128
+
+/** Request send completion once in every 64 sends, might be less. */
+#define MLX4_PMD_TX_PER_COMP_REQ 64
+
+/** Maximum size for inline data. */
+#define MLX4_PMD_MAX_INLINE 0
+
+/** Fixed RSS hash key size in bytes. Cannot be modified. */
+#define MLX4_RSS_HASH_KEY_SIZE 40
+
+/** Interrupt alarm timeout value in microseconds. */
+#define MLX4_INTR_ALARM_TIMEOUT 100000
+
+/* Maximum packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192
+
+/** Port parameter. */
+#define MLX4_PMD_PORT_KVARG "port"
+
+enum {
+	PCI_VENDOR_ID_MELLANOX = 0x15b3,
+};
+
+enum {
+	PCI_DEVICE_ID_MELLANOX_CONNECTX3 = 0x1003,
+	PCI_DEVICE_ID_MELLANOX_CONNECTX3VF = 0x1004,
+	PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007,
+};
+
+/** Driver name reported to lower layers and used in log output. */
+#define MLX4_DRIVER_NAME "net_mlx4"
+
+struct mlx4_drop;
+struct mlx4_rss;
+struct rxq;
+struct txq;
+struct rte_flow;
+
+LIST_HEAD(mlx4_dev_list, priv);
+LIST_HEAD(mlx4_mr_list, mlx4_mr);
+
+/** Private data structure. */
+struct priv {
+	LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
+	struct rte_eth_dev *dev; /**< Ethernet device. */
+	struct ibv_context *ctx; /**< Verbs context. */
+	struct ibv_device_attr device_attr; /**< Device properties. */
+	struct ibv_pd *pd; /**< Protection Domain. */
+	/* Device properties. */
+	uint16_t mtu; /**< Configured MTU. */
+	uint8_t port; /**< Physical port number. */
+	uint32_t started:1; /**< Device started, flows enabled. */
+	uint32_t vf:1; /**< This is a VF device. */
+	uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */
+	uint32_t isolated:1; /**< Toggle isolated mode. */
+	uint32_t rss_init:1; /**< Common RSS context is initialized. */
+	uint32_t hw_csum:1; /**< Checksum offload is supported. */
+	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
+	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
+	uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
+	uint32_t hw_rss_max_qps; /**< Max Rx Queues supported by RSS. */
+	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
+	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
+	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
+	struct {
+		uint32_t dev_gen; /* Generation number to flush local caches. */
+		rte_rwlock_t rwlock; /* MR Lock. */
+		struct mlx4_mr_btree cache; /* Global MR cache table. */
+		struct mlx4_mr_list mr_list; /* Registered MR list. */
+		struct mlx4_mr_list mr_free_list; /* Freed MR list. */
+	} mr;
+	LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */
+	LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */
+	struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
+	/**< Configured MAC addresses. Unused entries are zeroed. */
+};
+
+/* mlx4_ethdev.c */
+
+int mlx4_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]);
+int mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]);
+int mlx4_mtu_get(struct priv *priv, uint16_t *mtu);
+int mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu);
+int mlx4_dev_set_link_down(struct rte_eth_dev *dev);
+int mlx4_dev_set_link_up(struct rte_eth_dev *dev);
+void mlx4_promiscuous_enable(struct rte_eth_dev *dev);
+void mlx4_promiscuous_disable(struct rte_eth_dev *dev);
+void mlx4_allmulticast_enable(struct rte_eth_dev *dev);
+void mlx4_allmulticast_disable(struct rte_eth_dev *dev);
+void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
+int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
+		      uint32_t index, uint32_t vmdq);
+int mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr);
+int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
+int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
+void mlx4_stats_reset(struct rte_eth_dev *dev);
+void mlx4_dev_infos_get(struct rte_eth_dev *dev,
+			struct rte_eth_dev_info *info);
+int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete);
+int mlx4_flow_ctrl_get(struct rte_eth_dev *dev,
+		       struct rte_eth_fc_conf *fc_conf);
+int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
+		       struct rte_eth_fc_conf *fc_conf);
+const uint32_t *mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev);
+int mlx4_is_removed(struct rte_eth_dev *dev);
+
+/* mlx4_intr.c */
+
+int mlx4_intr_uninstall(struct priv *priv);
+int mlx4_intr_install(struct priv *priv);
+int mlx4_rxq_intr_enable(struct priv *priv);
+void mlx4_rxq_intr_disable(struct priv *priv);
+int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
+int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
+
+#endif /* RTE_PMD_MLX4_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_ethdev.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4_ethdev.c
new file mode 100644
index 00000000..30deb3ef
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_ethdev.c
@@ -0,0 +1,883 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+/**
+ * @file
+ * Miscellaneous control operations for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_bus_pci.h>
+#include <rte_errno.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ether.h>
+#include <rte_flow.h>
+#include <rte_pci.h>
+#include <rte_string_fns.h>
+
+#include "mlx4.h"
+#include "mlx4_flow.h"
+#include "mlx4_glue.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Get interface name from private structure.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
+{
+	DIR *dir;
+	struct dirent *dent;
+	unsigned int dev_type = 0;
+	unsigned int dev_port_prev = ~0u;
+	char match[IF_NAMESIZE] = "";
+
+	{
+		MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path);
+
+		dir = opendir(path);
+		if (dir == NULL) {
+			rte_errno = errno;
+			return -rte_errno;
+		}
+	}
+	while ((dent = readdir(dir)) != NULL) {
+		char *name = dent->d_name;
+		FILE *file;
+		unsigned int dev_port;
+		int r;
+
+		if ((name[0] == '.') &&
+		    ((name[1] == '\0') ||
+		     ((name[1] == '.') && (name[2] == '\0'))))
+			continue;
+
+		MKSTR(path, "%s/device/net/%s/%s",
+		      priv->ctx->device->ibdev_path, name,
+		      (dev_type ? "dev_id" : "dev_port"));
+
+		file = fopen(path, "rb");
+		if (file == NULL) {
+			if (errno != ENOENT)
+				continue;
+			/*
+			 * Switch to dev_id when dev_port does not exist as
+			 * is the case with Linux kernel versions < 3.15.
+			 */
+try_dev_id:
+			match[0] = '\0';
+			if (dev_type)
+				break;
+			dev_type = 1;
+			dev_port_prev = ~0u;
+			rewinddir(dir);
+			continue;
+		}
+		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
+		fclose(file);
+		if (r != 1)
+			continue;
+		/*
+		 * Switch to dev_id when dev_port returns the same value for
+		 * all ports. May happen when using a MOFED release older than
+		 * 3.0 with a Linux kernel >= 3.15.
+		 */
+		if (dev_port == dev_port_prev)
+			goto try_dev_id;
+		dev_port_prev = dev_port;
+		if (dev_port == (priv->port - 1u))
+			strlcpy(match, name, sizeof(match));
+	}
+	closedir(dir);
+	if (match[0] == '\0') {
+		rte_errno = ENODEV;
+		return -rte_errno;
+	}
+	strncpy(*ifname, match, sizeof(*ifname));
+	return 0;
+}
+
+/**
+ * Perform ifreq ioctl() on associated Ethernet device.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param req
+ *   Request number to pass to ioctl().
+ * @param[out] ifr
+ *   Interface request structure output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_ifreq(const struct priv *priv, int req, struct ifreq *ifr)
+{
+	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	int ret;
+
+	if (sock == -1) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = mlx4_get_ifname(priv, &ifr->ifr_name);
+	if (!ret && ioctl(sock, req, ifr) == -1) {
+		rte_errno = errno;
+		ret = -rte_errno;
+	}
+	close(sock);
+	return ret;
+}
+
+/**
+ * Get MAC address by querying netdevice.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param[out] mac
+ *   MAC address output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN])
+{
+	struct ifreq request;
+	int ret = mlx4_ifreq(priv, SIOCGIFHWADDR, &request);
+
+	if (ret)
+		return ret;
+	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
+	return 0;
+}
+
+/**
+ * Get device MTU.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[out] mtu
+ *   MTU value output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mtu_get(struct priv *priv, uint16_t *mtu)
+{
+	struct ifreq request;
+	int ret = mlx4_ifreq(priv, SIOCGIFMTU, &request);
+
+	if (ret)
+		return ret;
+	*mtu = request.ifr_mtu;
+	return 0;
+}
+
+/**
+ * DPDK callback to change the MTU.
+ *
+ * @param priv
+ *   Pointer to Ethernet device structure.
+ * @param mtu
+ *   MTU value to set.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct ifreq request = { .ifr_mtu = mtu, };
+	int ret = mlx4_ifreq(priv, SIOCSIFMTU, &request);
+
+	if (ret)
+		return ret;
+	priv->mtu = mtu;
+	return 0;
+}
+
+/**
+ * Set device flags.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param keep
+ *   Bitmask for flags that must remain untouched.
+ * @param flags
+ *   Bitmask for flags to modify.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
+{
+	struct ifreq request;
+	int ret = mlx4_ifreq(priv, SIOCGIFFLAGS, &request);
+
+	if (ret)
+		return ret;
+	request.ifr_flags &= keep;
+	request.ifr_flags |= flags & ~keep;
+	return mlx4_ifreq(priv, SIOCSIFFLAGS, &request);
+}
+
+/**
+ * Change the link state (UP / DOWN).
+ *
+ * @param priv
+ *   Pointer to Ethernet device private data.
+ * @param up
+ *   Nonzero for link up, otherwise link down.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_dev_set_link(struct priv *priv, int up)
+{
+	int err;
+
+	if (up) {
+		err = mlx4_set_flags(priv, ~IFF_UP, IFF_UP);
+		if (err)
+			return err;
+	} else {
+		err = mlx4_set_flags(priv, ~IFF_UP, ~IFF_UP);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/**
+ * DPDK callback to bring the link DOWN.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_dev_set_link_down(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	return mlx4_dev_set_link(priv, 0);
+}
+
+/**
+ * DPDK callback to bring the link UP.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_dev_set_link_up(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	return mlx4_dev_set_link(priv, 1);
+}
+
+/**
+ * Supported Rx mode toggles.
+ *
+ * Even and odd values respectively stand for off and on.
+ */
+enum rxmode_toggle {
+	RXMODE_TOGGLE_PROMISC_OFF,
+	RXMODE_TOGGLE_PROMISC_ON,
+	RXMODE_TOGGLE_ALLMULTI_OFF,
+	RXMODE_TOGGLE_ALLMULTI_ON,
+};
+
+/**
+ * Helper function to toggle promiscuous and all multicast modes.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param toggle
+ *   Toggle to set.
+ */
+static void
+mlx4_rxmode_toggle(struct rte_eth_dev *dev, enum rxmode_toggle toggle)
+{
+	struct priv *priv = dev->data->dev_private;
+	const char *mode;
+	struct rte_flow_error error;
+
+	switch (toggle) {
+	case RXMODE_TOGGLE_PROMISC_OFF:
+	case RXMODE_TOGGLE_PROMISC_ON:
+		mode = "promiscuous";
+		dev->data->promiscuous = toggle & 1;
+		break;
+	case RXMODE_TOGGLE_ALLMULTI_OFF:
+	case RXMODE_TOGGLE_ALLMULTI_ON:
+		mode = "all multicast";
+		dev->data->all_multicast = toggle & 1;
+		break;
+	}
+	if (!mlx4_flow_sync(priv, &error))
+		return;
+	ERROR("cannot toggle %s mode (code %d, \"%s\"),"
+	      " flow error type %d, cause %p, message: %s",
+	      mode, rte_errno, strerror(rte_errno), error.type, error.cause,
+	      error.message ? error.message : "(unspecified)");
+}
+
+/**
+ * DPDK callback to enable promiscuous mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_promiscuous_enable(struct rte_eth_dev *dev)
+{
+	mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_ON);
+}
+
+/**
+ * DPDK callback to disable promiscuous mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_promiscuous_disable(struct rte_eth_dev *dev)
+{
+	mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_OFF);
+}
+
+/**
+ * DPDK callback to enable all multicast mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_allmulticast_enable(struct rte_eth_dev *dev)
+{
+	mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_ON);
+}
+
+/**
+ * DPDK callback to disable all multicast mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_allmulticast_disable(struct rte_eth_dev *dev)
+{
+	mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_OFF);
+}
+
+/**
+ * DPDK callback to remove a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param index
+ *   MAC address index.
+ */
+void
+mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
+
+	if (index >= RTE_DIM(priv->mac)) {
+		rte_errno = EINVAL;
+		return;
+	}
+	memset(&priv->mac[index], 0, sizeof(priv->mac[index]));
+	if (!mlx4_flow_sync(priv, &error))
+		return;
+	ERROR("failed to synchronize flow rules after removing MAC address"
+	      " at index %d (code %d, \"%s\"),"
+	      " flow error type %d, cause %p, message: %s",
+	      index, rte_errno, strerror(rte_errno), error.type, error.cause,
+	      error.message ? error.message : "(unspecified)");
+}
+
+/**
+ * DPDK callback to add a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mac_addr
+ *   MAC address to register.
+ * @param index
+ *   MAC address index.
+ * @param vmdq
+ *   VMDq pool index to associate address with (ignored).
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
+		  uint32_t index, uint32_t vmdq)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
+	int ret;
+
+	(void)vmdq;
+	if (index >= RTE_DIM(priv->mac)) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	memcpy(&priv->mac[index], mac_addr, sizeof(priv->mac[index]));
+	ret = mlx4_flow_sync(priv, &error);
+	if (!ret)
+		return 0;
+	ERROR("failed to synchronize flow rules after adding MAC address"
+	      " at index %d (code %d, \"%s\"),"
+	      " flow error type %d, cause %p, message: %s",
+	      index, rte_errno, strerror(rte_errno), error.type, error.cause,
+	      error.message ? error.message : "(unspecified)");
+	return ret;
+}
+
+/**
+ * DPDK callback to configure a VLAN filter.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param vlan_id
+ *   VLAN ID to filter.
+ * @param on
+ *   Toggle filter.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
+	unsigned int vidx = vlan_id / 64;
+	unsigned int vbit = vlan_id % 64;
+	uint64_t *v;
+	int ret;
+
+	if (vidx >= RTE_DIM(dev->data->vlan_filter_conf.ids)) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	v = &dev->data->vlan_filter_conf.ids[vidx];
+	*v &= ~(UINT64_C(1) << vbit);
+	*v |= (uint64_t)!!on << vbit;
+	ret = mlx4_flow_sync(priv, &error);
+	if (!ret)
+		return 0;
+	ERROR("failed to synchronize flow rules after %s VLAN filter on ID %u"
+	      " (code %d, \"%s\"), "
+	      " flow error type %d, cause %p, message: %s",
+	      on ? "enabling" : "disabling", vlan_id,
+	      rte_errno, strerror(rte_errno), error.type, error.cause,
+	      error.message ? error.message : "(unspecified)");
+	return ret;
+}
+
+/**
+ * DPDK callback to set the primary MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mac_addr
+ *   MAC address to register.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
+{
+	return mlx4_mac_addr_add(dev, mac_addr, 0, 0);
+}
+
+/**
+ * DPDK callback to get information about the device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] info
+ *   Info structure output buffer.
+ */
+void
+mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+	struct priv *priv = dev->data->dev_private;
+	unsigned int max;
+	char ifname[IF_NAMESIZE];
+
+	/* FIXME: we should ask the device for these values. */
+	info->min_rx_bufsize = 32;
+	info->max_rx_pktlen = 65536;
+	/*
+	 * Since we need one CQ per QP, the limit is the minimum number
+	 * between the two values.
+	 */
+	max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ?
+	       priv->device_attr.max_qp : priv->device_attr.max_cq);
+	/* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
+	if (max >= 65535)
+		max = 65535;
+	info->max_rx_queues = max;
+	info->max_tx_queues = max;
+	info->max_mac_addrs = RTE_DIM(priv->mac);
+	info->tx_offload_capa = mlx4_get_tx_port_offloads(priv);
+	info->rx_queue_offload_capa = mlx4_get_rx_queue_offloads(priv);
+	info->rx_offload_capa = (mlx4_get_rx_port_offloads(priv) |
+				 info->rx_queue_offload_capa);
+	if (mlx4_get_ifname(priv, &ifname) == 0)
+		info->if_index = if_nametoindex(ifname);
+	info->hash_key_size = MLX4_RSS_HASH_KEY_SIZE;
+	info->speed_capa =
+			ETH_LINK_SPEED_1G |
+			ETH_LINK_SPEED_10G |
+			ETH_LINK_SPEED_20G |
+			ETH_LINK_SPEED_40G |
+			ETH_LINK_SPEED_56G;
+	info->flow_type_rss_offloads = mlx4_conv_rss_types(priv, 0, 1);
+}
+
+/**
+ * DPDK callback to get device statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] stats
+ *   Stats structure output buffer.
+ */
+int
+mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+	struct rte_eth_stats tmp;
+	unsigned int i;
+	unsigned int idx;
+
+	memset(&tmp, 0, sizeof(tmp));
+	/* Add software counters. */
+	for (i = 0; i != dev->data->nb_rx_queues; ++i) {
+		struct rxq *rxq = dev->data->rx_queues[i];
+
+		if (rxq == NULL)
+			continue;
+		idx = rxq->stats.idx;
+		if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+			tmp.q_ipackets[idx] += rxq->stats.ipackets;
+			tmp.q_ibytes[idx] += rxq->stats.ibytes;
+			tmp.q_errors[idx] += (rxq->stats.idropped +
+					      rxq->stats.rx_nombuf);
+		}
+		tmp.ipackets += rxq->stats.ipackets;
+		tmp.ibytes += rxq->stats.ibytes;
+		tmp.ierrors += rxq->stats.idropped;
+		tmp.rx_nombuf += rxq->stats.rx_nombuf;
+	}
+	for (i = 0; i != dev->data->nb_tx_queues; ++i) {
+		struct txq *txq = dev->data->tx_queues[i];
+
+		if (txq == NULL)
+			continue;
+		idx = txq->stats.idx;
+		if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+			tmp.q_opackets[idx] += txq->stats.opackets;
+			tmp.q_obytes[idx] += txq->stats.obytes;
+			tmp.q_errors[idx] += txq->stats.odropped;
+		}
+		tmp.opackets += txq->stats.opackets;
+		tmp.obytes += txq->stats.obytes;
+		tmp.oerrors += txq->stats.odropped;
+	}
+	*stats = tmp;
+	return 0;
+}
+
+/**
+ * DPDK callback to clear device statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_stats_reset(struct rte_eth_dev *dev)
+{
+	unsigned int i;
+
+	for (i = 0; i != dev->data->nb_rx_queues; ++i) {
+		struct rxq *rxq = dev->data->rx_queues[i];
+
+		if (rxq)
+			rxq->stats = (struct mlx4_rxq_stats){
+				.idx = rxq->stats.idx,
+			};
+	}
+	for (i = 0; i != dev->data->nb_tx_queues; ++i) {
+		struct txq *txq = dev->data->tx_queues[i];
+
+		if (txq)
+			txq->stats = (struct mlx4_txq_stats){
+				.idx = txq->stats.idx,
+			};
+	}
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param wait_to_complete
+ *   Wait for request completion (ignored).
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete)
+{
+	const struct priv *priv = dev->data->dev_private;
+	struct ethtool_cmd edata = {
+		.cmd = ETHTOOL_GSET,
+	};
+	struct ifreq ifr;
+	struct rte_eth_link dev_link;
+	int link_speed = 0;
+
+	if (priv == NULL) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	(void)wait_to_complete;
+	if (mlx4_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
+		WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(rte_errno));
+		return -rte_errno;
+	}
+	memset(&dev_link, 0, sizeof(dev_link));
+	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
+				(ifr.ifr_flags & IFF_RUNNING));
+	ifr.ifr_data = (void *)&edata;
+	if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) {
+		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
+		     strerror(rte_errno));
+		return -rte_errno;
+	}
+	link_speed = ethtool_cmd_speed(&edata);
+	if (link_speed == -1)
+		dev_link.link_speed = ETH_SPEED_NUM_NONE;
+	else
+		dev_link.link_speed = link_speed;
+	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
+				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+				  ETH_LINK_SPEED_FIXED);
+	dev->data->dev_link = dev_link;
+	return 0;
+}
+
+/**
+ * DPDK callback to get flow control status.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] fc_conf
+ *   Flow control output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_flow_ctrl_get(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct ifreq ifr;
+	struct ethtool_pauseparam ethpause = {
+		.cmd = ETHTOOL_GPAUSEPARAM,
+	};
+	int ret;
+
+	ifr.ifr_data = (void *)&ethpause;
+	if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) {
+		ret = rte_errno;
+		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)"
+		     " failed: %s",
+		     strerror(rte_errno));
+		goto out;
+	}
+	fc_conf->autoneg = ethpause.autoneg;
+	if (ethpause.rx_pause && ethpause.tx_pause)
+		fc_conf->mode = RTE_FC_FULL;
+	else if (ethpause.rx_pause)
+		fc_conf->mode = RTE_FC_RX_PAUSE;
+	else if (ethpause.tx_pause)
+		fc_conf->mode = RTE_FC_TX_PAUSE;
+	else
+		fc_conf->mode = RTE_FC_NONE;
+	ret = 0;
+out:
+	assert(ret >= 0);
+	return -ret;
+}
+
+/**
+ * DPDK callback to modify flow control parameters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[in] fc_conf
+ *   Flow control parameters.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct ifreq ifr;
+	struct ethtool_pauseparam ethpause = {
+		.cmd = ETHTOOL_SPAUSEPARAM,
+	};
+	int ret;
+
+	ifr.ifr_data = (void *)&ethpause;
+	ethpause.autoneg = fc_conf->autoneg;
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_RX_PAUSE))
+		ethpause.rx_pause = 1;
+	else
+		ethpause.rx_pause = 0;
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_TX_PAUSE))
+		ethpause.tx_pause = 1;
+	else
+		ethpause.tx_pause = 0;
+	if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) {
+		ret = rte_errno;
+		WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
+		     " failed: %s",
+		     strerror(rte_errno));
+		goto out;
+	}
+	ret = 0;
+out:
+	assert(ret >= 0);
+	return -ret;
+}
+
+/**
+ * DPDK callback to retrieve the received packet types that are recognized
+ * by the device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   Pointer to an array of recognized packet types if in Rx burst mode,
+ *   NULL otherwise.
+ */
+const uint32_t *
+mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev)
+{
+	static const uint32_t ptypes[] = {
+		/* refers to rxq_cq_to_pkt_type() */
+		RTE_PTYPE_L2_ETHER,
+		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+		RTE_PTYPE_L4_FRAG,
+		RTE_PTYPE_L4_TCP,
+		RTE_PTYPE_L4_UDP,
+		RTE_PTYPE_UNKNOWN
+	};
+	static const uint32_t ptypes_l2tun[] = {
+		/* refers to rxq_cq_to_pkt_type() */
+		RTE_PTYPE_L2_ETHER,
+		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+		RTE_PTYPE_L4_FRAG,
+		RTE_PTYPE_L4_TCP,
+		RTE_PTYPE_L4_UDP,
+		RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+		RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+		RTE_PTYPE_UNKNOWN
+	};
+	struct priv *priv = dev->data->dev_private;
+
+	if (dev->rx_pkt_burst == mlx4_rx_burst) {
+		if (priv->hw_csum_l2tun)
+			return ptypes_l2tun;
+		else
+			return ptypes;
+	}
+	return NULL;
+}
+
+/**
+ * Check if mlx4 device was removed.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   1 when device is removed, otherwise 0.
+ */
+int
+mlx4_is_removed(struct rte_eth_dev *dev)
+{
+	struct ibv_device_attr device_attr;
+	struct priv *priv = dev->data->dev_private;
+
+	if (mlx4_glue->query_device(priv->ctx, &device_attr) == EIO)
+		return 1;
+	return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_flow.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4_flow.c
new file mode 100644
index 00000000..b40e7e5c
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_flow.c
@@ -0,0 +1,1617 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+/**
+ * @file
+ * Flow API operations for mlx4 driver.
+ */
+
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdalign.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_byteorder.h>
+#include <rte_errno.h>
+#include <rte_eth_ctrl.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ether.h>
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+#include <rte_malloc.h>
+
+/* PMD headers. */
+#include "mlx4.h"
+#include "mlx4_glue.h"
+#include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/** Static initializer for a list of subsequent item types. */
+#define NEXT_ITEM(...) \
+	(const enum rte_flow_item_type []){ \
+		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
+	}
+
+/** Processor structure associated with a flow item. */
+struct mlx4_flow_proc_item {
+	/** Bit-mask for fields supported by this PMD. */
+	const void *mask_support;
+	/** Bit-mask to use when @p item->mask is not provided. */
+	const void *mask_default;
+	/** Size in bytes for @p mask_support and @p mask_default. */
+	const unsigned int mask_sz;
+	/** Merge a pattern item into a flow rule handle. */
+	int (*merge)(struct rte_flow *flow,
+		     const struct rte_flow_item *item,
+		     const struct mlx4_flow_proc_item *proc,
+		     struct rte_flow_error *error);
+	/** Size in bytes of the destination structure. */
+	const unsigned int dst_sz;
+	/** List of possible subsequent items. */
+	const enum rte_flow_item_type *const next_item;
+};
+
+/** Shared resources for drop flow rules. */
+struct mlx4_drop {
+	struct ibv_qp *qp; /**< QP target. */
+	struct ibv_cq *cq; /**< CQ associated with above QP. */
+	struct priv *priv; /**< Back pointer to private data. */
+	uint32_t refcnt; /**< Reference count. */
+};
+
+/**
+ * Convert supported RSS hash field types between DPDK and Verbs formats.
+ *
+ * This function returns the supported (default) set when @p types has
+ * special value 0.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param types
+ *   Depending on @p verbs_to_dpdk, hash types in either DPDK (see struct
+ *   rte_eth_rss_conf) or Verbs format.
+ * @param verbs_to_dpdk
+ *   A zero value converts @p types from DPDK to Verbs, a nonzero value
+ *   performs the reverse operation.
+ *
+ * @return
+ *   Converted RSS hash fields on success, (uint64_t)-1 otherwise and
+ *   rte_errno is set.
+ */
+uint64_t
+mlx4_conv_rss_types(struct priv *priv, uint64_t types, int verbs_to_dpdk)
+{
+	enum {
+		INNER,
+		IPV4, IPV4_1, IPV4_2, IPV6, IPV6_1, IPV6_2, IPV6_3,
+		TCP, UDP,
+		IPV4_TCP, IPV4_UDP, IPV6_TCP, IPV6_TCP_1, IPV6_UDP, IPV6_UDP_1,
+	};
+	enum {
+		VERBS_IPV4 = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4,
+		VERBS_IPV6 = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6,
+		VERBS_TCP = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP,
+		VERBS_UDP = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP,
+	};
+	static const uint64_t dpdk[] = {
+		[INNER] = 0,
+		[IPV4] = ETH_RSS_IPV4,
+		[IPV4_1] = ETH_RSS_FRAG_IPV4,
+		[IPV4_2] = ETH_RSS_NONFRAG_IPV4_OTHER,
+		[IPV6] = ETH_RSS_IPV6,
+		[IPV6_1] = ETH_RSS_FRAG_IPV6,
+		[IPV6_2] = ETH_RSS_NONFRAG_IPV6_OTHER,
+		[IPV6_3] = ETH_RSS_IPV6_EX,
+		[TCP] = 0,
+		[UDP] = 0,
+		[IPV4_TCP] = ETH_RSS_NONFRAG_IPV4_TCP,
+		[IPV4_UDP] = ETH_RSS_NONFRAG_IPV4_UDP,
+		[IPV6_TCP] = ETH_RSS_NONFRAG_IPV6_TCP,
+		[IPV6_TCP_1] = ETH_RSS_IPV6_TCP_EX,
+		[IPV6_UDP] = ETH_RSS_NONFRAG_IPV6_UDP,
+		[IPV6_UDP_1] = ETH_RSS_IPV6_UDP_EX,
+	};
+	static const uint64_t verbs[RTE_DIM(dpdk)] = {
+		[INNER] = IBV_RX_HASH_INNER,
+		[IPV4] = VERBS_IPV4,
+		[IPV4_1] = VERBS_IPV4,
+		[IPV4_2] = VERBS_IPV4,
+		[IPV6] = VERBS_IPV6,
+		[IPV6_1] = VERBS_IPV6,
+		[IPV6_2] = VERBS_IPV6,
+		[IPV6_3] = VERBS_IPV6,
+		[TCP] = VERBS_TCP,
+		[UDP] = VERBS_UDP,
+		[IPV4_TCP] = VERBS_IPV4 | VERBS_TCP,
+		[IPV4_UDP] = VERBS_IPV4 | VERBS_UDP,
+		[IPV6_TCP] = VERBS_IPV6 | VERBS_TCP,
+		[IPV6_TCP_1] = VERBS_IPV6 | VERBS_TCP,
+		[IPV6_UDP] = VERBS_IPV6 | VERBS_UDP,
+		[IPV6_UDP_1] = VERBS_IPV6 | VERBS_UDP,
+	};
+	const uint64_t *in = verbs_to_dpdk ? verbs : dpdk;
+	const uint64_t *out = verbs_to_dpdk ? dpdk : verbs;
+	uint64_t seen = 0;
+	uint64_t conv = 0;
+	unsigned int i;
+
+	if (!types) {
+		if (!verbs_to_dpdk)
+			return priv->hw_rss_sup;
+		types = priv->hw_rss_sup;
+	}
+	for (i = 0; i != RTE_DIM(dpdk); ++i)
+		if (in[i] && (types & in[i]) == in[i]) {
+			seen |= types & in[i];
+			conv |= out[i];
+		}
+	if ((verbs_to_dpdk || (conv & priv->hw_rss_sup) == conv) &&
+	    !(types & ~seen))
+		return conv;
+	rte_errno = ENOTSUP;
+	return (uint64_t)-1;
+}
+
+/**
+ * Merge Ethernet pattern item into flow rule handle.
+ *
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks, except in the specific case of matching
+ *   all multicast traffic (@p spec->dst and @p mask->dst equal to
+ *   01:00:00:00:00:00).
+ * - Not providing @p item->spec or providing an empty @p mask->dst is
+ *   *only* supported if the rule doesn't specify additional matching
+ *   criteria (i.e. rule is promiscuous-like).
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_merge_eth(struct rte_flow *flow,
+		    const struct rte_flow_item *item,
+		    const struct mlx4_flow_proc_item *proc,
+		    struct rte_flow_error *error)
+{
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
+	struct ibv_flow_spec_eth *eth;
+	const char *msg;
+	unsigned int i;
+
+	if (!mask) {
+		flow->promisc = 1;
+	} else {
+		uint32_t sum_dst = 0;
+		uint32_t sum_src = 0;
+
+		for (i = 0; i != sizeof(mask->dst.addr_bytes); ++i) {
+			sum_dst += mask->dst.addr_bytes[i];
+			sum_src += mask->src.addr_bytes[i];
+		}
+		if (sum_src) {
+			msg = "mlx4 does not support source MAC matching";
+			goto error;
+		} else if (!sum_dst) {
+			flow->promisc = 1;
+		} else if (sum_dst == 1 && mask->dst.addr_bytes[0] == 1) {
+			if (!(spec->dst.addr_bytes[0] & 1)) {
+				msg = "mlx4 does not support the explicit"
+					" exclusion of all multicast traffic";
+				goto error;
+			}
+			flow->allmulti = 1;
+		} else if (sum_dst != (UINT8_C(0xff) * ETHER_ADDR_LEN)) {
+			msg = "mlx4 does not support matching partial"
+				" Ethernet fields";
+			goto error;
+		}
+	}
+	if (!flow->ibv_attr)
+		return 0;
+	if (flow->promisc) {
+		flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT;
+		return 0;
+	}
+	if (flow->allmulti) {
+		flow->ibv_attr->type = IBV_FLOW_ATTR_MC_DEFAULT;
+		return 0;
+	}
+	++flow->ibv_attr->num_of_specs;
+	eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
+	*eth = (struct ibv_flow_spec_eth) {
+		.type = IBV_FLOW_SPEC_ETH,
+		.size = sizeof(*eth),
+	};
+	memcpy(eth->val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN);
+	memcpy(eth->mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN);
+	/* Remove unwanted bits from values. */
+	for (i = 0; i < ETHER_ADDR_LEN; ++i) {
+		eth->val.dst_mac[i] &= eth->mask.dst_mac[i];
+	}
+	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
+}
+
+/**
+ * Merge VLAN pattern item into flow rule handle.
+ *
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - Matching *all* VLAN traffic by omitting @p item->spec or providing an
+ *   empty @p item->mask would also include non-VLAN traffic. Doing so is
+ *   therefore unsupported.
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_merge_vlan(struct rte_flow *flow,
+		     const struct rte_flow_item *item,
+		     const struct mlx4_flow_proc_item *proc,
+		     struct rte_flow_error *error)
+{
+	const struct rte_flow_item_vlan *spec = item->spec;
+	const struct rte_flow_item_vlan *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
+	struct ibv_flow_spec_eth *eth;
+	const char *msg;
+
+	if (!mask || !mask->tci) {
+		msg = "mlx4 cannot match all VLAN traffic while excluding"
+			" non-VLAN traffic, TCI VID must be specified";
+		goto error;
+	}
+	if (mask->tci != RTE_BE16(0x0fff)) {
+		msg = "mlx4 does not support partial TCI VID matching";
+		goto error;
+	}
+	if (!flow->ibv_attr)
+		return 0;
+	eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size -
+		       sizeof(*eth));
+	eth->val.vlan_tag = spec->tci;
+	eth->mask.vlan_tag = mask->tci;
+	eth->val.vlan_tag &= eth->mask.vlan_tag;
+	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
+}
+
+/**
+ * Merge IPv4 pattern item into flow rule handle.
+ *
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_merge_ipv4(struct rte_flow *flow,
+		     const struct rte_flow_item *item,
+		     const struct mlx4_flow_proc_item *proc,
+		     struct rte_flow_error *error)
+{
+	const struct rte_flow_item_ipv4 *spec = item->spec;
+	const struct rte_flow_item_ipv4 *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
+	struct ibv_flow_spec_ipv4 *ipv4;
+	const char *msg;
+
+	if (mask &&
+	    ((uint32_t)(mask->hdr.src_addr + 1) > UINT32_C(1) ||
+	     (uint32_t)(mask->hdr.dst_addr + 1) > UINT32_C(1))) {
+		msg = "mlx4 does not support matching partial IPv4 fields";
+		goto error;
+	}
+	if (!flow->ibv_attr)
+		return 0;
+	++flow->ibv_attr->num_of_specs;
+	ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
+	*ipv4 = (struct ibv_flow_spec_ipv4) {
+		.type = IBV_FLOW_SPEC_IPV4,
+		.size = sizeof(*ipv4),
+	};
+	if (!spec)
+		return 0;
+	ipv4->val = (struct ibv_flow_ipv4_filter) {
+		.src_ip = spec->hdr.src_addr,
+		.dst_ip = spec->hdr.dst_addr,
+	};
+	ipv4->mask = (struct ibv_flow_ipv4_filter) {
+		.src_ip = mask->hdr.src_addr,
+		.dst_ip = mask->hdr.dst_addr,
+	};
+	/* Remove unwanted bits from values. */
+	ipv4->val.src_ip &= ipv4->mask.src_ip;
+	ipv4->val.dst_ip &= ipv4->mask.dst_ip;
+	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
+}
+
+/**
+ * Merge UDP pattern item into flow rule handle.
+ *
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks.
+ * - Due to HW/FW limitation, flow rule priority is not taken into account
+ *   when matching UDP destination ports, doing is therefore only supported
+ *   at the highest priority level (0).
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_merge_udp(struct rte_flow *flow,
+		    const struct rte_flow_item *item,
+		    const struct mlx4_flow_proc_item *proc,
+		    struct rte_flow_error *error)
+{
+	const struct rte_flow_item_udp *spec = item->spec;
+	const struct rte_flow_item_udp *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
+	struct ibv_flow_spec_tcp_udp *udp;
+	const char *msg;
+
+	if (mask &&
+	    ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) ||
+	     (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) {
+		msg = "mlx4 does not support matching partial UDP fields";
+		goto error;
+	}
+	if (mask && mask->hdr.dst_port && flow->priority) {
+		msg = "combining UDP destination port matching with a nonzero"
+			" priority level is not supported";
+		goto error;
+	}
+	if (!flow->ibv_attr)
+		return 0;
+	++flow->ibv_attr->num_of_specs;
+	udp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
+	*udp = (struct ibv_flow_spec_tcp_udp) {
+		.type = IBV_FLOW_SPEC_UDP,
+		.size = sizeof(*udp),
+	};
+	if (!spec)
+		return 0;
+	udp->val.dst_port = spec->hdr.dst_port;
+	udp->val.src_port = spec->hdr.src_port;
+	udp->mask.dst_port = mask->hdr.dst_port;
+	udp->mask.src_port = mask->hdr.src_port;
+	/* Remove unwanted bits from values. */
+	udp->val.src_port &= udp->mask.src_port;
+	udp->val.dst_port &= udp->mask.dst_port;
+	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
+}
+
+/**
+ * Merge TCP pattern item into flow rule handle.
+ *
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_merge_tcp(struct rte_flow *flow,
+		    const struct rte_flow_item *item,
+		    const struct mlx4_flow_proc_item *proc,
+		    struct rte_flow_error *error)
+{
+	const struct rte_flow_item_tcp *spec = item->spec;
+	const struct rte_flow_item_tcp *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
+	struct ibv_flow_spec_tcp_udp *tcp;
+	const char *msg;
+
+	if (mask &&
+	    ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) ||
+	     (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) {
+		msg = "mlx4 does not support matching partial TCP fields";
+		goto error;
+	}
+	if (!flow->ibv_attr)
+		return 0;
+	++flow->ibv_attr->num_of_specs;
+	tcp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
+	*tcp = (struct ibv_flow_spec_tcp_udp) {
+		.type = IBV_FLOW_SPEC_TCP,
+		.size = sizeof(*tcp),
+	};
+	if (!spec)
+		return 0;
+	tcp->val.dst_port = spec->hdr.dst_port;
+	tcp->val.src_port = spec->hdr.src_port;
+	tcp->mask.dst_port = mask->hdr.dst_port;
+	tcp->mask.src_port = mask->hdr.src_port;
+	/* Remove unwanted bits from values. */
+	tcp->val.src_port &= tcp->mask.src_port;
+	tcp->val.dst_port &= tcp->mask.dst_port;
+	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
+}
+
+/**
+ * Perform basic sanity checks on a pattern item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_item_check(const struct rte_flow_item *item,
+		     const struct mlx4_flow_proc_item *proc,
+		     struct rte_flow_error *error)
+{
+	const uint8_t *mask;
+	unsigned int i;
+
+	/* item->last and item->mask cannot exist without item->spec. */
+	if (!item->spec && (item->mask || item->last))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item,
+			 "\"mask\" or \"last\" field provided without a"
+			 " corresponding \"spec\"");
+	/* No spec, no mask, no problem. */
+	if (!item->spec)
+		return 0;
+	mask = item->mask ?
+		(const uint8_t *)item->mask :
+		(const uint8_t *)proc->mask_default;
+	assert(mask);
+	/*
+	 * Single-pass check to make sure that:
+	 * - Mask is supported, no bits are set outside proc->mask_support.
+	 * - Both item->spec and item->last are included in mask.
+	 */
+	for (i = 0; i != proc->mask_sz; ++i) {
+		if (!mask[i])
+			continue;
+		if ((mask[i] | ((const uint8_t *)proc->mask_support)[i]) !=
+		    ((const uint8_t *)proc->mask_support)[i])
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				 item, "unsupported field found in \"mask\"");
+		if (item->last &&
+		    (((const uint8_t *)item->spec)[i] & mask[i]) !=
+		    (((const uint8_t *)item->last)[i] & mask[i]))
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				 item,
+				 "range between \"spec\" and \"last\""
+				 " is larger than \"mask\"");
+	}
+	return 0;
+}
+
+/** Graph of supported items and associated actions. */
+static const struct mlx4_flow_proc_item mlx4_flow_proc_item_list[] = {
+	[RTE_FLOW_ITEM_TYPE_END] = {
+		.next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_ETH),
+	},
+	[RTE_FLOW_ITEM_TYPE_ETH] = {
+		.next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_VLAN,
+				       RTE_FLOW_ITEM_TYPE_IPV4),
+		.mask_support = &(const struct rte_flow_item_eth){
+			/* Only destination MAC can be matched. */
+			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+		},
+		.mask_default = &rte_flow_item_eth_mask,
+		.mask_sz = sizeof(struct rte_flow_item_eth),
+		.merge = mlx4_flow_merge_eth,
+		.dst_sz = sizeof(struct ibv_flow_spec_eth),
+	},
+	[RTE_FLOW_ITEM_TYPE_VLAN] = {
+		.next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_IPV4),
+		.mask_support = &(const struct rte_flow_item_vlan){
+			/* Only TCI VID matching is supported. */
+			.tci = RTE_BE16(0x0fff),
+		},
+		.mask_default = &rte_flow_item_vlan_mask,
+		.mask_sz = sizeof(struct rte_flow_item_vlan),
+		.merge = mlx4_flow_merge_vlan,
+		.dst_sz = 0,
+	},
+	[RTE_FLOW_ITEM_TYPE_IPV4] = {
+		.next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_UDP,
+				       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask_support = &(const struct rte_flow_item_ipv4){
+			.hdr = {
+				.src_addr = RTE_BE32(0xffffffff),
+				.dst_addr = RTE_BE32(0xffffffff),
+			},
+		},
+		.mask_default = &rte_flow_item_ipv4_mask,
+		.mask_sz = sizeof(struct rte_flow_item_ipv4),
+		.merge = mlx4_flow_merge_ipv4,
+		.dst_sz = sizeof(struct ibv_flow_spec_ipv4),
+	},
+	[RTE_FLOW_ITEM_TYPE_UDP] = {
+		.mask_support = &(const struct rte_flow_item_udp){
+			.hdr = {
+				.src_port = RTE_BE16(0xffff),
+				.dst_port = RTE_BE16(0xffff),
+			},
+		},
+		.mask_default = &rte_flow_item_udp_mask,
+		.mask_sz = sizeof(struct rte_flow_item_udp),
+		.merge = mlx4_flow_merge_udp,
+		.dst_sz = sizeof(struct ibv_flow_spec_tcp_udp),
+	},
+	[RTE_FLOW_ITEM_TYPE_TCP] = {
+		.mask_support = &(const struct rte_flow_item_tcp){
+			.hdr = {
+				.src_port = RTE_BE16(0xffff),
+				.dst_port = RTE_BE16(0xffff),
+			},
+		},
+		.mask_default = &rte_flow_item_tcp_mask,
+		.mask_sz = sizeof(struct rte_flow_item_tcp),
+		.merge = mlx4_flow_merge_tcp,
+		.dst_sz = sizeof(struct ibv_flow_spec_tcp_udp),
+	},
+};
+
+/**
+ * Make sure a flow rule is supported and initialize associated structure.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] addr
+ *   Buffer where the resulting flow rule handle pointer must be stored.
+ *   If NULL, stop processing after validation stage.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_prepare(struct priv *priv,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item pattern[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct rte_flow **addr)
+{
+	const struct rte_flow_item *item;
+	const struct rte_flow_action *action;
+	const struct mlx4_flow_proc_item *proc;
+	struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
+	struct rte_flow *flow = &temp;
+	const char *msg = NULL;
+	int overlap;
+
+	if (attr->group)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+			 NULL, "groups are not supported");
+	if (attr->priority > MLX4_FLOW_PRIORITY_LAST)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			 NULL, "maximum priority level is "
+			 MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST));
+	if (attr->egress)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
+			 NULL, "egress is not supported");
+	if (attr->transfer)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			 NULL, "transfer is not supported");
+	if (!attr->ingress)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+			 NULL, "only ingress is supported");
+fill:
+	overlap = 0;
+	proc = mlx4_flow_proc_item_list;
+	flow->priority = attr->priority;
+	/* Go over pattern. */
+	for (item = pattern; item->type; ++item) {
+		const struct mlx4_flow_proc_item *next = NULL;
+		unsigned int i;
+		int err;
+
+		if (item->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		if (item->type == MLX4_FLOW_ITEM_TYPE_INTERNAL) {
+			flow->internal = 1;
+			continue;
+		}
+		if (flow->promisc || flow->allmulti) {
+			msg = "mlx4 does not support additional matching"
+				" criteria combined with indiscriminate"
+				" matching on Ethernet headers";
+			goto exit_item_not_supported;
+		}
+		for (i = 0; proc->next_item && proc->next_item[i]; ++i) {
+			if (proc->next_item[i] == item->type) {
+				next = &mlx4_flow_proc_item_list[item->type];
+				break;
+			}
+		}
+		if (!next)
+			goto exit_item_not_supported;
+		proc = next;
+		/*
+		 * Perform basic sanity checks only once, while handle is
+		 * not allocated.
+		 */
+		if (flow == &temp) {
+			err = mlx4_flow_item_check(item, proc, error);
+			if (err)
+				return err;
+		}
+		if (proc->merge) {
+			err = proc->merge(flow, item, proc, error);
+			if (err)
+				return err;
+		}
+		flow->ibv_attr_size += proc->dst_sz;
+	}
+	/* Go over actions list. */
+	for (action = actions; action->type; ++action) {
+		/* This one may appear anywhere multiple times. */
+		if (action->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (overlap) {
+			msg = "cannot combine several fate-deciding actions,"
+				" choose between DROP, QUEUE or RSS";
+			goto exit_action_not_supported;
+		}
+		overlap = 1;
+		switch (action->type) {
+			const struct rte_flow_action_queue *queue;
+			const struct rte_flow_action_rss *rss;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
+			uint64_t fields;
+			unsigned int i;
+
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			flow->drop = 1;
+			break;
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+			if (flow->rss)
+				break;
+			queue = action->conf;
+			if (queue->index >= priv->dev->data->nb_rx_queues) {
+				msg = "queue target index beyond number of"
+					" configured Rx queues";
+				goto exit_action_not_supported;
+			}
+			flow->rss = mlx4_rss_get
+				(priv, 0, mlx4_rss_hash_key_default, 1,
+				 &queue->index);
+			if (!flow->rss) {
+				msg = "not enough resources for additional"
+					" single-queue RSS context";
+				goto exit_action_not_supported;
+			}
+			break;
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			if (flow->rss)
+				break;
+			rss = action->conf;
+			/* Default RSS configuration if none is provided. */
+			if (rss->key_len) {
+				rss_key = rss->key;
+				rss_key_len = rss->key_len;
+			} else {
+				rss_key = mlx4_rss_hash_key_default;
+				rss_key_len = MLX4_RSS_HASH_KEY_SIZE;
+			}
+			/* Sanity checks. */
+			for (i = 0; i < rss->queue_num; ++i)
+				if (rss->queue[i] >=
+				    priv->dev->data->nb_rx_queues)
+					break;
+			if (i != rss->queue_num) {
+				msg = "queue index target beyond number of"
+					" configured Rx queues";
+				goto exit_action_not_supported;
+			}
+			if (!rte_is_power_of_2(rss->queue_num)) {
+				msg = "for RSS, mlx4 requires the number of"
+					" queues to be a power of two";
+				goto exit_action_not_supported;
+			}
+			if (rss_key_len != sizeof(flow->rss->key)) {
+				msg = "mlx4 supports exactly one RSS hash key"
+					" length: "
+					MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
+				goto exit_action_not_supported;
+			}
+			for (i = 1; i < rss->queue_num; ++i)
+				if (rss->queue[i] - rss->queue[i - 1] != 1)
+					break;
+			if (i != rss->queue_num) {
+				msg = "mlx4 requires RSS contexts to use"
+					" consecutive queue indices only";
+				goto exit_action_not_supported;
+			}
+			if (rss->queue[0] % rss->queue_num) {
+				msg = "mlx4 requires the first queue of a RSS"
+					" context to be aligned on a multiple"
+					" of the context size";
+				goto exit_action_not_supported;
+			}
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				msg = "the only supported RSS hash function"
+					" is Toeplitz";
+				goto exit_action_not_supported;
+			}
+			if (rss->level) {
+				msg = "a nonzero RSS encapsulation level is"
+					" not supported";
+				goto exit_action_not_supported;
+			}
+			rte_errno = 0;
+			fields = mlx4_conv_rss_types(priv, rss->types, 0);
+			if (fields == (uint64_t)-1 && rte_errno) {
+				msg = "unsupported RSS hash type requested";
+				goto exit_action_not_supported;
+			}
+			flow->rss = mlx4_rss_get
+				(priv, fields, rss_key, rss->queue_num,
+				 rss->queue);
+			if (!flow->rss) {
+				msg = "either invalid parameters or not enough"
+					" resources for additional multi-queue"
+					" RSS context";
+				goto exit_action_not_supported;
+			}
+			break;
+		default:
+			goto exit_action_not_supported;
+		}
+	}
+	/* When fate is unknown, drop traffic. */
+	if (!overlap)
+		flow->drop = 1;
+	/* Validation ends here. */
+	if (!addr) {
+		if (flow->rss)
+			mlx4_rss_put(flow->rss);
+		return 0;
+	}
+	if (flow == &temp) {
+		/* Allocate proper handle based on collected data. */
+		const struct mlx4_malloc_vec vec[] = {
+			{
+				.align = alignof(struct rte_flow),
+				.size = sizeof(*flow),
+				.addr = (void **)&flow,
+			},
+			{
+				.align = alignof(struct ibv_flow_attr),
+				.size = temp.ibv_attr_size,
+				.addr = (void **)&temp.ibv_attr,
+			},
+		};
+
+		if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec))) {
+			if (temp.rss)
+				mlx4_rss_put(temp.rss);
+			return rte_flow_error_set
+				(error, -rte_errno,
+				 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				 "flow rule handle allocation failure");
+		}
+		/* Most fields will be updated by second pass. */
+		*flow = (struct rte_flow){
+			.ibv_attr = temp.ibv_attr,
+			.ibv_attr_size = sizeof(*flow->ibv_attr),
+			.rss = temp.rss,
+		};
+		*flow->ibv_attr = (struct ibv_flow_attr){
+			.type = IBV_FLOW_ATTR_NORMAL,
+			.size = sizeof(*flow->ibv_attr),
+			.priority = attr->priority,
+			.port = priv->port,
+		};
+		goto fill;
+	}
+	*addr = flow;
+	return 0;
+exit_item_not_supported:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg ? msg : "item not supported");
+exit_action_not_supported:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				  action, msg ? msg : "action not supported");
+}
+
+/**
+ * Validate a flow supported by the NIC.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+static int
+mlx4_flow_validate(struct rte_eth_dev *dev,
+		   const struct rte_flow_attr *attr,
+		   const struct rte_flow_item pattern[],
+		   const struct rte_flow_action actions[],
+		   struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	return mlx4_flow_prepare(priv, attr, pattern, actions, error, NULL);
+}
+
+/**
+ * Get a drop flow rule resources instance.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   Pointer to drop flow resources on success, NULL otherwise and rte_errno
+ *   is set.
+ */
+static struct mlx4_drop *
+mlx4_drop_get(struct priv *priv)
+{
+	struct mlx4_drop *drop = priv->drop;
+
+	if (drop) {
+		assert(drop->refcnt);
+		assert(drop->priv == priv);
+		++drop->refcnt;
+		return drop;
+	}
+	drop = rte_malloc(__func__, sizeof(*drop), 0);
+	if (!drop)
+		goto error;
+	*drop = (struct mlx4_drop){
+		.priv = priv,
+		.refcnt = 1,
+	};
+	drop->cq = mlx4_glue->create_cq(priv->ctx, 1, NULL, NULL, 0);
+	if (!drop->cq)
+		goto error;
+	drop->qp = mlx4_glue->create_qp
+		(priv->pd,
+		 &(struct ibv_qp_init_attr){
+			.send_cq = drop->cq,
+			.recv_cq = drop->cq,
+			.qp_type = IBV_QPT_RAW_PACKET,
+		 });
+	if (!drop->qp)
+		goto error;
+	priv->drop = drop;
+	return drop;
+error:
+	if (drop->qp)
+		claim_zero(mlx4_glue->destroy_qp(drop->qp));
+	if (drop->cq)
+		claim_zero(mlx4_glue->destroy_cq(drop->cq));
+	if (drop)
+		rte_free(drop);
+	rte_errno = ENOMEM;
+	return NULL;
+}
+
+/**
+ * Give back a drop flow rule resources instance.
+ *
+ * @param drop
+ *   Pointer to drop flow rule resources.
+ */
+static void
+mlx4_drop_put(struct mlx4_drop *drop)
+{
+	assert(drop->refcnt);
+	if (--drop->refcnt)
+		return;
+	drop->priv->drop = NULL;
+	claim_zero(mlx4_glue->destroy_qp(drop->qp));
+	claim_zero(mlx4_glue->destroy_cq(drop->cq));
+	rte_free(drop);
+}
+
+/**
+ * Toggle a configured flow rule.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param flow
+ *   Flow rule handle to toggle.
+ * @param enable
+ *   Whether associated Verbs flow must be created or removed.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_toggle(struct priv *priv,
+		 struct rte_flow *flow,
+		 int enable,
+		 struct rte_flow_error *error)
+{
+	struct ibv_qp *qp = NULL;
+	const char *msg;
+	int err;
+
+	if (!enable) {
+		if (!flow->ibv_flow)
+			return 0;
+		claim_zero(mlx4_glue->destroy_flow(flow->ibv_flow));
+		flow->ibv_flow = NULL;
+		if (flow->drop)
+			mlx4_drop_put(priv->drop);
+		else if (flow->rss)
+			mlx4_rss_detach(flow->rss);
+		return 0;
+	}
+	assert(flow->ibv_attr);
+	if (!flow->internal &&
+	    !priv->isolated &&
+	    flow->ibv_attr->priority == MLX4_FLOW_PRIORITY_LAST) {
+		if (flow->ibv_flow) {
+			claim_zero(mlx4_glue->destroy_flow(flow->ibv_flow));
+			flow->ibv_flow = NULL;
+			if (flow->drop)
+				mlx4_drop_put(priv->drop);
+			else if (flow->rss)
+				mlx4_rss_detach(flow->rss);
+		}
+		err = EACCES;
+		msg = ("priority level "
+		       MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST)
+		       " is reserved when not in isolated mode");
+		goto error;
+	}
+	if (flow->rss) {
+		struct mlx4_rss *rss = flow->rss;
+		int missing = 0;
+		unsigned int i;
+
+		/* Stop at the first nonexistent target queue. */
+		for (i = 0; i != rss->queues; ++i)
+			if (rss->queue_id[i] >=
+			    priv->dev->data->nb_rx_queues ||
+			    !priv->dev->data->rx_queues[rss->queue_id[i]]) {
+				missing = 1;
+				break;
+			}
+		if (flow->ibv_flow) {
+			if (missing ^ !flow->drop)
+				return 0;
+			/* Verbs flow needs updating. */
+			claim_zero(mlx4_glue->destroy_flow(flow->ibv_flow));
+			flow->ibv_flow = NULL;
+			if (flow->drop)
+				mlx4_drop_put(priv->drop);
+			else
+				mlx4_rss_detach(rss);
+		}
+		if (!missing) {
+			err = mlx4_rss_attach(rss);
+			if (err) {
+				err = -err;
+				msg = "cannot create indirection table or hash"
+					" QP to associate flow rule with";
+				goto error;
+			}
+			qp = rss->qp;
+		}
+		/* A missing target queue drops traffic implicitly. */
+		flow->drop = missing;
+	}
+	if (flow->drop) {
+		if (flow->ibv_flow)
+			return 0;
+		mlx4_drop_get(priv);
+		if (!priv->drop) {
+			err = rte_errno;
+			msg = "resources for drop flow rule cannot be created";
+			goto error;
+		}
+		qp = priv->drop->qp;
+	}
+	assert(qp);
+	if (flow->ibv_flow)
+		return 0;
+	flow->ibv_flow = mlx4_glue->create_flow(qp, flow->ibv_attr);
+	if (flow->ibv_flow)
+		return 0;
+	if (flow->drop)
+		mlx4_drop_put(priv->drop);
+	else if (flow->rss)
+		mlx4_rss_detach(flow->rss);
+	err = errno;
+	msg = "flow rule rejected by device";
+error:
+	return rte_flow_error_set
+		(error, err, RTE_FLOW_ERROR_TYPE_HANDLE, flow, msg);
+}
+
+/**
+ * Create a flow.
+ *
+ * @see rte_flow_create()
+ * @see rte_flow_ops
+ */
+static struct rte_flow *
+mlx4_flow_create(struct rte_eth_dev *dev,
+		 const struct rte_flow_attr *attr,
+		 const struct rte_flow_item pattern[],
+		 const struct rte_flow_action actions[],
+		 struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow *flow;
+	int err;
+
+	err = mlx4_flow_prepare(priv, attr, pattern, actions, error, &flow);
+	if (err)
+		return NULL;
+	err = mlx4_flow_toggle(priv, flow, priv->started, error);
+	if (!err) {
+		struct rte_flow *curr = LIST_FIRST(&priv->flows);
+
+		/* New rules are inserted after internal ones. */
+		if (!curr || !curr->internal) {
+			LIST_INSERT_HEAD(&priv->flows, flow, next);
+		} else {
+			while (LIST_NEXT(curr, next) &&
+			       LIST_NEXT(curr, next)->internal)
+				curr = LIST_NEXT(curr, next);
+			LIST_INSERT_AFTER(curr, flow, next);
+		}
+		return flow;
+	}
+	if (flow->rss)
+		mlx4_rss_put(flow->rss);
+	rte_flow_error_set(error, -err, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			   error->message);
+	rte_free(flow);
+	return NULL;
+}
+
+/**
+ * Configure isolated mode.
+ *
+ * @see rte_flow_isolate()
+ * @see rte_flow_ops
+ */
+static int
+mlx4_flow_isolate(struct rte_eth_dev *dev,
+		  int enable,
+		  struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	if (!!enable == !!priv->isolated)
+		return 0;
+	priv->isolated = !!enable;
+	if (mlx4_flow_sync(priv, error)) {
+		priv->isolated = !enable;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Destroy a flow rule.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+static int
+mlx4_flow_destroy(struct rte_eth_dev *dev,
+		  struct rte_flow *flow,
+		  struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+	int err = mlx4_flow_toggle(priv, flow, 0, error);
+
+	if (err)
+		return err;
+	LIST_REMOVE(flow, next);
+	if (flow->rss)
+		mlx4_rss_put(flow->rss);
+	rte_free(flow);
+	return 0;
+}
+
+/**
+ * Destroy user-configured flow rules.
+ *
+ * This function skips internal flows rules.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+static int
+mlx4_flow_flush(struct rte_eth_dev *dev,
+		struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow *flow = LIST_FIRST(&priv->flows);
+
+	while (flow) {
+		struct rte_flow *next = LIST_NEXT(flow, next);
+
+		if (!flow->internal)
+			mlx4_flow_destroy(dev, flow, error);
+		flow = next;
+	}
+	return 0;
+}
+
+/**
+ * Helper function to determine the next configured VLAN filter.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param vlan
+ *   VLAN ID to use as a starting point.
+ *
+ * @return
+ *   Next configured VLAN ID or a high value (>= 4096) if there is none.
+ */
+static uint16_t
+mlx4_flow_internal_next_vlan(struct priv *priv, uint16_t vlan)
+{
+	while (vlan < 4096) {
+		if (priv->dev->data->vlan_filter_conf.ids[vlan / 64] &
+		    (UINT64_C(1) << (vlan % 64)))
+			return vlan;
+		++vlan;
+	}
+	return vlan;
+}
+
+/**
+ * Generate internal flow rules.
+ *
+ * Various flow rules are created depending on the mode the device is in:
+ *
+ * 1. Promiscuous:
+ *       port MAC + broadcast + catch-all (VLAN filtering is ignored).
+ * 2. All multicast:
+ *       port MAC/VLAN + broadcast + catch-all multicast.
+ * 3. Otherwise:
+ *       port MAC/VLAN + broadcast MAC/VLAN.
+ *
+ * About MAC flow rules:
+ *
+ * - MAC flow rules are generated from @p dev->data->mac_addrs
+ *   (@p priv->mac array).
+ * - An additional flow rule for Ethernet broadcasts is also generated.
+ * - All these are per-VLAN if @p DEV_RX_OFFLOAD_VLAN_FILTER
+ *   is enabled and VLAN filters are configured.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
+{
+	struct rte_flow_attr attr = {
+		.priority = MLX4_FLOW_PRIORITY_LAST,
+		.ingress = 1,
+	};
+	struct rte_flow_item_eth eth_spec;
+	const struct rte_flow_item_eth eth_mask = {
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+	};
+	const struct rte_flow_item_eth eth_allmulti = {
+		.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
+	};
+	struct rte_flow_item_vlan vlan_spec;
+	const struct rte_flow_item_vlan vlan_mask = {
+		.tci = RTE_BE16(0x0fff),
+	};
+	struct rte_flow_item pattern[] = {
+		{
+			.type = MLX4_FLOW_ITEM_TYPE_INTERNAL,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+			.spec = &eth_spec,
+			.mask = &eth_mask,
+		},
+		{
+			/* Replaced with VLAN if filtering is enabled. */
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	/*
+	 * Round number of queues down to their previous power of 2 to
+	 * comply with RSS context limitations. Extra queues silently do not
+	 * get RSS by default.
+	 */
+	uint32_t queues =
+		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
+	uint16_t queue[queues];
+	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
+		.types = 0,
+		.key_len = MLX4_RSS_HASH_KEY_SIZE,
+		.queue_num = queues,
+		.key = mlx4_rss_hash_key_default,
+		.queue = queue,
+	};
+	struct rte_flow_action actions[] = {
+		{
+			.type = RTE_FLOW_ACTION_TYPE_RSS,
+			.conf = &action_rss,
+		},
+		{
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
+	struct ether_addr *rule_mac = &eth_spec.dst;
+	rte_be16_t *rule_vlan =
+		(priv->dev->data->dev_conf.rxmode.offloads &
+		 DEV_RX_OFFLOAD_VLAN_FILTER) &&
+		!priv->dev->data->promiscuous ?
+		&vlan_spec.tci :
+		NULL;
+	uint16_t vlan = 0;
+	struct rte_flow *flow;
+	unsigned int i;
+	int err = 0;
+
+	/* Nothing to be done if there are no Rx queues. */
+	if (!queues)
+		goto error;
+	/* Prepare default RSS configuration. */
+	for (i = 0; i != queues; ++i)
+		queue[i] = i;
+	/*
+	 * Set up VLAN item if filtering is enabled and at least one VLAN
+	 * filter is configured.
+	 */
+	if (rule_vlan) {
+		vlan = mlx4_flow_internal_next_vlan(priv, 0);
+		if (vlan < 4096) {
+			pattern[2] = (struct rte_flow_item){
+				.type = RTE_FLOW_ITEM_TYPE_VLAN,
+				.spec = &vlan_spec,
+				.mask = &vlan_mask,
+			};
+next_vlan:
+			*rule_vlan = rte_cpu_to_be_16(vlan);
+		} else {
+			rule_vlan = NULL;
+		}
+	}
+	for (i = 0; i != RTE_DIM(priv->mac) + 1; ++i) {
+		const struct ether_addr *mac;
+
+		/* Broadcasts are handled by an extra iteration. */
+		if (i < RTE_DIM(priv->mac))
+			mac = &priv->mac[i];
+		else
+			mac = &eth_mask.dst;
+		if (is_zero_ether_addr(mac))
+			continue;
+		/* Check if MAC flow rule is already present. */
+		for (flow = LIST_FIRST(&priv->flows);
+		     flow && flow->internal;
+		     flow = LIST_NEXT(flow, next)) {
+			const struct ibv_flow_spec_eth *eth =
+				(const void *)((uintptr_t)flow->ibv_attr +
+					       sizeof(*flow->ibv_attr));
+			unsigned int j;
+
+			if (!flow->mac)
+				continue;
+			assert(flow->ibv_attr->type == IBV_FLOW_ATTR_NORMAL);
+			assert(flow->ibv_attr->num_of_specs == 1);
+			assert(eth->type == IBV_FLOW_SPEC_ETH);
+			assert(flow->rss);
+			if (rule_vlan &&
+			    (eth->val.vlan_tag != *rule_vlan ||
+			     eth->mask.vlan_tag != RTE_BE16(0x0fff)))
+				continue;
+			if (!rule_vlan && eth->mask.vlan_tag)
+				continue;
+			for (j = 0; j != sizeof(mac->addr_bytes); ++j)
+				if (eth->val.dst_mac[j] != mac->addr_bytes[j] ||
+				    eth->mask.dst_mac[j] != UINT8_C(0xff) ||
+				    eth->val.src_mac[j] != UINT8_C(0x00) ||
+				    eth->mask.src_mac[j] != UINT8_C(0x00))
+					break;
+			if (j != sizeof(mac->addr_bytes))
+				continue;
+			if (flow->rss->queues != queues ||
+			    memcmp(flow->rss->queue_id, action_rss.queue,
+				   queues * sizeof(flow->rss->queue_id[0])))
+				continue;
+			break;
+		}
+		if (!flow || !flow->internal) {
+			/* Not found, create a new flow rule. */
+			memcpy(rule_mac, mac, sizeof(*mac));
+			flow = mlx4_flow_create(priv->dev, &attr, pattern,
+						actions, error);
+			if (!flow) {
+				err = -rte_errno;
+				goto error;
+			}
+		}
+		flow->select = 1;
+		flow->mac = 1;
+	}
+	if (rule_vlan) {
+		vlan = mlx4_flow_internal_next_vlan(priv, vlan + 1);
+		if (vlan < 4096)
+			goto next_vlan;
+	}
+	/* Take care of promiscuous and all multicast flow rules. */
+	if (priv->dev->data->promiscuous || priv->dev->data->all_multicast) {
+		for (flow = LIST_FIRST(&priv->flows);
+		     flow && flow->internal;
+		     flow = LIST_NEXT(flow, next)) {
+			if (priv->dev->data->promiscuous) {
+				if (flow->promisc)
+					break;
+			} else {
+				assert(priv->dev->data->all_multicast);
+				if (flow->allmulti)
+					break;
+			}
+		}
+		if (flow && flow->internal) {
+			assert(flow->rss);
+			if (flow->rss->queues != queues ||
+			    memcmp(flow->rss->queue_id, action_rss.queue,
+				   queues * sizeof(flow->rss->queue_id[0])))
+				flow = NULL;
+		}
+		if (!flow || !flow->internal) {
+			/* Not found, create a new flow rule. */
+			if (priv->dev->data->promiscuous) {
+				pattern[1].spec = NULL;
+				pattern[1].mask = NULL;
+			} else {
+				assert(priv->dev->data->all_multicast);
+				pattern[1].spec = &eth_allmulti;
+				pattern[1].mask = &eth_allmulti;
+			}
+			pattern[2] = pattern[3];
+			flow = mlx4_flow_create(priv->dev, &attr, pattern,
+						actions, error);
+			if (!flow) {
+				err = -rte_errno;
+				goto error;
+			}
+		}
+		assert(flow->promisc || flow->allmulti);
+		flow->select = 1;
+	}
+error:
+	/* Clear selection and clean up stale internal flow rules. */
+	flow = LIST_FIRST(&priv->flows);
+	while (flow && flow->internal) {
+		struct rte_flow *next = LIST_NEXT(flow, next);
+
+		if (!flow->select)
+			claim_zero(mlx4_flow_destroy(priv->dev, flow, error));
+		else
+			flow->select = 0;
+		flow = next;
+	}
+	return err;
+}
+
+/**
+ * Synchronize flow rules.
+ *
+ * This function synchronizes flow rules with the state of the device by
+ * taking into account isolated mode and whether target queues are
+ * configured.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error)
+{
+	struct rte_flow *flow;
+	int ret;
+
+	/* Internal flow rules are guaranteed to come first in the list. */
+	if (priv->isolated) {
+		/*
+		 * Get rid of them in isolated mode, stop at the first
+		 * non-internal rule found.
+		 */
+		for (flow = LIST_FIRST(&priv->flows);
+		     flow && flow->internal;
+		     flow = LIST_FIRST(&priv->flows))
+			claim_zero(mlx4_flow_destroy(priv->dev, flow, error));
+	} else {
+		/* Refresh internal rules. */
+		ret = mlx4_flow_internal(priv, error);
+		if (ret)
+			return ret;
+	}
+	/* Toggle the remaining flow rules . */
+	LIST_FOREACH(flow, &priv->flows, next) {
+		ret = mlx4_flow_toggle(priv, flow, priv->started, error);
+		if (ret)
+			return ret;
+	}
+	if (!priv->started)
+		assert(!priv->drop);
+	return 0;
+}
+
+/**
+ * Clean up all flow rules.
+ *
+ * Unlike mlx4_flow_flush(), this function takes care of all remaining flow
+ * rules regardless of whether they are internal or user-configured.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+void
+mlx4_flow_clean(struct priv *priv)
+{
+	struct rte_flow *flow;
+
+	while ((flow = LIST_FIRST(&priv->flows)))
+		mlx4_flow_destroy(priv->dev, flow, NULL);
+	assert(LIST_EMPTY(&priv->rss));
+}
+
+static const struct rte_flow_ops mlx4_flow_ops = {
+	.validate = mlx4_flow_validate,
+	.create = mlx4_flow_create,
+	.destroy = mlx4_flow_destroy,
+	.flush = mlx4_flow_flush,
+	.isolate = mlx4_flow_isolate,
+};
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_filter_ctrl(struct rte_eth_dev *dev,
+		 enum rte_filter_type filter_type,
+		 enum rte_filter_op filter_op,
+		 void *arg)
+{
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			break;
+		*(const void **)arg = &mlx4_flow_ops;
+		return 0;
+	default:
+		ERROR("%p: filter type (%d) not supported",
+		      (void *)dev, filter_type);
+		break;
+	}
+	rte_errno = ENOTSUP;
+	return -rte_errno;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_flow.h b/src/spdk/dpdk/drivers/net/mlx4/mlx4_flow.h
new file mode 100644
index 00000000..2917ebe9
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_flow.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX4_FLOW_H_
+#define RTE_PMD_MLX4_FLOW_H_
+
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_eth_ctrl.h>
+#include <rte_ethdev_driver.h>
+#include <rte_flow.h>
+#include <rte_flow_driver.h>
+#include <rte_byteorder.h>
+
+/** Last and lowest priority level for a flow rule. */
+#define MLX4_FLOW_PRIORITY_LAST UINT32_C(0xfff)
+
+/** Meta pattern item used to distinguish internal rules. */
+#define MLX4_FLOW_ITEM_TYPE_INTERNAL ((enum rte_flow_item_type)-1)
+
+/** PMD-specific (mlx4) definition of a flow rule handle. */
+struct rte_flow {
+	LIST_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
+	struct ibv_flow *ibv_flow; /**< Verbs flow. */
+	struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */
+	uint32_t ibv_attr_size; /**< Size of Verbs attributes. */
+	uint32_t select:1; /**< Used by operations on the linked list. */
+	uint32_t internal:1; /**< Internal flow rule outside isolated mode. */
+	uint32_t mac:1; /**< Rule associated with a configured MAC address. */
+	uint32_t promisc:1; /**< This rule matches everything. */
+	uint32_t allmulti:1; /**< This rule matches all multicast traffic. */
+	uint32_t drop:1; /**< This rule drops packets. */
+	uint32_t priority; /**< Flow rule priority. */
+	struct mlx4_rss *rss; /**< Rx target. */
+};
+
+/* mlx4_flow.c */
+
+uint64_t mlx4_conv_rss_types(struct priv *priv, uint64_t types,
+			     int verbs_to_dpdk);
+int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
+void mlx4_flow_clean(struct priv *priv);
+int mlx4_filter_ctrl(struct rte_eth_dev *dev,
+		     enum rte_filter_type filter_type,
+		     enum rte_filter_op filter_op,
+		     void *arg);
+
+#endif /* RTE_PMD_MLX4_FLOW_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_glue.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4_glue.c
new file mode 100644
index 00000000..67b3bfac
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_glue.c
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include "mlx4_glue.h"
+
+static int
+mlx4_glue_fork_init(void)
+{
+	return ibv_fork_init();
+}
+
+static int
+mlx4_glue_get_async_event(struct ibv_context *context,
+			  struct ibv_async_event *event)
+{
+	return ibv_get_async_event(context, event);
+}
+
+static void
+mlx4_glue_ack_async_event(struct ibv_async_event *event)
+{
+	ibv_ack_async_event(event);
+}
+
+static struct ibv_pd *
+mlx4_glue_alloc_pd(struct ibv_context *context)
+{
+	return ibv_alloc_pd(context);
+}
+
+static int
+mlx4_glue_dealloc_pd(struct ibv_pd *pd)
+{
+	return ibv_dealloc_pd(pd);
+}
+
+static struct ibv_device **
+mlx4_glue_get_device_list(int *num_devices)
+{
+	return ibv_get_device_list(num_devices);
+}
+
+static void
+mlx4_glue_free_device_list(struct ibv_device **list)
+{
+	ibv_free_device_list(list);
+}
+
+static struct ibv_context *
+mlx4_glue_open_device(struct ibv_device *device)
+{
+	return ibv_open_device(device);
+}
+
+static int
+mlx4_glue_close_device(struct ibv_context *context)
+{
+	return ibv_close_device(context);
+}
+
+static const char *
+mlx4_glue_get_device_name(struct ibv_device *device)
+{
+	return ibv_get_device_name(device);
+}
+
+static int
+mlx4_glue_query_device(struct ibv_context *context,
+		       struct ibv_device_attr *device_attr)
+{
+	return ibv_query_device(context, device_attr);
+}
+
+static int
+mlx4_glue_query_device_ex(struct ibv_context *context,
+			  const struct ibv_query_device_ex_input *input,
+			  struct ibv_device_attr_ex *attr)
+{
+	return ibv_query_device_ex(context, input, attr);
+}
+
+static int
+mlx4_glue_query_port(struct ibv_context *context, uint8_t port_num,
+		     struct ibv_port_attr *port_attr)
+{
+	return ibv_query_port(context, port_num, port_attr);
+}
+
+static const char *
+mlx4_glue_port_state_str(enum ibv_port_state port_state)
+{
+	return ibv_port_state_str(port_state);
+}
+
+static struct ibv_comp_channel *
+mlx4_glue_create_comp_channel(struct ibv_context *context)
+{
+	return ibv_create_comp_channel(context);
+}
+
+static int
+mlx4_glue_destroy_comp_channel(struct ibv_comp_channel *channel)
+{
+	return ibv_destroy_comp_channel(channel);
+}
+
+static struct ibv_cq *
+mlx4_glue_create_cq(struct ibv_context *context, int cqe, void *cq_context,
+		    struct ibv_comp_channel *channel, int comp_vector)
+{
+	return ibv_create_cq(context, cqe, cq_context, channel, comp_vector);
+}
+
+static int
+mlx4_glue_destroy_cq(struct ibv_cq *cq)
+{
+	return ibv_destroy_cq(cq);
+}
+
+static int
+mlx4_glue_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq,
+		       void **cq_context)
+{
+	return ibv_get_cq_event(channel, cq, cq_context);
+}
+
+static void
+mlx4_glue_ack_cq_events(struct ibv_cq *cq, unsigned int nevents)
+{
+	ibv_ack_cq_events(cq, nevents);
+}
+
+static struct ibv_flow *
+mlx4_glue_create_flow(struct ibv_qp *qp, struct ibv_flow_attr *flow)
+{
+	return ibv_create_flow(qp, flow);
+}
+
+static int
+mlx4_glue_destroy_flow(struct ibv_flow *flow_id)
+{
+	return ibv_destroy_flow(flow_id);
+}
+
+static struct ibv_qp *
+mlx4_glue_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
+{
+	return ibv_create_qp(pd, qp_init_attr);
+}
+
+static struct ibv_qp *
+mlx4_glue_create_qp_ex(struct ibv_context *context,
+		       struct ibv_qp_init_attr_ex *qp_init_attr_ex)
+{
+	return ibv_create_qp_ex(context, qp_init_attr_ex);
+}
+
+static int
+mlx4_glue_destroy_qp(struct ibv_qp *qp)
+{
+	return ibv_destroy_qp(qp);
+}
+
+static int
+mlx4_glue_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask)
+{
+	return ibv_modify_qp(qp, attr, attr_mask);
+}
+
+static struct ibv_mr *
+mlx4_glue_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access)
+{
+	return ibv_reg_mr(pd, addr, length, access);
+}
+
+static int
+mlx4_glue_dereg_mr(struct ibv_mr *mr)
+{
+	return ibv_dereg_mr(mr);
+}
+
+static struct ibv_rwq_ind_table *
+mlx4_glue_create_rwq_ind_table(struct ibv_context *context,
+			       struct ibv_rwq_ind_table_init_attr *init_attr)
+{
+	return ibv_create_rwq_ind_table(context, init_attr);
+}
+
+static int
+mlx4_glue_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table)
+{
+	return ibv_destroy_rwq_ind_table(rwq_ind_table);
+}
+
+static struct ibv_wq *
+mlx4_glue_create_wq(struct ibv_context *context,
+		    struct ibv_wq_init_attr *wq_init_attr)
+{
+	return ibv_create_wq(context, wq_init_attr);
+}
+
+static int
+mlx4_glue_destroy_wq(struct ibv_wq *wq)
+{
+	return ibv_destroy_wq(wq);
+}
+static int
+mlx4_glue_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr)
+{
+	return ibv_modify_wq(wq, wq_attr);
+}
+
+static int
+mlx4_glue_dv_init_obj(struct mlx4dv_obj *obj, uint64_t obj_type)
+{
+	return mlx4dv_init_obj(obj, obj_type);
+}
+
+static int
+mlx4_glue_dv_set_context_attr(struct ibv_context *context,
+			      enum mlx4dv_set_ctx_attr_type attr_type,
+			      void *attr)
+{
+	return mlx4dv_set_context_attr(context, attr_type, attr);
+}
+
+const struct mlx4_glue *mlx4_glue = &(const struct mlx4_glue){
+	.version = MLX4_GLUE_VERSION,
+	.fork_init = mlx4_glue_fork_init,
+	.get_async_event = mlx4_glue_get_async_event,
+	.ack_async_event = mlx4_glue_ack_async_event,
+	.alloc_pd = mlx4_glue_alloc_pd,
+	.dealloc_pd = mlx4_glue_dealloc_pd,
+	.get_device_list = mlx4_glue_get_device_list,
+	.free_device_list = mlx4_glue_free_device_list,
+	.open_device = mlx4_glue_open_device,
+	.close_device = mlx4_glue_close_device,
+	.get_device_name = mlx4_glue_get_device_name,
+	.query_device = mlx4_glue_query_device,
+	.query_device_ex = mlx4_glue_query_device_ex,
+	.query_port = mlx4_glue_query_port,
+	.port_state_str = mlx4_glue_port_state_str,
+	.create_comp_channel = mlx4_glue_create_comp_channel,
+	.destroy_comp_channel = mlx4_glue_destroy_comp_channel,
+	.create_cq = mlx4_glue_create_cq,
+	.destroy_cq = mlx4_glue_destroy_cq,
+	.get_cq_event = mlx4_glue_get_cq_event,
+	.ack_cq_events = mlx4_glue_ack_cq_events,
+	.create_flow = mlx4_glue_create_flow,
+	.destroy_flow = mlx4_glue_destroy_flow,
+	.create_qp = mlx4_glue_create_qp,
+	.create_qp_ex = mlx4_glue_create_qp_ex,
+	.destroy_qp = mlx4_glue_destroy_qp,
+	.modify_qp = mlx4_glue_modify_qp,
+	.reg_mr = mlx4_glue_reg_mr,
+	.dereg_mr = mlx4_glue_dereg_mr,
+	.create_rwq_ind_table = mlx4_glue_create_rwq_ind_table,
+	.destroy_rwq_ind_table = mlx4_glue_destroy_rwq_ind_table,
+	.create_wq = mlx4_glue_create_wq,
+	.destroy_wq = mlx4_glue_destroy_wq,
+	.modify_wq = mlx4_glue_modify_wq,
+	.dv_init_obj = mlx4_glue_dv_init_obj,
+	.dv_set_context_attr = mlx4_glue_dv_set_context_attr,
+};
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_glue.h b/src/spdk/dpdk/drivers/net/mlx4/mlx4_glue.h
new file mode 100644
index 00000000..668ca867
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_glue.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#ifndef MLX4_GLUE_H_
+#define MLX4_GLUE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#ifndef MLX4_GLUE_VERSION
+#define MLX4_GLUE_VERSION ""
+#endif
+
+/* LIB_GLUE_VERSION must be updated every time this structure is modified. */
+struct mlx4_glue {
+	const char *version;
+	int (*fork_init)(void);
+	int (*get_async_event)(struct ibv_context *context,
+			       struct ibv_async_event *event);
+	void (*ack_async_event)(struct ibv_async_event *event);
+	struct ibv_pd *(*alloc_pd)(struct ibv_context *context);
+	int (*dealloc_pd)(struct ibv_pd *pd);
+	struct ibv_device **(*get_device_list)(int *num_devices);
+	void (*free_device_list)(struct ibv_device **list);
+	struct ibv_context *(*open_device)(struct ibv_device *device);
+	int (*close_device)(struct ibv_context *context);
+	const char *(*get_device_name)(struct ibv_device *device);
+	int (*query_device)(struct ibv_context *context,
+			    struct ibv_device_attr *device_attr);
+	int (*query_device_ex)(struct ibv_context *context,
+			       const struct ibv_query_device_ex_input *input,
+			       struct ibv_device_attr_ex *attr);
+	int (*query_port)(struct ibv_context *context, uint8_t port_num,
+			  struct ibv_port_attr *port_attr);
+	const char *(*port_state_str)(enum ibv_port_state port_state);
+	struct ibv_comp_channel *(*create_comp_channel)
+		(struct ibv_context *context);
+	int (*destroy_comp_channel)(struct ibv_comp_channel *channel);
+	struct ibv_cq *(*create_cq)(struct ibv_context *context, int cqe,
+				    void *cq_context,
+				    struct ibv_comp_channel *channel,
+				    int comp_vector);
+	int (*destroy_cq)(struct ibv_cq *cq);
+	int (*get_cq_event)(struct ibv_comp_channel *channel,
+			    struct ibv_cq **cq, void **cq_context);
+	void (*ack_cq_events)(struct ibv_cq *cq, unsigned int nevents);
+	struct ibv_flow *(*create_flow)(struct ibv_qp *qp,
+					struct ibv_flow_attr *flow);
+	int (*destroy_flow)(struct ibv_flow *flow_id);
+	struct ibv_qp *(*create_qp)(struct ibv_pd *pd,
+				    struct ibv_qp_init_attr *qp_init_attr);
+	struct ibv_qp *(*create_qp_ex)
+		(struct ibv_context *context,
+		 struct ibv_qp_init_attr_ex *qp_init_attr_ex);
+	int (*destroy_qp)(struct ibv_qp *qp);
+	int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+			 int attr_mask);
+	struct ibv_mr *(*reg_mr)(struct ibv_pd *pd, void *addr,
+				 size_t length, int access);
+	int (*dereg_mr)(struct ibv_mr *mr);
+	struct ibv_rwq_ind_table *(*create_rwq_ind_table)
+		(struct ibv_context *context,
+		 struct ibv_rwq_ind_table_init_attr *init_attr);
+	int (*destroy_rwq_ind_table)(struct ibv_rwq_ind_table *rwq_ind_table);
+	struct ibv_wq *(*create_wq)(struct ibv_context *context,
+				    struct ibv_wq_init_attr *wq_init_attr);
+	int (*destroy_wq)(struct ibv_wq *wq);
+	int (*modify_wq)(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr);
+	int (*dv_init_obj)(struct mlx4dv_obj *obj, uint64_t obj_type);
+	int (*dv_set_context_attr)(struct ibv_context *context,
+				   enum mlx4dv_set_ctx_attr_type attr_type,
+				   void *attr);
+};
+
+const struct mlx4_glue *mlx4_glue;
+
+#endif /* MLX4_GLUE_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_intr.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4_intr.c
new file mode 100644
index 00000000..eeb982a0
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_intr.c
@@ -0,0 +1,406 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+/**
+ * @file
+ * Interrupts handling for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_alarm.h>
+#include <rte_errno.h>
+#include <rte_ethdev_driver.h>
+#include <rte_io.h>
+#include <rte_interrupts.h>
+
+#include "mlx4.h"
+#include "mlx4_glue.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+static int mlx4_link_status_check(struct priv *priv);
+
+/**
+ * Clean up Rx interrupts handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+static void
+mlx4_rx_intr_vec_disable(struct priv *priv)
+{
+	struct rte_intr_handle *intr_handle = &priv->intr_handle;
+
+	rte_intr_free_epoll_fd(intr_handle);
+	free(intr_handle->intr_vec);
+	intr_handle->nb_efd = 0;
+	intr_handle->intr_vec = NULL;
+}
+
+/**
+ * Allocate queue vector and fill epoll fd list for Rx interrupts.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_rx_intr_vec_enable(struct priv *priv)
+{
+	unsigned int i;
+	unsigned int rxqs_n = priv->dev->data->nb_rx_queues;
+	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+	unsigned int count = 0;
+	struct rte_intr_handle *intr_handle = &priv->intr_handle;
+
+	mlx4_rx_intr_vec_disable(priv);
+	intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0]));
+	if (intr_handle->intr_vec == NULL) {
+		rte_errno = ENOMEM;
+		ERROR("failed to allocate memory for interrupt vector,"
+		      " Rx interrupts will not be supported");
+		return -rte_errno;
+	}
+	for (i = 0; i != n; ++i) {
+		struct rxq *rxq = priv->dev->data->rx_queues[i];
+
+		/* Skip queues that cannot request interrupts. */
+		if (!rxq || !rxq->channel) {
+			/* Use invalid intr_vec[] index to disable entry. */
+			intr_handle->intr_vec[i] =
+				RTE_INTR_VEC_RXTX_OFFSET +
+				RTE_MAX_RXTX_INTR_VEC_ID;
+			continue;
+		}
+		if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
+			rte_errno = E2BIG;
+			ERROR("too many Rx queues for interrupt vector size"
+			      " (%d), Rx interrupts cannot be enabled",
+			      RTE_MAX_RXTX_INTR_VEC_ID);
+			mlx4_rx_intr_vec_disable(priv);
+			return -rte_errno;
+		}
+		intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
+		intr_handle->efds[count] = rxq->channel->fd;
+		count++;
+	}
+	if (!count)
+		mlx4_rx_intr_vec_disable(priv);
+	else
+		intr_handle->nb_efd = count;
+	return 0;
+}
+
+/**
+ * Process scheduled link status check.
+ *
+ * If LSC interrupts are requested, process related callback.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+static void
+mlx4_link_status_alarm(struct priv *priv)
+{
+	const struct rte_intr_conf *const intr_conf =
+		&priv->dev->data->dev_conf.intr_conf;
+
+	assert(priv->intr_alarm == 1);
+	priv->intr_alarm = 0;
+	if (intr_conf->lsc && !mlx4_link_status_check(priv))
+		_rte_eth_dev_callback_process(priv->dev,
+					      RTE_ETH_EVENT_INTR_LSC,
+					      NULL);
+}
+
+/**
+ * Check link status.
+ *
+ * In case of inconsistency, another check is scheduled.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success (link status is consistent), negative errno value
+ *   otherwise and rte_errno is set.
+ */
+static int
+mlx4_link_status_check(struct priv *priv)
+{
+	struct rte_eth_link *link = &priv->dev->data->dev_link;
+	int ret = mlx4_link_update(priv->dev, 0);
+
+	if (ret)
+		return ret;
+	if ((!link->link_speed && link->link_status) ||
+	    (link->link_speed && !link->link_status)) {
+		if (!priv->intr_alarm) {
+			/* Inconsistent status, check again later. */
+			ret = rte_eal_alarm_set(MLX4_INTR_ALARM_TIMEOUT,
+						(void (*)(void *))
+						mlx4_link_status_alarm,
+						priv);
+			if (ret)
+				return ret;
+			priv->intr_alarm = 1;
+		}
+		rte_errno = EINPROGRESS;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Handle interrupts from the NIC.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+static void
+mlx4_interrupt_handler(struct priv *priv)
+{
+	enum { LSC, RMV, };
+	static const enum rte_eth_event_type type[] = {
+		[LSC] = RTE_ETH_EVENT_INTR_LSC,
+		[RMV] = RTE_ETH_EVENT_INTR_RMV,
+	};
+	uint32_t caught[RTE_DIM(type)] = { 0 };
+	struct ibv_async_event event;
+	const struct rte_intr_conf *const intr_conf =
+		&priv->dev->data->dev_conf.intr_conf;
+	unsigned int i;
+
+	/* Read all message and acknowledge them. */
+	while (!mlx4_glue->get_async_event(priv->ctx, &event)) {
+		switch (event.event_type) {
+		case IBV_EVENT_PORT_ACTIVE:
+		case IBV_EVENT_PORT_ERR:
+			if (intr_conf->lsc && !mlx4_link_status_check(priv))
+				++caught[LSC];
+			break;
+		case IBV_EVENT_DEVICE_FATAL:
+			if (intr_conf->rmv)
+				++caught[RMV];
+			break;
+		default:
+			DEBUG("event type %d on physical port %d not handled",
+			      event.event_type, event.element.port_num);
+		}
+		mlx4_glue->ack_async_event(&event);
+	}
+	for (i = 0; i != RTE_DIM(caught); ++i)
+		if (caught[i])
+			_rte_eth_dev_callback_process(priv->dev, type[i],
+						      NULL);
+}
+
+/**
+ * MLX4 CQ notification .
+ *
+ * @param rxq
+ *   Pointer to receive queue structure.
+ * @param solicited
+ *   Is request solicited or not.
+ */
+static void
+mlx4_arm_cq(struct rxq *rxq, int solicited)
+{
+	struct mlx4_cq *cq = &rxq->mcq;
+	uint64_t doorbell;
+	uint32_t sn = cq->arm_sn & MLX4_CQ_DB_GEQ_N_MASK;
+	uint32_t ci = cq->cons_index & MLX4_CQ_DB_CI_MASK;
+	uint32_t cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT;
+
+	*cq->arm_db = rte_cpu_to_be_32(sn << 28 | cmd | ci);
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	rte_wmb();
+	doorbell = sn << 28 | cmd | cq->cqn;
+	doorbell <<= 32;
+	doorbell |= ci;
+	rte_write64(rte_cpu_to_be_64(doorbell), cq->cq_db_reg);
+}
+
+/**
+ * Uninstall interrupt handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_intr_uninstall(struct priv *priv)
+{
+	int err = rte_errno; /* Make sure rte_errno remains unchanged. */
+
+	if (priv->intr_handle.fd != -1) {
+		rte_intr_callback_unregister(&priv->intr_handle,
+					     (void (*)(void *))
+					     mlx4_interrupt_handler,
+					     priv);
+		priv->intr_handle.fd = -1;
+	}
+	rte_eal_alarm_cancel((void (*)(void *))mlx4_link_status_alarm, priv);
+	priv->intr_alarm = 0;
+	mlx4_rxq_intr_disable(priv);
+	rte_errno = err;
+	return 0;
+}
+
+/**
+ * Install interrupt handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_intr_install(struct priv *priv)
+{
+	const struct rte_intr_conf *const intr_conf =
+		&priv->dev->data->dev_conf.intr_conf;
+	int rc;
+
+	mlx4_intr_uninstall(priv);
+	if (intr_conf->lsc | intr_conf->rmv) {
+		priv->intr_handle.fd = priv->ctx->async_fd;
+		rc = rte_intr_callback_register(&priv->intr_handle,
+						(void (*)(void *))
+						mlx4_interrupt_handler,
+						priv);
+		if (rc < 0) {
+			rte_errno = -rc;
+			goto error;
+		}
+	}
+	return 0;
+error:
+	mlx4_intr_uninstall(priv);
+	return -rte_errno;
+}
+
+/**
+ * DPDK callback for Rx queue interrupt disable.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Rx queue index.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct rxq *rxq = dev->data->rx_queues[idx];
+	struct ibv_cq *ev_cq;
+	void *ev_ctx;
+	int ret;
+
+	if (!rxq || !rxq->channel) {
+		ret = EINVAL;
+	} else {
+		ret = mlx4_glue->get_cq_event(rxq->cq->channel, &ev_cq,
+					      &ev_ctx);
+		if (ret || ev_cq != rxq->cq)
+			ret = EINVAL;
+	}
+	if (ret) {
+		rte_errno = ret;
+		WARN("unable to disable interrupt on rx queue %d",
+		     idx);
+	} else {
+		rxq->mcq.arm_sn++;
+		mlx4_glue->ack_cq_events(rxq->cq, 1);
+	}
+	return -ret;
+}
+
+/**
+ * DPDK callback for Rx queue interrupt enable.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Rx queue index.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct rxq *rxq = dev->data->rx_queues[idx];
+	int ret = 0;
+
+	if (!rxq || !rxq->channel) {
+		ret = EINVAL;
+		rte_errno = ret;
+		WARN("unable to arm interrupt on rx queue %d", idx);
+	} else {
+		mlx4_arm_cq(rxq, 0);
+	}
+	return -ret;
+}
+
+/**
+ * Enable datapath interrupts.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rxq_intr_enable(struct priv *priv)
+{
+	const struct rte_intr_conf *const intr_conf =
+		&priv->dev->data->dev_conf.intr_conf;
+
+	if (intr_conf->rxq && mlx4_rx_intr_vec_enable(priv) < 0)
+		goto error;
+	return 0;
+error:
+	return -rte_errno;
+}
+
+/**
+ * Disable datapath interrupts, keeping other interrupts intact.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+void
+mlx4_rxq_intr_disable(struct priv *priv)
+{
+	int err = rte_errno; /* Make sure rte_errno remains unchanged. */
+
+	mlx4_rx_intr_vec_disable(priv);
+	rte_errno = err;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_mr.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4_mr.c
new file mode 100644
index 00000000..d23d3c61
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_mr.c
@@ -0,0 +1,1181 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+/**
+ * @file
+ * Memory management functions for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_memory.h>
+#include <rte_mempool.h>
+#include <rte_rwlock.h>
+
+#include "mlx4_glue.h"
+#include "mlx4_mr.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+struct mr_find_contig_memsegs_data {
+	uintptr_t addr;
+	uintptr_t start;
+	uintptr_t end;
+	const struct rte_memseg_list *msl;
+};
+
+struct mr_update_mp_data {
+	struct rte_eth_dev *dev;
+	struct mlx4_mr_ctrl *mr_ctrl;
+	int ret;
+};
+
+/**
+ * Expand B-tree table to a given size. Can't be called with holding
+ * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ *
+ * @param bt
+ *   Pointer to B-tree structure.
+ * @param n
+ *   Number of entries for expansion.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+static int
+mr_btree_expand(struct mlx4_mr_btree *bt, int n)
+{
+	void *mem;
+	int ret = 0;
+
+	if (n <= bt->size)
+		return ret;
+	/*
+	 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is
+	 * used inside if there's no room to expand. Because this is a quite
+	 * rare case and a part of very slow path, it is very acceptable.
+	 * Initially cache_bh[] will be given practically enough space and once
+	 * it is expanded, expansion wouldn't be needed again ever.
+	 */
+	mem = rte_realloc(bt->table, n * sizeof(struct mlx4_mr_cache), 0);
+	if (mem == NULL) {
+		/* Not an error, B-tree search will be skipped. */
+		WARN("failed to expand MR B-tree (%p) table", (void *)bt);
+		ret = -1;
+	} else {
+		DEBUG("expanded MR B-tree table (size=%u)", n);
+		bt->table = mem;
+		bt->size = n;
+	}
+	return ret;
+}
+
+/**
+ * Look up LKey from given B-tree lookup table, store the last index and return
+ * searched LKey.
+ *
+ * @param bt
+ *   Pointer to B-tree structure.
+ * @param[out] idx
+ *   Pointer to index. Even on search failure, returns index where it stops
+ *   searching so that index can be used when inserting a new entry.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mr_btree_lookup(struct mlx4_mr_btree *bt, uint16_t *idx, uintptr_t addr)
+{
+	struct mlx4_mr_cache *lkp_tbl;
+	uint16_t n;
+	uint16_t base = 0;
+
+	assert(bt != NULL);
+	lkp_tbl = *bt->table;
+	n = bt->len;
+	/* First entry must be NULL for comparison. */
+	assert(bt->len > 0 || (lkp_tbl[0].start == 0 &&
+			       lkp_tbl[0].lkey == UINT32_MAX));
+	/* Binary search. */
+	do {
+		register uint16_t delta = n >> 1;
+
+		if (addr < lkp_tbl[base + delta].start) {
+			n = delta;
+		} else {
+			base += delta;
+			n -= delta;
+		}
+	} while (n > 1);
+	assert(addr >= lkp_tbl[base].start);
+	*idx = base;
+	if (addr < lkp_tbl[base].end)
+		return lkp_tbl[base].lkey;
+	/* Not found. */
+	return UINT32_MAX;
+}
+
+/**
+ * Insert an entry to B-tree lookup table.
+ *
+ * @param bt
+ *   Pointer to B-tree structure.
+ * @param entry
+ *   Pointer to new entry to insert.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+static int
+mr_btree_insert(struct mlx4_mr_btree *bt, struct mlx4_mr_cache *entry)
+{
+	struct mlx4_mr_cache *lkp_tbl;
+	uint16_t idx = 0;
+	size_t shift;
+
+	assert(bt != NULL);
+	assert(bt->len <= bt->size);
+	assert(bt->len > 0);
+	lkp_tbl = *bt->table;
+	/* Find out the slot for insertion. */
+	if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) {
+		DEBUG("abort insertion to B-tree(%p): already exist at"
+		      " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
+		      (void *)bt, idx, entry->start, entry->end, entry->lkey);
+		/* Already exist, return. */
+		return 0;
+	}
+	/* If table is full, return error. */
+	if (unlikely(bt->len == bt->size)) {
+		bt->overflow = 1;
+		return -1;
+	}
+	/* Insert entry. */
+	++idx;
+	shift = (bt->len - idx) * sizeof(struct mlx4_mr_cache);
+	if (shift)
+		memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift);
+	lkp_tbl[idx] = *entry;
+	bt->len++;
+	DEBUG("inserted B-tree(%p)[%u],"
+	      " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
+	      (void *)bt, idx, entry->start, entry->end, entry->lkey);
+	return 0;
+}
+
+/**
+ * Initialize B-tree and allocate memory for lookup table.
+ *
+ * @param bt
+ *   Pointer to B-tree structure.
+ * @param n
+ *   Number of entries to allocate.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket)
+{
+	if (bt == NULL) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	memset(bt, 0, sizeof(*bt));
+	bt->table = rte_calloc_socket("B-tree table",
+				      n, sizeof(struct mlx4_mr_cache),
+				      0, socket);
+	if (bt->table == NULL) {
+		rte_errno = ENOMEM;
+		ERROR("failed to allocate memory for btree cache on socket %d",
+		      socket);
+		return -rte_errno;
+	}
+	bt->size = n;
+	/* First entry must be NULL for binary search. */
+	(*bt->table)[bt->len++] = (struct mlx4_mr_cache) {
+		.lkey = UINT32_MAX,
+	};
+	DEBUG("initialized B-tree %p with table %p",
+	      (void *)bt, (void *)bt->table);
+	return 0;
+}
+
+/**
+ * Free B-tree resources.
+ *
+ * @param bt
+ *   Pointer to B-tree structure.
+ */
+void
+mlx4_mr_btree_free(struct mlx4_mr_btree *bt)
+{
+	if (bt == NULL)
+		return;
+	DEBUG("freeing B-tree %p with table %p", (void *)bt, (void *)bt->table);
+	rte_free(bt->table);
+	memset(bt, 0, sizeof(*bt));
+}
+
+#ifndef NDEBUG
+/**
+ * Dump all the entries in a B-tree
+ *
+ * @param bt
+ *   Pointer to B-tree structure.
+ */
+void
+mlx4_mr_btree_dump(struct mlx4_mr_btree *bt)
+{
+	int idx;
+	struct mlx4_mr_cache *lkp_tbl;
+
+	if (bt == NULL)
+		return;
+	lkp_tbl = *bt->table;
+	for (idx = 0; idx < bt->len; ++idx) {
+		struct mlx4_mr_cache *entry = &lkp_tbl[idx];
+
+		DEBUG("B-tree(%p)[%u],"
+		      " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
+		      (void *)bt, idx, entry->start, entry->end, entry->lkey);
+	}
+}
+#endif
+
+/**
+ * Find virtually contiguous memory chunk in a given MR.
+ *
+ * @param dev
+ *   Pointer to MR structure.
+ * @param[out] entry
+ *   Pointer to returning MR cache entry. If not found, this will not be
+ *   updated.
+ * @param start_idx
+ *   Start index of the memseg bitmap.
+ *
+ * @return
+ *   Next index to go on lookup.
+ */
+static int
+mr_find_next_chunk(struct mlx4_mr *mr, struct mlx4_mr_cache *entry,
+		   int base_idx)
+{
+	uintptr_t start = 0;
+	uintptr_t end = 0;
+	uint32_t idx = 0;
+
+	for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) {
+		if (rte_bitmap_get(mr->ms_bmp, idx)) {
+			const struct rte_memseg_list *msl;
+			const struct rte_memseg *ms;
+
+			msl = mr->msl;
+			ms = rte_fbarray_get(&msl->memseg_arr,
+					     mr->ms_base_idx + idx);
+			assert(msl->page_sz == ms->hugepage_sz);
+			if (!start)
+				start = ms->addr_64;
+			end = ms->addr_64 + ms->hugepage_sz;
+		} else if (start) {
+			/* Passed the end of a fragment. */
+			break;
+		}
+	}
+	if (start) {
+		/* Found one chunk. */
+		entry->start = start;
+		entry->end = end;
+		entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey);
+	}
+	return idx;
+}
+
+/**
+ * Insert a MR to the global B-tree cache. It may fail due to low-on-memory.
+ * Then, this entry will have to be searched by mr_lookup_dev_list() in
+ * mlx4_mr_create() on miss.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mr
+ *   Pointer to MR to insert.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+static int
+mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx4_mr *mr)
+{
+	struct priv *priv = dev->data->dev_private;
+	unsigned int n;
+
+	DEBUG("port %u inserting MR(%p) to global cache",
+	      dev->data->port_id, (void *)mr);
+	for (n = 0; n < mr->ms_bmp_n; ) {
+		struct mlx4_mr_cache entry = { 0, };
+
+		/* Find a contiguous chunk and advance the index. */
+		n = mr_find_next_chunk(mr, &entry, n);
+		if (!entry.end)
+			break;
+		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+			/*
+			 * Overflowed, but the global table cannot be expanded
+			 * because of deadlock.
+			 */
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Look up address in the original global MR list.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] entry
+ *   Pointer to returning MR cache entry. If no match, this will not be updated.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Found MR on match, NULL otherwise.
+ */
+static struct mlx4_mr *
+mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
+		   uintptr_t addr)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mlx4_mr *mr;
+
+	/* Iterate all the existing MRs. */
+	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+		unsigned int n;
+
+		if (mr->ms_n == 0)
+			continue;
+		for (n = 0; n < mr->ms_bmp_n; ) {
+			struct mlx4_mr_cache ret = { 0, };
+
+			n = mr_find_next_chunk(mr, &ret, n);
+			if (addr >= ret.start && addr < ret.end) {
+				/* Found. */
+				*entry = ret;
+				return mr;
+			}
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Look up address on device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] entry
+ *   Pointer to returning MR cache entry. If no match, this will not be updated.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
+ */
+static uint32_t
+mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
+	      uintptr_t addr)
+{
+	struct priv *priv = dev->data->dev_private;
+	uint16_t idx;
+	uint32_t lkey = UINT32_MAX;
+	struct mlx4_mr *mr;
+
+	/*
+	 * If the global cache has overflowed since it failed to expand the
+	 * B-tree table, it can't have all the existing MRs. Then, the address
+	 * has to be searched by traversing the original MR list instead, which
+	 * is very slow path. Otherwise, the global cache is all inclusive.
+	 */
+	if (!unlikely(priv->mr.cache.overflow)) {
+		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+		if (lkey != UINT32_MAX)
+			*entry = (*priv->mr.cache.table)[idx];
+	} else {
+		/* Falling back to the slowest path. */
+		mr = mr_lookup_dev_list(dev, entry, addr);
+		if (mr != NULL)
+			lkey = entry->lkey;
+	}
+	assert(lkey == UINT32_MAX || (addr >= entry->start &&
+				      addr < entry->end));
+	return lkey;
+}
+
+/**
+ * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free()
+ * can raise memory free event and the callback function will spin on the lock.
+ *
+ * @param mr
+ *   Pointer to MR to free.
+ */
+static void
+mr_free(struct mlx4_mr *mr)
+{
+	if (mr == NULL)
+		return;
+	DEBUG("freeing MR(%p):", (void *)mr);
+	if (mr->ibv_mr != NULL)
+		claim_zero(mlx4_glue->dereg_mr(mr->ibv_mr));
+	if (mr->ms_bmp != NULL)
+		rte_bitmap_free(mr->ms_bmp);
+	rte_free(mr);
+}
+
+/**
+ * Releass resources of detached MR having no online entry.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mlx4_mr_garbage_collect(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mlx4_mr *mr_next;
+	struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
+
+	/*
+	 * MR can't be freed with holding the lock because rte_free() could call
+	 * memory free callback function. This will be a deadlock situation.
+	 */
+	rte_rwlock_write_lock(&priv->mr.rwlock);
+	/* Detach the whole free list and release it after unlocking. */
+	free_list = priv->mr.mr_free_list;
+	LIST_INIT(&priv->mr.mr_free_list);
+	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	/* Release resources. */
+	mr_next = LIST_FIRST(&free_list);
+	while (mr_next != NULL) {
+		struct mlx4_mr *mr = mr_next;
+
+		mr_next = LIST_NEXT(mr, mr);
+		mr_free(mr);
+	}
+}
+
+/* Called during rte_memseg_contig_walk() by mlx4_mr_create(). */
+static int
+mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
+			  const struct rte_memseg *ms, size_t len, void *arg)
+{
+	struct mr_find_contig_memsegs_data *data = arg;
+
+	if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len)
+		return 0;
+	/* Found, save it and stop walking. */
+	data->start = ms->addr_64;
+	data->end = ms->addr_64 + len;
+	data->msl = msl;
+	return 1;
+}
+
+/**
+ * Create a new global Memroy Region (MR) for a missing virtual address.
+ * Register entire virtually contiguous memory chunk around the address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] entry
+ *   Pointer to returning MR cache entry, found in the global cache or newly
+ *   created. If failed to create one, this will not be updated.
+ * @param addr
+ *   Target virtual address to register.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
+ */
+static uint32_t
+mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
+	       uintptr_t addr)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	const struct rte_memseg_list *msl;
+	const struct rte_memseg *ms;
+	struct mlx4_mr *mr = NULL;
+	size_t len;
+	uint32_t ms_n;
+	uint32_t bmp_size;
+	void *bmp_mem;
+	int ms_idx_shift = -1;
+	unsigned int n;
+	struct mr_find_contig_memsegs_data data = {
+		.addr = addr,
+	};
+	struct mr_find_contig_memsegs_data data_re;
+
+	DEBUG("port %u creating a MR using address (%p)",
+	      dev->data->port_id, (void *)addr);
+	/*
+	 * Release detached MRs if any. This can't be called with holding either
+	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+	 * been detached by the memory free event but it couldn't be released
+	 * inside the callback due to deadlock. As a result, releasing resources
+	 * is quite opportunistic.
+	 */
+	mlx4_mr_garbage_collect(dev);
+	/*
+	 * Find out a contiguous virtual address chunk in use, to which the
+	 * given address belongs, in order to register maximum range. In the
+	 * best case where mempools are not dynamically recreated and
+	 * '--socket-mem' is speicified as an EAL option, it is very likely to
+	 * have only one MR(LKey) per a socket and per a hugepage-size even
+	 * though the system memory is highly fragmented.
+	 */
+	if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) {
+		WARN("port %u unable to find virtually contiguous"
+		     " chunk for address (%p)."
+		     " rte_memseg_contig_walk() failed.",
+		     dev->data->port_id, (void *)addr);
+		rte_errno = ENXIO;
+		goto err_nolock;
+	}
+alloc_resources:
+	/* Addresses must be page-aligned. */
+	assert(rte_is_aligned((void *)data.start, data.msl->page_sz));
+	assert(rte_is_aligned((void *)data.end, data.msl->page_sz));
+	msl = data.msl;
+	ms = rte_mem_virt2memseg((void *)data.start, msl);
+	len = data.end - data.start;
+	assert(msl->page_sz == ms->hugepage_sz);
+	/* Number of memsegs in the range. */
+	ms_n = len / msl->page_sz;
+	DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
+	      " page_sz=0x%" PRIx64 ", ms_n=%u",
+	      dev->data->port_id, (void *)addr,
+	      data.start, data.end, msl->page_sz, ms_n);
+	/* Size of memory for bitmap. */
+	bmp_size = rte_bitmap_get_memory_footprint(ms_n);
+	mr = rte_zmalloc_socket(NULL,
+				RTE_ALIGN_CEIL(sizeof(*mr),
+					       RTE_CACHE_LINE_SIZE) +
+				bmp_size,
+				RTE_CACHE_LINE_SIZE, msl->socket_id);
+	if (mr == NULL) {
+		WARN("port %u unable to allocate memory for a new MR of"
+		     " address (%p).",
+		     dev->data->port_id, (void *)addr);
+		rte_errno = ENOMEM;
+		goto err_nolock;
+	}
+	mr->msl = msl;
+	/*
+	 * Save the index of the first memseg and initialize memseg bitmap. To
+	 * see if a memseg of ms_idx in the memseg-list is still valid, check:
+	 *	rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx)
+	 */
+	mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+	bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE);
+	mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size);
+	if (mr->ms_bmp == NULL) {
+		WARN("port %u unable to initialize bitamp for a new MR of"
+		     " address (%p).",
+		     dev->data->port_id, (void *)addr);
+		rte_errno = EINVAL;
+		goto err_nolock;
+	}
+	/*
+	 * Should recheck whether the extended contiguous chunk is still valid.
+	 * Because memory_hotplug_lock can't be held if there's any memory
+	 * related calls in a critical path, resource allocation above can't be
+	 * locked. If the memory has been changed at this point, try again with
+	 * just single page. If not, go on with the big chunk atomically from
+	 * here.
+	 */
+	rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+	data_re = data;
+	if (len > msl->page_sz &&
+	    !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) {
+		WARN("port %u unable to find virtually contiguous"
+		     " chunk for address (%p)."
+		     " rte_memseg_contig_walk() failed.",
+		     dev->data->port_id, (void *)addr);
+		rte_errno = ENXIO;
+		goto err_memlock;
+	}
+	if (data.start != data_re.start || data.end != data_re.end) {
+		/*
+		 * The extended contiguous chunk has been changed. Try again
+		 * with single memseg instead.
+		 */
+		data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz);
+		data.end = data.start + msl->page_sz;
+		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+		mr_free(mr);
+		goto alloc_resources;
+	}
+	assert(data.msl == data_re.msl);
+	rte_rwlock_write_lock(&priv->mr.rwlock);
+	/*
+	 * Check the address is really missing. If other thread already created
+	 * one or it is not found due to overflow, abort and return.
+	 */
+	if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) {
+		/*
+		 * Insert to the global cache table. It may fail due to
+		 * low-on-memory. Then, this entry will have to be searched
+		 * here again.
+		 */
+		mr_btree_insert(&priv->mr.cache, entry);
+		DEBUG("port %u found MR for %p on final lookup, abort",
+		      dev->data->port_id, (void *)addr);
+		rte_rwlock_write_unlock(&priv->mr.rwlock);
+		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+		/*
+		 * Must be unlocked before calling rte_free() because
+		 * mlx4_mr_mem_event_free_cb() can be called inside.
+		 */
+		mr_free(mr);
+		return entry->lkey;
+	}
+	/*
+	 * Trim start and end addresses for verbs MR. Set bits for registering
+	 * memsegs but exclude already registered ones. Bitmap can be
+	 * fragmented.
+	 */
+	for (n = 0; n < ms_n; ++n) {
+		uintptr_t start;
+		struct mlx4_mr_cache ret = { 0, };
+
+		start = data_re.start + n * msl->page_sz;
+		/* Exclude memsegs already registered by other MRs. */
+		if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) {
+			/*
+			 * Start from the first unregistered memseg in the
+			 * extended range.
+			 */
+			if (ms_idx_shift == -1) {
+				mr->ms_base_idx += n;
+				data.start = start;
+				ms_idx_shift = n;
+			}
+			data.end = start + msl->page_sz;
+			rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift);
+			++mr->ms_n;
+		}
+	}
+	len = data.end - data.start;
+	mr->ms_bmp_n = len / msl->page_sz;
+	assert(ms_idx_shift + mr->ms_bmp_n <= ms_n);
+	/*
+	 * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be
+	 * called with holding the memory lock because it doesn't use
+	 * mlx4_alloc_buf_extern() which eventually calls rte_malloc_socket()
+	 * through mlx4_alloc_verbs_buf().
+	 */
+	mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)data.start, len,
+				       IBV_ACCESS_LOCAL_WRITE);
+	if (mr->ibv_mr == NULL) {
+		WARN("port %u fail to create a verbs MR for address (%p)",
+		     dev->data->port_id, (void *)addr);
+		rte_errno = EINVAL;
+		goto err_mrlock;
+	}
+	assert((uintptr_t)mr->ibv_mr->addr == data.start);
+	assert(mr->ibv_mr->length == len);
+	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	DEBUG("port %u MR CREATED (%p) for %p:\n"
+	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
+	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
+	      dev->data->port_id, (void *)mr, (void *)addr,
+	      data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey),
+	      mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
+	/* Insert to the global cache table. */
+	mr_insert_dev_cache(dev, mr);
+	/* Fill in output data. */
+	mr_lookup_dev(dev, entry, addr);
+	/* Lookup can't fail. */
+	assert(entry->lkey != UINT32_MAX);
+	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+	return entry->lkey;
+err_mrlock:
+	rte_rwlock_write_unlock(&priv->mr.rwlock);
+err_memlock:
+	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+err_nolock:
+	/*
+	 * In case of error, as this can be called in a datapath, a warning
+	 * message per an error is preferable instead. Must be unlocked before
+	 * calling rte_free() because mlx4_mr_mem_event_free_cb() can be called
+	 * inside.
+	 */
+	mr_free(mr);
+	return UINT32_MAX;
+}
+
+/**
+ * Rebuild the global B-tree cache of device from the original MR list.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mr_rebuild_dev_cache(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mlx4_mr *mr;
+
+	DEBUG("port %u rebuild dev cache[]", dev->data->port_id);
+	/* Flush cache to rebuild. */
+	priv->mr.cache.len = 1;
+	priv->mr.cache.overflow = 0;
+	/* Iterate all the existing MRs. */
+	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+		if (mr_insert_dev_cache(dev, mr) < 0)
+			return;
+}
+
+/**
+ * Callback for memory free event. Iterate freed memsegs and check whether it
+ * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
+ * result, the MR would be fragmented. If it becomes empty, the MR will be freed
+ * later by mlx4_mr_garbage_collect().
+ *
+ * The global cache must be rebuilt if there's any change and this event has to
+ * be propagated to dataplane threads to flush the local caches.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param addr
+ *   Address of freed memory.
+ * @param len
+ *   Size of freed memory.
+ */
+static void
+mlx4_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
+{
+	struct priv *priv = dev->data->dev_private;
+	const struct rte_memseg_list *msl;
+	struct mlx4_mr *mr;
+	int ms_n;
+	int i;
+	int rebuild = 0;
+
+	DEBUG("port %u free callback: addr=%p, len=%zu",
+	      dev->data->port_id, addr, len);
+	msl = rte_mem_virt2memseg_list(addr);
+	/* addr and len must be page-aligned. */
+	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
+	assert(len == RTE_ALIGN(len, msl->page_sz));
+	ms_n = len / msl->page_sz;
+	rte_rwlock_write_lock(&priv->mr.rwlock);
+	/* Clear bits of freed memsegs from MR. */
+	for (i = 0; i < ms_n; ++i) {
+		const struct rte_memseg *ms;
+		struct mlx4_mr_cache entry;
+		uintptr_t start;
+		int ms_idx;
+		uint32_t pos;
+
+		/* Find MR having this memseg. */
+		start = (uintptr_t)addr + i * msl->page_sz;
+		mr = mr_lookup_dev_list(dev, &entry, start);
+		if (mr == NULL)
+			continue;
+		ms = rte_mem_virt2memseg((void *)start, msl);
+		assert(ms != NULL);
+		assert(msl->page_sz == ms->hugepage_sz);
+		ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+		pos = ms_idx - mr->ms_base_idx;
+		assert(rte_bitmap_get(mr->ms_bmp, pos));
+		assert(pos < mr->ms_bmp_n);
+		DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p",
+		      dev->data->port_id, (void *)mr, pos, (void *)start);
+		rte_bitmap_clear(mr->ms_bmp, pos);
+		if (--mr->ms_n == 0) {
+			LIST_REMOVE(mr, mr);
+			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+			DEBUG("port %u remove MR(%p) from list",
+			      dev->data->port_id, (void *)mr);
+		}
+		/*
+		 * MR is fragmented or will be freed. the global cache must be
+		 * rebuilt.
+		 */
+		rebuild = 1;
+	}
+	if (rebuild) {
+		mr_rebuild_dev_cache(dev);
+		/*
+		 * Flush local caches by propagating invalidation across cores.
+		 * rte_smp_wmb() is enough to synchronize this event. If one of
+		 * freed memsegs is seen by other core, that means the memseg
+		 * has been allocated by allocator, which will come after this
+		 * free call. Therefore, this store instruction (incrementing
+		 * generation below) will be guaranteed to be seen by other core
+		 * before the core sees the newly allocated memory.
+		 */
+		++priv->mr.dev_gen;
+		DEBUG("broadcasting local cache flush, gen=%d",
+		      priv->mr.dev_gen);
+		rte_smp_wmb();
+	}
+	rte_rwlock_write_unlock(&priv->mr.rwlock);
+#ifndef NDEBUG
+	if (rebuild)
+		mlx4_mr_dump_dev(dev);
+#endif
+}
+
+/**
+ * Callback for memory event.
+ *
+ * @param event_type
+ *   Memory event type.
+ * @param addr
+ *   Address of memory.
+ * @param len
+ *   Size of memory.
+ */
+void
+mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+		     size_t len, void *arg __rte_unused)
+{
+	struct priv *priv;
+
+	switch (event_type) {
+	case RTE_MEM_EVENT_FREE:
+		rte_rwlock_read_lock(&mlx4_mem_event_rwlock);
+		/* Iterate all the existing mlx4 devices. */
+		LIST_FOREACH(priv, &mlx4_mem_event_cb_list, mem_event_cb)
+			mlx4_mr_mem_event_free_cb(priv->dev, addr, len);
+		rte_rwlock_read_unlock(&mlx4_mem_event_rwlock);
+		break;
+	case RTE_MEM_EVENT_ALLOC:
+	default:
+		break;
+	}
+}
+
+/**
+ * Look up address in the global MR cache table. If not found, create a new MR.
+ * Insert the found/created entry to local bottom-half cache table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mr_ctrl
+ *   Pointer to per-queue MR control structure.
+ * @param[out] entry
+ *   Pointer to returning MR cache entry, found in the global cache or newly
+ *   created. If failed to create one, this is not written.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mlx4_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+		   struct mlx4_mr_cache *entry, uintptr_t addr)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mlx4_mr_btree *bt = &mr_ctrl->cache_bh;
+	uint16_t idx;
+	uint32_t lkey;
+
+	/* If local cache table is full, try to double it. */
+	if (unlikely(bt->len == bt->size))
+		mr_btree_expand(bt, bt->size << 1);
+	/* Look up in the global cache. */
+	rte_rwlock_read_lock(&priv->mr.rwlock);
+	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	if (lkey != UINT32_MAX) {
+		/* Found. */
+		*entry = (*priv->mr.cache.table)[idx];
+		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		/*
+		 * Update local cache. Even if it fails, return the found entry
+		 * to update top-half cache. Next time, this entry will be found
+		 * in the global cache.
+		 */
+		mr_btree_insert(bt, entry);
+		return lkey;
+	}
+	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	/* First time to see the address? Create a new MR. */
+	lkey = mlx4_mr_create(dev, entry, addr);
+	/*
+	 * Update the local cache if successfully created a new global MR. Even
+	 * if failed to create one, there's no action to take in this datapath
+	 * code. As returning LKey is invalid, this will eventually make HW
+	 * fail.
+	 */
+	if (lkey != UINT32_MAX)
+		mr_btree_insert(bt, entry);
+	return lkey;
+}
+
+/**
+ * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if
+ * misses, search in the global MR cache table and update the new entry to
+ * per-queue local caches.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mr_ctrl
+ *   Pointer to per-queue MR control structure.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mlx4_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+		   uintptr_t addr)
+{
+	uint32_t lkey;
+	uint16_t bh_idx = 0;
+	/* Victim in top-half cache to replace with new entry. */
+	struct mlx4_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head];
+
+	/* Binary-search MR translation table. */
+	lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
+	/* Update top-half cache. */
+	if (likely(lkey != UINT32_MAX)) {
+		*repl = (*mr_ctrl->cache_bh.table)[bh_idx];
+	} else {
+		/*
+		 * If missed in local lookup table, search in the global cache
+		 * and local cache_bh[] will be updated inside if possible.
+		 * Top-half cache entry will also be updated.
+		 */
+		lkey = mlx4_mr_lookup_dev(dev, mr_ctrl, repl, addr);
+		if (unlikely(lkey == UINT32_MAX))
+			return UINT32_MAX;
+	}
+	/* Update the most recently used entry. */
+	mr_ctrl->mru = mr_ctrl->head;
+	/* Point to the next victim, the oldest. */
+	mr_ctrl->head = (mr_ctrl->head + 1) % MLX4_MR_CACHE_N;
+	return lkey;
+}
+
+/**
+ * Bottom-half of LKey search on Rx.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr)
+{
+	struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+	struct priv *priv = rxq->priv;
+
+	DEBUG("Rx queue %u: miss on top-half, mru=%u, head=%u, addr=%p",
+	      rxq->stats.idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr);
+	return mlx4_mr_addr2mr_bh(priv->dev, mr_ctrl, addr);
+}
+
+/**
+ * Bottom-half of LKey search on Tx.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr)
+{
+	struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+	struct priv *priv = txq->priv;
+
+	DEBUG("Tx queue %u: miss on top-half, mru=%u, head=%u, addr=%p",
+	      txq->stats.idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr);
+	return mlx4_mr_addr2mr_bh(priv->dev, mr_ctrl, addr);
+}
+
+/**
+ * Flush all of the local cache entries.
+ *
+ * @param mr_ctrl
+ *   Pointer to per-queue MR control structure.
+ */
+void
+mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl)
+{
+	/* Reset the most-recently-used index. */
+	mr_ctrl->mru = 0;
+	/* Reset the linear search array. */
+	mr_ctrl->head = 0;
+	memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
+	/* Reset the B-tree table. */
+	mr_ctrl->cache_bh.len = 1;
+	mr_ctrl->cache_bh.overflow = 0;
+	/* Update the generation number. */
+	mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
+	DEBUG("mr_ctrl(%p): flushed, cur_gen=%d",
+	      (void *)mr_ctrl, mr_ctrl->cur_gen);
+}
+
+/* Called during rte_mempool_mem_iter() by mlx4_mr_update_mp(). */
+static void
+mlx4_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque,
+		     struct rte_mempool_memhdr *memhdr,
+		     unsigned mem_idx __rte_unused)
+{
+	struct mr_update_mp_data *data = opaque;
+	uint32_t lkey;
+
+	/* Stop iteration if failed in the previous walk. */
+	if (data->ret < 0)
+		return;
+	/* Register address of the chunk and update local caches. */
+	lkey = mlx4_mr_addr2mr_bh(data->dev, data->mr_ctrl,
+				  (uintptr_t)memhdr->addr);
+	if (lkey == UINT32_MAX)
+		data->ret = -1;
+}
+
+/**
+ * Register entire memory chunks in a Mempool.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mr_ctrl
+ *   Pointer to per-queue MR control structure.
+ * @param mp
+ *   Pointer to registering Mempool.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+int
+mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+		  struct rte_mempool *mp)
+{
+	struct mr_update_mp_data data = {
+		.dev = dev,
+		.mr_ctrl = mr_ctrl,
+		.ret = 0,
+	};
+
+	rte_mempool_mem_iter(mp, mlx4_mr_update_mp_cb, &data);
+	return data.ret;
+}
+
+#ifndef NDEBUG
+/**
+ * Dump all the created MRs and the global cache entries.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx4_mr_dump_dev(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mlx4_mr *mr;
+	int mr_n = 0;
+	int chunk_n = 0;
+
+	rte_rwlock_read_lock(&priv->mr.rwlock);
+	/* Iterate all the existing MRs. */
+	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+		unsigned int n;
+
+		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+		      dev->data->port_id, mr_n++,
+		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
+		      mr->ms_n, mr->ms_bmp_n);
+		if (mr->ms_n == 0)
+			continue;
+		for (n = 0; n < mr->ms_bmp_n; ) {
+			struct mlx4_mr_cache ret = { 0, };
+
+			n = mr_find_next_chunk(mr, &ret, n);
+			if (!ret.end)
+				break;
+			DEBUG("  chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")",
+			      chunk_n++, ret.start, ret.end);
+		}
+	}
+	DEBUG("port %u dumping global cache", dev->data->port_id);
+	mlx4_mr_btree_dump(&priv->mr.cache);
+	rte_rwlock_read_unlock(&priv->mr.rwlock);
+}
+#endif
+
+/**
+ * Release all the created MRs and resources. Remove device from memory callback
+ * list.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx4_mr_release(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mlx4_mr *mr_next = LIST_FIRST(&priv->mr.mr_list);
+
+	/* Remove from memory callback device list. */
+	rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
+	LIST_REMOVE(priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+#ifndef NDEBUG
+	mlx4_mr_dump_dev(dev);
+#endif
+	rte_rwlock_write_lock(&priv->mr.rwlock);
+	/* Detach from MR list and move to free list. */
+	while (mr_next != NULL) {
+		struct mlx4_mr *mr = mr_next;
+
+		mr_next = LIST_NEXT(mr, mr);
+		LIST_REMOVE(mr, mr);
+		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+	}
+	LIST_INIT(&priv->mr.mr_list);
+	/* Free global cache. */
+	mlx4_mr_btree_free(&priv->mr.cache);
+	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	/* Free all remaining MRs. */
+	mlx4_mr_garbage_collect(dev);
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_mr.h b/src/spdk/dpdk/drivers/net/mlx4/mlx4_mr.h
new file mode 100644
index 00000000..37a365a8
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_mr.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX4_MR_H_
+#define RTE_PMD_MLX4_MR_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_eal_memconfig.h>
+#include <rte_ethdev.h>
+#include <rte_rwlock.h>
+#include <rte_bitmap.h>
+
+/* Size of per-queue MR cache array for linear search. */
+#define MLX4_MR_CACHE_N 8
+
+/* Size of MR cache table for binary search. */
+#define MLX4_MR_BTREE_CACHE_N 256
+
+/* Memory Region object. */
+struct mlx4_mr {
+	LIST_ENTRY(mlx4_mr) mr; /**< Pointer to the prev/next entry. */
+	struct ibv_mr *ibv_mr; /* Verbs Memory Region. */
+	const struct rte_memseg_list *msl;
+	int ms_base_idx; /* Start index of msl->memseg_arr[]. */
+	int ms_n; /* Number of memsegs in use. */
+	uint32_t ms_bmp_n; /* Number of bits in memsegs bit-mask. */
+	struct rte_bitmap *ms_bmp; /* Bit-mask of memsegs belonged to MR. */
+};
+
+/* Cache entry for Memory Region. */
+struct mlx4_mr_cache {
+	uintptr_t start; /* Start address of MR. */
+	uintptr_t end; /* End address of MR. */
+	uint32_t lkey; /* rte_cpu_to_be_32(ibv_mr->lkey). */
+} __rte_packed;
+
+/* MR Cache table for Binary search. */
+struct mlx4_mr_btree {
+	uint16_t len; /* Number of entries. */
+	uint16_t size; /* Total number of entries. */
+	int overflow; /* Mark failure of table expansion. */
+	struct mlx4_mr_cache (*table)[];
+} __rte_packed;
+
+/* Per-queue MR control descriptor. */
+struct mlx4_mr_ctrl {
+	uint32_t *dev_gen_ptr; /* Generation number of device to poll. */
+	uint32_t cur_gen; /* Generation number saved to flush caches. */
+	uint16_t mru; /* Index of last hit entry in top-half cache. */
+	uint16_t head; /* Index of the oldest entry in top-half cache. */
+	struct mlx4_mr_cache cache[MLX4_MR_CACHE_N]; /* Cache for top-half. */
+	struct mlx4_mr_btree cache_bh; /* Cache for bottom-half. */
+} __rte_packed;
+
+extern struct mlx4_dev_list  mlx4_mem_event_cb_list;
+extern rte_rwlock_t mlx4_mem_event_rwlock;
+
+/* First entry must be NULL for comparison. */
+#define mlx4_mr_btree_len(bt) ((bt)->len - 1)
+
+int mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket);
+void mlx4_mr_btree_free(struct mlx4_mr_btree *bt);
+void mlx4_mr_btree_dump(struct mlx4_mr_btree *bt);
+void mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+			  size_t len, void *arg);
+int mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+		      struct rte_mempool *mp);
+void mlx4_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx4_mr_release(struct rte_eth_dev *dev);
+
+/**
+ * Look up LKey from given lookup table by linear search. Firstly look up the
+ * last-hit entry. If miss, the entire array is searched. If found, update the
+ * last-hit index and return LKey.
+ *
+ * @param lkp_tbl
+ *   Pointer to lookup table.
+ * @param[in,out] cached_idx
+ *   Pointer to last-hit index.
+ * @param n
+ *   Size of lookup table.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx4_mr_lookup_cache(struct mlx4_mr_cache *lkp_tbl, uint16_t *cached_idx,
+		     uint16_t n, uintptr_t addr)
+{
+	uint16_t idx;
+
+	if (likely(addr >= lkp_tbl[*cached_idx].start &&
+		   addr < lkp_tbl[*cached_idx].end))
+		return lkp_tbl[*cached_idx].lkey;
+	for (idx = 0; idx < n && lkp_tbl[idx].start != 0; ++idx) {
+		if (addr >= lkp_tbl[idx].start &&
+		    addr < lkp_tbl[idx].end) {
+			/* Found. */
+			*cached_idx = idx;
+			return lkp_tbl[idx].lkey;
+		}
+	}
+	return UINT32_MAX;
+}
+
+#endif /* RTE_PMD_MLX4_MR_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_prm.h b/src/spdk/dpdk/drivers/net/mlx4/mlx4_prm.h
new file mode 100644
index 00000000..aef77ba0
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_prm.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef MLX4_PRM_H_
+#define MLX4_PRM_H_
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+#include "mlx4_autoconf.h"
+
+/* ConnectX-3 Tx queue basic block. */
+#define MLX4_TXBB_SHIFT 6
+#define MLX4_TXBB_SIZE (1 << MLX4_TXBB_SHIFT)
+
+/* Typical TSO descriptor with 16 gather entries is 352 bytes. */
+#define MLX4_MAX_SGE 32
+#define MLX4_MAX_WQE_SIZE \
+	(MLX4_MAX_SGE * sizeof(struct mlx4_wqe_data_seg) + \
+	 sizeof(struct mlx4_wqe_ctrl_seg))
+#define MLX4_SEG_SHIFT 4
+
+/* Send queue stamping/invalidating information. */
+#define MLX4_SQ_STAMP_STRIDE 64
+#define MLX4_SQ_STAMP_DWORDS (MLX4_SQ_STAMP_STRIDE / 4)
+#define MLX4_SQ_OWNER_BIT 31
+#define MLX4_SQ_STAMP_VAL 0x7fffffff
+
+/* Work queue element (WQE) flags. */
+#define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
+#define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
+
+/* CQE checksum flags. */
+enum {
+	MLX4_CQE_L2_TUNNEL_IPV4 = (int)(1u << 25),
+	MLX4_CQE_L2_TUNNEL_L4_CSUM = (int)(1u << 26),
+	MLX4_CQE_L2_TUNNEL = (int)(1u << 27),
+	MLX4_CQE_L2_VLAN_MASK = (int)(3u << 29),
+	MLX4_CQE_L2_TUNNEL_IPOK = (int)(1u << 31),
+};
+
+/* CQE status flags. */
+#define MLX4_CQE_STATUS_IPV6F (1 << 12)
+#define MLX4_CQE_STATUS_IPV4 (1 << 22)
+#define MLX4_CQE_STATUS_IPV4F (1 << 23)
+#define MLX4_CQE_STATUS_IPV6 (1 << 24)
+#define MLX4_CQE_STATUS_IPV4OPT (1 << 25)
+#define MLX4_CQE_STATUS_TCP (1 << 26)
+#define MLX4_CQE_STATUS_UDP (1 << 27)
+#define MLX4_CQE_STATUS_PTYPE_MASK \
+	(MLX4_CQE_STATUS_IPV4 | \
+	 MLX4_CQE_STATUS_IPV4F | \
+	 MLX4_CQE_STATUS_IPV6 | \
+	 MLX4_CQE_STATUS_IPV4OPT | \
+	 MLX4_CQE_STATUS_TCP | \
+	 MLX4_CQE_STATUS_UDP)
+
+/* Send queue information. */
+struct mlx4_sq {
+	volatile uint8_t *buf; /**< SQ buffer. */
+	volatile uint8_t *eob; /**< End of SQ buffer */
+	uint32_t size; /**< SQ size includes headroom. */
+	uint32_t remain_size; /**< Remaining WQE room in SQ (bytes). */
+	uint32_t owner_opcode;
+	/**< Default owner opcode with HW valid owner bit. */
+	uint32_t stamp; /**< Stamp value with an invalid HW owner bit. */
+	volatile uint32_t *db; /**< Pointer to the doorbell. */
+	uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
+};
+
+/* Completion queue events, numbers and masks. */
+#define MLX4_CQ_DB_GEQ_N_MASK 0x3
+#define MLX4_CQ_DOORBELL 0x20
+#define MLX4_CQ_DB_CI_MASK 0xffffff
+
+/* Completion queue information. */
+struct mlx4_cq {
+	volatile void *cq_uar; /**< CQ user access region. */
+	volatile void *cq_db_reg; /**< CQ doorbell register. */
+	volatile uint32_t *set_ci_db; /**< Pointer to the CQ doorbell. */
+	volatile uint32_t *arm_db; /**< Arming Rx events doorbell. */
+	volatile uint8_t *buf; /**< Pointer to the completion queue buffer. */
+	uint32_t cqe_cnt; /**< Number of entries in the queue. */
+	uint32_t cqe_64:1; /**< CQ entry size is 64 bytes. */
+	uint32_t cons_index; /**< Last queue entry that was handled. */
+	uint32_t cqn; /**< CQ number. */
+	int arm_sn; /**< Rx event counter. */
+};
+
+#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg {
+	rte_be32_t mss_hdr_size;
+	rte_be32_t header[];
+};
+#endif
+
+/**
+ * Retrieve a CQE entry from a CQ.
+ *
+ * cqe = cq->buf + cons_index * cqe_size + cqe_offset
+ *
+ * Where cqe_size is 32 or 64 bytes and cqe_offset is 0 or 32 (depending on
+ * cqe_size).
+ *
+ * @param cq
+ *   CQ to retrieve entry from.
+ * @param index
+ *   Entry index.
+ *
+ * @return
+ *   Pointer to CQE entry.
+ */
+static inline volatile struct mlx4_cqe *
+mlx4_get_cqe(struct mlx4_cq *cq, uint32_t index)
+{
+	return (volatile struct mlx4_cqe *)(cq->buf +
+				   ((index & (cq->cqe_cnt - 1)) <<
+				    (5 + cq->cqe_64)) +
+				   (cq->cqe_64 << 5));
+}
+
+/**
+ * Transpose a flag in a value.
+ *
+ * @param val
+ *   Input value.
+ * @param from
+ *   Flag to retrieve from input value.
+ * @param to
+ *   Flag to set in output value.
+ *
+ * @return
+ *   Output value with transposed flag enabled if present on input.
+ */
+static inline uint64_t
+mlx4_transpose(uint64_t val, uint64_t from, uint64_t to)
+{
+	return (from >= to ?
+		(val & from) / (from / to) :
+		(val & from) * (to / from));
+}
+
+#endif /* MLX4_PRM_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_rxq.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4_rxq.c
new file mode 100644
index 00000000..9737da2e
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_rxq.c
@@ -0,0 +1,936 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+/**
+ * @file
+ * Rx queues configuration for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_ethdev_driver.h>
+#include <rte_flow.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#include "mlx4.h"
+#include "mlx4_glue.h"
+#include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Historical RSS hash key.
+ *
+ * This used to be the default for mlx4 in Linux before v3.19 switched to
+ * generating random hash keys through netdev_rss_key_fill().
+ *
+ * It is used in this PMD for consistency with past DPDK releases but can
+ * now be overridden through user configuration.
+ *
+ * Note: this is not const to work around API quirks.
+ */
+uint8_t
+mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
+	0x2c, 0xc6, 0x81, 0xd1,
+	0x5b, 0xdb, 0xf4, 0xf7,
+	0xfc, 0xa2, 0x83, 0x19,
+	0xdb, 0x1a, 0x3e, 0x94,
+	0x6b, 0x9e, 0x38, 0xd9,
+	0x2c, 0x9c, 0x03, 0xd1,
+	0xad, 0x99, 0x44, 0xa7,
+	0xd9, 0x56, 0x3d, 0x59,
+	0x06, 0x3c, 0x25, 0xf3,
+	0xfc, 0x1f, 0xdc, 0x2a,
+};
+
+/**
+ * Obtain a RSS context with specified properties.
+ *
+ * Used when creating a flow rule targeting one or several Rx queues.
+ *
+ * If a matching RSS context already exists, it is returned with its
+ * reference count incremented.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param fields
+ *   Fields for RSS processing (Verbs format).
+ * @param[in] key
+ *   Hash key to use (whose size is exactly MLX4_RSS_HASH_KEY_SIZE).
+ * @param queues
+ *   Number of target queues.
+ * @param[in] queue_id
+ *   Target queues.
+ *
+ * @return
+ *   Pointer to RSS context on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx4_rss *
+mlx4_rss_get(struct priv *priv, uint64_t fields,
+	     const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+	     uint16_t queues, const uint16_t queue_id[])
+{
+	struct mlx4_rss *rss;
+	size_t queue_id_size = sizeof(queue_id[0]) * queues;
+
+	LIST_FOREACH(rss, &priv->rss, next)
+		if (fields == rss->fields &&
+		    queues == rss->queues &&
+		    !memcmp(key, rss->key, MLX4_RSS_HASH_KEY_SIZE) &&
+		    !memcmp(queue_id, rss->queue_id, queue_id_size)) {
+			++rss->refcnt;
+			return rss;
+		}
+	rss = rte_malloc(__func__, offsetof(struct mlx4_rss, queue_id) +
+			 queue_id_size, 0);
+	if (!rss)
+		goto error;
+	*rss = (struct mlx4_rss){
+		.priv = priv,
+		.refcnt = 1,
+		.usecnt = 0,
+		.qp = NULL,
+		.ind = NULL,
+		.fields = fields,
+		.queues = queues,
+	};
+	memcpy(rss->key, key, MLX4_RSS_HASH_KEY_SIZE);
+	memcpy(rss->queue_id, queue_id, queue_id_size);
+	LIST_INSERT_HEAD(&priv->rss, rss, next);
+	return rss;
+error:
+	rte_errno = ENOMEM;
+	return NULL;
+}
+
+/**
+ * Release a RSS context instance.
+ *
+ * Used when destroying a flow rule targeting one or several Rx queues.
+ *
+ * This function decrements the reference count of the context and destroys
+ * it after reaching 0. The context must have no users at this point; all
+ * prior calls to mlx4_rss_attach() must have been followed by matching
+ * calls to mlx4_rss_detach().
+ *
+ * @param rss
+ *   RSS context to release.
+ */
+void
+mlx4_rss_put(struct mlx4_rss *rss)
+{
+	assert(rss->refcnt);
+	if (--rss->refcnt)
+		return;
+	assert(!rss->usecnt);
+	assert(!rss->qp);
+	assert(!rss->ind);
+	LIST_REMOVE(rss, next);
+	rte_free(rss);
+}
+
+/**
+ * Attach a user to a RSS context instance.
+ *
+ * Used when the RSS QP and indirection table objects must be instantiated,
+ * that is, when a flow rule must be enabled.
+ *
+ * This function increments the usage count of the context.
+ *
+ * @param rss
+ *   RSS context to attach to.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rss_attach(struct mlx4_rss *rss)
+{
+	assert(rss->refcnt);
+	if (rss->usecnt++) {
+		assert(rss->qp);
+		assert(rss->ind);
+		return 0;
+	}
+
+	struct ibv_wq *ind_tbl[rss->queues];
+	struct priv *priv = rss->priv;
+	const char *msg;
+	unsigned int i = 0;
+	int ret;
+
+	if (!rte_is_power_of_2(RTE_DIM(ind_tbl))) {
+		ret = EINVAL;
+		msg = "number of RSS queues must be a power of two";
+		goto error;
+	}
+	for (i = 0; i != RTE_DIM(ind_tbl); ++i) {
+		uint16_t id = rss->queue_id[i];
+		struct rxq *rxq = NULL;
+
+		if (id < priv->dev->data->nb_rx_queues)
+			rxq = priv->dev->data->rx_queues[id];
+		if (!rxq) {
+			ret = EINVAL;
+			msg = "RSS target queue is not configured";
+			goto error;
+		}
+		ret = mlx4_rxq_attach(rxq);
+		if (ret) {
+			ret = -ret;
+			msg = "unable to attach RSS target queue";
+			goto error;
+		}
+		ind_tbl[i] = rxq->wq;
+	}
+	rss->ind = mlx4_glue->create_rwq_ind_table
+		(priv->ctx,
+		 &(struct ibv_rwq_ind_table_init_attr){
+			.log_ind_tbl_size = rte_log2_u32(RTE_DIM(ind_tbl)),
+			.ind_tbl = ind_tbl,
+			.comp_mask = 0,
+		 });
+	if (!rss->ind) {
+		ret = errno ? errno : EINVAL;
+		msg = "RSS indirection table creation failure";
+		goto error;
+	}
+	rss->qp = mlx4_glue->create_qp_ex
+		(priv->ctx,
+		 &(struct ibv_qp_init_attr_ex){
+			.comp_mask = (IBV_QP_INIT_ATTR_PD |
+				      IBV_QP_INIT_ATTR_RX_HASH |
+				      IBV_QP_INIT_ATTR_IND_TABLE),
+			.qp_type = IBV_QPT_RAW_PACKET,
+			.pd = priv->pd,
+			.rwq_ind_tbl = rss->ind,
+			.rx_hash_conf = {
+				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+				.rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE,
+				.rx_hash_key = rss->key,
+				.rx_hash_fields_mask = rss->fields,
+			},
+		 });
+	if (!rss->qp) {
+		ret = errno ? errno : EINVAL;
+		msg = "RSS hash QP creation failure";
+		goto error;
+	}
+	ret = mlx4_glue->modify_qp
+		(rss->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_INIT,
+			.port_num = priv->port,
+		 },
+		 IBV_QP_STATE | IBV_QP_PORT);
+	if (ret) {
+		msg = "failed to switch RSS hash QP to INIT state";
+		goto error;
+	}
+	ret = mlx4_glue->modify_qp
+		(rss->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_RTR,
+		 },
+		 IBV_QP_STATE);
+	if (ret) {
+		msg = "failed to switch RSS hash QP to RTR state";
+		goto error;
+	}
+	return 0;
+error:
+	if (rss->qp) {
+		claim_zero(mlx4_glue->destroy_qp(rss->qp));
+		rss->qp = NULL;
+	}
+	if (rss->ind) {
+		claim_zero(mlx4_glue->destroy_rwq_ind_table(rss->ind));
+		rss->ind = NULL;
+	}
+	while (i--)
+		mlx4_rxq_detach(priv->dev->data->rx_queues[rss->queue_id[i]]);
+	ERROR("mlx4: %s", msg);
+	--rss->usecnt;
+	rte_errno = ret;
+	return -ret;
+}
+
+/**
+ * Detach a user from a RSS context instance.
+ *
+ * Used when disabling (not destroying) a flow rule.
+ *
+ * This function decrements the usage count of the context and destroys
+ * usage resources after reaching 0.
+ *
+ * @param rss
+ *   RSS context to detach from.
+ */
+void
+mlx4_rss_detach(struct mlx4_rss *rss)
+{
+	struct priv *priv = rss->priv;
+	unsigned int i;
+
+	assert(rss->refcnt);
+	assert(rss->qp);
+	assert(rss->ind);
+	if (--rss->usecnt)
+		return;
+	claim_zero(mlx4_glue->destroy_qp(rss->qp));
+	rss->qp = NULL;
+	claim_zero(mlx4_glue->destroy_rwq_ind_table(rss->ind));
+	rss->ind = NULL;
+	for (i = 0; i != rss->queues; ++i)
+		mlx4_rxq_detach(priv->dev->data->rx_queues[rss->queue_id[i]]);
+}
+
+/**
+ * Initialize common RSS context resources.
+ *
+ * Because ConnectX-3 hardware limitations require a fixed order in the
+ * indirection table, WQs must be allocated sequentially to be part of a
+ * common RSS context.
+ *
+ * Since a newly created WQ cannot be moved to a different context, this
+ * function allocates them all at once, one for each configured Rx queue,
+ * as well as all related resources (CQs and mbufs).
+ *
+ * This must therefore be done before creating any Rx flow rules relying on
+ * indirection tables.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rss_init(struct priv *priv)
+{
+	struct rte_eth_dev *dev = priv->dev;
+	uint8_t log2_range = rte_log2_u32(dev->data->nb_rx_queues);
+	uint32_t wq_num_prev = 0;
+	const char *msg;
+	unsigned int i;
+	int ret;
+
+	if (priv->rss_init)
+		return 0;
+	if (priv->dev->data->nb_rx_queues > priv->hw_rss_max_qps) {
+		ERROR("RSS does not support more than %d queues",
+		      priv->hw_rss_max_qps);
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	/* Prepare range for RSS contexts before creating the first WQ. */
+	ret = mlx4_glue->dv_set_context_attr
+		(priv->ctx,
+		 MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ,
+		 &log2_range);
+	if (ret) {
+		ERROR("cannot set up range size for RSS context to %u"
+		      " (for %u Rx queues), error: %s",
+		      1 << log2_range, dev->data->nb_rx_queues, strerror(ret));
+		rte_errno = ret;
+		return -ret;
+	}
+	for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) {
+		struct rxq *rxq = priv->dev->data->rx_queues[i];
+		struct ibv_cq *cq;
+		struct ibv_wq *wq;
+		uint32_t wq_num;
+
+		/* Attach the configured Rx queues. */
+		if (rxq) {
+			assert(!rxq->usecnt);
+			ret = mlx4_rxq_attach(rxq);
+			if (!ret) {
+				wq_num = rxq->wq->wq_num;
+				goto wq_num_check;
+			}
+			ret = -ret;
+			msg = "unable to create Rx queue resources";
+			goto error;
+		}
+		/*
+		 * WQs are temporarily allocated for unconfigured Rx queues
+		 * to maintain proper index alignment in indirection table
+		 * by skipping unused WQ numbers.
+		 *
+		 * The reason this works at all even though these WQs are
+		 * immediately destroyed is that WQNs are allocated
+		 * sequentially and are guaranteed to never be reused in the
+		 * same context by the underlying implementation.
+		 */
+		cq = mlx4_glue->create_cq(priv->ctx, 1, NULL, NULL, 0);
+		if (!cq) {
+			ret = ENOMEM;
+			msg = "placeholder CQ creation failure";
+			goto error;
+		}
+		wq = mlx4_glue->create_wq
+			(priv->ctx,
+			 &(struct ibv_wq_init_attr){
+				.wq_type = IBV_WQT_RQ,
+				.max_wr = 1,
+				.max_sge = 1,
+				.pd = priv->pd,
+				.cq = cq,
+			 });
+		if (wq) {
+			wq_num = wq->wq_num;
+			claim_zero(mlx4_glue->destroy_wq(wq));
+		} else {
+			wq_num = 0; /* Shut up GCC 4.8 warnings. */
+		}
+		claim_zero(mlx4_glue->destroy_cq(cq));
+		if (!wq) {
+			ret = ENOMEM;
+			msg = "placeholder WQ creation failure";
+			goto error;
+		}
+wq_num_check:
+		/*
+		 * While guaranteed by the implementation, make sure WQ
+		 * numbers are really sequential (as the saying goes,
+		 * trust, but verify).
+		 */
+		if (i && wq_num - wq_num_prev != 1) {
+			if (rxq)
+				mlx4_rxq_detach(rxq);
+			ret = ERANGE;
+			msg = "WQ numbers are not sequential";
+			goto error;
+		}
+		wq_num_prev = wq_num;
+	}
+	priv->rss_init = 1;
+	return 0;
+error:
+	ERROR("cannot initialize common RSS resources (queue %u): %s: %s",
+	      i, msg, strerror(ret));
+	while (i--) {
+		struct rxq *rxq = priv->dev->data->rx_queues[i];
+
+		if (rxq)
+			mlx4_rxq_detach(rxq);
+	}
+	rte_errno = ret;
+	return -ret;
+}
+
+/**
+ * Release common RSS context resources.
+ *
+ * As the reverse of mlx4_rss_init(), this must be done after removing all
+ * flow rules relying on indirection tables.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+void
+mlx4_rss_deinit(struct priv *priv)
+{
+	unsigned int i;
+
+	if (!priv->rss_init)
+		return;
+	for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) {
+		struct rxq *rxq = priv->dev->data->rx_queues[i];
+
+		if (rxq) {
+			assert(rxq->usecnt == 1);
+			mlx4_rxq_detach(rxq);
+		}
+	}
+	priv->rss_init = 0;
+}
+
+/**
+ * Attach a user to a Rx queue.
+ *
+ * Used when the resources of an Rx queue must be instantiated for it to
+ * become in a usable state.
+ *
+ * This function increments the usage count of the Rx queue.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rxq_attach(struct rxq *rxq)
+{
+	if (rxq->usecnt++) {
+		assert(rxq->cq);
+		assert(rxq->wq);
+		assert(rxq->wqes);
+		assert(rxq->rq_db);
+		return 0;
+	}
+
+	struct priv *priv = rxq->priv;
+	struct rte_eth_dev *dev = priv->dev;
+	const uint32_t elts_n = 1 << rxq->elts_n;
+	const uint32_t sges_n = 1 << rxq->sges_n;
+	struct rte_mbuf *(*elts)[elts_n] = rxq->elts;
+	struct mlx4dv_obj mlxdv;
+	struct mlx4dv_rwq dv_rwq;
+	struct mlx4dv_cq dv_cq = { .comp_mask = MLX4DV_CQ_MASK_UAR, };
+	const char *msg;
+	struct ibv_cq *cq = NULL;
+	struct ibv_wq *wq = NULL;
+	uint32_t create_flags = 0;
+	uint32_t comp_mask = 0;
+	volatile struct mlx4_wqe_data_seg (*wqes)[];
+	unsigned int i;
+	int ret;
+
+	assert(rte_is_power_of_2(elts_n));
+	cq = mlx4_glue->create_cq(priv->ctx, elts_n / sges_n, NULL,
+				  rxq->channel, 0);
+	if (!cq) {
+		ret = ENOMEM;
+		msg = "CQ creation failure";
+		goto error;
+	}
+	/* By default, FCS (CRC) is stripped by hardware. */
+	if (rxq->crc_present) {
+		create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
+		comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+	}
+	wq = mlx4_glue->create_wq
+		(priv->ctx,
+		 &(struct ibv_wq_init_attr){
+			.wq_type = IBV_WQT_RQ,
+			.max_wr = elts_n / sges_n,
+			.max_sge = sges_n,
+			.pd = priv->pd,
+			.cq = cq,
+			.comp_mask = comp_mask,
+			.create_flags = create_flags,
+		 });
+	if (!wq) {
+		ret = errno ? errno : EINVAL;
+		msg = "WQ creation failure";
+		goto error;
+	}
+	ret = mlx4_glue->modify_wq
+		(wq,
+		 &(struct ibv_wq_attr){
+			.attr_mask = IBV_WQ_ATTR_STATE,
+			.wq_state = IBV_WQS_RDY,
+		 });
+	if (ret) {
+		msg = "WQ state change to IBV_WQS_RDY failed";
+		goto error;
+	}
+	/* Retrieve device queue information. */
+	mlxdv.cq.in = cq;
+	mlxdv.cq.out = &dv_cq;
+	mlxdv.rwq.in = wq;
+	mlxdv.rwq.out = &dv_rwq;
+	ret = mlx4_glue->dv_init_obj(&mlxdv, MLX4DV_OBJ_RWQ | MLX4DV_OBJ_CQ);
+	if (ret) {
+		msg = "failed to obtain device information from WQ/CQ objects";
+		goto error;
+	}
+	/* Pre-register Rx mempool. */
+	DEBUG("port %u Rx queue %u registering mp %s having %u chunks",
+	      priv->dev->data->port_id, rxq->stats.idx,
+	      rxq->mp->name, rxq->mp->nb_mem_chunks);
+	mlx4_mr_update_mp(dev, &rxq->mr_ctrl, rxq->mp);
+	wqes = (volatile struct mlx4_wqe_data_seg (*)[])
+		((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset);
+	for (i = 0; i != RTE_DIM(*elts); ++i) {
+		volatile struct mlx4_wqe_data_seg *scat = &(*wqes)[i];
+		struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
+
+		if (buf == NULL) {
+			while (i--) {
+				rte_pktmbuf_free_seg((*elts)[i]);
+				(*elts)[i] = NULL;
+			}
+			ret = ENOMEM;
+			msg = "cannot allocate mbuf";
+			goto error;
+		}
+		/* Headroom is reserved by rte_pktmbuf_alloc(). */
+		assert(buf->data_off == RTE_PKTMBUF_HEADROOM);
+		/* Buffer is supposed to be empty. */
+		assert(rte_pktmbuf_data_len(buf) == 0);
+		assert(rte_pktmbuf_pkt_len(buf) == 0);
+		/* Only the first segment keeps headroom. */
+		if (i % sges_n)
+			buf->data_off = 0;
+		buf->port = rxq->port_id;
+		buf->data_len = rte_pktmbuf_tailroom(buf);
+		buf->pkt_len = rte_pktmbuf_tailroom(buf);
+		buf->nb_segs = 1;
+		*scat = (struct mlx4_wqe_data_seg){
+			.addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
+								  uintptr_t)),
+			.byte_count = rte_cpu_to_be_32(buf->data_len),
+			.lkey = mlx4_rx_mb2mr(rxq, buf),
+		};
+		(*elts)[i] = buf;
+	}
+	DEBUG("%p: allocated and configured %u segments (max %u packets)",
+	      (void *)rxq, elts_n, elts_n / sges_n);
+	rxq->cq = cq;
+	rxq->wq = wq;
+	rxq->wqes = wqes;
+	rxq->rq_db = dv_rwq.rdb;
+	rxq->mcq.buf = dv_cq.buf.buf;
+	rxq->mcq.cqe_cnt = dv_cq.cqe_cnt;
+	rxq->mcq.set_ci_db = dv_cq.set_ci_db;
+	rxq->mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0;
+	rxq->mcq.arm_db = dv_cq.arm_db;
+	rxq->mcq.arm_sn = dv_cq.arm_sn;
+	rxq->mcq.cqn = dv_cq.cqn;
+	rxq->mcq.cq_uar = dv_cq.cq_uar;
+	rxq->mcq.cq_db_reg = (uint8_t *)dv_cq.cq_uar + MLX4_CQ_DOORBELL;
+	/* Update doorbell counter. */
+	rxq->rq_ci = elts_n / sges_n;
+	rte_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	return 0;
+error:
+	if (wq)
+		claim_zero(mlx4_glue->destroy_wq(wq));
+	if (cq)
+		claim_zero(mlx4_glue->destroy_cq(cq));
+	--rxq->usecnt;
+	rte_errno = ret;
+	ERROR("error while attaching Rx queue %p: %s: %s",
+	      (void *)rxq, msg, strerror(ret));
+	return -ret;
+}
+
+/**
+ * Detach a user from a Rx queue.
+ *
+ * This function decrements the usage count of the Rx queue and destroys
+ * usage resources after reaching 0.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ */
+void
+mlx4_rxq_detach(struct rxq *rxq)
+{
+	unsigned int i;
+	struct rte_mbuf *(*elts)[1 << rxq->elts_n] = rxq->elts;
+
+	if (--rxq->usecnt)
+		return;
+	rxq->rq_ci = 0;
+	memset(&rxq->mcq, 0, sizeof(rxq->mcq));
+	rxq->rq_db = NULL;
+	rxq->wqes = NULL;
+	claim_zero(mlx4_glue->destroy_wq(rxq->wq));
+	rxq->wq = NULL;
+	claim_zero(mlx4_glue->destroy_cq(rxq->cq));
+	rxq->cq = NULL;
+	DEBUG("%p: freeing Rx queue elements", (void *)rxq);
+	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+		if (!(*elts)[i])
+			continue;
+		rte_pktmbuf_free_seg((*elts)[i]);
+		(*elts)[i] = NULL;
+	}
+}
+
+/**
+ * Returns the per-queue supported offloads.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   Supported Tx offloads.
+ */
+uint64_t
+mlx4_get_rx_queue_offloads(struct priv *priv)
+{
+	uint64_t offloads = DEV_RX_OFFLOAD_SCATTER |
+			    DEV_RX_OFFLOAD_CRC_STRIP |
+			    DEV_RX_OFFLOAD_KEEP_CRC |
+			    DEV_RX_OFFLOAD_JUMBO_FRAME;
+
+	if (priv->hw_csum)
+		offloads |= DEV_RX_OFFLOAD_CHECKSUM;
+	return offloads;
+}
+
+/**
+ * Returns the per-port supported offloads.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   Supported Rx offloads.
+ */
+uint64_t
+mlx4_get_rx_port_offloads(struct priv *priv)
+{
+	uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER;
+
+	(void)priv;
+	return offloads;
+}
+
+/**
+ * DPDK callback to configure a Rx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Rx queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ *   Thresholds parameters.
+ * @param mp
+ *   Memory pool for buffer allocations.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+		    unsigned int socket, const struct rte_eth_rxconf *conf,
+		    struct rte_mempool *mp)
+{
+	struct priv *priv = dev->data->dev_private;
+	uint32_t mb_len = rte_pktmbuf_data_room_size(mp);
+	struct rte_mbuf *(*elts)[rte_align32pow2(desc)];
+	struct rxq *rxq;
+	struct mlx4_malloc_vec vec[] = {
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = sizeof(*rxq),
+			.addr = (void **)&rxq,
+		},
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = sizeof(*elts),
+			.addr = (void **)&elts,
+		},
+	};
+	int ret;
+	uint32_t crc_present;
+	uint64_t offloads;
+
+	offloads = conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
+	DEBUG("%p: configuring queue %u for %u descriptors",
+	      (void *)dev, idx, desc);
+
+	if (idx >= dev->data->nb_rx_queues) {
+		rte_errno = EOVERFLOW;
+		ERROR("%p: queue index out of range (%u >= %u)",
+		      (void *)dev, idx, dev->data->nb_rx_queues);
+		return -rte_errno;
+	}
+	rxq = dev->data->rx_queues[idx];
+	if (rxq) {
+		rte_errno = EEXIST;
+		ERROR("%p: Rx queue %u already configured, release it first",
+		      (void *)dev, idx);
+		return -rte_errno;
+	}
+	if (!desc) {
+		rte_errno = EINVAL;
+		ERROR("%p: invalid number of Rx descriptors", (void *)dev);
+		return -rte_errno;
+	}
+	if (desc != RTE_DIM(*elts)) {
+		desc = RTE_DIM(*elts);
+		WARN("%p: increased number of descriptors in Rx queue %u"
+		     " to the next power of two (%u)",
+		     (void *)dev, idx, desc);
+	}
+	/* By default, FCS (CRC) is stripped by hardware. */
+	crc_present = 0;
+	if (rte_eth_dev_must_keep_crc(offloads)) {
+		if (priv->hw_fcs_strip) {
+			crc_present = 1;
+		} else {
+			WARN("%p: CRC stripping has been disabled but will still"
+			     " be performed by hardware, make sure MLNX_OFED and"
+			     " firmware are up to date",
+			     (void *)dev);
+		}
+	}
+	DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
+	      " incoming frames to hide it",
+	      (void *)dev,
+	      crc_present ? "disabled" : "enabled",
+	      crc_present << 2);
+	/* Allocate and initialize Rx queue. */
+	mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket);
+	if (!rxq) {
+		ERROR("%p: unable to allocate queue index %u",
+		      (void *)dev, idx);
+		return -rte_errno;
+	}
+	*rxq = (struct rxq){
+		.priv = priv,
+		.mp = mp,
+		.port_id = dev->data->port_id,
+		.sges_n = 0,
+		.elts_n = rte_log2_u32(desc),
+		.elts = elts,
+		/* Toggle Rx checksum offload if hardware supports it. */
+		.csum = priv->hw_csum &&
+			(offloads & DEV_RX_OFFLOAD_CHECKSUM),
+		.csum_l2tun = priv->hw_csum_l2tun &&
+			      (offloads & DEV_RX_OFFLOAD_CHECKSUM),
+		.crc_present = crc_present,
+		.l2tun_offload = priv->hw_csum_l2tun,
+		.stats = {
+			.idx = idx,
+		},
+		.socket = socket,
+	};
+	/* Enable scattered packets support for this queue if necessary. */
+	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
+	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
+		;
+	} else if (offloads & DEV_RX_OFFLOAD_SCATTER) {
+		uint32_t size =
+			RTE_PKTMBUF_HEADROOM +
+			dev->data->dev_conf.rxmode.max_rx_pkt_len;
+		uint32_t sges_n;
+
+		/*
+		 * Determine the number of SGEs needed for a full packet
+		 * and round it to the next power of two.
+		 */
+		sges_n = rte_log2_u32((size / mb_len) + !!(size % mb_len));
+		rxq->sges_n = sges_n;
+		/* Make sure sges_n did not overflow. */
+		size = mb_len * (1 << rxq->sges_n);
+		size -= RTE_PKTMBUF_HEADROOM;
+		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+			rte_errno = EOVERFLOW;
+			ERROR("%p: too many SGEs (%u) needed to handle"
+			      " requested maximum packet size %u",
+			      (void *)dev,
+			      1 << sges_n,
+			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
+			goto error;
+		}
+	} else {
+		WARN("%p: the requested maximum Rx packet size (%u) is"
+		     " larger than a single mbuf (%u) and scattered"
+		     " mode has not been requested",
+		     (void *)dev,
+		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
+		     mb_len - RTE_PKTMBUF_HEADROOM);
+	}
+	DEBUG("%p: maximum number of segments per packet: %u",
+	      (void *)dev, 1 << rxq->sges_n);
+	if (desc % (1 << rxq->sges_n)) {
+		rte_errno = EINVAL;
+		ERROR("%p: number of Rx queue descriptors (%u) is not a"
+		      " multiple of maximum segments per packet (%u)",
+		      (void *)dev,
+		      desc,
+		      1 << rxq->sges_n);
+		goto error;
+	}
+	if (mlx4_mr_btree_init(&rxq->mr_ctrl.cache_bh,
+			       MLX4_MR_BTREE_CACHE_N, socket)) {
+		/* rte_errno is already set. */
+		goto error;
+	}
+	if (dev->data->dev_conf.intr_conf.rxq) {
+		rxq->channel = mlx4_glue->create_comp_channel(priv->ctx);
+		if (rxq->channel == NULL) {
+			rte_errno = ENOMEM;
+			ERROR("%p: Rx interrupt completion channel creation"
+			      " failure: %s",
+			      (void *)dev, strerror(rte_errno));
+			goto error;
+		}
+		if (mlx4_fd_set_non_blocking(rxq->channel->fd) < 0) {
+			ERROR("%p: unable to make Rx interrupt completion"
+			      " channel non-blocking: %s",
+			      (void *)dev, strerror(rte_errno));
+			goto error;
+		}
+	}
+	DEBUG("%p: adding Rx queue %p to list", (void *)dev, (void *)rxq);
+	dev->data->rx_queues[idx] = rxq;
+	return 0;
+error:
+	dev->data->rx_queues[idx] = NULL;
+	ret = rte_errno;
+	mlx4_rx_queue_release(rxq);
+	rte_errno = ret;
+	assert(rte_errno > 0);
+	return -rte_errno;
+}
+
+/**
+ * DPDK callback to release a Rx queue.
+ *
+ * @param dpdk_rxq
+ *   Generic Rx queue pointer.
+ */
+void
+mlx4_rx_queue_release(void *dpdk_rxq)
+{
+	struct rxq *rxq = (struct rxq *)dpdk_rxq;
+	struct priv *priv;
+	unsigned int i;
+
+	if (rxq == NULL)
+		return;
+	priv = rxq->priv;
+	for (i = 0; i != priv->dev->data->nb_rx_queues; ++i)
+		if (priv->dev->data->rx_queues[i] == rxq) {
+			DEBUG("%p: removing Rx queue %p from list",
+			      (void *)priv->dev, (void *)rxq);
+			priv->dev->data->rx_queues[i] = NULL;
+			break;
+		}
+	assert(!rxq->cq);
+	assert(!rxq->wq);
+	assert(!rxq->wqes);
+	assert(!rxq->rq_db);
+	if (rxq->channel)
+		claim_zero(mlx4_glue->destroy_comp_channel(rxq->channel));
+	mlx4_mr_btree_free(&rxq->mr_ctrl.cache_bh);
+	rte_free(rxq);
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_rxtx.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4_rxtx.c
new file mode 100644
index 00000000..8c88effc
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_rxtx.c
@@ -0,0 +1,1394 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+/**
+ * @file
+ * Data plane functions for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_io.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include "mlx4.h"
+#include "mlx4_prm.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Pointer-value pair structure used in tx_post_send for saving the first
+ * DWORD (32 byte) of a TXBB.
+ */
+struct pv {
+	union {
+		volatile struct mlx4_wqe_data_seg *dseg;
+		volatile uint32_t *dst;
+	};
+	uint32_t val;
+};
+
+/** A helper structure for TSO packet handling. */
+struct tso_info {
+	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
+	struct pv *pv;
+	/** Current entry in the pv array. */
+	int pv_counter;
+	/** Total size of the WQE including padding. */
+	uint32_t wqe_size;
+	/** Size of TSO header to prepend to each packet to send. */
+	uint16_t tso_header_size;
+	/** Total size of the TSO segment in the WQE. */
+	uint16_t wqe_tso_seg_size;
+	/** Raw WQE size in units of 16 Bytes and without padding. */
+	uint8_t fence_size;
+};
+
+/** A table to translate Rx completion flags to packet type. */
+uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
+	/*
+	 * The index to the array should have:
+	 *  bit[7] - MLX4_CQE_L2_TUNNEL
+	 *  bit[6] - MLX4_CQE_L2_TUNNEL_IPV4
+	 *  bit[5] - MLX4_CQE_STATUS_UDP
+	 *  bit[4] - MLX4_CQE_STATUS_TCP
+	 *  bit[3] - MLX4_CQE_STATUS_IPV4OPT
+	 *  bit[2] - MLX4_CQE_STATUS_IPV6
+	 *  bit[1] - MLX4_CQE_STATUS_IPF
+	 *  bit[0] - MLX4_CQE_STATUS_IPV4
+	 * giving a total of up to 256 entries.
+	 */
+	/* L2 */
+	[0x00] = RTE_PTYPE_L2_ETHER,
+	/* L3 */
+	[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_NONFRAG,
+	[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG,
+	[0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG,
+	[0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_NONFRAG,
+	[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG,
+	[0x08] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_NONFRAG,
+	[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_NONFRAG,
+	[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_FRAG,
+	[0x0b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_FRAG,
+	/* TCP */
+	[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP,
+	[0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP,
+	[0x16] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG,
+	[0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_TCP,
+	[0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_TCP,
+	/* UDP */
+	[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_UDP,
+	[0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_UDP,
+	[0x26] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG,
+	[0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_UDP,
+	[0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_UDP,
+	/* Tunneled - L3 IPV6 */
+	[0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+	[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_NONFRAG,
+	[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0x83] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_NONFRAG,
+	[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_NONFRAG,
+	[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_NONFRAG,
+	[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0x8b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	/* Tunneled - L3 IPV6, TCP */
+	[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0x96] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP,
+	[0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP,
+	/* Tunneled - L3 IPV6, UDP */
+	[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xa6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_UDP,
+	/* Tunneled - L3 IPV4 */
+	[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+	[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_NONFRAG,
+	[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0xc3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_NONFRAG,
+	[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_NONFRAG,
+	[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_NONFRAG,
+	[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0xcb] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	/* Tunneled - L3 IPV4, TCP */
+	[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xd6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_TCP,
+	/* Tunneled - L3 IPV4, UDP */
+	[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xe6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_UDP,
+};
+
+/**
+ * Stamp TXBB burst so it won't be reused by the HW.
+ *
+ * Routine is used when freeing WQE used by the chip or when failing
+ * building an WQ entry has failed leaving partial information on the queue.
+ *
+ * @param sq
+ *   Pointer to the SQ structure.
+ * @param start
+ *   Pointer to the first TXBB to stamp.
+ * @param end
+ *   Pointer to the followed end TXBB to stamp.
+ *
+ * @return
+ *   Stamping burst size in byte units.
+ */
+static uint32_t
+mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, volatile uint32_t *start,
+			 volatile uint32_t *end)
+{
+	uint32_t stamp = sq->stamp;
+	int32_t size = (intptr_t)end - (intptr_t)start;
+
+	assert(start != end);
+	/* Hold SQ ring wrap around. */
+	if (size < 0) {
+		size = (int32_t)sq->size + size;
+		do {
+			*start = stamp;
+			start += MLX4_SQ_STAMP_DWORDS;
+		} while (start != (volatile uint32_t *)sq->eob);
+		start = (volatile uint32_t *)sq->buf;
+		/* Flip invalid stamping ownership. */
+		stamp ^= RTE_BE32(1u << MLX4_SQ_OWNER_BIT);
+		sq->stamp = stamp;
+		if (start == end)
+			return size;
+	}
+	do {
+		*start = stamp;
+		start += MLX4_SQ_STAMP_DWORDS;
+	} while (start != end);
+	return (uint32_t)size;
+}
+
+/**
+ * Manage Tx completions.
+ *
+ * When sending a burst, mlx4_tx_burst() posts several WRs.
+ * To improve performance, a completion event is only required once every
+ * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
+ * for other WRs, but this information would not be used anyway.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param elts_m
+ *   Tx elements number mask.
+ * @param sq
+ *   Pointer to the SQ structure.
+ */
+static void
+mlx4_txq_complete(struct txq *txq, const unsigned int elts_m,
+		  struct mlx4_sq *sq)
+{
+	unsigned int elts_tail = txq->elts_tail;
+	struct mlx4_cq *cq = &txq->mcq;
+	volatile struct mlx4_cqe *cqe;
+	uint32_t completed;
+	uint32_t cons_index = cq->cons_index;
+	volatile uint32_t *first_txbb;
+
+	/*
+	 * Traverse over all CQ entries reported and handle each WQ entry
+	 * reported by them.
+	 */
+	do {
+		cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
+		if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+		    !!(cons_index & cq->cqe_cnt)))
+			break;
+#ifndef NDEBUG
+		/*
+		 * Make sure we read the CQE after we read the ownership bit.
+		 */
+		rte_io_rmb();
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+			     MLX4_CQE_OPCODE_ERROR)) {
+			volatile struct mlx4_err_cqe *cqe_err =
+				(volatile struct mlx4_err_cqe *)cqe;
+			ERROR("%p CQE error - vendor syndrome: 0x%x"
+			      " syndrome: 0x%x\n",
+			      (void *)txq, cqe_err->vendor_err,
+			      cqe_err->syndrome);
+			break;
+		}
+#endif /* NDEBUG */
+		cons_index++;
+	} while (1);
+	completed = (cons_index - cq->cons_index) * txq->elts_comp_cd_init;
+	if (unlikely(!completed))
+		return;
+	/* First stamping address is the end of the last one. */
+	first_txbb = (&(*txq->elts)[elts_tail & elts_m])->eocb;
+	elts_tail += completed;
+	/* The new tail element holds the end address. */
+	sq->remain_size += mlx4_txq_stamp_freed_wqe(sq, first_txbb,
+		(&(*txq->elts)[elts_tail & elts_m])->eocb);
+	/* Update CQ consumer index. */
+	cq->cons_index = cons_index;
+	*cq->set_ci_db = rte_cpu_to_be_32(cons_index & MLX4_CQ_DB_CI_MASK);
+	txq->elts_tail = elts_tail;
+}
+
+/**
+ * Write Tx data segment to the SQ.
+ *
+ * @param dseg
+ *   Pointer to data segment in SQ.
+ * @param lkey
+ *   Memory region lkey.
+ * @param addr
+ *   Data address.
+ * @param byte_count
+ *   Big endian bytes count of the data to send.
+ */
+static inline void
+mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg,
+		       uint32_t lkey, uintptr_t addr, rte_be32_t  byte_count)
+{
+	dseg->addr = rte_cpu_to_be_64(addr);
+	dseg->lkey = lkey;
+#if RTE_CACHE_LINE_SIZE < 64
+	/*
+	 * Need a barrier here before writing the byte_count
+	 * fields to make sure that all the data is visible
+	 * before the byte_count field is set.
+	 * Otherwise, if the segment begins a new cacheline,
+	 * the HCA prefetcher could grab the 64-byte chunk and
+	 * get a valid (!= 0xffffffff) byte count but stale
+	 * data, and end up sending the wrong data.
+	 */
+	rte_io_wmb();
+#endif /* RTE_CACHE_LINE_SIZE */
+	dseg->byte_count = byte_count;
+}
+
+/**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+	tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
+	if (tunneled)
+		tinfo->tso_header_size +=
+				buf->outer_l2_len + buf->outer_l3_len;
+	if (unlikely(buf->tso_segsz == 0 ||
+		     tinfo->tso_header_size == 0 ||
+		     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
+		     tinfo->tso_header_size > buf->data_len))
+		return -EINVAL;
+	/*
+	 * Calculate the WQE TSO segment size
+	 * Note:
+	 * 1. An LSO segment must be padded such that the subsequent data
+	 *    segment is 16-byte aligned.
+	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
+	 */
+	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
+					    tinfo->tso_header_size,
+					    sizeof(struct mlx4_wqe_data_seg));
+	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+			     buf->nb_segs;
+	tinfo->wqe_size =
+		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+			  MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < tinfo->wqe_size ||
+	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+		return -ENOMEM;
+	/* Init pv. */
+	tinfo->pv = (struct pv *)txq->bounce_buf;
+	tinfo->pv_counter = 0;
+	return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo,
+			     volatile struct mlx4_wqe_data_seg *dseg,
+			     volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	uint32_t lkey;
+	int nb_segs = buf->nb_segs;
+	int nb_segs_txbb;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
+			(volatile struct mlx4_wqe_ctrl_seg *)
+				((volatile uint8_t *)ctrl + tinfo->wqe_size);
+	uint16_t data_len = sbuf->data_len - tinfo->tso_header_size;
+	uintptr_t data_addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t,
+						      tinfo->tso_header_size);
+
+	do {
+		/* how many dseg entries do we have in the current TXBB ? */
+		nb_segs_txbb = (MLX4_TXBB_SIZE -
+				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
+			       MLX4_SEG_SHIFT;
+		switch (nb_segs_txbb) {
+#ifndef NDEBUG
+		default:
+			/* Should never happen. */
+			rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
+			(void *)txq, nb_segs_txbb);
+			/* rte_panic never returns. */
+			break;
+#endif /* NDEBUG */
+		case 4:
+			/* Memory region key for this memory pool. */
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			dseg->addr = rte_cpu_to_be_64(data_addr);
+			dseg->lkey = lkey;
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[*pv_counter].dseg = dseg;
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			pv[(*pv_counter)++].val =
+				rte_cpu_to_be_32(data_len ?
+						 data_len :
+						 0x80000000);
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			/* fallthrough */
+		case 3:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			/* fallthrough */
+		case 2:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			/* fallthrough */
+		case 1:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* Prepare next buf info */
+			sbuf = sbuf->next;
+			dseg++;
+			data_len = sbuf->data_len;
+			data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+			/* fallthrough */
+		}
+		/* Wrap dseg if it points at the end of the queue. */
+		if ((volatile uint8_t *)dseg >= sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((volatile uint8_t *)dseg - sq->size);
+	} while (true);
+err:
+	return NULL;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *
+ * This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_data_seg *
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+			   struct txq *txq,
+			   struct tso_info *tinfo,
+			   volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_lso_seg *tseg =
+		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	int remain_size = tinfo->tso_header_size;
+	char *from = rte_pktmbuf_mtod(buf, char *);
+	uint16_t txbb_avail_space;
+	/* Union to overcome volatile constraints when copying TSO header. */
+	union {
+		volatile uint8_t *vto;
+		uint8_t *to;
+	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+	/*
+	 * TSO data always starts at offset 20 from the beginning of the TXBB
+	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+	 * we can write the first 44 TSO header bytes without worry for TxQ
+	 * wrapping or overwriting the first TXBB 32bit word.
+	 */
+	txbb_avail_space = MLX4_TXBB_SIZE -
+			   (sizeof(struct mlx4_wqe_ctrl_seg) +
+			    sizeof(struct mlx4_wqe_lso_seg));
+	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
+		/* Copy to end of txbb. */
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		/* New TXBB, stash the first 32bits for later use. */
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		pv[(*pv_counter)++].val = *(uint32_t *)from,
+		from += sizeof(uint32_t);
+		thdr.to += sizeof(uint32_t);
+		remain_size -= txbb_avail_space + sizeof(uint32_t);
+		/* Avail space in new TXBB is TXBB size - 4 */
+		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+	}
+	if (remain_size > txbb_avail_space) {
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		remain_size -= txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
+		(*pv_counter)++;
+	} else if (remain_size) {
+		rte_memcpy(thdr.to, from, remain_size);
+	}
+	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+					      tinfo->tso_header_size);
+	/* Calculate data segment location */
+	return (volatile struct mlx4_wqe_data_seg *)
+				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+	struct mlx4_sq *sq = &txq->msq;
+	struct tso_info tinfo;
+	struct pv *pv;
+	int pv_counter;
+	int ret;
+
+	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+	if (unlikely(ret))
+		goto error;
+	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
+	if (unlikely(dseg == NULL))
+		goto error;
+	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)dseg - sq->size);
+	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
+	if (unlikely(ctrl_next == NULL))
+		goto error;
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (likely(tinfo.pv_counter)) {
+		pv = tinfo.pv;
+		pv_counter = tinfo.pv_counter;
+		/* Need a barrier here before writing the first TXBB word. */
+		rte_io_wmb();
+		do {
+			--pv_counter;
+			*pv[pv_counter].dst = pv[pv_counter].val;
+		} while (pv_counter > 0);
+	}
+	ctrl->fence_size = tinfo.fence_size;
+	sq->remain_size -= tinfo.wqe_size;
+	return ctrl_next;
+error:
+	txq->stats.odropped++;
+	return NULL;
+}
+
+/**
+ * Write data segments of multi-segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
+		   volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	struct pv *pv = (struct pv *)txq->bounce_buf;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	uint32_t lkey;
+	int pv_counter = 0;
+	int nb_segs = buf->nb_segs;
+	uint32_t wqe_size;
+	volatile struct mlx4_wqe_data_seg *dseg =
+		(volatile struct mlx4_wqe_data_seg *)(ctrl + 1);
+
+	ctrl->fence_size = 1 + nb_segs;
+	wqe_size = RTE_ALIGN((uint32_t)(ctrl->fence_size << MLX4_SEG_SHIFT),
+			     MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < wqe_size ||
+	    wqe_size > MLX4_MAX_WQE_SIZE)
+		return NULL;
+	/*
+	 * Fill the data segments with buffer information.
+	 * First WQE TXBB head segment is always control segment,
+	 * so jump to tail TXBB data segments code for the first
+	 * WQE data segments filling.
+	 */
+	goto txbb_tail_segs;
+txbb_head_seg:
+	/* Memory region key (big endian) for this memory pool. */
+	lkey = mlx4_tx_mb2mr(txq, sbuf);
+	if (unlikely(lkey == (uint32_t)-1)) {
+		DEBUG("%p: unable to get MP <-> MR association",
+		      (void *)txq);
+		return NULL;
+	}
+	/* Handle WQE wraparound. */
+	if (dseg >=
+		(volatile struct mlx4_wqe_data_seg *)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+			sq->buf;
+	dseg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(sbuf, uintptr_t));
+	dseg->lkey = lkey;
+	/*
+	 * This data segment starts at the beginning of a new
+	 * TXBB, so we need to postpone its byte_count writing
+	 * for later.
+	 */
+	pv[pv_counter].dseg = dseg;
+	/*
+	 * Zero length segment is treated as inline segment
+	 * with zero data.
+	 */
+	pv[pv_counter++].val = rte_cpu_to_be_32(sbuf->data_len ?
+						sbuf->data_len : 0x80000000);
+	sbuf = sbuf->next;
+	dseg++;
+	nb_segs--;
+txbb_tail_segs:
+	/* Jump to default if there are more than two segments remaining. */
+	switch (nb_segs) {
+	default:
+		lkey = mlx4_tx_mb2mr(txq, sbuf);
+		if (unlikely(lkey == (uint32_t)-1)) {
+			DEBUG("%p: unable to get MP <-> MR association",
+			      (void *)txq);
+			return NULL;
+		}
+		mlx4_fill_tx_data_seg(dseg, lkey,
+				      rte_pktmbuf_mtod(sbuf, uintptr_t),
+				      rte_cpu_to_be_32(sbuf->data_len ?
+						       sbuf->data_len :
+						       0x80000000));
+		sbuf = sbuf->next;
+		dseg++;
+		nb_segs--;
+		/* fallthrough */
+	case 2:
+		lkey = mlx4_tx_mb2mr(txq, sbuf);
+		if (unlikely(lkey == (uint32_t)-1)) {
+			DEBUG("%p: unable to get MP <-> MR association",
+			      (void *)txq);
+			return NULL;
+		}
+		mlx4_fill_tx_data_seg(dseg, lkey,
+				      rte_pktmbuf_mtod(sbuf, uintptr_t),
+				      rte_cpu_to_be_32(sbuf->data_len ?
+						       sbuf->data_len :
+						       0x80000000));
+		sbuf = sbuf->next;
+		dseg++;
+		nb_segs--;
+		/* fallthrough */
+	case 1:
+		lkey = mlx4_tx_mb2mr(txq, sbuf);
+		if (unlikely(lkey == (uint32_t)-1)) {
+			DEBUG("%p: unable to get MP <-> MR association",
+			      (void *)txq);
+			return NULL;
+		}
+		mlx4_fill_tx_data_seg(dseg, lkey,
+				      rte_pktmbuf_mtod(sbuf, uintptr_t),
+				      rte_cpu_to_be_32(sbuf->data_len ?
+						       sbuf->data_len :
+						       0x80000000));
+		nb_segs--;
+		if (nb_segs) {
+			sbuf = sbuf->next;
+			dseg++;
+			goto txbb_head_seg;
+		}
+		/* fallthrough */
+	case 0:
+		break;
+	}
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (pv_counter) {
+		/* Need a barrier here before writing the byte_count. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+	}
+	sq->remain_size -= wqe_size;
+	/* Align next WQE address to the next TXBB. */
+	return (volatile struct mlx4_wqe_ctrl_seg *)
+		((volatile uint8_t *)ctrl + wqe_size);
+}
+
+/**
+ * DPDK callback for Tx.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	unsigned int elts_head = txq->elts_head;
+	const unsigned int elts_n = txq->elts_n;
+	const unsigned int elts_m = elts_n - 1;
+	unsigned int bytes_sent = 0;
+	unsigned int i;
+	unsigned int max = elts_head - txq->elts_tail;
+	struct mlx4_sq *sq = &txq->msq;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl;
+	struct txq_elt *elt;
+
+	assert(txq->elts_comp_cd != 0);
+	if (likely(max >= txq->elts_comp_cd_init))
+		mlx4_txq_complete(txq, elts_m, sq);
+	max = elts_n - max;
+	assert(max >= 1);
+	assert(max <= elts_n);
+	/* Always leave one free entry in the ring. */
+	--max;
+	if (max > pkts_n)
+		max = pkts_n;
+	elt = &(*txq->elts)[elts_head & elts_m];
+	/* First Tx burst element saves the next WQE control segment. */
+	ctrl = elt->wqe;
+	for (i = 0; (i != max); ++i) {
+		struct rte_mbuf *buf = pkts[i];
+		struct txq_elt *elt_next = &(*txq->elts)[++elts_head & elts_m];
+		uint32_t owner_opcode = sq->owner_opcode;
+		volatile struct mlx4_wqe_data_seg *dseg =
+				(volatile struct mlx4_wqe_data_seg *)(ctrl + 1);
+		volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+		union {
+			uint32_t flags;
+			uint16_t flags16[2];
+		} srcrb;
+		uint32_t lkey;
+		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
+
+		/* Clean up old buffer. */
+		if (likely(elt->buf != NULL)) {
+			struct rte_mbuf *tmp = elt->buf;
+
+#ifndef NDEBUG
+			/* Poisoning. */
+			memset(&elt->buf, 0x66, sizeof(struct rte_mbuf *));
+#endif
+			/* Faster than rte_pktmbuf_free(). */
+			do {
+				struct rte_mbuf *next = tmp->next;
+
+				rte_pktmbuf_free_seg(tmp);
+				tmp = next;
+			} while (tmp != NULL);
+		}
+		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+		if (tso) {
+			/* Change opcode to TSO */
+			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		} else if (buf->nb_segs == 1) {
+			/* Validate WQE space in the send queue. */
+			if (sq->remain_size < MLX4_TXBB_SIZE) {
+				elt->buf = NULL;
+				break;
+			}
+			lkey = mlx4_tx_mb2mr(txq, buf);
+			if (unlikely(lkey == (uint32_t)-1)) {
+				/* MR does not exist. */
+				DEBUG("%p: unable to get MP <-> MR association",
+				      (void *)txq);
+				elt->buf = NULL;
+				break;
+			}
+			mlx4_fill_tx_data_seg(dseg++, lkey,
+					      rte_pktmbuf_mtod(buf, uintptr_t),
+					      rte_cpu_to_be_32(buf->data_len));
+			/* Set WQE size in 16-byte units. */
+			ctrl->fence_size = 0x2;
+			sq->remain_size -= MLX4_TXBB_SIZE;
+			/* Align next WQE address to the next TXBB. */
+			ctrl_next = ctrl + 0x4;
+		} else {
+			ctrl_next = mlx4_tx_burst_segs(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		}
+		/* Hold SQ ring wrap around. */
+		if ((volatile uint8_t *)ctrl_next >= sq->eob) {
+			ctrl_next = (volatile struct mlx4_wqe_ctrl_seg *)
+				((volatile uint8_t *)ctrl_next - sq->size);
+			/* Flip HW valid ownership. */
+			sq->owner_opcode ^= 1u << MLX4_SQ_OWNER_BIT;
+		}
+		/*
+		 * For raw Ethernet, the SOLICIT flag is used to indicate
+		 * that no ICRC should be calculated.
+		 */
+		if (--txq->elts_comp_cd == 0) {
+			/* Save the completion burst end address. */
+			elt_next->eocb = (volatile uint32_t *)ctrl_next;
+			txq->elts_comp_cd = txq->elts_comp_cd_init;
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
+					       MLX4_WQE_CTRL_CQ_UPDATE);
+		} else {
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
+		}
+		/* Enable HW checksum offload if requested */
+		if (txq->csum &&
+		    (buf->ol_flags &
+		     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
+			const uint64_t is_tunneled = (buf->ol_flags &
+						      (PKT_TX_TUNNEL_GRE |
+						       PKT_TX_TUNNEL_VXLAN));
+
+			if (is_tunneled && txq->csum_l2tun) {
+				owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
+						MLX4_WQE_CTRL_IL4_HDR_CSUM;
+				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+					srcrb.flags |=
+					    RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
+			} else {
+				srcrb.flags |=
+					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+						MLX4_WQE_CTRL_TCP_UDP_CSUM);
+			}
+		}
+		if (txq->lb) {
+			/*
+			 * Copy destination MAC address to the WQE, this allows
+			 * loopback in eSwitch, so that VFs and PF can
+			 * communicate with each other.
+			 */
+			srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *));
+			ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *,
+					      sizeof(uint16_t)));
+		} else {
+			ctrl->imm = 0;
+		}
+		ctrl->srcrb_flags = srcrb.flags;
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		rte_io_wmb();
+		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode);
+		elt->buf = buf;
+		bytes_sent += buf->pkt_len;
+		ctrl = ctrl_next;
+		elt = elt_next;
+	}
+	/* Take a shortcut if nothing must be sent. */
+	if (unlikely(i == 0))
+		return 0;
+	/* Save WQE address of the next Tx burst element. */
+	elt->wqe = ctrl;
+	/* Increment send statistics counters. */
+	txq->stats.opackets += i;
+	txq->stats.obytes += bytes_sent;
+	/* Make sure that descriptors are written before doorbell record. */
+	rte_wmb();
+	/* Ring QP doorbell. */
+	rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
+	txq->elts_head += i;
+	return i;
+}
+
+/**
+ * Translate Rx completion flags to packet type.
+ *
+ * @param[in] cqe
+ *   Pointer to CQE.
+ *
+ * @return
+ *   Packet type for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe,
+		   uint32_t l2tun_offload)
+{
+	uint8_t idx = 0;
+	uint32_t pinfo = rte_be_to_cpu_32(cqe->vlan_my_qpn);
+	uint32_t status = rte_be_to_cpu_32(cqe->status);
+
+	/*
+	 * The index to the array should have:
+	 *  bit[7] - MLX4_CQE_L2_TUNNEL
+	 *  bit[6] - MLX4_CQE_L2_TUNNEL_IPV4
+	 */
+	if (l2tun_offload && (pinfo & MLX4_CQE_L2_TUNNEL))
+		idx |= ((pinfo & MLX4_CQE_L2_TUNNEL) >> 20) |
+		       ((pinfo & MLX4_CQE_L2_TUNNEL_IPV4) >> 19);
+	/*
+	 * The index to the array should have:
+	 *  bit[5] - MLX4_CQE_STATUS_UDP
+	 *  bit[4] - MLX4_CQE_STATUS_TCP
+	 *  bit[3] - MLX4_CQE_STATUS_IPV4OPT
+	 *  bit[2] - MLX4_CQE_STATUS_IPV6
+	 *  bit[1] - MLX4_CQE_STATUS_IPF
+	 *  bit[0] - MLX4_CQE_STATUS_IPV4
+	 * giving a total of up to 256 entries.
+	 */
+	idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22);
+	if (status & MLX4_CQE_STATUS_IPV6)
+		idx |= ((status & MLX4_CQE_STATUS_IPV6F) >> 11);
+	return mlx4_ptype_table[idx];
+}
+
+/**
+ * Translate Rx completion flags to offload flags.
+ *
+ * @param flags
+ *   Rx completion flags returned by mlx4_cqe_flags().
+ * @param csum
+ *   Whether Rx checksums are enabled.
+ * @param csum_l2tun
+ *   Whether Rx L2 tunnel checksums are enabled.
+ *
+ * @return
+ *   Offload flags (ol_flags) in mbuf format.
+ */
+static inline uint32_t
+rxq_cq_to_ol_flags(uint32_t flags, int csum, int csum_l2tun)
+{
+	uint32_t ol_flags = 0;
+
+	if (csum)
+		ol_flags |=
+			mlx4_transpose(flags,
+				       MLX4_CQE_STATUS_IP_HDR_CSUM_OK,
+				       PKT_RX_IP_CKSUM_GOOD) |
+			mlx4_transpose(flags,
+				       MLX4_CQE_STATUS_TCP_UDP_CSUM_OK,
+				       PKT_RX_L4_CKSUM_GOOD);
+	if ((flags & MLX4_CQE_L2_TUNNEL) && csum_l2tun)
+		ol_flags |=
+			mlx4_transpose(flags,
+				       MLX4_CQE_L2_TUNNEL_IPOK,
+				       PKT_RX_IP_CKSUM_GOOD) |
+			mlx4_transpose(flags,
+				       MLX4_CQE_L2_TUNNEL_L4_CSUM,
+				       PKT_RX_L4_CKSUM_GOOD);
+	return ol_flags;
+}
+
+/**
+ * Extract checksum information from CQE flags.
+ *
+ * @param cqe
+ *   Pointer to CQE structure.
+ * @param csum
+ *   Whether Rx checksums are enabled.
+ * @param csum_l2tun
+ *   Whether Rx L2 tunnel checksums are enabled.
+ *
+ * @return
+ *   CQE checksum information.
+ */
+static inline uint32_t
+mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun)
+{
+	uint32_t flags = 0;
+
+	/*
+	 * The relevant bits are in different locations on their
+	 * CQE fields therefore we can join them in one 32bit
+	 * variable.
+	 */
+	if (csum)
+		flags = (rte_be_to_cpu_32(cqe->status) &
+			 MLX4_CQE_STATUS_IPV4_CSUM_OK);
+	if (csum_l2tun)
+		flags |= (rte_be_to_cpu_32(cqe->vlan_my_qpn) &
+			  (MLX4_CQE_L2_TUNNEL |
+			   MLX4_CQE_L2_TUNNEL_IPOK |
+			   MLX4_CQE_L2_TUNNEL_L4_CSUM |
+			   MLX4_CQE_L2_TUNNEL_IPV4));
+	return flags;
+}
+
+/**
+ * Poll one CQE from CQ.
+ *
+ * @param rxq
+ *   Pointer to the receive queue structure.
+ * @param[out] out
+ *   Just polled CQE.
+ *
+ * @return
+ *   Number of bytes of the CQE, 0 in case there is no completion.
+ */
+static unsigned int
+mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out)
+{
+	int ret = 0;
+	volatile struct mlx4_cqe *cqe = NULL;
+	struct mlx4_cq *cq = &rxq->mcq;
+
+	cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
+	if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+	    !!(cq->cons_index & cq->cqe_cnt))
+		goto out;
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rte_rmb();
+	assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
+	assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
+	       MLX4_CQE_OPCODE_ERROR);
+	ret = rte_be_to_cpu_32(cqe->byte_cnt);
+	++cq->cons_index;
+out:
+	*out = cqe;
+	return ret;
+}
+
+/**
+ * DPDK callback for Rx with scattered packets support.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct rxq *rxq = dpdk_rxq;
+	const uint32_t wr_cnt = (1 << rxq->elts_n) - 1;
+	const uint16_t sges_n = rxq->sges_n;
+	struct rte_mbuf *pkt = NULL;
+	struct rte_mbuf *seg = NULL;
+	unsigned int i = 0;
+	uint32_t rq_ci = rxq->rq_ci << sges_n;
+	int len = 0;
+
+	while (pkts_n) {
+		volatile struct mlx4_cqe *cqe;
+		uint32_t idx = rq_ci & wr_cnt;
+		struct rte_mbuf *rep = (*rxq->elts)[idx];
+		volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx];
+
+		/* Update the 'next' pointer of the previous segment. */
+		if (pkt)
+			seg->next = rep;
+		seg = rep;
+		rte_prefetch0(seg);
+		rte_prefetch0(scat);
+		rep = rte_mbuf_raw_alloc(rxq->mp);
+		if (unlikely(rep == NULL)) {
+			++rxq->stats.rx_nombuf;
+			if (!pkt) {
+				/*
+				 * No buffers before we even started,
+				 * bail out silently.
+				 */
+				break;
+			}
+			while (pkt != seg) {
+				assert(pkt != (*rxq->elts)[idx]);
+				rep = pkt->next;
+				pkt->next = NULL;
+				pkt->nb_segs = 1;
+				rte_mbuf_raw_free(pkt);
+				pkt = rep;
+			}
+			break;
+		}
+		if (!pkt) {
+			/* Looking for the new packet. */
+			len = mlx4_cq_poll_one(rxq, &cqe);
+			if (!len) {
+				rte_mbuf_raw_free(rep);
+				break;
+			}
+			if (unlikely(len < 0)) {
+				/* Rx error, packet is likely too large. */
+				rte_mbuf_raw_free(rep);
+				++rxq->stats.idropped;
+				goto skip;
+			}
+			pkt = seg;
+			assert(len >= (rxq->crc_present << 2));
+			/* Update packet information. */
+			pkt->packet_type =
+				rxq_cq_to_pkt_type(cqe, rxq->l2tun_offload);
+			pkt->ol_flags = PKT_RX_RSS_HASH;
+			pkt->hash.rss = cqe->immed_rss_invalid;
+			if (rxq->crc_present)
+				len -= ETHER_CRC_LEN;
+			pkt->pkt_len = len;
+			if (rxq->csum | rxq->csum_l2tun) {
+				uint32_t flags =
+					mlx4_cqe_flags(cqe,
+						       rxq->csum,
+						       rxq->csum_l2tun);
+
+				pkt->ol_flags =
+					rxq_cq_to_ol_flags(flags,
+							   rxq->csum,
+							   rxq->csum_l2tun);
+			}
+		}
+		rep->nb_segs = 1;
+		rep->port = rxq->port_id;
+		rep->data_len = seg->data_len;
+		rep->data_off = seg->data_off;
+		(*rxq->elts)[idx] = rep;
+		/*
+		 * Fill NIC descriptor with the new buffer. The lkey and size
+		 * of the buffers are already known, only the buffer address
+		 * changes.
+		 */
+		scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+		/* If there's only one MR, no need to replace LKey in WQE. */
+		if (unlikely(mlx4_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+			scat->lkey = mlx4_rx_mb2mr(rxq, rep);
+		if (len > seg->data_len) {
+			len -= seg->data_len;
+			++pkt->nb_segs;
+			++rq_ci;
+			continue;
+		}
+		/* The last segment. */
+		seg->data_len = len;
+		/* Increment bytes counter. */
+		rxq->stats.ibytes += pkt->pkt_len;
+		/* Return packet. */
+		*(pkts++) = pkt;
+		pkt = NULL;
+		--pkts_n;
+		++i;
+skip:
+		/* Align consumer index to the next stride. */
+		rq_ci >>= sges_n;
+		++rq_ci;
+		rq_ci <<= sges_n;
+	}
+	if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci))
+		return 0;
+	/* Update the consumer index. */
+	rxq->rq_ci = rq_ci >> sges_n;
+	rte_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	*rxq->mcq.set_ci_db =
+		rte_cpu_to_be_32(rxq->mcq.cons_index & MLX4_CQ_DB_CI_MASK);
+	/* Increment packets counter. */
+	rxq->stats.ipackets += i;
+	return i;
+}
+
+/**
+ * Dummy DPDK callback for Tx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	(void)dpdk_txq;
+	(void)pkts;
+	(void)pkts_n;
+	return 0;
+}
+
+/**
+ * Dummy DPDK callback for Rx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	(void)dpdk_rxq;
+	(void)pkts;
+	(void)pkts_n;
+	return 0;
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_rxtx.h b/src/spdk/dpdk/drivers/net/mlx4/mlx4_rxtx.h
new file mode 100644
index 00000000..ffa8abfc
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_rxtx.h
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef MLX4_RXTX_H_
+#define MLX4_RXTX_H_
+
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev_driver.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#include "mlx4.h"
+#include "mlx4_prm.h"
+#include "mlx4_mr.h"
+
+/** Rx queue counters. */
+struct mlx4_rxq_stats {
+	unsigned int idx; /**< Mapping index. */
+	uint64_t ipackets; /**< Total of successfully received packets. */
+	uint64_t ibytes; /**< Total of successfully received bytes. */
+	uint64_t idropped; /**< Total of packets dropped when Rx ring full. */
+	uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */
+};
+
+/** Rx queue descriptor. */
+struct rxq {
+	struct priv *priv; /**< Back pointer to private data. */
+	struct rte_mempool *mp; /**< Memory pool for allocations. */
+	struct ibv_cq *cq; /**< Completion queue. */
+	struct ibv_wq *wq; /**< Work queue. */
+	struct ibv_comp_channel *channel; /**< Rx completion channel. */
+	uint16_t rq_ci; /**< Saved RQ consumer index. */
+	uint16_t port_id; /**< Port ID for incoming packets. */
+	uint16_t sges_n; /**< Number of segments per packet (log2 value). */
+	uint16_t elts_n; /**< Mbuf queue size (log2 value). */
+	struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */
+	struct rte_mbuf *(*elts)[]; /**< Rx elements. */
+	volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */
+	volatile uint32_t *rq_db; /**< RQ doorbell record. */
+	uint32_t csum:1; /**< Enable checksum offloading. */
+	uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */
+	uint32_t crc_present:1; /**< CRC must be subtracted. */
+	uint32_t l2tun_offload:1; /**< L2 tunnel offload is enabled. */
+	struct mlx4_cq mcq;  /**< Info for directly manipulating the CQ. */
+	struct mlx4_rxq_stats stats; /**< Rx queue counters. */
+	unsigned int socket; /**< CPU socket ID for allocations. */
+	uint32_t usecnt; /**< Number of users relying on queue resources. */
+	uint8_t data[]; /**< Remaining queue resources. */
+};
+
+/** Shared flow target for Rx queues. */
+struct mlx4_rss {
+	LIST_ENTRY(mlx4_rss) next; /**< Next entry in list. */
+	struct priv *priv; /**< Back pointer to private data. */
+	uint32_t refcnt; /**< Reference count for this object. */
+	uint32_t usecnt; /**< Number of users relying on @p qp and @p ind. */
+	struct ibv_qp *qp; /**< Queue pair. */
+	struct ibv_rwq_ind_table *ind; /**< Indirection table. */
+	uint64_t fields; /**< Fields for RSS processing (Verbs format). */
+	uint8_t key[MLX4_RSS_HASH_KEY_SIZE]; /**< Hash key to use. */
+	uint16_t queues; /**< Number of target queues. */
+	uint16_t queue_id[]; /**< Target queues. */
+};
+
+/** Tx element. */
+struct txq_elt {
+	struct rte_mbuf *buf; /**< Buffer. */
+	union {
+		volatile struct mlx4_wqe_ctrl_seg *wqe; /**< SQ WQE. */
+		volatile uint32_t *eocb; /**< End of completion burst. */
+	};
+};
+
+/** Tx queue counters. */
+struct mlx4_txq_stats {
+	unsigned int idx; /**< Mapping index. */
+	uint64_t opackets; /**< Total of successfully sent packets. */
+	uint64_t obytes; /**< Total of successfully sent bytes. */
+	uint64_t odropped; /**< Total number of packets failed to transmit. */
+};
+
+/** Tx queue descriptor. */
+struct txq {
+	struct mlx4_sq msq; /**< Info for directly manipulating the SQ. */
+	struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */
+	unsigned int elts_head; /**< Current index in (*elts)[]. */
+	unsigned int elts_tail; /**< First element awaiting completion. */
+	int elts_comp_cd; /**< Countdown for next completion. */
+	unsigned int elts_comp_cd_init; /**< Initial value for countdown. */
+	unsigned int elts_n; /**< (*elts)[] length. */
+	struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */
+	struct txq_elt (*elts)[]; /**< Tx elements. */
+	struct mlx4_txq_stats stats; /**< Tx queue counters. */
+	uint32_t max_inline; /**< Max inline send size. */
+	uint32_t csum:1; /**< Enable checksum offloading. */
+	uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */
+	uint32_t lb:1; /**< Whether packets should be looped back by eSwitch. */
+	uint8_t *bounce_buf;
+	/**< Memory used for storing the first DWORD of data TXBBs. */
+	struct priv *priv; /**< Back pointer to private data. */
+	unsigned int socket; /**< CPU socket ID for allocations. */
+	struct ibv_cq *cq; /**< Completion queue. */
+	struct ibv_qp *qp; /**< Queue pair. */
+	uint8_t data[]; /**< Remaining queue resources. */
+};
+
+/* mlx4_rxq.c */
+
+uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
+int mlx4_rss_init(struct priv *priv);
+void mlx4_rss_deinit(struct priv *priv);
+struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
+			      const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+			      uint16_t queues, const uint16_t queue_id[]);
+void mlx4_rss_put(struct mlx4_rss *rss);
+int mlx4_rss_attach(struct mlx4_rss *rss);
+void mlx4_rss_detach(struct mlx4_rss *rss);
+int mlx4_rxq_attach(struct rxq *rxq);
+void mlx4_rxq_detach(struct rxq *rxq);
+uint64_t mlx4_get_rx_port_offloads(struct priv *priv);
+uint64_t mlx4_get_rx_queue_offloads(struct priv *priv);
+int mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+			uint16_t desc, unsigned int socket,
+			const struct rte_eth_rxconf *conf,
+			struct rte_mempool *mp);
+void mlx4_rx_queue_release(void *dpdk_rxq);
+
+/* mlx4_rxtx.c */
+
+uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
+		       uint16_t pkts_n);
+uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
+		       uint16_t pkts_n);
+uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+
+/* mlx4_txq.c */
+
+uint64_t mlx4_get_tx_port_offloads(struct priv *priv);
+int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+			uint16_t desc, unsigned int socket,
+			const struct rte_eth_txconf *conf);
+void mlx4_tx_queue_release(void *dpdk_txq);
+
+/* mlx4_mr.c */
+
+void mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl);
+uint32_t mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr);
+uint32_t mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr);
+
+/**
+ * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx
+ * as mempool is pre-configured and static.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ * @param addr
+ *   Address to search.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx4_rx_addr2mr(struct rxq *rxq, uintptr_t addr)
+{
+	struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+	uint32_t lkey;
+
+	/* Linear search on MR cache array. */
+	lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru,
+				    MLX4_MR_CACHE_N, addr);
+	if (likely(lkey != UINT32_MAX))
+		return lkey;
+	/* Take slower bottom-half (Binary Search) on miss. */
+	return mlx4_rx_addr2mr_bh(rxq, addr);
+}
+
+#define mlx4_rx_mb2mr(rxq, mb) mlx4_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr))
+
+/**
+ * Query LKey from a packet buffer for Tx. If not found, add the mempool.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param addr
+ *   Address to search.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx4_tx_addr2mr(struct txq *txq, uintptr_t addr)
+{
+	struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+	uint32_t lkey;
+
+	/* Check generation bit to see if there's any change on existing MRs. */
+	if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen))
+		mlx4_mr_flush_local_cache(mr_ctrl);
+	/* Linear search on MR cache array. */
+	lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru,
+				    MLX4_MR_CACHE_N, addr);
+	if (likely(lkey != UINT32_MAX))
+		return lkey;
+	/* Take slower bottom-half (binary search) on miss. */
+	return mlx4_tx_addr2mr_bh(txq, addr);
+}
+
+#define mlx4_tx_mb2mr(rxq, mb) mlx4_tx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr))
+
+#endif /* MLX4_RXTX_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_txq.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4_txq.c
new file mode 100644
index 00000000..9aa7440d
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_txq.c
@@ -0,0 +1,374 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+/**
+ * @file
+ * Tx queues configuration for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <inttypes.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_ethdev_driver.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#include "mlx4.h"
+#include "mlx4_glue.h"
+#include "mlx4_prm.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Free Tx queue elements.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ */
+static void
+mlx4_txq_free_elts(struct txq *txq)
+{
+	unsigned int elts_head = txq->elts_head;
+	unsigned int elts_tail = txq->elts_tail;
+	struct txq_elt (*elts)[txq->elts_n] = txq->elts;
+	unsigned int elts_m = txq->elts_n - 1;
+
+	DEBUG("%p: freeing WRs", (void *)txq);
+	while (elts_tail != elts_head) {
+		struct txq_elt *elt = &(*elts)[elts_tail++ & elts_m];
+
+		assert(elt->buf != NULL);
+		rte_pktmbuf_free(elt->buf);
+		elt->buf = NULL;
+		elt->wqe = NULL;
+	}
+	txq->elts_tail = txq->elts_head;
+}
+
+/**
+ * Retrieves information needed in order to directly access the Tx queue.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param mlxdv
+ *   Pointer to device information for this Tx queue.
+ */
+static void
+mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	struct mlx4_cq *cq = &txq->mcq;
+	struct mlx4dv_qp *dqp = mlxdv->qp.out;
+	struct mlx4dv_cq *dcq = mlxdv->cq.out;
+
+	/* Total length, including headroom and spare WQEs. */
+	sq->size = (uint32_t)dqp->rq.offset - (uint32_t)dqp->sq.offset;
+	sq->buf = (uint8_t *)dqp->buf.buf + dqp->sq.offset;
+	sq->eob = sq->buf + sq->size;
+	uint32_t headroom_size = 2048 + (1 << dqp->sq.wqe_shift);
+	/* Continuous headroom size bytes must always stay freed. */
+	sq->remain_size = sq->size - headroom_size;
+	sq->owner_opcode = MLX4_OPCODE_SEND | (0u << MLX4_SQ_OWNER_BIT);
+	sq->stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
+				     (0u << MLX4_SQ_OWNER_BIT));
+	sq->db = dqp->sdb;
+	sq->doorbell_qpn = dqp->doorbell_qpn;
+	cq->buf = dcq->buf.buf;
+	cq->cqe_cnt = dcq->cqe_cnt;
+	cq->set_ci_db = dcq->set_ci_db;
+	cq->cqe_64 = (dcq->cqe_size & 64) ? 1 : 0;
+}
+
+/**
+ * Returns the per-port supported offloads.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   Supported Tx offloads.
+ */
+uint64_t
+mlx4_get_tx_port_offloads(struct priv *priv)
+{
+	uint64_t offloads = DEV_TX_OFFLOAD_MULTI_SEGS;
+
+	if (priv->hw_csum) {
+		offloads |= (DEV_TX_OFFLOAD_IPV4_CKSUM |
+			     DEV_TX_OFFLOAD_UDP_CKSUM |
+			     DEV_TX_OFFLOAD_TCP_CKSUM);
+	}
+	if (priv->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (priv->hw_csum_l2tun) {
+		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (priv->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
+	}
+	return offloads;
+}
+
+/**
+ * DPDK callback to configure a Tx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Tx queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ *   Thresholds parameters.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+		    unsigned int socket, const struct rte_eth_txconf *conf)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mlx4dv_obj mlxdv;
+	struct mlx4dv_qp dv_qp;
+	struct mlx4dv_cq dv_cq;
+	struct txq_elt (*elts)[rte_align32pow2(desc)];
+	struct ibv_qp_init_attr qp_init_attr;
+	struct txq *txq;
+	uint8_t *bounce_buf;
+	struct mlx4_malloc_vec vec[] = {
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = sizeof(*txq),
+			.addr = (void **)&txq,
+		},
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = sizeof(*elts),
+			.addr = (void **)&elts,
+		},
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = MLX4_MAX_WQE_SIZE,
+			.addr = (void **)&bounce_buf,
+		},
+	};
+	int ret;
+	uint64_t offloads;
+
+	offloads = conf->offloads | dev->data->dev_conf.txmode.offloads;
+
+	DEBUG("%p: configuring queue %u for %u descriptors",
+	      (void *)dev, idx, desc);
+
+	if (idx >= dev->data->nb_tx_queues) {
+		rte_errno = EOVERFLOW;
+		ERROR("%p: queue index out of range (%u >= %u)",
+		      (void *)dev, idx, dev->data->nb_tx_queues);
+		return -rte_errno;
+	}
+	txq = dev->data->tx_queues[idx];
+	if (txq) {
+		rte_errno = EEXIST;
+		DEBUG("%p: Tx queue %u already configured, release it first",
+		      (void *)dev, idx);
+		return -rte_errno;
+	}
+	if (!desc) {
+		rte_errno = EINVAL;
+		ERROR("%p: invalid number of Tx descriptors", (void *)dev);
+		return -rte_errno;
+	}
+	if (desc != RTE_DIM(*elts)) {
+		desc = RTE_DIM(*elts);
+		WARN("%p: increased number of descriptors in Tx queue %u"
+		     " to the next power of two (%u)",
+		     (void *)dev, idx, desc);
+	}
+	/* Allocate and initialize Tx queue. */
+	mlx4_zmallocv_socket("TXQ", vec, RTE_DIM(vec), socket);
+	if (!txq) {
+		ERROR("%p: unable to allocate queue index %u",
+		      (void *)dev, idx);
+		return -rte_errno;
+	}
+	*txq = (struct txq){
+		.priv = priv,
+		.stats = {
+			.idx = idx,
+		},
+		.socket = socket,
+		.elts_n = desc,
+		.elts = elts,
+		.elts_head = 0,
+		.elts_tail = 0,
+		/*
+		 * Request send completion every MLX4_PMD_TX_PER_COMP_REQ
+		 * packets or at least 4 times per ring.
+		 */
+		.elts_comp_cd =
+			RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4),
+		.elts_comp_cd_init =
+			RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4),
+		.csum = priv->hw_csum &&
+			(offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
+					   DEV_TX_OFFLOAD_UDP_CKSUM |
+					   DEV_TX_OFFLOAD_TCP_CKSUM)),
+		.csum_l2tun = priv->hw_csum_l2tun &&
+			      (offloads &
+			       DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM),
+		/* Enable Tx loopback for VF devices. */
+		.lb = !!priv->vf,
+		.bounce_buf = bounce_buf,
+	};
+	txq->cq = mlx4_glue->create_cq(priv->ctx, desc, NULL, NULL, 0);
+	if (!txq->cq) {
+		rte_errno = ENOMEM;
+		ERROR("%p: CQ creation failure: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	qp_init_attr = (struct ibv_qp_init_attr){
+		.send_cq = txq->cq,
+		.recv_cq = txq->cq,
+		.cap = {
+			.max_send_wr =
+				RTE_MIN(priv->device_attr.max_qp_wr, desc),
+			.max_send_sge = 1,
+			.max_inline_data = MLX4_PMD_MAX_INLINE,
+		},
+		.qp_type = IBV_QPT_RAW_PACKET,
+		/* No completion events must occur by default. */
+		.sq_sig_all = 0,
+	};
+	txq->qp = mlx4_glue->create_qp(priv->pd, &qp_init_attr);
+	if (!txq->qp) {
+		rte_errno = errno ? errno : EINVAL;
+		ERROR("%p: QP creation failure: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	txq->max_inline = qp_init_attr.cap.max_inline_data;
+	ret = mlx4_glue->modify_qp
+		(txq->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_INIT,
+			.port_num = priv->port,
+		 },
+		 IBV_QP_STATE | IBV_QP_PORT);
+	if (ret) {
+		rte_errno = ret;
+		ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	ret = mlx4_glue->modify_qp
+		(txq->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_RTR,
+		 },
+		 IBV_QP_STATE);
+	if (ret) {
+		rte_errno = ret;
+		ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	ret = mlx4_glue->modify_qp
+		(txq->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_RTS,
+		 },
+		 IBV_QP_STATE);
+	if (ret) {
+		rte_errno = ret;
+		ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	/* Retrieve device queue information. */
+	mlxdv.cq.in = txq->cq;
+	mlxdv.cq.out = &dv_cq;
+	mlxdv.qp.in = txq->qp;
+	mlxdv.qp.out = &dv_qp;
+	ret = mlx4_glue->dv_init_obj(&mlxdv, MLX4DV_OBJ_QP | MLX4DV_OBJ_CQ);
+	if (ret) {
+		rte_errno = EINVAL;
+		ERROR("%p: failed to obtain information needed for"
+		      " accessing the device queues", (void *)dev);
+		goto error;
+	}
+	mlx4_txq_fill_dv_obj_info(txq, &mlxdv);
+	/* Save first wqe pointer in the first element. */
+	(&(*txq->elts)[0])->wqe =
+		(volatile struct mlx4_wqe_ctrl_seg *)txq->msq.buf;
+	if (mlx4_mr_btree_init(&txq->mr_ctrl.cache_bh,
+			       MLX4_MR_BTREE_CACHE_N, socket)) {
+		/* rte_errno is already set. */
+		goto error;
+	}
+	/* Save pointer of global generation number to check memory event. */
+	txq->mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
+	DEBUG("%p: adding Tx queue %p to list", (void *)dev, (void *)txq);
+	dev->data->tx_queues[idx] = txq;
+	return 0;
+error:
+	dev->data->tx_queues[idx] = NULL;
+	ret = rte_errno;
+	mlx4_tx_queue_release(txq);
+	rte_errno = ret;
+	assert(rte_errno > 0);
+	return -rte_errno;
+}
+
+/**
+ * DPDK callback to release a Tx queue.
+ *
+ * @param dpdk_txq
+ *   Generic Tx queue pointer.
+ */
+void
+mlx4_tx_queue_release(void *dpdk_txq)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	struct priv *priv;
+	unsigned int i;
+
+	if (txq == NULL)
+		return;
+	priv = txq->priv;
+	for (i = 0; i != priv->dev->data->nb_tx_queues; ++i)
+		if (priv->dev->data->tx_queues[i] == txq) {
+			DEBUG("%p: removing Tx queue %p from list",
+			      (void *)priv->dev, (void *)txq);
+			priv->dev->data->tx_queues[i] = NULL;
+			break;
+		}
+	mlx4_txq_free_elts(txq);
+	if (txq->qp)
+		claim_zero(mlx4_glue->destroy_qp(txq->qp));
+	if (txq->cq)
+		claim_zero(mlx4_glue->destroy_cq(txq->cq));
+	mlx4_mr_btree_free(&txq->mr_ctrl.cache_bh);
+	rte_free(txq);
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_utils.c b/src/spdk/dpdk/drivers/net/mlx4/mlx4_utils.c
new file mode 100644
index 00000000..a727d703
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_utils.c
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+/**
+ * @file
+ * Utility functions used by the mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_memory.h>
+
+#include "mlx4_utils.h"
+
+/**
+ * Make a file descriptor non-blocking.
+ *
+ * @param fd
+ *   File descriptor to alter.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_fd_set_non_blocking(int fd)
+{
+	int ret = fcntl(fd, F_GETFL);
+
+	if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
+		return 0;
+	assert(errno);
+	rte_errno = errno;
+	return -rte_errno;
+}
+
+/**
+ * Internal helper to allocate memory once for several disparate objects.
+ *
+ * The most restrictive alignment constraint for standard objects is assumed
+ * to be sizeof(double) and is used as a default value.
+ *
+ * C11 code would include stdalign.h and use alignof(max_align_t) however
+ * we'll stick with C99 for the time being.
+ */
+static inline size_t
+mlx4_mallocv_inline(const char *type, const struct mlx4_malloc_vec *vec,
+		    unsigned int cnt, int zero, int socket)
+{
+	unsigned int i;
+	size_t size;
+	size_t least;
+	uint8_t *data = NULL;
+	int fill = !vec[0].addr;
+
+fill:
+	size = 0;
+	least = 0;
+	for (i = 0; i < cnt; ++i) {
+		size_t align = (uintptr_t)vec[i].align;
+
+		if (!align) {
+			align = sizeof(double);
+		} else if (!rte_is_power_of_2(align)) {
+			rte_errno = EINVAL;
+			goto error;
+		}
+		if (least < align)
+			least = align;
+		align = RTE_ALIGN_CEIL(size, align);
+		size = align + vec[i].size;
+		if (fill && vec[i].addr)
+			*vec[i].addr = data + align;
+	}
+	if (fill)
+		return size;
+	if (!zero)
+		data = rte_malloc_socket(type, size, least, socket);
+	else
+		data = rte_zmalloc_socket(type, size, least, socket);
+	if (data) {
+		fill = 1;
+		goto fill;
+	}
+	rte_errno = ENOMEM;
+error:
+	for (i = 0; i != cnt; ++i)
+		if (vec[i].addr)
+			*vec[i].addr = NULL;
+	return 0;
+}
+
+/**
+ * Allocate memory once for several disparate objects.
+ *
+ * This function adds iovec-like semantics (e.g. readv()) to rte_malloc().
+ * Memory is allocated once for several contiguous objects of nonuniform
+ * sizes and alignment constraints.
+ *
+ * Each entry of @p vec describes the size, alignment constraint and
+ * provides a buffer address where the resulting object pointer must be
+ * stored.
+ *
+ * The buffer of the first entry is guaranteed to point to the beginning of
+ * the allocated region and is safe to use with rte_free().
+ *
+ * NULL buffers are silently ignored.
+ *
+ * Providing a NULL buffer in the first entry prevents this function from
+ * allocating any memory but has otherwise no effect on its behavior. In
+ * this case, the contents of remaining non-NULL buffers are updated with
+ * addresses relative to zero (i.e. offsets that would have been used during
+ * the allocation).
+ *
+ * @param[in] type
+ *   A string identifying the type of allocated objects (useful for debug
+ *   purposes, such as identifying the cause of a memory leak). Can be NULL.
+ * @param[in, out] vec
+ *   Description of objects to allocate memory for.
+ * @param cnt
+ *   Number of entries in @p vec.
+ *
+ * @return
+ *   Size in bytes of the allocated region including any padding. In case of
+ *   error, rte_errno is set, 0 is returned and NULL is stored in the
+ *   non-NULL buffers pointed by @p vec.
+ *
+ * @see struct mlx4_malloc_vec
+ * @see rte_malloc()
+ */
+size_t
+mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec,
+	     unsigned int cnt)
+{
+	return mlx4_mallocv_inline(type, vec, cnt, 0, SOCKET_ID_ANY);
+}
+
+/**
+ * Combines the semantics of mlx4_mallocv() with those of rte_zmalloc().
+ *
+ * @see mlx4_mallocv()
+ * @see rte_zmalloc()
+ */
+size_t
+mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec,
+	      unsigned int cnt)
+{
+	return mlx4_mallocv_inline(type, vec, cnt, 1, SOCKET_ID_ANY);
+}
+
+/**
+ * Socket-aware version of mlx4_mallocv().
+ *
+ * This function takes one additional parameter.
+ *
+ * @param socket
+ *   NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this
+ *   function will behave the same as mlx4_mallocv().
+ *
+ * @see mlx4_mallocv()
+ * @see rte_malloc_socket()
+ */
+size_t
+mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+		    unsigned int cnt, int socket)
+{
+	return mlx4_mallocv_inline(type, vec, cnt, 0, socket);
+}
+
+/**
+ * Combines the semantics of mlx4_mallocv_socket() with those of
+ * mlx4_zmalloc_socket().
+ *
+ * @see mlx4_mallocv_socket()
+ * @see rte_zmalloc_socket()
+ */
+size_t
+mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+		     unsigned int cnt, int socket)
+{
+	return mlx4_mallocv_inline(type, vec, cnt, 1, socket);
+}
diff --git a/src/spdk/dpdk/drivers/net/mlx4/mlx4_utils.h b/src/spdk/dpdk/drivers/net/mlx4/mlx4_utils.h
new file mode 100644
index 00000000..86abb3b7
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/mlx4_utils.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#ifndef MLX4_UTILS_H_
+#define MLX4_UTILS_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "mlx4.h"
+
+#ifndef NDEBUG
+
+/*
+ * When debugging is enabled (NDEBUG not defined), file, line and function
+ * information replace the driver name (MLX4_DRIVER_NAME) in log messages.
+ */
+
+/** Return the file name part of a path. */
+static inline const char *
+pmd_drv_log_basename(const char *s)
+{
+	const char *n = s;
+
+	while (*n)
+		if (*(n++) == '/')
+			s = n;
+	return s;
+}
+
+#define PMD_DRV_LOG(level, ...) \
+	RTE_LOG(level, PMD, \
+		RTE_FMT("%s:%u: %s(): " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \
+			pmd_drv_log_basename(__FILE__), \
+			__LINE__, \
+			__func__, \
+			RTE_FMT_TAIL(__VA_ARGS__,)))
+#define DEBUG(...) PMD_DRV_LOG(DEBUG, __VA_ARGS__)
+#define claim_zero(...) assert((__VA_ARGS__) == 0)
+
+#else /* NDEBUG */
+
+/*
+ * Like assert(), DEBUG() becomes a no-op and claim_zero() does not perform
+ * any check when debugging is disabled.
+ */
+
+#define PMD_DRV_LOG(level, ...) \
+	RTE_LOG(level, PMD, \
+		RTE_FMT(MLX4_DRIVER_NAME ": " \
+			RTE_FMT_HEAD(__VA_ARGS__,) "\n", \
+		RTE_FMT_TAIL(__VA_ARGS__,)))
+#define DEBUG(...) (void)0
+#define claim_zero(...) (__VA_ARGS__)
+
+#endif /* NDEBUG */
+
+#define INFO(...) PMD_DRV_LOG(INFO, __VA_ARGS__)
+#define WARN(...) PMD_DRV_LOG(WARNING, __VA_ARGS__)
+#define ERROR(...) PMD_DRV_LOG(ERR, __VA_ARGS__)
+
+/** Allocate a buffer on the stack and fill it with a printf format string. */
+#define MKSTR(name, ...) \
+	char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \
+	\
+	snprintf(name, sizeof(name), __VA_ARGS__)
+
+/** Generate a string out of the provided arguments. */
+#define MLX4_STR(...) # __VA_ARGS__
+
+/** Similar to MLX4_STR() with enclosed macros expanded first. */
+#define MLX4_STR_EXPAND(...) MLX4_STR(__VA_ARGS__)
+
+/** Object description used with mlx4_mallocv() and similar functions. */
+struct mlx4_malloc_vec {
+	size_t align; /**< Alignment constraint (power of 2), 0 if unknown. */
+	size_t size; /**< Object size. */
+	void **addr; /**< Storage for allocation address. */
+};
+
+/* mlx4_utils.c */
+
+int mlx4_fd_set_non_blocking(int fd);
+size_t mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec,
+		    unsigned int cnt);
+size_t mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec,
+		     unsigned int cnt);
+size_t mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+			   unsigned int cnt, int socket);
+size_t mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+			    unsigned int cnt, int socket);
+
+#endif /* MLX4_UTILS_H_ */
diff --git a/src/spdk/dpdk/drivers/net/mlx4/rte_pmd_mlx4_version.map b/src/spdk/dpdk/drivers/net/mlx4/rte_pmd_mlx4_version.map
new file mode 100644
index 00000000..ef353984
--- /dev/null
+++ b/src/spdk/dpdk/drivers/net/mlx4/rte_pmd_mlx4_version.map
@@ -0,0 +1,4 @@
+DPDK_2.0 {
+
+	local: *;
+};
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
commit	483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree	e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/spdk/dpdk/drivers/net/mlx4
parent	Initial commit. (diff)
download	ceph-upstream.tar.xz ceph-upstream.zip