summaryrefslogtreecommitdiffstats
path: root/src/spdk/dpdk/drivers/vdpa
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/dpdk/drivers/vdpa
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/dpdk/drivers/vdpa')
-rw-r--r--src/spdk/dpdk/drivers/vdpa/Makefile12
-rw-r--r--src/spdk/dpdk/drivers/vdpa/ifc/Makefile33
-rw-r--r--src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.c329
-rw-r--r--src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.h162
-rw-r--r--src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf_osdep.h52
-rw-r--r--src/spdk/dpdk/drivers/vdpa/ifc/ifcvf_vdpa.c1280
-rw-r--r--src/spdk/dpdk/drivers/vdpa/ifc/meson.build8
-rw-r--r--src/spdk/dpdk/drivers/vdpa/ifc/rte_pmd_ifc_version.map3
-rw-r--r--src/spdk/dpdk/drivers/vdpa/meson.build9
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/Makefile56
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/meson.build37
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.c626
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.h355
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_event.c401
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_lm.c121
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_mem.c347
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_steer.c288
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_utils.h20
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c457
-rw-r--r--src/spdk/dpdk/drivers/vdpa/mlx5/rte_pmd_mlx5_vdpa_version.map3
20 files changed, 4599 insertions, 0 deletions
diff --git a/src/spdk/dpdk/drivers/vdpa/Makefile b/src/spdk/dpdk/drivers/vdpa/Makefile
new file mode 100644
index 000000000..6e8835948
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2019 Mellanox Technologies, Ltd
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+ifeq ($(CONFIG_RTE_EAL_VFIO),y)
+DIRS-$(CONFIG_RTE_LIBRTE_IFC_PMD) += ifc
+endif
+
+DIRS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5
+
+include $(RTE_SDK)/mk/rte.subdir.mk
diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/Makefile b/src/spdk/dpdk/drivers/vdpa/ifc/Makefile
new file mode 100644
index 000000000..b468bfdbd
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/ifc/Makefile
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_ifc.a
+
+LDLIBS += -lpthread
+LDLIBS += -lrte_eal -lrte_pci -lrte_vhost -lrte_bus_pci
+LDLIBS += -lrte_kvargs
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+
+#
+# Add extra flags for base driver source files to disable warnings in them
+#
+BASE_DRIVER_OBJS=$(sort $(patsubst %.c,%.o,$(notdir $(wildcard $(SRCDIR)/base/*.c))))
+
+VPATH += $(SRCDIR)/base
+
+EXPORT_MAP := rte_pmd_ifc_version.map
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_IFC_PMD) += ifcvf_vdpa.c
+SRCS-$(CONFIG_RTE_LIBRTE_IFC_PMD) += ifcvf.c
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.c b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.c
new file mode 100644
index 000000000..3c0b2dff6
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.c
@@ -0,0 +1,329 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include "ifcvf.h"
+#include "ifcvf_osdep.h"
+
+STATIC void *
+get_cap_addr(struct ifcvf_hw *hw, struct ifcvf_pci_cap *cap)
+{
+ u8 bar = cap->bar;
+ u32 length = cap->length;
+ u32 offset = cap->offset;
+
+ if (bar > IFCVF_PCI_MAX_RESOURCE - 1) {
+ DEBUGOUT("invalid bar: %u\n", bar);
+ return NULL;
+ }
+
+ if (offset + length < offset) {
+ DEBUGOUT("offset(%u) + length(%u) overflows\n",
+ offset, length);
+ return NULL;
+ }
+
+ if (offset + length > hw->mem_resource[cap->bar].len) {
+ DEBUGOUT("offset(%u) + length(%u) overflows bar length(%u)",
+ offset, length, (u32)hw->mem_resource[cap->bar].len);
+ return NULL;
+ }
+
+ return hw->mem_resource[bar].addr + offset;
+}
+
+int
+ifcvf_init_hw(struct ifcvf_hw *hw, PCI_DEV *dev)
+{
+ int ret;
+ u8 pos;
+ struct ifcvf_pci_cap cap;
+
+ ret = PCI_READ_CONFIG_BYTE(dev, &pos, PCI_CAPABILITY_LIST);
+ if (ret < 0) {
+ DEBUGOUT("failed to read pci capability list\n");
+ return -1;
+ }
+
+ while (pos) {
+ ret = PCI_READ_CONFIG_RANGE(dev, (u32 *)&cap,
+ sizeof(cap), pos);
+ if (ret < 0) {
+ DEBUGOUT("failed to read cap at pos: %x", pos);
+ break;
+ }
+
+ if (cap.cap_vndr != PCI_CAP_ID_VNDR)
+ goto next;
+
+ DEBUGOUT("cfg type: %u, bar: %u, offset: %u, "
+ "len: %u\n", cap.cfg_type, cap.bar,
+ cap.offset, cap.length);
+
+ switch (cap.cfg_type) {
+ case IFCVF_PCI_CAP_COMMON_CFG:
+ hw->common_cfg = get_cap_addr(hw, &cap);
+ break;
+ case IFCVF_PCI_CAP_NOTIFY_CFG:
+ PCI_READ_CONFIG_DWORD(dev, &hw->notify_off_multiplier,
+ pos + sizeof(cap));
+ hw->notify_base = get_cap_addr(hw, &cap);
+ hw->notify_region = cap.bar;
+ break;
+ case IFCVF_PCI_CAP_ISR_CFG:
+ hw->isr = get_cap_addr(hw, &cap);
+ break;
+ case IFCVF_PCI_CAP_DEVICE_CFG:
+ hw->dev_cfg = get_cap_addr(hw, &cap);
+ break;
+ }
+next:
+ pos = cap.cap_next;
+ }
+
+ hw->lm_cfg = hw->mem_resource[4].addr;
+
+ if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+ hw->isr == NULL || hw->dev_cfg == NULL) {
+ DEBUGOUT("capability incomplete\n");
+ return -1;
+ }
+
+ DEBUGOUT("capability mapping:\ncommon cfg: %p\n"
+ "notify base: %p\nisr cfg: %p\ndevice cfg: %p\n"
+ "multiplier: %u\n",
+ hw->common_cfg, hw->dev_cfg,
+ hw->isr, hw->notify_base,
+ hw->notify_off_multiplier);
+
+ return 0;
+}
+
+STATIC u8
+ifcvf_get_status(struct ifcvf_hw *hw)
+{
+ return IFCVF_READ_REG8(&hw->common_cfg->device_status);
+}
+
+STATIC void
+ifcvf_set_status(struct ifcvf_hw *hw, u8 status)
+{
+ IFCVF_WRITE_REG8(status, &hw->common_cfg->device_status);
+}
+
+STATIC void
+ifcvf_reset(struct ifcvf_hw *hw)
+{
+ ifcvf_set_status(hw, 0);
+
+ /* flush status write */
+ while (ifcvf_get_status(hw))
+ msec_delay(1);
+}
+
+STATIC void
+ifcvf_add_status(struct ifcvf_hw *hw, u8 status)
+{
+ if (status != 0)
+ status |= ifcvf_get_status(hw);
+
+ ifcvf_set_status(hw, status);
+ ifcvf_get_status(hw);
+}
+
+u64
+ifcvf_get_features(struct ifcvf_hw *hw)
+{
+ u32 features_lo, features_hi;
+ struct ifcvf_pci_common_cfg *cfg = hw->common_cfg;
+
+ IFCVF_WRITE_REG32(0, &cfg->device_feature_select);
+ features_lo = IFCVF_READ_REG32(&cfg->device_feature);
+
+ IFCVF_WRITE_REG32(1, &cfg->device_feature_select);
+ features_hi = IFCVF_READ_REG32(&cfg->device_feature);
+
+ return ((u64)features_hi << 32) | features_lo;
+}
+
+STATIC void
+ifcvf_set_features(struct ifcvf_hw *hw, u64 features)
+{
+ struct ifcvf_pci_common_cfg *cfg = hw->common_cfg;
+
+ IFCVF_WRITE_REG32(0, &cfg->guest_feature_select);
+ IFCVF_WRITE_REG32(features & ((1ULL << 32) - 1), &cfg->guest_feature);
+
+ IFCVF_WRITE_REG32(1, &cfg->guest_feature_select);
+ IFCVF_WRITE_REG32(features >> 32, &cfg->guest_feature);
+}
+
+STATIC int
+ifcvf_config_features(struct ifcvf_hw *hw)
+{
+ u64 host_features;
+
+ host_features = ifcvf_get_features(hw);
+ hw->req_features &= host_features;
+
+ ifcvf_set_features(hw, hw->req_features);
+ ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_FEATURES_OK);
+
+ if (!(ifcvf_get_status(hw) & IFCVF_CONFIG_STATUS_FEATURES_OK)) {
+ DEBUGOUT("failed to set FEATURES_OK status\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+STATIC void
+io_write64_twopart(u64 val, u32 *lo, u32 *hi)
+{
+ IFCVF_WRITE_REG32(val & ((1ULL << 32) - 1), lo);
+ IFCVF_WRITE_REG32(val >> 32, hi);
+}
+
+STATIC int
+ifcvf_hw_enable(struct ifcvf_hw *hw)
+{
+ struct ifcvf_pci_common_cfg *cfg;
+ u8 *lm_cfg;
+ u32 i;
+ u16 notify_off;
+
+ cfg = hw->common_cfg;
+ lm_cfg = hw->lm_cfg;
+
+ IFCVF_WRITE_REG16(0, &cfg->msix_config);
+ if (IFCVF_READ_REG16(&cfg->msix_config) == IFCVF_MSI_NO_VECTOR) {
+ DEBUGOUT("msix vec alloc failed for device config\n");
+ return -1;
+ }
+
+ for (i = 0; i < hw->nr_vring; i++) {
+ IFCVF_WRITE_REG16(i, &cfg->queue_select);
+ io_write64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo,
+ &cfg->queue_desc_hi);
+ io_write64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo,
+ &cfg->queue_avail_hi);
+ io_write64_twopart(hw->vring[i].used, &cfg->queue_used_lo,
+ &cfg->queue_used_hi);
+ IFCVF_WRITE_REG16(hw->vring[i].size, &cfg->queue_size);
+
+ *(u32 *)(lm_cfg + IFCVF_LM_RING_STATE_OFFSET +
+ (i / 2) * IFCVF_LM_CFG_SIZE + (i % 2) * 4) =
+ (u32)hw->vring[i].last_avail_idx |
+ ((u32)hw->vring[i].last_used_idx << 16);
+
+ IFCVF_WRITE_REG16(i + 1, &cfg->queue_msix_vector);
+ if (IFCVF_READ_REG16(&cfg->queue_msix_vector) ==
+ IFCVF_MSI_NO_VECTOR) {
+ DEBUGOUT("queue %u, msix vec alloc failed\n",
+ i);
+ return -1;
+ }
+
+ notify_off = IFCVF_READ_REG16(&cfg->queue_notify_off);
+ hw->notify_addr[i] = (void *)((u8 *)hw->notify_base +
+ notify_off * hw->notify_off_multiplier);
+ IFCVF_WRITE_REG16(1, &cfg->queue_enable);
+ }
+
+ return 0;
+}
+
+STATIC void
+ifcvf_hw_disable(struct ifcvf_hw *hw)
+{
+ u32 i;
+ struct ifcvf_pci_common_cfg *cfg;
+ u32 ring_state;
+
+ cfg = hw->common_cfg;
+
+ IFCVF_WRITE_REG16(IFCVF_MSI_NO_VECTOR, &cfg->msix_config);
+ for (i = 0; i < hw->nr_vring; i++) {
+ IFCVF_WRITE_REG16(i, &cfg->queue_select);
+ IFCVF_WRITE_REG16(0, &cfg->queue_enable);
+ IFCVF_WRITE_REG16(IFCVF_MSI_NO_VECTOR, &cfg->queue_msix_vector);
+ ring_state = *(u32 *)(hw->lm_cfg + IFCVF_LM_RING_STATE_OFFSET +
+ (i / 2) * IFCVF_LM_CFG_SIZE + (i % 2) * 4);
+ hw->vring[i].last_avail_idx = (u16)(ring_state >> 16);
+ hw->vring[i].last_used_idx = (u16)(ring_state >> 16);
+ }
+}
+
+int
+ifcvf_start_hw(struct ifcvf_hw *hw)
+{
+ ifcvf_reset(hw);
+ ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_ACK);
+ ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_DRIVER);
+
+ if (ifcvf_config_features(hw) < 0)
+ return -1;
+
+ if (ifcvf_hw_enable(hw) < 0)
+ return -1;
+
+ ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_DRIVER_OK);
+ return 0;
+}
+
+void
+ifcvf_stop_hw(struct ifcvf_hw *hw)
+{
+ ifcvf_hw_disable(hw);
+ ifcvf_reset(hw);
+}
+
+void
+ifcvf_enable_logging(struct ifcvf_hw *hw, u64 log_base, u64 log_size)
+{
+ u8 *lm_cfg;
+
+ lm_cfg = hw->lm_cfg;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_BASE_ADDR_LOW) =
+ log_base & IFCVF_32_BIT_MASK;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_BASE_ADDR_HIGH) =
+ (log_base >> 32) & IFCVF_32_BIT_MASK;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_END_ADDR_LOW) =
+ (log_base + log_size) & IFCVF_32_BIT_MASK;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_END_ADDR_HIGH) =
+ ((log_base + log_size) >> 32) & IFCVF_32_BIT_MASK;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_LOGGING_CTRL) = IFCVF_LM_ENABLE_VF;
+}
+
+void
+ifcvf_disable_logging(struct ifcvf_hw *hw)
+{
+ u8 *lm_cfg;
+
+ lm_cfg = hw->lm_cfg;
+ *(u32 *)(lm_cfg + IFCVF_LM_LOGGING_CTRL) = IFCVF_LM_DISABLE;
+}
+
+void
+ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid)
+{
+ IFCVF_WRITE_REG16(qid, hw->notify_addr[qid]);
+}
+
+u8
+ifcvf_get_notify_region(struct ifcvf_hw *hw)
+{
+ return hw->notify_region;
+}
+
+u64
+ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid)
+{
+ return (u8 *)hw->notify_addr[qid] -
+ (u8 *)hw->mem_resource[hw->notify_region].addr;
+}
diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.h b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.h
new file mode 100644
index 000000000..eb04a9406
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _IFCVF_H_
+#define _IFCVF_H_
+
+#include "ifcvf_osdep.h"
+
+#define IFCVF_VENDOR_ID 0x1AF4
+#define IFCVF_DEVICE_ID 0x1041
+#define IFCVF_SUBSYS_VENDOR_ID 0x8086
+#define IFCVF_SUBSYS_DEVICE_ID 0x001A
+
+#define IFCVF_MAX_QUEUES 1
+#define VIRTIO_F_IOMMU_PLATFORM 33
+
+/* Common configuration */
+#define IFCVF_PCI_CAP_COMMON_CFG 1
+/* Notifications */
+#define IFCVF_PCI_CAP_NOTIFY_CFG 2
+/* ISR Status */
+#define IFCVF_PCI_CAP_ISR_CFG 3
+/* Device specific configuration */
+#define IFCVF_PCI_CAP_DEVICE_CFG 4
+/* PCI configuration access */
+#define IFCVF_PCI_CAP_PCI_CFG 5
+
+#define IFCVF_CONFIG_STATUS_RESET 0x00
+#define IFCVF_CONFIG_STATUS_ACK 0x01
+#define IFCVF_CONFIG_STATUS_DRIVER 0x02
+#define IFCVF_CONFIG_STATUS_DRIVER_OK 0x04
+#define IFCVF_CONFIG_STATUS_FEATURES_OK 0x08
+#define IFCVF_CONFIG_STATUS_FAILED 0x80
+
+#define IFCVF_MSI_NO_VECTOR 0xffff
+#define IFCVF_PCI_MAX_RESOURCE 6
+
+#define IFCVF_LM_CFG_SIZE 0x40
+#define IFCVF_LM_RING_STATE_OFFSET 0x20
+
+#define IFCVF_LM_LOGGING_CTRL 0x0
+
+#define IFCVF_LM_BASE_ADDR_LOW 0x10
+#define IFCVF_LM_BASE_ADDR_HIGH 0x14
+#define IFCVF_LM_END_ADDR_LOW 0x18
+#define IFCVF_LM_END_ADDR_HIGH 0x1c
+
+#define IFCVF_LM_DISABLE 0x0
+#define IFCVF_LM_ENABLE_VF 0x1
+#define IFCVF_LM_ENABLE_PF 0x3
+#define IFCVF_LOG_BASE 0x100000000000
+#define IFCVF_MEDIATED_VRING 0x200000000000
+
+#define IFCVF_32_BIT_MASK 0xffffffff
+
+
+struct ifcvf_pci_cap {
+ u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */
+ u8 cap_next; /* Generic PCI field: next ptr. */
+ u8 cap_len; /* Generic PCI field: capability length */
+ u8 cfg_type; /* Identifies the structure. */
+ u8 bar; /* Where to find it. */
+ u8 padding[3]; /* Pad to full dword. */
+ u32 offset; /* Offset within bar. */
+ u32 length; /* Length of the structure, in bytes. */
+};
+
+struct ifcvf_pci_notify_cap {
+ struct ifcvf_pci_cap cap;
+ u32 notify_off_multiplier; /* Multiplier for queue_notify_off. */
+};
+
+struct ifcvf_pci_common_cfg {
+ /* About the whole device. */
+ u32 device_feature_select;
+ u32 device_feature;
+ u32 guest_feature_select;
+ u32 guest_feature;
+ u16 msix_config;
+ u16 num_queues;
+ u8 device_status;
+ u8 config_generation;
+
+ /* About a specific virtqueue. */
+ u16 queue_select;
+ u16 queue_size;
+ u16 queue_msix_vector;
+ u16 queue_enable;
+ u16 queue_notify_off;
+ u32 queue_desc_lo;
+ u32 queue_desc_hi;
+ u32 queue_avail_lo;
+ u32 queue_avail_hi;
+ u32 queue_used_lo;
+ u32 queue_used_hi;
+};
+
+struct ifcvf_net_config {
+ u8 mac[6];
+ u16 status;
+ u16 max_virtqueue_pairs;
+} __rte_packed;
+
+struct ifcvf_pci_mem_resource {
+ u64 phys_addr; /**< Physical address, 0 if not resource. */
+ u64 len; /**< Length of the resource. */
+ u8 *addr; /**< Virtual address, NULL when not mapped. */
+};
+
+struct vring_info {
+ u64 desc;
+ u64 avail;
+ u64 used;
+ u16 size;
+ u16 last_avail_idx;
+ u16 last_used_idx;
+};
+
+struct ifcvf_hw {
+ u64 req_features;
+ u8 notify_region;
+ u32 notify_off_multiplier;
+ struct ifcvf_pci_common_cfg *common_cfg;
+ struct ifcvf_net_config *dev_cfg;
+ u8 *isr;
+ u16 *notify_base;
+ u16 *notify_addr[IFCVF_MAX_QUEUES * 2];
+ u8 *lm_cfg;
+ struct vring_info vring[IFCVF_MAX_QUEUES * 2];
+ u8 nr_vring;
+ struct ifcvf_pci_mem_resource mem_resource[IFCVF_PCI_MAX_RESOURCE];
+};
+
+int
+ifcvf_init_hw(struct ifcvf_hw *hw, PCI_DEV *dev);
+
+u64
+ifcvf_get_features(struct ifcvf_hw *hw);
+
+int
+ifcvf_start_hw(struct ifcvf_hw *hw);
+
+void
+ifcvf_stop_hw(struct ifcvf_hw *hw);
+
+void
+ifcvf_enable_logging(struct ifcvf_hw *hw, u64 log_base, u64 log_size);
+
+void
+ifcvf_disable_logging(struct ifcvf_hw *hw);
+
+void
+ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid);
+
+u8
+ifcvf_get_notify_region(struct ifcvf_hw *hw);
+
+u64
+ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid);
+
+#endif /* _IFCVF_H_ */
diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf_osdep.h b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf_osdep.h
new file mode 100644
index 000000000..6aef25ea4
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/ifc/base/ifcvf_osdep.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _IFCVF_OSDEP_H_
+#define _IFCVF_OSDEP_H_
+
+#include <stdint.h>
+#include <linux/pci_regs.h>
+
+#include <rte_cycles.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_log.h>
+#include <rte_io.h>
+
+#define DEBUGOUT(S, args...) RTE_LOG(DEBUG, PMD, S, ##args)
+#define STATIC static
+
+#define msec_delay(x) rte_delay_us_sleep(1000 * (x))
+
+#define IFCVF_READ_REG8(reg) rte_read8(reg)
+#define IFCVF_WRITE_REG8(val, reg) rte_write8((val), (reg))
+#define IFCVF_READ_REG16(reg) rte_read16(reg)
+#define IFCVF_WRITE_REG16(val, reg) rte_write16((val), (reg))
+#define IFCVF_READ_REG32(reg) rte_read32(reg)
+#define IFCVF_WRITE_REG32(val, reg) rte_write32((val), (reg))
+
+typedef struct rte_pci_device PCI_DEV;
+
+#define PCI_READ_CONFIG_BYTE(dev, val, where) \
+ rte_pci_read_config(dev, val, 1, where)
+
+#define PCI_READ_CONFIG_DWORD(dev, val, where) \
+ rte_pci_read_config(dev, val, 4, where)
+
+typedef uint8_t u8;
+typedef int8_t s8;
+typedef uint16_t u16;
+typedef int16_t s16;
+typedef uint32_t u32;
+typedef int32_t s32;
+typedef int64_t s64;
+typedef uint64_t u64;
+
+static inline int
+PCI_READ_CONFIG_RANGE(PCI_DEV *dev, uint32_t *val, int size, int where)
+{
+ return rte_pci_read_config(dev, val, size, where);
+}
+
+#endif /* _IFCVF_OSDEP_H_ */
diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/ifcvf_vdpa.c b/src/spdk/dpdk/drivers/vdpa/ifc/ifcvf_vdpa.c
new file mode 100644
index 000000000..ec97178dc
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -0,0 +1,1280 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <unistd.h>
+#include <pthread.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/epoll.h>
+#include <linux/virtio_net.h>
+#include <stdbool.h>
+
+#include <rte_malloc.h>
+#include <rte_memory.h>
+#include <rte_bus_pci.h>
+#include <rte_vhost.h>
+#include <rte_vdpa.h>
+#include <rte_vfio.h>
+#include <rte_spinlock.h>
+#include <rte_log.h>
+#include <rte_kvargs.h>
+#include <rte_devargs.h>
+
+#include "base/ifcvf.h"
+
+#define DRV_LOG(level, fmt, args...) \
+ rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
+ "IFCVF %s(): " fmt "\n", __func__, ##args)
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+#define IFCVF_USED_RING_LEN(size) \
+ ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
+
+#define IFCVF_VDPA_MODE "vdpa"
+#define IFCVF_SW_FALLBACK_LM "sw-live-migration"
+
+static const char * const ifcvf_valid_arguments[] = {
+ IFCVF_VDPA_MODE,
+ IFCVF_SW_FALLBACK_LM,
+ NULL
+};
+
+static int ifcvf_vdpa_logtype;
+
+struct ifcvf_internal {
+ struct rte_vdpa_dev_addr dev_addr;
+ struct rte_pci_device *pdev;
+ struct ifcvf_hw hw;
+ int vfio_container_fd;
+ int vfio_group_fd;
+ int vfio_dev_fd;
+ pthread_t tid; /* thread for notify relay */
+ int epfd;
+ int vid;
+ int did;
+ uint16_t max_queues;
+ uint64_t features;
+ rte_atomic32_t started;
+ rte_atomic32_t dev_attached;
+ rte_atomic32_t running;
+ rte_spinlock_t lock;
+ bool sw_lm;
+ bool sw_fallback_running;
+ /* mediated vring for sw fallback */
+ struct vring m_vring[IFCVF_MAX_QUEUES * 2];
+ /* eventfd for used ring interrupt */
+ int intr_fd[IFCVF_MAX_QUEUES * 2];
+};
+
+struct internal_list {
+ TAILQ_ENTRY(internal_list) next;
+ struct ifcvf_internal *internal;
+};
+
+TAILQ_HEAD(internal_list_head, internal_list);
+static struct internal_list_head internal_list =
+ TAILQ_HEAD_INITIALIZER(internal_list);
+
+static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
+
+static struct internal_list *
+find_internal_resource_by_did(int did)
+{
+ int found = 0;
+ struct internal_list *list;
+
+ pthread_mutex_lock(&internal_list_lock);
+
+ TAILQ_FOREACH(list, &internal_list, next) {
+ if (did == list->internal->did) {
+ found = 1;
+ break;
+ }
+ }
+
+ pthread_mutex_unlock(&internal_list_lock);
+
+ if (!found)
+ return NULL;
+
+ return list;
+}
+
+static struct internal_list *
+find_internal_resource_by_dev(struct rte_pci_device *pdev)
+{
+ int found = 0;
+ struct internal_list *list;
+
+ pthread_mutex_lock(&internal_list_lock);
+
+ TAILQ_FOREACH(list, &internal_list, next) {
+ if (pdev == list->internal->pdev) {
+ found = 1;
+ break;
+ }
+ }
+
+ pthread_mutex_unlock(&internal_list_lock);
+
+ if (!found)
+ return NULL;
+
+ return list;
+}
+
+static int
+ifcvf_vfio_setup(struct ifcvf_internal *internal)
+{
+ struct rte_pci_device *dev = internal->pdev;
+ char devname[RTE_DEV_NAME_MAX_LEN] = {0};
+ int iommu_group_num;
+ int i, ret;
+
+ internal->vfio_dev_fd = -1;
+ internal->vfio_group_fd = -1;
+ internal->vfio_container_fd = -1;
+
+ rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
+ ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
+ &iommu_group_num);
+ if (ret <= 0) {
+ DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
+ return -1;
+ }
+
+ internal->vfio_container_fd = rte_vfio_container_create();
+ if (internal->vfio_container_fd < 0)
+ return -1;
+
+ internal->vfio_group_fd = rte_vfio_container_group_bind(
+ internal->vfio_container_fd, iommu_group_num);
+ if (internal->vfio_group_fd < 0)
+ goto err;
+
+ if (rte_pci_map_device(dev))
+ goto err;
+
+ internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
+
+ for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
+ i++) {
+ internal->hw.mem_resource[i].addr =
+ internal->pdev->mem_resource[i].addr;
+ internal->hw.mem_resource[i].phys_addr =
+ internal->pdev->mem_resource[i].phys_addr;
+ internal->hw.mem_resource[i].len =
+ internal->pdev->mem_resource[i].len;
+ }
+
+ return 0;
+
+err:
+ rte_vfio_container_destroy(internal->vfio_container_fd);
+ return -1;
+}
+
+static int
+ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
+{
+ uint32_t i;
+ int ret;
+ struct rte_vhost_memory *mem = NULL;
+ int vfio_container_fd;
+
+ ret = rte_vhost_get_mem_table(internal->vid, &mem);
+ if (ret < 0) {
+ DRV_LOG(ERR, "failed to get VM memory layout.");
+ goto exit;
+ }
+
+ vfio_container_fd = internal->vfio_container_fd;
+
+ for (i = 0; i < mem->nregions; i++) {
+ struct rte_vhost_mem_region *reg;
+
+ reg = &mem->regions[i];
+ DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
+ "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
+ do_map ? "DMA map" : "DMA unmap", i,
+ reg->host_user_addr, reg->guest_phys_addr, reg->size);
+
+ if (do_map) {
+ ret = rte_vfio_container_dma_map(vfio_container_fd,
+ reg->host_user_addr, reg->guest_phys_addr,
+ reg->size);
+ if (ret < 0) {
+ DRV_LOG(ERR, "DMA map failed.");
+ goto exit;
+ }
+ } else {
+ ret = rte_vfio_container_dma_unmap(vfio_container_fd,
+ reg->host_user_addr, reg->guest_phys_addr,
+ reg->size);
+ if (ret < 0) {
+ DRV_LOG(ERR, "DMA unmap failed.");
+ goto exit;
+ }
+ }
+ }
+
+exit:
+ if (mem)
+ free(mem);
+ return ret;
+}
+
+static uint64_t
+hva_to_gpa(int vid, uint64_t hva)
+{
+ struct rte_vhost_memory *mem = NULL;
+ struct rte_vhost_mem_region *reg;
+ uint32_t i;
+ uint64_t gpa = 0;
+
+ if (rte_vhost_get_mem_table(vid, &mem) < 0)
+ goto exit;
+
+ for (i = 0; i < mem->nregions; i++) {
+ reg = &mem->regions[i];
+
+ if (hva >= reg->host_user_addr &&
+ hva < reg->host_user_addr + reg->size) {
+ gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
+ break;
+ }
+ }
+
+exit:
+ if (mem)
+ free(mem);
+ return gpa;
+}
+
+static int
+vdpa_ifcvf_start(struct ifcvf_internal *internal)
+{
+ struct ifcvf_hw *hw = &internal->hw;
+ int i, nr_vring;
+ int vid;
+ struct rte_vhost_vring vq;
+ uint64_t gpa;
+
+ vid = internal->vid;
+ nr_vring = rte_vhost_get_vring_num(vid);
+ rte_vhost_get_negotiated_features(vid, &hw->req_features);
+
+ for (i = 0; i < nr_vring; i++) {
+ rte_vhost_get_vhost_vring(vid, i, &vq);
+ gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
+ if (gpa == 0) {
+ DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
+ return -1;
+ }
+ hw->vring[i].desc = gpa;
+
+ gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
+ if (gpa == 0) {
+ DRV_LOG(ERR, "Fail to get GPA for available ring.");
+ return -1;
+ }
+ hw->vring[i].avail = gpa;
+
+ gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
+ if (gpa == 0) {
+ DRV_LOG(ERR, "Fail to get GPA for used ring.");
+ return -1;
+ }
+ hw->vring[i].used = gpa;
+
+ hw->vring[i].size = vq.size;
+ rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
+ &hw->vring[i].last_used_idx);
+ }
+ hw->nr_vring = i;
+
+ return ifcvf_start_hw(&internal->hw);
+}
+
+static void
+vdpa_ifcvf_stop(struct ifcvf_internal *internal)
+{
+ struct ifcvf_hw *hw = &internal->hw;
+ uint32_t i;
+ int vid;
+ uint64_t features = 0;
+ uint64_t log_base = 0, log_size = 0;
+ uint64_t len;
+
+ vid = internal->vid;
+ ifcvf_stop_hw(hw);
+
+ for (i = 0; i < hw->nr_vring; i++)
+ rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
+ hw->vring[i].last_used_idx);
+
+ if (internal->sw_lm)
+ return;
+
+ rte_vhost_get_negotiated_features(vid, &features);
+ if (RTE_VHOST_NEED_LOG(features)) {
+ ifcvf_disable_logging(hw);
+ rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
+ rte_vfio_container_dma_unmap(internal->vfio_container_fd,
+ log_base, IFCVF_LOG_BASE, log_size);
+ /*
+ * IFCVF marks dirty memory pages for only packet buffer,
+ * SW helps to mark the used ring as dirty after device stops.
+ */
+ for (i = 0; i < hw->nr_vring; i++) {
+ len = IFCVF_USED_RING_LEN(hw->vring[i].size);
+ rte_vhost_log_used_vring(vid, i, 0, len);
+ }
+ }
+}
+
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+ sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
+static int
+vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
+{
+ int ret;
+ uint32_t i, nr_vring;
+ char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+ int *fd_ptr;
+ struct rte_vhost_vring vring;
+ int fd;
+
+ vring.callfd = -1;
+
+ nr_vring = rte_vhost_get_vring_num(internal->vid);
+
+ irq_set = (struct vfio_irq_set *)irq_set_buf;
+ irq_set->argsz = sizeof(irq_set_buf);
+ irq_set->count = nr_vring + 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+ irq_set->start = 0;
+ fd_ptr = (int *)&irq_set->data;
+ fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
+
+ for (i = 0; i < nr_vring; i++)
+ internal->intr_fd[i] = -1;
+
+ for (i = 0; i < nr_vring; i++) {
+ rte_vhost_get_vhost_vring(internal->vid, i, &vring);
+ fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
+ if ((i & 1) == 0 && m_rx == true) {
+ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (fd < 0) {
+ DRV_LOG(ERR, "can't setup eventfd: %s",
+ strerror(errno));
+ return -1;
+ }
+ internal->intr_fd[i] = fd;
+ fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
+ }
+ }
+
+ ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+ if (ret) {
+ DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
+ strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
+{
+ int ret;
+ uint32_t i, nr_vring;
+ char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+
+ irq_set = (struct vfio_irq_set *)irq_set_buf;
+ irq_set->argsz = sizeof(irq_set_buf);
+ irq_set->count = 0;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+ irq_set->start = 0;
+
+ nr_vring = rte_vhost_get_vring_num(internal->vid);
+ for (i = 0; i < nr_vring; i++) {
+ if (internal->intr_fd[i] >= 0)
+ close(internal->intr_fd[i]);
+ internal->intr_fd[i] = -1;
+ }
+
+ ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+ if (ret) {
+ DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
+ strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static void *
+notify_relay(void *arg)
+{
+ int i, kickfd, epfd, nfds = 0;
+ uint32_t qid, q_num;
+ struct epoll_event events[IFCVF_MAX_QUEUES * 2];
+ struct epoll_event ev;
+ uint64_t buf;
+ int nbytes;
+ struct rte_vhost_vring vring;
+ struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
+ struct ifcvf_hw *hw = &internal->hw;
+
+ q_num = rte_vhost_get_vring_num(internal->vid);
+
+ epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
+ if (epfd < 0) {
+ DRV_LOG(ERR, "failed to create epoll instance.");
+ return NULL;
+ }
+ internal->epfd = epfd;
+
+ vring.kickfd = -1;
+ for (qid = 0; qid < q_num; qid++) {
+ ev.events = EPOLLIN | EPOLLPRI;
+ rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
+ ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
+ DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
+ return NULL;
+ }
+ }
+
+ for (;;) {
+ nfds = epoll_wait(epfd, events, q_num, -1);
+ if (nfds < 0) {
+ if (errno == EINTR)
+ continue;
+ DRV_LOG(ERR, "epoll_wait return fail\n");
+ return NULL;
+ }
+
+ for (i = 0; i < nfds; i++) {
+ qid = events[i].data.u32;
+ kickfd = (uint32_t)(events[i].data.u64 >> 32);
+ do {
+ nbytes = read(kickfd, &buf, 8);
+ if (nbytes < 0) {
+ if (errno == EINTR ||
+ errno == EWOULDBLOCK ||
+ errno == EAGAIN)
+ continue;
+ DRV_LOG(INFO, "Error reading "
+ "kickfd: %s",
+ strerror(errno));
+ }
+ break;
+ } while (1);
+
+ ifcvf_notify_queue(hw, qid);
+ }
+ }
+
+ return NULL;
+}
+
+static int
+setup_notify_relay(struct ifcvf_internal *internal)
+{
+ int ret;
+
+ ret = pthread_create(&internal->tid, NULL, notify_relay,
+ (void *)internal);
+ if (ret) {
+ DRV_LOG(ERR, "failed to create notify relay pthread.");
+ return -1;
+ }
+ return 0;
+}
+
+static int
+unset_notify_relay(struct ifcvf_internal *internal)
+{
+ void *status;
+
+ if (internal->tid) {
+ pthread_cancel(internal->tid);
+ pthread_join(internal->tid, &status);
+ }
+ internal->tid = 0;
+
+ if (internal->epfd >= 0)
+ close(internal->epfd);
+ internal->epfd = -1;
+
+ return 0;
+}
+
+static int
+update_datapath(struct ifcvf_internal *internal)
+{
+ int ret;
+
+ rte_spinlock_lock(&internal->lock);
+
+ if (!rte_atomic32_read(&internal->running) &&
+ (rte_atomic32_read(&internal->started) &&
+ rte_atomic32_read(&internal->dev_attached))) {
+ ret = ifcvf_dma_map(internal, 1);
+ if (ret)
+ goto err;
+
+ ret = vdpa_enable_vfio_intr(internal, 0);
+ if (ret)
+ goto err;
+
+ ret = vdpa_ifcvf_start(internal);
+ if (ret)
+ goto err;
+
+ ret = setup_notify_relay(internal);
+ if (ret)
+ goto err;
+
+ rte_atomic32_set(&internal->running, 1);
+ } else if (rte_atomic32_read(&internal->running) &&
+ (!rte_atomic32_read(&internal->started) ||
+ !rte_atomic32_read(&internal->dev_attached))) {
+ ret = unset_notify_relay(internal);
+ if (ret)
+ goto err;
+
+ vdpa_ifcvf_stop(internal);
+
+ ret = vdpa_disable_vfio_intr(internal);
+ if (ret)
+ goto err;
+
+ ret = ifcvf_dma_map(internal, 0);
+ if (ret)
+ goto err;
+
+ rte_atomic32_set(&internal->running, 0);
+ }
+
+ rte_spinlock_unlock(&internal->lock);
+ return 0;
+err:
+ rte_spinlock_unlock(&internal->lock);
+ return ret;
+}
+
+static int
+m_ifcvf_start(struct ifcvf_internal *internal)
+{
+ struct ifcvf_hw *hw = &internal->hw;
+ uint32_t i, nr_vring;
+ int vid, ret;
+ struct rte_vhost_vring vq;
+ void *vring_buf;
+ uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
+ uint64_t size;
+ uint64_t gpa;
+
+ memset(&vq, 0, sizeof(vq));
+ vid = internal->vid;
+ nr_vring = rte_vhost_get_vring_num(vid);
+ rte_vhost_get_negotiated_features(vid, &hw->req_features);
+
+ for (i = 0; i < nr_vring; i++) {
+ rte_vhost_get_vhost_vring(vid, i, &vq);
+
+ size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
+ PAGE_SIZE);
+ vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
+ vring_init(&internal->m_vring[i], vq.size, vring_buf,
+ PAGE_SIZE);
+
+ ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
+ (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
+ if (ret < 0) {
+ DRV_LOG(ERR, "mediated vring DMA map failed.");
+ goto error;
+ }
+
+ gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
+ if (gpa == 0) {
+ DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
+ return -1;
+ }
+ hw->vring[i].desc = gpa;
+
+ gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
+ if (gpa == 0) {
+ DRV_LOG(ERR, "Fail to get GPA for available ring.");
+ return -1;
+ }
+ hw->vring[i].avail = gpa;
+
+ /* Direct I/O for Tx queue, relay for Rx queue */
+ if (i & 1) {
+ gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
+ if (gpa == 0) {
+ DRV_LOG(ERR, "Fail to get GPA for used ring.");
+ return -1;
+ }
+ hw->vring[i].used = gpa;
+ } else {
+ hw->vring[i].used = m_vring_iova +
+ (char *)internal->m_vring[i].used -
+ (char *)internal->m_vring[i].desc;
+ }
+
+ hw->vring[i].size = vq.size;
+
+ rte_vhost_get_vring_base(vid, i,
+ &internal->m_vring[i].avail->idx,
+ &internal->m_vring[i].used->idx);
+
+ rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
+ &hw->vring[i].last_used_idx);
+
+ m_vring_iova += size;
+ }
+ hw->nr_vring = nr_vring;
+
+ return ifcvf_start_hw(&internal->hw);
+
+error:
+ for (i = 0; i < nr_vring; i++)
+ if (internal->m_vring[i].desc)
+ rte_free(internal->m_vring[i].desc);
+
+ return -1;
+}
+
+static int
+m_ifcvf_stop(struct ifcvf_internal *internal)
+{
+ int vid;
+ uint32_t i;
+ struct rte_vhost_vring vq;
+ struct ifcvf_hw *hw = &internal->hw;
+ uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
+ uint64_t size, len;
+
+ vid = internal->vid;
+ ifcvf_stop_hw(hw);
+
+ for (i = 0; i < hw->nr_vring; i++) {
+ /* synchronize remaining new used entries if any */
+ if ((i & 1) == 0)
+ update_used_ring(internal, i);
+
+ rte_vhost_get_vhost_vring(vid, i, &vq);
+ len = IFCVF_USED_RING_LEN(vq.size);
+ rte_vhost_log_used_vring(vid, i, 0, len);
+
+ size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
+ PAGE_SIZE);
+ rte_vfio_container_dma_unmap(internal->vfio_container_fd,
+ (uint64_t)(uintptr_t)internal->m_vring[i].desc,
+ m_vring_iova, size);
+
+ rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
+ hw->vring[i].last_used_idx);
+ rte_free(internal->m_vring[i].desc);
+ m_vring_iova += size;
+ }
+
+ return 0;
+}
+
+static void
+update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
+{
+ rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
+ rte_vhost_vring_call(internal->vid, qid);
+}
+
+static void *
+vring_relay(void *arg)
+{
+ int i, vid, epfd, fd, nfds;
+ struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
+ struct rte_vhost_vring vring;
+ uint16_t qid, q_num;
+ struct epoll_event events[IFCVF_MAX_QUEUES * 4];
+ struct epoll_event ev;
+ int nbytes;
+ uint64_t buf;
+
+ vid = internal->vid;
+ q_num = rte_vhost_get_vring_num(vid);
+
+ /* add notify fd and interrupt fd to epoll */
+ epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
+ if (epfd < 0) {
+ DRV_LOG(ERR, "failed to create epoll instance.");
+ return NULL;
+ }
+ internal->epfd = epfd;
+
+ vring.kickfd = -1;
+ for (qid = 0; qid < q_num; qid++) {
+ ev.events = EPOLLIN | EPOLLPRI;
+ rte_vhost_get_vhost_vring(vid, qid, &vring);
+ ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
+ DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
+ return NULL;
+ }
+ }
+
+ for (qid = 0; qid < q_num; qid += 2) {
+ ev.events = EPOLLIN | EPOLLPRI;
+ /* leave a flag to mark it's for interrupt */
+ ev.data.u64 = 1 | qid << 1 |
+ (uint64_t)internal->intr_fd[qid] << 32;
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
+ < 0) {
+ DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
+ return NULL;
+ }
+ update_used_ring(internal, qid);
+ }
+
+ /* start relay with a first kick */
+ for (qid = 0; qid < q_num; qid++)
+ ifcvf_notify_queue(&internal->hw, qid);
+
+ /* listen to the events and react accordingly */
+ for (;;) {
+ nfds = epoll_wait(epfd, events, q_num * 2, -1);
+ if (nfds < 0) {
+ if (errno == EINTR)
+ continue;
+ DRV_LOG(ERR, "epoll_wait return fail\n");
+ return NULL;
+ }
+
+ for (i = 0; i < nfds; i++) {
+ fd = (uint32_t)(events[i].data.u64 >> 32);
+ do {
+ nbytes = read(fd, &buf, 8);
+ if (nbytes < 0) {
+ if (errno == EINTR ||
+ errno == EWOULDBLOCK ||
+ errno == EAGAIN)
+ continue;
+ DRV_LOG(INFO, "Error reading "
+ "kickfd: %s",
+ strerror(errno));
+ }
+ break;
+ } while (1);
+
+ qid = events[i].data.u32 >> 1;
+
+ if (events[i].data.u32 & 1)
+ update_used_ring(internal, qid);
+ else
+ ifcvf_notify_queue(&internal->hw, qid);
+ }
+ }
+
+ return NULL;
+}
+
+static int
+setup_vring_relay(struct ifcvf_internal *internal)
+{
+ int ret;
+
+ ret = pthread_create(&internal->tid, NULL, vring_relay,
+ (void *)internal);
+ if (ret) {
+ DRV_LOG(ERR, "failed to create ring relay pthread.");
+ return -1;
+ }
+ return 0;
+}
+
+static int
+unset_vring_relay(struct ifcvf_internal *internal)
+{
+ void *status;
+
+ if (internal->tid) {
+ pthread_cancel(internal->tid);
+ pthread_join(internal->tid, &status);
+ }
+ internal->tid = 0;
+
+ if (internal->epfd >= 0)
+ close(internal->epfd);
+ internal->epfd = -1;
+
+ return 0;
+}
+
+static int
+ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
+{
+ int ret;
+ int vid = internal->vid;
+
+ /* stop the direct IO data path */
+ unset_notify_relay(internal);
+ vdpa_ifcvf_stop(internal);
+ vdpa_disable_vfio_intr(internal);
+
+ ret = rte_vhost_host_notifier_ctrl(vid, false);
+ if (ret && ret != -ENOTSUP)
+ goto error;
+
+ /* set up interrupt for interrupt relay */
+ ret = vdpa_enable_vfio_intr(internal, 1);
+ if (ret)
+ goto unmap;
+
+ /* config the VF */
+ ret = m_ifcvf_start(internal);
+ if (ret)
+ goto unset_intr;
+
+ /* set up vring relay thread */
+ ret = setup_vring_relay(internal);
+ if (ret)
+ goto stop_vf;
+
+ rte_vhost_host_notifier_ctrl(vid, true);
+
+ internal->sw_fallback_running = true;
+
+ return 0;
+
+stop_vf:
+ m_ifcvf_stop(internal);
+unset_intr:
+ vdpa_disable_vfio_intr(internal);
+unmap:
+ ifcvf_dma_map(internal, 0);
+error:
+ return -1;
+}
+
+static int
+ifcvf_dev_config(int vid)
+{
+ int did;
+ struct internal_list *list;
+ struct ifcvf_internal *internal;
+
+ did = rte_vhost_get_vdpa_device_id(vid);
+ list = find_internal_resource_by_did(did);
+ if (list == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d", did);
+ return -1;
+ }
+
+ internal = list->internal;
+ internal->vid = vid;
+ rte_atomic32_set(&internal->dev_attached, 1);
+ update_datapath(internal);
+
+ if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
+ DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
+
+ return 0;
+}
+
+static int
+ifcvf_dev_close(int vid)
+{
+ int did;
+ struct internal_list *list;
+ struct ifcvf_internal *internal;
+
+ did = rte_vhost_get_vdpa_device_id(vid);
+ list = find_internal_resource_by_did(did);
+ if (list == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d", did);
+ return -1;
+ }
+
+ internal = list->internal;
+
+ if (internal->sw_fallback_running) {
+ /* unset ring relay */
+ unset_vring_relay(internal);
+
+ /* reset VF */
+ m_ifcvf_stop(internal);
+
+ /* remove interrupt setting */
+ vdpa_disable_vfio_intr(internal);
+
+ /* unset DMA map for guest memory */
+ ifcvf_dma_map(internal, 0);
+
+ internal->sw_fallback_running = false;
+ } else {
+ rte_atomic32_set(&internal->dev_attached, 0);
+ update_datapath(internal);
+ }
+
+ return 0;
+}
+
+static int
+ifcvf_set_features(int vid)
+{
+ uint64_t features = 0;
+ int did;
+ struct internal_list *list;
+ struct ifcvf_internal *internal;
+ uint64_t log_base = 0, log_size = 0;
+
+ did = rte_vhost_get_vdpa_device_id(vid);
+ list = find_internal_resource_by_did(did);
+ if (list == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d", did);
+ return -1;
+ }
+
+ internal = list->internal;
+ rte_vhost_get_negotiated_features(vid, &features);
+
+ if (!RTE_VHOST_NEED_LOG(features))
+ return 0;
+
+ if (internal->sw_lm) {
+ ifcvf_sw_fallback_switchover(internal);
+ } else {
+ rte_vhost_get_log_base(vid, &log_base, &log_size);
+ rte_vfio_container_dma_map(internal->vfio_container_fd,
+ log_base, IFCVF_LOG_BASE, log_size);
+ ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
+ }
+
+ return 0;
+}
+
+static int
+ifcvf_get_vfio_group_fd(int vid)
+{
+ int did;
+ struct internal_list *list;
+
+ did = rte_vhost_get_vdpa_device_id(vid);
+ list = find_internal_resource_by_did(did);
+ if (list == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d", did);
+ return -1;
+ }
+
+ return list->internal->vfio_group_fd;
+}
+
+static int
+ifcvf_get_vfio_device_fd(int vid)
+{
+ int did;
+ struct internal_list *list;
+
+ did = rte_vhost_get_vdpa_device_id(vid);
+ list = find_internal_resource_by_did(did);
+ if (list == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d", did);
+ return -1;
+ }
+
+ return list->internal->vfio_dev_fd;
+}
+
+static int
+ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
+{
+ int did;
+ struct internal_list *list;
+ struct ifcvf_internal *internal;
+ struct vfio_region_info reg = { .argsz = sizeof(reg) };
+ int ret;
+
+ did = rte_vhost_get_vdpa_device_id(vid);
+ list = find_internal_resource_by_did(did);
+ if (list == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d", did);
+ return -1;
+ }
+
+ internal = list->internal;
+
+ reg.index = ifcvf_get_notify_region(&internal->hw);
+ ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
+ if (ret) {
+ DRV_LOG(ERR, "Get not get device region info: %s",
+ strerror(errno));
+ return -1;
+ }
+
+ *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
+ *size = 0x1000;
+
+ return 0;
+}
+
+static int
+ifcvf_get_queue_num(int did, uint32_t *queue_num)
+{
+ struct internal_list *list;
+
+ list = find_internal_resource_by_did(did);
+ if (list == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d", did);
+ return -1;
+ }
+
+ *queue_num = list->internal->max_queues;
+
+ return 0;
+}
+
+static int
+ifcvf_get_vdpa_features(int did, uint64_t *features)
+{
+ struct internal_list *list;
+
+ list = find_internal_resource_by_did(did);
+ if (list == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d", did);
+ return -1;
+ }
+
+ *features = list->internal->features;
+
+ return 0;
+}
+
+#define VDPA_SUPPORTED_PROTOCOL_FEATURES \
+ (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
+ 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
+ 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
+ 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
+ 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
+static int
+ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
+{
+ *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
+ return 0;
+}
+
+static struct rte_vdpa_dev_ops ifcvf_ops = {
+ .get_queue_num = ifcvf_get_queue_num,
+ .get_features = ifcvf_get_vdpa_features,
+ .get_protocol_features = ifcvf_get_protocol_features,
+ .dev_conf = ifcvf_dev_config,
+ .dev_close = ifcvf_dev_close,
+ .set_vring_state = NULL,
+ .set_features = ifcvf_set_features,
+ .migration_done = NULL,
+ .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
+ .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
+ .get_notify_area = ifcvf_get_notify_area,
+};
+
+static inline int
+open_int(const char *key __rte_unused, const char *value, void *extra_args)
+{
+ uint16_t *n = extra_args;
+
+ if (value == NULL || extra_args == NULL)
+ return -EINVAL;
+
+ *n = (uint16_t)strtoul(value, NULL, 0);
+ if (*n == USHRT_MAX && errno == ERANGE)
+ return -1;
+
+ return 0;
+}
+
+static int
+ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+ struct rte_pci_device *pci_dev)
+{
+ uint64_t features;
+ struct ifcvf_internal *internal = NULL;
+ struct internal_list *list = NULL;
+ int vdpa_mode = 0;
+ int sw_fallback_lm = 0;
+ struct rte_kvargs *kvlist = NULL;
+ int ret = 0;
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return 0;
+
+ if (!pci_dev->device.devargs)
+ return 1;
+
+ kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
+ ifcvf_valid_arguments);
+ if (kvlist == NULL)
+ return 1;
+
+ /* probe only when vdpa mode is specified */
+ if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
+ rte_kvargs_free(kvlist);
+ return 1;
+ }
+
+ ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
+ &vdpa_mode);
+ if (ret < 0 || vdpa_mode == 0) {
+ rte_kvargs_free(kvlist);
+ return 1;
+ }
+
+ list = rte_zmalloc("ifcvf", sizeof(*list), 0);
+ if (list == NULL)
+ goto error;
+
+ internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
+ if (internal == NULL)
+ goto error;
+
+ internal->pdev = pci_dev;
+ rte_spinlock_init(&internal->lock);
+
+ if (ifcvf_vfio_setup(internal) < 0) {
+ DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
+ goto error;
+ }
+
+ if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
+ DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
+ goto error;
+ }
+
+ internal->max_queues = IFCVF_MAX_QUEUES;
+ features = ifcvf_get_features(&internal->hw);
+ internal->features = (features &
+ ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
+ (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
+ (1ULL << VIRTIO_NET_F_CTRL_VQ) |
+ (1ULL << VIRTIO_NET_F_STATUS) |
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
+ (1ULL << VHOST_F_LOG_ALL);
+
+ internal->dev_addr.pci_addr = pci_dev->addr;
+ internal->dev_addr.type = VDPA_ADDR_PCI;
+ list->internal = internal;
+
+ if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
+ ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
+ &open_int, &sw_fallback_lm);
+ if (ret < 0)
+ goto error;
+ }
+ internal->sw_lm = sw_fallback_lm;
+
+ internal->did = rte_vdpa_register_device(&internal->dev_addr,
+ &ifcvf_ops);
+ if (internal->did < 0) {
+ DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
+ goto error;
+ }
+
+ pthread_mutex_lock(&internal_list_lock);
+ TAILQ_INSERT_TAIL(&internal_list, list, next);
+ pthread_mutex_unlock(&internal_list_lock);
+
+ rte_atomic32_set(&internal->started, 1);
+ update_datapath(internal);
+
+ rte_kvargs_free(kvlist);
+ return 0;
+
+error:
+ rte_kvargs_free(kvlist);
+ rte_free(list);
+ rte_free(internal);
+ return -1;
+}
+
+static int
+ifcvf_pci_remove(struct rte_pci_device *pci_dev)
+{
+ struct ifcvf_internal *internal;
+ struct internal_list *list;
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return 0;
+
+ list = find_internal_resource_by_dev(pci_dev);
+ if (list == NULL) {
+ DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
+ return -1;
+ }
+
+ internal = list->internal;
+ rte_atomic32_set(&internal->started, 0);
+ update_datapath(internal);
+
+ rte_pci_unmap_device(internal->pdev);
+ rte_vfio_container_destroy(internal->vfio_container_fd);
+ rte_vdpa_unregister_device(internal->did);
+
+ pthread_mutex_lock(&internal_list_lock);
+ TAILQ_REMOVE(&internal_list, list, next);
+ pthread_mutex_unlock(&internal_list_lock);
+
+ rte_free(list);
+ rte_free(internal);
+
+ return 0;
+}
+
+/*
+ * IFCVF has the same vendor ID and device ID as virtio net PCI
+ * device, with its specific subsystem vendor ID and device ID.
+ */
+static const struct rte_pci_id pci_id_ifcvf_map[] = {
+ { .class_id = RTE_CLASS_ANY_ID,
+ .vendor_id = IFCVF_VENDOR_ID,
+ .device_id = IFCVF_DEVICE_ID,
+ .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
+ .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
+ },
+
+ { .vendor_id = 0, /* sentinel */
+ },
+};
+
+static struct rte_pci_driver rte_ifcvf_vdpa = {
+ .id_table = pci_id_ifcvf_map,
+ .drv_flags = 0,
+ .probe = ifcvf_pci_probe,
+ .remove = ifcvf_pci_remove,
+};
+
+RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
+RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
+RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
+
+RTE_INIT(ifcvf_vdpa_init_log)
+{
+ ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
+ if (ifcvf_vdpa_logtype >= 0)
+ rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
+}
diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/meson.build b/src/spdk/dpdk/drivers/vdpa/ifc/meson.build
new file mode 100644
index 000000000..b179987f9
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/ifc/meson.build
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+build = dpdk_conf.has('RTE_LIBRTE_VHOST')
+reason = 'missing dependency, DPDK vhost library'
+sources = files('ifcvf_vdpa.c', 'base/ifcvf.c')
+includes += include_directories('base')
+deps += 'vhost'
diff --git a/src/spdk/dpdk/drivers/vdpa/ifc/rte_pmd_ifc_version.map b/src/spdk/dpdk/drivers/vdpa/ifc/rte_pmd_ifc_version.map
new file mode 100644
index 000000000..f9f17e4f6
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/ifc/rte_pmd_ifc_version.map
@@ -0,0 +1,3 @@
+DPDK_20.0 {
+ local: *;
+};
diff --git a/src/spdk/dpdk/drivers/vdpa/meson.build b/src/spdk/dpdk/drivers/vdpa/meson.build
new file mode 100644
index 000000000..e3ed54a25
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/meson.build
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2019 Mellanox Technologies, Ltd
+
+drivers = ['ifc',
+ 'mlx5',]
+std_deps = ['bus_pci', 'kvargs']
+std_deps += ['vhost']
+config_flag_fmt = 'RTE_LIBRTE_@0@_PMD'
+driver_name_fmt = 'rte_pmd_@0@'
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/Makefile b/src/spdk/dpdk/drivers/vdpa/mlx5/Makefile
new file mode 100644
index 000000000..ef34c0b88
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/Makefile
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2019 Mellanox Technologies, Ltd
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# Library name.
+LIB = librte_pmd_mlx5_vdpa.a
+
+# Sources.
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_mem.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_event.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_virtq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_steer.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_lm.c
+
+
+# Basic CFLAGS.
+CFLAGS += -O3
+CFLAGS += -std=c11 -Wall -Wextra
+CFLAGS += -g
+CFLAGS += -I$(RTE_SDK)/drivers/common/mlx5
+CFLAGS += -I$(RTE_SDK)/drivers/net/mlx5_vdpa
+CFLAGS += -I$(RTE_SDK)/lib/librte_sched
+CFLAGS += -I$(BUILDDIR)/drivers/common/mlx5
+CFLAGS += -D_BSD_SOURCE
+CFLAGS += -D_DEFAULT_SOURCE
+CFLAGS += -D_XOPEN_SOURCE=600
+CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -Wno-strict-prototypes
+LDLIBS += -lrte_common_mlx5
+LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_pci -lrte_bus_pci -lrte_sched
+
+# A few warnings cannot be avoided in external headers.
+CFLAGS += -Wno-error=cast-qual
+
+EXPORT_MAP := rte_pmd_mlx5_vdpa_version.map
+
+# DEBUG which is usually provided on the command-line may enable
+# CONFIG_RTE_LIBRTE_MLX5_DEBUG.
+ifeq ($(DEBUG),1)
+CONFIG_RTE_LIBRTE_MLX5_DEBUG := y
+endif
+
+# User-defined CFLAGS.
+ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DEBUG),y)
+CFLAGS += -pedantic
+ifneq ($(CONFIG_RTE_TOOLCHAIN_ICC),y)
+CFLAGS += -DPEDANTIC
+endif
+AUTO_CONFIG_CFLAGS += -Wno-pedantic
+else
+CFLAGS += -UPEDANTIC
+endif
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/meson.build b/src/spdk/dpdk/drivers/vdpa/mlx5/meson.build
new file mode 100644
index 000000000..2963aad71
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/meson.build
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2019 Mellanox Technologies, Ltd
+
+if not is_linux
+ build = false
+ reason = 'only supported on Linux'
+ subdir_done()
+endif
+
+fmt_name = 'mlx5_vdpa'
+deps += ['hash', 'common_mlx5', 'vhost', 'pci', 'bus_pci', 'eal', 'sched']
+sources = files(
+ 'mlx5_vdpa.c',
+ 'mlx5_vdpa_mem.c',
+ 'mlx5_vdpa_event.c',
+ 'mlx5_vdpa_virtq.c',
+ 'mlx5_vdpa_steer.c',
+ 'mlx5_vdpa_lm.c',
+)
+cflags_options = [
+ '-std=c11',
+ '-Wno-strict-prototypes',
+ '-D_BSD_SOURCE',
+ '-D_DEFAULT_SOURCE',
+ '-D_XOPEN_SOURCE=600'
+]
+foreach option:cflags_options
+ if cc.has_argument(option)
+ cflags += option
+ endif
+endforeach
+
+if get_option('buildtype').contains('debug')
+ cflags += [ '-pedantic', '-DPEDANTIC' ]
+else
+ cflags += [ '-UPEDANTIC' ]
+endif
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.c
new file mode 100644
index 000000000..1113d6cef
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -0,0 +1,626 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <rte_log.h>
+#include <rte_errno.h>
+#include <rte_bus_pci.h>
+#include <rte_pci.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_common.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+#include <mlx5_nl.h>
+
+#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
+
+
+#define MLX5_VDPA_DEFAULT_FEATURES ((1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VIRTIO_F_ANY_LAYOUT) | \
+ (1ULL << VIRTIO_NET_F_MQ) | \
+ (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+ (1ULL << VIRTIO_F_ORDER_PLATFORM) | \
+ (1ULL << VHOST_F_LOG_ALL))
+
+#define MLX5_VDPA_PROTOCOL_FEATURES \
+ ((1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_MQ))
+
+TAILQ_HEAD(mlx5_vdpa_privs, mlx5_vdpa_priv) priv_list =
+ TAILQ_HEAD_INITIALIZER(priv_list);
+static pthread_mutex_t priv_list_lock = PTHREAD_MUTEX_INITIALIZER;
+int mlx5_vdpa_logtype;
+
+static struct mlx5_vdpa_priv *
+mlx5_vdpa_find_priv_resource_by_did(int did)
+{
+ struct mlx5_vdpa_priv *priv;
+ int found = 0;
+
+ pthread_mutex_lock(&priv_list_lock);
+ TAILQ_FOREACH(priv, &priv_list, next) {
+ if (did == priv->id) {
+ found = 1;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&priv_list_lock);
+ if (!found) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ rte_errno = EINVAL;
+ return NULL;
+ }
+ return priv;
+}
+
+static int
+mlx5_vdpa_get_queue_num(int did, uint32_t *queue_num)
+{
+ struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did);
+
+ if (priv == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ return -1;
+ }
+ *queue_num = priv->caps.max_num_virtio_queues;
+ return 0;
+}
+
+static int
+mlx5_vdpa_get_vdpa_features(int did, uint64_t *features)
+{
+ struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did);
+
+ if (priv == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ return -1;
+ }
+ *features = MLX5_VDPA_DEFAULT_FEATURES;
+ if (priv->caps.virtio_queue_type & (1 << MLX5_VIRTQ_TYPE_PACKED))
+ *features |= (1ULL << VIRTIO_F_RING_PACKED);
+ if (priv->caps.tso_ipv4)
+ *features |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
+ if (priv->caps.tso_ipv6)
+ *features |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
+ if (priv->caps.tx_csum)
+ *features |= (1ULL << VIRTIO_NET_F_CSUM);
+ if (priv->caps.rx_csum)
+ *features |= (1ULL << VIRTIO_NET_F_GUEST_CSUM);
+ if (priv->caps.virtio_version_1_0)
+ *features |= (1ULL << VIRTIO_F_VERSION_1);
+ return 0;
+}
+
+static int
+mlx5_vdpa_get_protocol_features(int did, uint64_t *features)
+{
+ struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did);
+
+ if (priv == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ return -1;
+ }
+ *features = MLX5_VDPA_PROTOCOL_FEATURES;
+ return 0;
+}
+
+static int
+mlx5_vdpa_set_vring_state(int vid, int vring, int state)
+{
+ int did = rte_vhost_get_vdpa_device_id(vid);
+ struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did);
+
+ if (priv == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ return -EINVAL;
+ }
+ if (vring >= (int)priv->caps.max_num_virtio_queues * 2) {
+ DRV_LOG(ERR, "Too big vring id: %d.", vring);
+ return -E2BIG;
+ }
+ return mlx5_vdpa_virtq_enable(priv, vring, state);
+}
+
+static int
+mlx5_vdpa_direct_db_prepare(struct mlx5_vdpa_priv *priv)
+{
+ int ret;
+
+ if (priv->direct_notifier) {
+ ret = rte_vhost_host_notifier_ctrl(priv->vid, false);
+ if (ret != 0) {
+ DRV_LOG(INFO, "Direct HW notifier FD cannot be "
+ "destroyed for device %d: %d.", priv->vid, ret);
+ return -1;
+ }
+ priv->direct_notifier = 0;
+ }
+ ret = rte_vhost_host_notifier_ctrl(priv->vid, true);
+ if (ret != 0)
+ DRV_LOG(INFO, "Direct HW notifier FD cannot be configured for"
+ " device %d: %d.", priv->vid, ret);
+ else
+ priv->direct_notifier = 1;
+ return 0;
+}
+
+static int
+mlx5_vdpa_features_set(int vid)
+{
+ int did = rte_vhost_get_vdpa_device_id(vid);
+ struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did);
+ uint64_t log_base, log_size;
+ uint64_t features;
+ int ret;
+
+ if (priv == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ return -EINVAL;
+ }
+ ret = rte_vhost_get_negotiated_features(vid, &features);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to get negotiated features.");
+ return ret;
+ }
+ if (RTE_VHOST_NEED_LOG(features)) {
+ ret = rte_vhost_get_log_base(vid, &log_base, &log_size);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to get log base.");
+ return ret;
+ }
+ ret = mlx5_vdpa_dirty_bitmap_set(priv, log_base, log_size);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to set dirty bitmap.");
+ return ret;
+ }
+ DRV_LOG(INFO, "mlx5 vdpa: enabling dirty logging...");
+ ret = mlx5_vdpa_logging_enable(priv, 1);
+ if (ret) {
+ DRV_LOG(ERR, "Failed t enable dirty logging.");
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static int
+mlx5_vdpa_dev_close(int vid)
+{
+ int did = rte_vhost_get_vdpa_device_id(vid);
+ struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did);
+ int ret = 0;
+
+ if (priv == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ return -1;
+ }
+ if (priv->configured)
+ ret |= mlx5_vdpa_lm_log(priv);
+ mlx5_vdpa_cqe_event_unset(priv);
+ mlx5_vdpa_steer_unset(priv);
+ mlx5_vdpa_virtqs_release(priv);
+ mlx5_vdpa_event_qp_global_release(priv);
+ mlx5_vdpa_mem_dereg(priv);
+ priv->configured = 0;
+ priv->vid = 0;
+ DRV_LOG(INFO, "vDPA device %d was closed.", vid);
+ return ret;
+}
+
+static int
+mlx5_vdpa_dev_config(int vid)
+{
+ int did = rte_vhost_get_vdpa_device_id(vid);
+ struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did);
+
+ if (priv == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ return -EINVAL;
+ }
+ if (priv->configured && mlx5_vdpa_dev_close(vid)) {
+ DRV_LOG(ERR, "Failed to reconfigure vid %d.", vid);
+ return -1;
+ }
+ priv->vid = vid;
+ if (mlx5_vdpa_mem_register(priv) || mlx5_vdpa_direct_db_prepare(priv) ||
+ mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
+ mlx5_vdpa_cqe_event_setup(priv)) {
+ mlx5_vdpa_dev_close(vid);
+ return -1;
+ }
+ priv->configured = 1;
+ DRV_LOG(INFO, "vDPA device %d was configured.", vid);
+ return 0;
+}
+
+static int
+mlx5_vdpa_get_device_fd(int vid)
+{
+ int did = rte_vhost_get_vdpa_device_id(vid);
+ struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did);
+
+ if (priv == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ return -EINVAL;
+ }
+ return priv->ctx->cmd_fd;
+}
+
+static int
+mlx5_vdpa_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
+{
+ int did = rte_vhost_get_vdpa_device_id(vid);
+ struct mlx5_vdpa_priv *priv = mlx5_vdpa_find_priv_resource_by_did(did);
+
+ RTE_SET_USED(qid);
+ if (priv == NULL) {
+ DRV_LOG(ERR, "Invalid device id: %d.", did);
+ return -EINVAL;
+ }
+ if (!priv->var) {
+ DRV_LOG(ERR, "VAR was not created for device %d, is the device"
+ " configured?.", did);
+ return -EINVAL;
+ }
+ *offset = priv->var->mmap_off;
+ *size = priv->var->length;
+ return 0;
+}
+
+static struct rte_vdpa_dev_ops mlx5_vdpa_ops = {
+ .get_queue_num = mlx5_vdpa_get_queue_num,
+ .get_features = mlx5_vdpa_get_vdpa_features,
+ .get_protocol_features = mlx5_vdpa_get_protocol_features,
+ .dev_conf = mlx5_vdpa_dev_config,
+ .dev_close = mlx5_vdpa_dev_close,
+ .set_vring_state = mlx5_vdpa_set_vring_state,
+ .set_features = mlx5_vdpa_features_set,
+ .migration_done = NULL,
+ .get_vfio_group_fd = NULL,
+ .get_vfio_device_fd = mlx5_vdpa_get_device_fd,
+ .get_notify_area = mlx5_vdpa_get_notify_area,
+};
+
+static struct ibv_device *
+mlx5_vdpa_get_ib_device_match(struct rte_pci_addr *addr)
+{
+ int n;
+ struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
+ struct ibv_device *ibv_match = NULL;
+
+ if (!ibv_list) {
+ rte_errno = ENOSYS;
+ return NULL;
+ }
+ while (n-- > 0) {
+ struct rte_pci_addr pci_addr;
+
+ DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
+ if (mlx5_dev_to_pci_addr(ibv_list[n]->ibdev_path, &pci_addr))
+ continue;
+ if (rte_pci_addr_cmp(addr, &pci_addr))
+ continue;
+ ibv_match = ibv_list[n];
+ break;
+ }
+ if (!ibv_match)
+ rte_errno = ENOENT;
+ mlx5_glue->free_device_list(ibv_list);
+ return ibv_match;
+}
+
+/* Try to disable ROCE by Netlink\Devlink. */
+static int
+mlx5_vdpa_nl_roce_disable(const char *addr)
+{
+ int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC);
+ int devlink_id;
+ int enable;
+ int ret;
+
+ if (nlsk_fd < 0)
+ return nlsk_fd;
+ devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
+ if (devlink_id < 0) {
+ ret = devlink_id;
+ DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by"
+ " Netlink.");
+ goto close;
+ }
+ ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
+ if (ret) {
+ DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
+ ret);
+ goto close;
+ } else if (!enable) {
+ DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
+ goto close;
+ }
+ ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
+ if (ret)
+ DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
+ else
+ DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
+close:
+ close(nlsk_fd);
+ return ret;
+}
+
+/* Try to disable ROCE by sysfs. */
+static int
+mlx5_vdpa_sys_roce_disable(const char *addr)
+{
+ FILE *file_o;
+ int enable;
+ int ret;
+
+ MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
+ file_o = fopen(file_p, "rb");
+ if (!file_o) {
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+ }
+ ret = fscanf(file_o, "%d", &enable);
+ if (ret != 1) {
+ rte_errno = EINVAL;
+ ret = EINVAL;
+ goto close;
+ } else if (!enable) {
+ ret = 0;
+ DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
+ goto close;
+ }
+ fclose(file_o);
+ file_o = fopen(file_p, "wb");
+ if (!file_o) {
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+ }
+ fprintf(file_o, "0\n");
+ ret = 0;
+close:
+ if (ret)
+ DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
+ else
+ DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
+ fclose(file_o);
+ return ret;
+}
+
+#define MLX5_VDPA_MAX_RETRIES 20
+#define MLX5_VDPA_USEC 1000
+static int
+mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv)
+{
+ char addr_name[64] = {0};
+
+ rte_pci_device_name(addr, addr_name, sizeof(addr_name));
+ /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
+ if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 ||
+ mlx5_vdpa_sys_roce_disable(addr_name) == 0) {
+ /*
+ * Succeed to disable ROCE, wait for the IB device to appear
+ * again after reload.
+ */
+ int r;
+ struct ibv_device *ibv_new;
+
+ for (r = MLX5_VDPA_MAX_RETRIES; r; r--) {
+ ibv_new = mlx5_vdpa_get_ib_device_match(addr);
+ if (ibv_new) {
+ *ibv = ibv_new;
+ return 0;
+ }
+ usleep(MLX5_VDPA_USEC);
+ }
+ DRV_LOG(ERR, "Cannot much device %s after ROCE disable, "
+ "retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES);
+ rte_errno = EAGAIN;
+ }
+ return -rte_errno;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function spawns vdpa device out of a given PCI device.
+ *
+ * @param[in] pci_drv
+ * PCI driver structure (mlx5_vpda_driver).
+ * @param[in] pci_dev
+ * PCI device information.
+ *
+ * @return
+ * 0 on success, 1 to skip this driver, a negative errno value otherwise
+ * and rte_errno is set.
+ */
+static int
+mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+ struct rte_pci_device *pci_dev __rte_unused)
+{
+ struct ibv_device *ibv;
+ struct mlx5_vdpa_priv *priv = NULL;
+ struct ibv_context *ctx = NULL;
+ struct mlx5_hca_attr attr;
+ int ret;
+
+ if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_VDPA) {
+ DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
+ " driver.");
+ return 1;
+ }
+ ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
+ if (!ibv) {
+ DRV_LOG(ERR, "No matching IB device for PCI slot "
+ PCI_PRI_FMT ".", pci_dev->addr.domain,
+ pci_dev->addr.bus, pci_dev->addr.devid,
+ pci_dev->addr.function);
+ return -rte_errno;
+ } else {
+ DRV_LOG(INFO, "PCI information matches for device \"%s\".",
+ ibv->name);
+ }
+ if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
+ DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
+ ibv->name);
+ return -rte_errno;
+ }
+ ctx = mlx5_glue->dv_open_device(ibv);
+ if (!ctx) {
+ DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
+ rte_errno = ENODEV;
+ return -rte_errno;
+ }
+ ret = mlx5_devx_cmd_query_hca_attr(ctx, &attr);
+ if (ret) {
+ DRV_LOG(ERR, "Unable to read HCA capabilities.");
+ rte_errno = ENOTSUP;
+ goto error;
+ } else if (!attr.vdpa.valid || !attr.vdpa.max_num_virtio_queues) {
+ DRV_LOG(ERR, "Not enough capabilities to support vdpa, maybe "
+ "old FW/OFED version?");
+ rte_errno = ENOTSUP;
+ goto error;
+ }
+ priv = rte_zmalloc("mlx5 vDPA device private", sizeof(*priv) +
+ sizeof(struct mlx5_vdpa_virtq) *
+ attr.vdpa.max_num_virtio_queues * 2,
+ RTE_CACHE_LINE_SIZE);
+ if (!priv) {
+ DRV_LOG(ERR, "Failed to allocate private memory.");
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ priv->caps = attr.vdpa;
+ priv->log_max_rqt_size = attr.log_max_rqt_size;
+ priv->ctx = ctx;
+ priv->dev_addr.pci_addr = pci_dev->addr;
+ priv->dev_addr.type = VDPA_ADDR_PCI;
+ priv->var = mlx5_glue->dv_alloc_var(ctx, 0);
+ if (!priv->var) {
+ DRV_LOG(ERR, "Failed to allocate VAR %u.\n", errno);
+ goto error;
+ }
+ priv->id = rte_vdpa_register_device(&priv->dev_addr, &mlx5_vdpa_ops);
+ if (priv->id < 0) {
+ DRV_LOG(ERR, "Failed to register vDPA device.");
+ rte_errno = rte_errno ? rte_errno : EINVAL;
+ goto error;
+ }
+ SLIST_INIT(&priv->mr_list);
+ pthread_mutex_lock(&priv_list_lock);
+ TAILQ_INSERT_TAIL(&priv_list, priv, next);
+ pthread_mutex_unlock(&priv_list_lock);
+ return 0;
+
+error:
+ if (priv) {
+ if (priv->var)
+ mlx5_glue->dv_free_var(priv->var);
+ rte_free(priv);
+ }
+ if (ctx)
+ mlx5_glue->close_device(ctx);
+ return -rte_errno;
+}
+
+/**
+ * DPDK callback to remove a PCI device.
+ *
+ * This function removes all vDPA devices belong to a given PCI device.
+ *
+ * @param[in] pci_dev
+ * Pointer to the PCI device.
+ *
+ * @return
+ * 0 on success, the function cannot fail.
+ */
+static int
+mlx5_vdpa_pci_remove(struct rte_pci_device *pci_dev)
+{
+ struct mlx5_vdpa_priv *priv = NULL;
+ int found = 0;
+
+ pthread_mutex_lock(&priv_list_lock);
+ TAILQ_FOREACH(priv, &priv_list, next) {
+ if (memcmp(&priv->dev_addr.pci_addr, &pci_dev->addr,
+ sizeof(pci_dev->addr)) == 0) {
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ TAILQ_REMOVE(&priv_list, priv, next);
+ pthread_mutex_unlock(&priv_list_lock);
+ if (found) {
+ if (priv->configured)
+ mlx5_vdpa_dev_close(priv->vid);
+ if (priv->var) {
+ mlx5_glue->dv_free_var(priv->var);
+ priv->var = NULL;
+ }
+ mlx5_glue->close_device(priv->ctx);
+ rte_free(priv);
+ }
+ return 0;
+}
+
+static const struct rte_pci_id mlx5_vdpa_pci_id_map[] = {
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
+ },
+ {
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
+ },
+ {
+ .vendor_id = 0
+ }
+};
+
+static struct rte_pci_driver mlx5_vdpa_driver = {
+ .driver = {
+ .name = "mlx5_vdpa",
+ },
+ .id_table = mlx5_vdpa_pci_id_map,
+ .probe = mlx5_vdpa_pci_probe,
+ .remove = mlx5_vdpa_pci_remove,
+ .drv_flags = 0,
+};
+
+/**
+ * Driver initialization routine.
+ */
+RTE_INIT(rte_mlx5_vdpa_init)
+{
+ /* Initialize common log type. */
+ mlx5_vdpa_logtype = rte_log_register("pmd.vdpa.mlx5");
+ if (mlx5_vdpa_logtype >= 0)
+ rte_log_set_level(mlx5_vdpa_logtype, RTE_LOG_NOTICE);
+ if (mlx5_glue)
+ rte_pci_register(&mlx5_vdpa_driver);
+}
+
+RTE_PMD_EXPORT_NAME(net_mlx5_vdpa, __COUNTER__);
+RTE_PMD_REGISTER_PCI_TABLE(net_mlx5_vdpa, mlx5_vdpa_pci_id_map);
+RTE_PMD_REGISTER_KMOD_DEP(net_mlx5_vdpa, "* ib_uverbs & mlx5_core & mlx5_ib");
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.h b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.h
new file mode 100644
index 000000000..fcc216ac7
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa.h
@@ -0,0 +1,355 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_VDPA_H_
+#define RTE_PMD_MLX5_VDPA_H_
+
+#include <linux/virtio_net.h>
+#include <sys/queue.h>
+
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <rte_vdpa.h>
+#include <rte_vhost.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+#include <rte_spinlock.h>
+#include <rte_interrupts.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+
+
+#define MLX5_VDPA_INTR_RETRIES 256
+#define MLX5_VDPA_INTR_RETRIES_USEC 1000
+
+#ifndef VIRTIO_F_ORDER_PLATFORM
+#define VIRTIO_F_ORDER_PLATFORM 36
+#endif
+
+#ifndef VIRTIO_F_RING_PACKED
+#define VIRTIO_F_RING_PACKED 34
+#endif
+
+struct mlx5_vdpa_cq {
+ uint16_t log_desc_n;
+ uint32_t cq_ci:24;
+ uint32_t arm_sn:2;
+ int callfd;
+ rte_spinlock_t sl;
+ struct mlx5_devx_obj *cq;
+ struct mlx5dv_devx_umem *umem_obj;
+ union {
+ volatile void *umem_buf;
+ volatile struct mlx5_cqe *cqes;
+ };
+ volatile uint32_t *db_rec;
+ uint64_t errors;
+};
+
+struct mlx5_vdpa_event_qp {
+ struct mlx5_vdpa_cq cq;
+ struct mlx5_devx_obj *fw_qp;
+ struct mlx5_devx_obj *sw_qp;
+ struct mlx5dv_devx_umem *umem_obj;
+ void *umem_buf;
+ volatile uint32_t *db_rec;
+};
+
+struct mlx5_vdpa_query_mr {
+ SLIST_ENTRY(mlx5_vdpa_query_mr) next;
+ void *addr;
+ uint64_t length;
+ struct mlx5dv_devx_umem *umem;
+ struct mlx5_devx_obj *mkey;
+ int is_indirect;
+};
+
+struct mlx5_vdpa_virtq {
+ SLIST_ENTRY(mlx5_vdpa_virtq) next;
+ uint8_t enable;
+ uint16_t index;
+ uint16_t vq_size;
+ struct mlx5_vdpa_priv *priv;
+ struct mlx5_devx_obj *virtq;
+ struct mlx5_vdpa_event_qp eqp;
+ struct {
+ struct mlx5dv_devx_umem *obj;
+ void *buf;
+ uint32_t size;
+ } umems[3];
+ struct rte_intr_handle intr_handle;
+};
+
+struct mlx5_vdpa_steer {
+ struct mlx5_devx_obj *rqt;
+ void *domain;
+ void *tbl;
+ struct {
+ struct mlx5dv_flow_matcher *matcher;
+ struct mlx5_devx_obj *tir;
+ void *tir_action;
+ void *flow;
+ } rss[7];
+};
+
+struct mlx5_vdpa_priv {
+ TAILQ_ENTRY(mlx5_vdpa_priv) next;
+ uint8_t configured;
+ uint8_t direct_notifier; /* Whether direct notifier is on or off. */
+ int id; /* vDPA device id. */
+ int vid; /* vhost device id. */
+ struct ibv_context *ctx; /* Device context. */
+ struct rte_vdpa_dev_addr dev_addr;
+ struct mlx5_hca_vdpa_attr caps;
+ uint32_t pdn; /* Protection Domain number. */
+ struct ibv_pd *pd;
+ uint32_t gpa_mkey_index;
+ struct ibv_mr *null_mr;
+ struct rte_vhost_memory *vmem;
+ uint32_t eqn;
+ struct mlx5dv_devx_event_channel *eventc;
+ struct mlx5dv_devx_uar *uar;
+ struct rte_intr_handle intr_handle;
+ struct mlx5_devx_obj *td;
+ struct mlx5_devx_obj *tis;
+ uint16_t nr_virtqs;
+ uint64_t features; /* Negotiated features. */
+ uint16_t log_max_rqt_size;
+ struct mlx5_vdpa_steer steer;
+ struct mlx5dv_var *var;
+ void *virtq_db_addr;
+ SLIST_HEAD(mr_list, mlx5_vdpa_query_mr) mr_list;
+ struct mlx5_vdpa_virtq virtqs[];
+};
+
+/*
+ * Check whether virtq is for traffic receive.
+ * According to VIRTIO_NET Spec the virtqueues index identity its type by:
+ * 0 receiveq1
+ * 1 transmitq1
+ * ...
+ * 2(N-1) receiveqN
+ * 2(N-1)+1 transmitqN
+ * 2N controlq
+ */
+static inline uint8_t
+is_virtq_recvq(int virtq_index, int nr_vring)
+{
+ if (virtq_index % 2 == 0 && virtq_index != nr_vring - 1)
+ return 1;
+ return 0;
+}
+
+/**
+ * Release all the prepared memory regions and all their related resources.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ */
+void mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Register all the memory regions of the virtio device to the HW and allocate
+ * all their related resources.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv);
+
+
+/**
+ * Create an event QP and all its related resources.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ * @param[in] desc_n
+ * Number of descriptors.
+ * @param[in] callfd
+ * The guest notification file descriptor.
+ * @param[in/out] eqp
+ * Pointer to the event QP structure.
+ *
+ * @return
+ * 0 on success, -1 otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t desc_n,
+ int callfd, struct mlx5_vdpa_event_qp *eqp);
+
+/**
+ * Destroy an event QP and all its related resources.
+ *
+ * @param[in/out] eqp
+ * Pointer to the event QP structure.
+ */
+void mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp);
+
+/**
+ * Release all the event global resources.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ */
+void mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Setup CQE event.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Unset CQE event .
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ */
+void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Release a virtq and all its related resources.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ */
+void mlx5_vdpa_virtqs_release(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Create all the HW virtqs resources and all their related resources.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_virtqs_prepare(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Enable\Disable virtq..
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ * @param[in] index
+ * The virtq index.
+ * @param[in] enable
+ * Set to enable, otherwise disable.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_virtq_enable(struct mlx5_vdpa_priv *priv, int index, int enable);
+
+/**
+ * Unset steering and release all its related resources- stop traffic.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ */
+void mlx5_vdpa_steer_unset(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Update steering according to the received queues status.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_steer_update(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Setup steering and all its related resources to enable RSS traffic from the
+ * device to all the Rx host queues.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_steer_setup(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Enable\Disable live migration logging.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ * @param[in] enable
+ * Set for enable, unset for disable.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_logging_enable(struct mlx5_vdpa_priv *priv, int enable);
+
+/**
+ * Set dirty bitmap logging to allow live migration.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ * @param[in] log_base
+ * Vhost log base.
+ * @param[in] log_size
+ * Vhost log size.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_dirty_bitmap_set(struct mlx5_vdpa_priv *priv, uint64_t log_base,
+ uint64_t log_size);
+
+/**
+ * Log all virtqs information for live migration.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ * @param[in] enable
+ * Set for enable, unset for disable.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_lm_log(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Modify virtq state to be ready or suspend.
+ *
+ * @param[in] virtq
+ * The vdpa driver private virtq structure.
+ * @param[in] state
+ * Set for ready, otherwise suspend.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state);
+
+/**
+ * Stop virtq before destroying it.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ * @param[in] index
+ * The virtq index.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index);
+
+#endif /* RTE_PMD_MLX5_VDPA_H_ */
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_event.c
new file mode 100644
index 000000000..dd60150fe
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_event.c
@@ -0,0 +1,401 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+#include <unistd.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <sys/eventfd.h>
+
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_atomic.h>
+#include <rte_common.h>
+#include <rte_io.h>
+
+#include <mlx5_common.h>
+
+#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
+
+
+void
+mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
+{
+ if (priv->uar) {
+ mlx5_glue->devx_free_uar(priv->uar);
+ priv->uar = NULL;
+ }
+ if (priv->eventc) {
+ mlx5_glue->devx_destroy_event_channel(priv->eventc);
+ priv->eventc = NULL;
+ }
+ priv->eqn = 0;
+}
+
+/* Prepare all the global resources for all the event objects.*/
+static int
+mlx5_vdpa_event_qp_global_prepare(struct mlx5_vdpa_priv *priv)
+{
+ uint32_t lcore;
+
+ if (priv->eventc)
+ return 0;
+ lcore = (uint32_t)rte_lcore_to_cpu_id(-1);
+ if (mlx5_glue->devx_query_eqn(priv->ctx, lcore, &priv->eqn)) {
+ rte_errno = errno;
+ DRV_LOG(ERR, "Failed to query EQ number %d.", rte_errno);
+ return -1;
+ }
+ priv->eventc = mlx5_glue->devx_create_event_channel(priv->ctx,
+ MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA);
+ if (!priv->eventc) {
+ rte_errno = errno;
+ DRV_LOG(ERR, "Failed to create event channel %d.",
+ rte_errno);
+ goto error;
+ }
+ priv->uar = mlx5_glue->devx_alloc_uar(priv->ctx, 0);
+ if (!priv->uar) {
+ rte_errno = errno;
+ DRV_LOG(ERR, "Failed to allocate UAR.");
+ goto error;
+ }
+ return 0;
+error:
+ mlx5_vdpa_event_qp_global_release(priv);
+ return -1;
+}
+
+static void
+mlx5_vdpa_cq_destroy(struct mlx5_vdpa_cq *cq)
+{
+ if (cq->cq)
+ claim_zero(mlx5_devx_cmd_destroy(cq->cq));
+ if (cq->umem_obj)
+ claim_zero(mlx5_glue->devx_umem_dereg(cq->umem_obj));
+ if (cq->umem_buf)
+ rte_free((void *)(uintptr_t)cq->umem_buf);
+ memset(cq, 0, sizeof(*cq));
+}
+
+static inline void
+mlx5_vdpa_cq_arm(struct mlx5_vdpa_priv *priv, struct mlx5_vdpa_cq *cq)
+{
+ uint32_t arm_sn = cq->arm_sn << MLX5_CQ_SQN_OFFSET;
+ uint32_t cq_ci = cq->cq_ci & MLX5_CI_MASK;
+ uint32_t doorbell_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | cq_ci;
+ uint64_t doorbell = ((uint64_t)doorbell_hi << 32) | cq->cq->id;
+ uint64_t db_be = rte_cpu_to_be_64(doorbell);
+ uint32_t *addr = RTE_PTR_ADD(priv->uar->base_addr, MLX5_CQ_DOORBELL);
+
+ rte_io_wmb();
+ cq->db_rec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
+ rte_wmb();
+#ifdef RTE_ARCH_64
+ *(uint64_t *)addr = db_be;
+#else
+ *(uint32_t *)addr = db_be;
+ rte_io_wmb();
+ *((uint32_t *)addr + 1) = db_be >> 32;
+#endif
+ cq->arm_sn++;
+}
+
+static int
+mlx5_vdpa_cq_create(struct mlx5_vdpa_priv *priv, uint16_t log_desc_n,
+ int callfd, struct mlx5_vdpa_cq *cq)
+{
+ struct mlx5_devx_cq_attr attr;
+ size_t pgsize = sysconf(_SC_PAGESIZE);
+ uint32_t umem_size;
+ int ret;
+ uint16_t event_nums[1] = {0};
+
+ cq->log_desc_n = log_desc_n;
+ umem_size = sizeof(struct mlx5_cqe) * (1 << log_desc_n) +
+ sizeof(*cq->db_rec) * 2;
+ cq->umem_buf = rte_zmalloc(__func__, umem_size, 4096);
+ if (!cq->umem_buf) {
+ DRV_LOG(ERR, "Failed to allocate memory for CQ.");
+ rte_errno = ENOMEM;
+ return -ENOMEM;
+ }
+ cq->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx,
+ (void *)(uintptr_t)cq->umem_buf,
+ umem_size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!cq->umem_obj) {
+ DRV_LOG(ERR, "Failed to register umem for CQ.");
+ goto error;
+ }
+ attr.q_umem_valid = 1;
+ attr.db_umem_valid = 1;
+ attr.use_first_only = 0;
+ attr.overrun_ignore = 0;
+ attr.uar_page_id = priv->uar->page_id;
+ attr.q_umem_id = cq->umem_obj->umem_id;
+ attr.q_umem_offset = 0;
+ attr.db_umem_id = cq->umem_obj->umem_id;
+ attr.db_umem_offset = sizeof(struct mlx5_cqe) * (1 << log_desc_n);
+ attr.eqn = priv->eqn;
+ attr.log_cq_size = log_desc_n;
+ attr.log_page_size = rte_log2_u32(pgsize);
+ cq->cq = mlx5_devx_cmd_create_cq(priv->ctx, &attr);
+ if (!cq->cq)
+ goto error;
+ cq->db_rec = RTE_PTR_ADD(cq->umem_buf, (uintptr_t)attr.db_umem_offset);
+ cq->cq_ci = 0;
+ rte_spinlock_init(&cq->sl);
+ /* Subscribe CQ event to the event channel controlled by the driver. */
+ ret = mlx5_glue->devx_subscribe_devx_event(priv->eventc, cq->cq->obj,
+ sizeof(event_nums),
+ event_nums,
+ (uint64_t)(uintptr_t)cq);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to subscribe CQE event.");
+ rte_errno = errno;
+ goto error;
+ }
+ cq->callfd = callfd;
+ /* Init CQ to ones to be in HW owner in the start. */
+ memset((void *)(uintptr_t)cq->umem_buf, 0xFF, attr.db_umem_offset);
+ /* First arming. */
+ mlx5_vdpa_cq_arm(priv, cq);
+ return 0;
+error:
+ mlx5_vdpa_cq_destroy(cq);
+ return -1;
+}
+
+static inline void __rte_unused
+mlx5_vdpa_cq_poll(struct mlx5_vdpa_priv *priv __rte_unused,
+ struct mlx5_vdpa_cq *cq)
+{
+ struct mlx5_vdpa_event_qp *eqp =
+ container_of(cq, struct mlx5_vdpa_event_qp, cq);
+ const unsigned int cq_size = 1 << cq->log_desc_n;
+ const unsigned int cq_mask = cq_size - 1;
+ int ret;
+
+ do {
+ volatile struct mlx5_cqe *cqe = cq->cqes + (cq->cq_ci &
+ cq_mask);
+
+ ret = check_cqe(cqe, cq_size, cq->cq_ci);
+ switch (ret) {
+ case MLX5_CQE_STATUS_ERR:
+ cq->errors++;
+ /*fall-through*/
+ case MLX5_CQE_STATUS_SW_OWN:
+ cq->cq_ci++;
+ break;
+ case MLX5_CQE_STATUS_HW_OWN:
+ default:
+ break;
+ }
+ } while (ret != MLX5_CQE_STATUS_HW_OWN);
+ rte_io_wmb();
+ /* Ring CQ doorbell record. */
+ cq->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
+ rte_io_wmb();
+ /* Ring SW QP doorbell record. */
+ eqp->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci + cq_size);
+}
+
+static void
+mlx5_vdpa_interrupt_handler(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_EVENT
+ (void)cb_arg;
+ return;
+#else
+ struct mlx5_vdpa_priv *priv = cb_arg;
+ union {
+ struct mlx5dv_devx_async_event_hdr event_resp;
+ uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+ } out;
+
+ while (mlx5_glue->devx_get_event(priv->eventc, &out.event_resp,
+ sizeof(out.buf)) >=
+ (ssize_t)sizeof(out.event_resp.cookie)) {
+ struct mlx5_vdpa_cq *cq = (struct mlx5_vdpa_cq *)
+ (uintptr_t)out.event_resp.cookie;
+ rte_spinlock_lock(&cq->sl);
+ mlx5_vdpa_cq_poll(priv, cq);
+ mlx5_vdpa_cq_arm(priv, cq);
+ if (cq->callfd != -1)
+ /* Notify guest for descriptors consuming. */
+ eventfd_write(cq->callfd, (eventfd_t)1);
+ rte_spinlock_unlock(&cq->sl);
+ DRV_LOG(DEBUG, "CQ %d event: new cq_ci = %u.", cq->cq->id,
+ cq->cq_ci);
+ }
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+int
+mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv)
+{
+ int flags;
+ int ret;
+
+ if (!priv->eventc)
+ /* All virtqs are in poll mode. */
+ return 0;
+ flags = fcntl(priv->eventc->fd, F_GETFL);
+ ret = fcntl(priv->eventc->fd, F_SETFL, flags | O_NONBLOCK);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to change event channel FD.");
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ priv->intr_handle.fd = priv->eventc->fd;
+ priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
+ if (rte_intr_callback_register(&priv->intr_handle,
+ mlx5_vdpa_interrupt_handler, priv)) {
+ priv->intr_handle.fd = 0;
+ DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno);
+ return -rte_errno;
+ }
+ return 0;
+}
+
+void
+mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv)
+{
+ int retries = MLX5_VDPA_INTR_RETRIES;
+ int ret = -EAGAIN;
+
+ if (priv->intr_handle.fd) {
+ while (retries-- && ret == -EAGAIN) {
+ ret = rte_intr_callback_unregister(&priv->intr_handle,
+ mlx5_vdpa_interrupt_handler,
+ priv);
+ if (ret == -EAGAIN) {
+ DRV_LOG(DEBUG, "Try again to unregister fd %d "
+ "of CQ interrupt, retries = %d.",
+ priv->intr_handle.fd, retries);
+ usleep(MLX5_VDPA_INTR_RETRIES_USEC);
+ }
+ }
+ memset(&priv->intr_handle, 0, sizeof(priv->intr_handle));
+ }
+}
+
+void
+mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp)
+{
+ if (eqp->sw_qp)
+ claim_zero(mlx5_devx_cmd_destroy(eqp->sw_qp));
+ if (eqp->umem_obj)
+ claim_zero(mlx5_glue->devx_umem_dereg(eqp->umem_obj));
+ if (eqp->umem_buf)
+ rte_free(eqp->umem_buf);
+ if (eqp->fw_qp)
+ claim_zero(mlx5_devx_cmd_destroy(eqp->fw_qp));
+ mlx5_vdpa_cq_destroy(&eqp->cq);
+ memset(eqp, 0, sizeof(*eqp));
+}
+
+static int
+mlx5_vdpa_qps2rts(struct mlx5_vdpa_event_qp *eqp)
+{
+ if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_RST2INIT_QP,
+ eqp->sw_qp->id)) {
+ DRV_LOG(ERR, "Failed to modify FW QP to INIT state(%u).",
+ rte_errno);
+ return -1;
+ }
+ if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_RST2INIT_QP,
+ eqp->fw_qp->id)) {
+ DRV_LOG(ERR, "Failed to modify SW QP to INIT state(%u).",
+ rte_errno);
+ return -1;
+ }
+ if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_INIT2RTR_QP,
+ eqp->sw_qp->id)) {
+ DRV_LOG(ERR, "Failed to modify FW QP to RTR state(%u).",
+ rte_errno);
+ return -1;
+ }
+ if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_INIT2RTR_QP,
+ eqp->fw_qp->id)) {
+ DRV_LOG(ERR, "Failed to modify SW QP to RTR state(%u).",
+ rte_errno);
+ return -1;
+ }
+ if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_RTR2RTS_QP,
+ eqp->sw_qp->id)) {
+ DRV_LOG(ERR, "Failed to modify FW QP to RTS state(%u).",
+ rte_errno);
+ return -1;
+ }
+ if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_RTR2RTS_QP,
+ eqp->fw_qp->id)) {
+ DRV_LOG(ERR, "Failed to modify SW QP to RTS state(%u).",
+ rte_errno);
+ return -1;
+ }
+ return 0;
+}
+
+int
+mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t desc_n,
+ int callfd, struct mlx5_vdpa_event_qp *eqp)
+{
+ struct mlx5_devx_qp_attr attr = {0};
+ uint16_t log_desc_n = rte_log2_u32(desc_n);
+ uint32_t umem_size = (1 << log_desc_n) * MLX5_WSEG_SIZE +
+ sizeof(*eqp->db_rec) * 2;
+
+ if (mlx5_vdpa_event_qp_global_prepare(priv))
+ return -1;
+ if (mlx5_vdpa_cq_create(priv, log_desc_n, callfd, &eqp->cq))
+ return -1;
+ attr.pd = priv->pdn;
+ eqp->fw_qp = mlx5_devx_cmd_create_qp(priv->ctx, &attr);
+ if (!eqp->fw_qp) {
+ DRV_LOG(ERR, "Failed to create FW QP(%u).", rte_errno);
+ goto error;
+ }
+ eqp->umem_buf = rte_zmalloc(__func__, umem_size, 4096);
+ if (!eqp->umem_buf) {
+ DRV_LOG(ERR, "Failed to allocate memory for SW QP.");
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ eqp->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx,
+ (void *)(uintptr_t)eqp->umem_buf,
+ umem_size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!eqp->umem_obj) {
+ DRV_LOG(ERR, "Failed to register umem for SW QP.");
+ goto error;
+ }
+ attr.uar_index = priv->uar->page_id;
+ attr.cqn = eqp->cq.cq->id;
+ attr.log_page_size = rte_log2_u32(sysconf(_SC_PAGESIZE));
+ attr.rq_size = 1 << log_desc_n;
+ attr.log_rq_stride = rte_log2_u32(MLX5_WSEG_SIZE);
+ attr.sq_size = 0; /* No need SQ. */
+ attr.dbr_umem_valid = 1;
+ attr.wq_umem_id = eqp->umem_obj->umem_id;
+ attr.wq_umem_offset = 0;
+ attr.dbr_umem_id = eqp->umem_obj->umem_id;
+ attr.dbr_address = (1 << log_desc_n) * MLX5_WSEG_SIZE;
+ eqp->sw_qp = mlx5_devx_cmd_create_qp(priv->ctx, &attr);
+ if (!eqp->sw_qp) {
+ DRV_LOG(ERR, "Failed to create SW QP(%u).", rte_errno);
+ goto error;
+ }
+ eqp->db_rec = RTE_PTR_ADD(eqp->umem_buf, (uintptr_t)attr.dbr_address);
+ if (mlx5_vdpa_qps2rts(eqp))
+ goto error;
+ /* First ringing. */
+ rte_write32(rte_cpu_to_be_32(1 << log_desc_n), &eqp->db_rec[0]);
+ return 0;
+error:
+ mlx5_vdpa_event_qp_destroy(eqp);
+ return -1;
+}
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_lm.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_lm.c
new file mode 100644
index 000000000..460e01d80
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_lm.c
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+#include <rte_malloc.h>
+#include <rte_errno.h>
+
+#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
+
+
+int
+mlx5_vdpa_logging_enable(struct mlx5_vdpa_priv *priv, int enable)
+{
+ struct mlx5_devx_virtq_attr attr = {
+ .type = MLX5_VIRTQ_MODIFY_TYPE_DIRTY_BITMAP_DUMP_ENABLE,
+ .dirty_bitmap_dump_enable = enable,
+ };
+ int i;
+
+ for (i = 0; i < priv->nr_virtqs; ++i) {
+ attr.queue_index = i;
+ if (!priv->virtqs[i].virtq ||
+ mlx5_devx_cmd_modify_virtq(priv->virtqs[i].virtq, &attr)) {
+ DRV_LOG(ERR, "Failed to modify virtq %d logging.", i);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int
+mlx5_vdpa_dirty_bitmap_set(struct mlx5_vdpa_priv *priv, uint64_t log_base,
+ uint64_t log_size)
+{
+ struct mlx5_devx_mkey_attr mkey_attr = {
+ .addr = (uintptr_t)log_base,
+ .size = log_size,
+ .pd = priv->pdn,
+ .pg_access = 1,
+ .klm_array = NULL,
+ .klm_num = 0,
+ .relaxed_ordering = 0,
+ };
+ struct mlx5_devx_virtq_attr attr = {
+ .type = MLX5_VIRTQ_MODIFY_TYPE_DIRTY_BITMAP_PARAMS,
+ .dirty_bitmap_addr = log_base,
+ .dirty_bitmap_size = log_size,
+ };
+ struct mlx5_vdpa_query_mr *mr = rte_malloc(__func__, sizeof(*mr), 0);
+ int i;
+
+ if (!mr) {
+ DRV_LOG(ERR, "Failed to allocate mem for lm mr.");
+ return -1;
+ }
+ mr->umem = mlx5_glue->devx_umem_reg(priv->ctx,
+ (void *)(uintptr_t)log_base,
+ log_size, IBV_ACCESS_LOCAL_WRITE);
+ if (!mr->umem) {
+ DRV_LOG(ERR, "Failed to register umem for lm mr.");
+ goto err;
+ }
+ mkey_attr.umem_id = mr->umem->umem_id;
+ mr->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr);
+ if (!mr->mkey) {
+ DRV_LOG(ERR, "Failed to create Mkey for lm.");
+ goto err;
+ }
+ attr.dirty_bitmap_mkey = mr->mkey->id;
+ for (i = 0; i < priv->nr_virtqs; ++i) {
+ attr.queue_index = i;
+ if (!priv->virtqs[i].virtq ||
+ mlx5_devx_cmd_modify_virtq(priv->virtqs[i].virtq, &attr)) {
+ DRV_LOG(ERR, "Failed to modify virtq %d for lm.", i);
+ goto err;
+ }
+ }
+ mr->is_indirect = 0;
+ SLIST_INSERT_HEAD(&priv->mr_list, mr, next);
+ return 0;
+err:
+ if (mr->mkey)
+ mlx5_devx_cmd_destroy(mr->mkey);
+ if (mr->umem)
+ mlx5_glue->devx_umem_dereg(mr->umem);
+ rte_free(mr);
+ return -1;
+}
+
+#define MLX5_VDPA_USED_RING_LEN(size) \
+ ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
+
+int
+mlx5_vdpa_lm_log(struct mlx5_vdpa_priv *priv)
+{
+ uint64_t features;
+ int ret = rte_vhost_get_negotiated_features(priv->vid, &features);
+ int i;
+
+ if (ret) {
+ DRV_LOG(ERR, "Failed to get negotiated features.");
+ return -1;
+ }
+ if (!RTE_VHOST_NEED_LOG(features))
+ return 0;
+ for (i = 0; i < priv->nr_virtqs; ++i) {
+ if (priv->virtqs[i].virtq) {
+ ret = mlx5_vdpa_virtq_stop(priv, i);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to stop virtq %d.", i);
+ return -1;
+ }
+ } else {
+ DRV_LOG(ERR, "virtq %d is not created.", i);
+ return -1;
+ }
+ rte_vhost_log_used_vring(priv->vid, i, 0,
+ MLX5_VDPA_USED_RING_LEN(priv->virtqs[i].vq_size));
+ }
+ return 0;
+}
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_mem.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_mem.c
new file mode 100644
index 000000000..da31b47ec
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_mem.c
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+#include <stdlib.h>
+
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_common.h>
+#include <rte_sched_common.h>
+
+#include <mlx5_prm.h>
+#include <mlx5_common.h>
+
+#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
+
+static int
+mlx5_vdpa_pd_prepare(struct mlx5_vdpa_priv *priv)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ if (priv->pd)
+ return 0;
+ priv->pd = mlx5_glue->alloc_pd(priv->ctx);
+ if (priv->pd == NULL) {
+ DRV_LOG(ERR, "Failed to allocate PD.");
+ return errno ? -errno : -ENOMEM;
+ }
+ struct mlx5dv_obj obj;
+ struct mlx5dv_pd pd_info;
+ int ret = 0;
+
+ obj.pd.in = priv->pd;
+ obj.pd.out = &pd_info;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
+ if (ret) {
+ DRV_LOG(ERR, "Fail to get PD object info.");
+ mlx5_glue->dealloc_pd(priv->pd);
+ priv->pd = NULL;
+ return -errno;
+ }
+ priv->pdn = pd_info.pdn;
+ return 0;
+#else
+ (void)priv;
+ DRV_LOG(ERR, "Cannot get pdn - no DV support.");
+ return -ENOTSUP;
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
+}
+
+void
+mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv)
+{
+ struct mlx5_vdpa_query_mr *entry;
+ struct mlx5_vdpa_query_mr *next;
+
+ entry = SLIST_FIRST(&priv->mr_list);
+ while (entry) {
+ next = SLIST_NEXT(entry, next);
+ claim_zero(mlx5_devx_cmd_destroy(entry->mkey));
+ if (!entry->is_indirect)
+ claim_zero(mlx5_glue->devx_umem_dereg(entry->umem));
+ SLIST_REMOVE(&priv->mr_list, entry, mlx5_vdpa_query_mr, next);
+ rte_free(entry);
+ entry = next;
+ }
+ SLIST_INIT(&priv->mr_list);
+ if (priv->null_mr) {
+ claim_zero(mlx5_glue->dereg_mr(priv->null_mr));
+ priv->null_mr = NULL;
+ }
+ if (priv->pd) {
+ claim_zero(mlx5_glue->dealloc_pd(priv->pd));
+ priv->pd = NULL;
+ }
+ if (priv->vmem) {
+ free(priv->vmem);
+ priv->vmem = NULL;
+ }
+}
+
+static int
+mlx5_vdpa_regions_addr_cmp(const void *a, const void *b)
+{
+ const struct rte_vhost_mem_region *region_a = a;
+ const struct rte_vhost_mem_region *region_b = b;
+
+ if (region_a->guest_phys_addr < region_b->guest_phys_addr)
+ return -1;
+ if (region_a->guest_phys_addr > region_b->guest_phys_addr)
+ return 1;
+ return 0;
+}
+
+#define KLM_NUM_MAX_ALIGN(sz) (RTE_ALIGN_CEIL(sz, MLX5_MAX_KLM_BYTE_COUNT) / \
+ MLX5_MAX_KLM_BYTE_COUNT)
+
+/*
+ * Allocate and sort the region list and choose indirect mkey mode:
+ * 1. Calculate GCD, guest memory size and indirect mkey entries num per mode.
+ * 2. Align GCD to the maximum allowed size(2G) and to be power of 2.
+ * 2. Decide the indirect mkey mode according to the next rules:
+ * a. If both KLM_FBS entries number and KLM entries number are bigger
+ * than the maximum allowed(MLX5_DEVX_MAX_KLM_ENTRIES) - error.
+ * b. KLM mode if KLM_FBS entries number is bigger than the maximum
+ * allowed(MLX5_DEVX_MAX_KLM_ENTRIES).
+ * c. KLM mode if GCD is smaller than the minimum allowed(4K).
+ * d. KLM mode if the total size of KLM entries is in one cache line
+ * and the total size of KLM_FBS entries is not in one cache line.
+ * e. Otherwise, KLM_FBS mode.
+ */
+static struct rte_vhost_memory *
+mlx5_vdpa_vhost_mem_regions_prepare(int vid, uint8_t *mode, uint64_t *mem_size,
+ uint64_t *gcd, uint32_t *entries_num)
+{
+ struct rte_vhost_memory *mem;
+ uint64_t size;
+ uint64_t klm_entries_num = 0;
+ uint64_t klm_fbs_entries_num;
+ uint32_t i;
+ int ret = rte_vhost_get_mem_table(vid, &mem);
+
+ if (ret < 0) {
+ DRV_LOG(ERR, "Failed to get VM memory layout vid =%d.", vid);
+ rte_errno = EINVAL;
+ return NULL;
+ }
+ qsort(mem->regions, mem->nregions, sizeof(mem->regions[0]),
+ mlx5_vdpa_regions_addr_cmp);
+ *mem_size = (mem->regions[(mem->nregions - 1)].guest_phys_addr) +
+ (mem->regions[(mem->nregions - 1)].size) -
+ (mem->regions[0].guest_phys_addr);
+ *gcd = 0;
+ for (i = 0; i < mem->nregions; ++i) {
+ DRV_LOG(INFO, "Region %u: HVA 0x%" PRIx64 ", GPA 0x%" PRIx64
+ ", size 0x%" PRIx64 ".", i,
+ mem->regions[i].host_user_addr,
+ mem->regions[i].guest_phys_addr, mem->regions[i].size);
+ if (i > 0) {
+ /* Hole handle. */
+ size = mem->regions[i].guest_phys_addr -
+ (mem->regions[i - 1].guest_phys_addr +
+ mem->regions[i - 1].size);
+ *gcd = rte_get_gcd(*gcd, size);
+ klm_entries_num += KLM_NUM_MAX_ALIGN(size);
+ }
+ size = mem->regions[i].size;
+ *gcd = rte_get_gcd(*gcd, size);
+ klm_entries_num += KLM_NUM_MAX_ALIGN(size);
+ }
+ if (*gcd > MLX5_MAX_KLM_BYTE_COUNT)
+ *gcd = rte_get_gcd(*gcd, MLX5_MAX_KLM_BYTE_COUNT);
+ if (!RTE_IS_POWER_OF_2(*gcd)) {
+ uint64_t candidate_gcd = rte_align64prevpow2(*gcd);
+
+ while (candidate_gcd > 1 && (*gcd % candidate_gcd))
+ candidate_gcd /= 2;
+ DRV_LOG(DEBUG, "GCD 0x%" PRIx64 " is not power of 2. Adjusted "
+ "GCD is 0x%" PRIx64 ".", *gcd, candidate_gcd);
+ *gcd = candidate_gcd;
+ }
+ klm_fbs_entries_num = *mem_size / *gcd;
+ if (*gcd < MLX5_MIN_KLM_FIXED_BUFFER_SIZE || klm_fbs_entries_num >
+ MLX5_DEVX_MAX_KLM_ENTRIES ||
+ ((klm_entries_num * sizeof(struct mlx5_klm)) <=
+ RTE_CACHE_LINE_SIZE && (klm_fbs_entries_num *
+ sizeof(struct mlx5_klm)) >
+ RTE_CACHE_LINE_SIZE)) {
+ *mode = MLX5_MKC_ACCESS_MODE_KLM;
+ *entries_num = klm_entries_num;
+ DRV_LOG(INFO, "Indirect mkey mode is KLM.");
+ } else {
+ *mode = MLX5_MKC_ACCESS_MODE_KLM_FBS;
+ *entries_num = klm_fbs_entries_num;
+ DRV_LOG(INFO, "Indirect mkey mode is KLM Fixed Buffer Size.");
+ }
+ DRV_LOG(DEBUG, "Memory registration information: nregions = %u, "
+ "mem_size = 0x%" PRIx64 ", GCD = 0x%" PRIx64
+ ", klm_fbs_entries_num = 0x%" PRIx64 ", klm_entries_num = 0x%"
+ PRIx64 ".", mem->nregions, *mem_size, *gcd, klm_fbs_entries_num,
+ klm_entries_num);
+ if (*entries_num > MLX5_DEVX_MAX_KLM_ENTRIES) {
+ DRV_LOG(ERR, "Failed to prepare memory of vid %d - memory is "
+ "too fragmented.", vid);
+ free(mem);
+ return NULL;
+ }
+ return mem;
+}
+
+#define KLM_SIZE_MAX_ALIGN(sz) ((sz) > MLX5_MAX_KLM_BYTE_COUNT ? \
+ MLX5_MAX_KLM_BYTE_COUNT : (sz))
+
+/*
+ * The target here is to group all the physical memory regions of the
+ * virtio device in one indirect mkey.
+ * For KLM Fixed Buffer Size mode (HW find the translation entry in one
+ * read according to the guest phisical address):
+ * All the sub-direct mkeys of it must be in the same size, hence, each
+ * one of them should be in the GCD size of all the virtio memory
+ * regions and the holes between them.
+ * For KLM mode (each entry may be in different size so HW must iterate
+ * the entries):
+ * Each virtio memory region and each hole between them have one entry,
+ * just need to cover the maximum allowed size(2G) by splitting entries
+ * which their associated memory regions are bigger than 2G.
+ * It means that each virtio memory region may be mapped to more than
+ * one direct mkey in the 2 modes.
+ * All the holes of invalid memory between the virtio memory regions
+ * will be mapped to the null memory region for security.
+ */
+int
+mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv)
+{
+ struct mlx5_devx_mkey_attr mkey_attr;
+ struct mlx5_vdpa_query_mr *entry = NULL;
+ struct rte_vhost_mem_region *reg = NULL;
+ uint8_t mode;
+ uint32_t entries_num = 0;
+ uint32_t i;
+ uint64_t gcd;
+ uint64_t klm_size;
+ uint64_t mem_size;
+ uint64_t k;
+ int klm_index = 0;
+ int ret;
+ struct rte_vhost_memory *mem = mlx5_vdpa_vhost_mem_regions_prepare
+ (priv->vid, &mode, &mem_size, &gcd, &entries_num);
+ struct mlx5_klm klm_array[entries_num];
+
+ if (!mem)
+ return -rte_errno;
+ priv->vmem = mem;
+ ret = mlx5_vdpa_pd_prepare(priv);
+ if (ret)
+ goto error;
+ priv->null_mr = mlx5_glue->alloc_null_mr(priv->pd);
+ if (!priv->null_mr) {
+ DRV_LOG(ERR, "Failed to allocate null MR.");
+ ret = -errno;
+ goto error;
+ }
+ DRV_LOG(DEBUG, "Dump fill Mkey = %u.", priv->null_mr->lkey);
+ for (i = 0; i < mem->nregions; i++) {
+ reg = &mem->regions[i];
+ entry = rte_zmalloc(__func__, sizeof(*entry), 0);
+ if (!entry) {
+ ret = -ENOMEM;
+ DRV_LOG(ERR, "Failed to allocate mem entry memory.");
+ goto error;
+ }
+ entry->umem = mlx5_glue->devx_umem_reg(priv->ctx,
+ (void *)(uintptr_t)reg->host_user_addr,
+ reg->size, IBV_ACCESS_LOCAL_WRITE);
+ if (!entry->umem) {
+ DRV_LOG(ERR, "Failed to register Umem by Devx.");
+ ret = -errno;
+ goto error;
+ }
+ mkey_attr.addr = (uintptr_t)(reg->guest_phys_addr);
+ mkey_attr.size = reg->size;
+ mkey_attr.umem_id = entry->umem->umem_id;
+ mkey_attr.pd = priv->pdn;
+ mkey_attr.pg_access = 1;
+ mkey_attr.klm_array = NULL;
+ mkey_attr.klm_num = 0;
+ mkey_attr.relaxed_ordering = 0;
+ entry->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr);
+ if (!entry->mkey) {
+ DRV_LOG(ERR, "Failed to create direct Mkey.");
+ ret = -rte_errno;
+ goto error;
+ }
+ entry->addr = (void *)(uintptr_t)(reg->host_user_addr);
+ entry->length = reg->size;
+ entry->is_indirect = 0;
+ if (i > 0) {
+ uint64_t sadd;
+ uint64_t empty_region_sz = reg->guest_phys_addr -
+ (mem->regions[i - 1].guest_phys_addr +
+ mem->regions[i - 1].size);
+
+ if (empty_region_sz > 0) {
+ sadd = mem->regions[i - 1].guest_phys_addr +
+ mem->regions[i - 1].size;
+ klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
+ KLM_SIZE_MAX_ALIGN(empty_region_sz) : gcd;
+ for (k = 0; k < empty_region_sz;
+ k += klm_size) {
+ klm_array[klm_index].byte_count =
+ k + klm_size > empty_region_sz ?
+ empty_region_sz - k : klm_size;
+ klm_array[klm_index].mkey =
+ priv->null_mr->lkey;
+ klm_array[klm_index].address = sadd + k;
+ klm_index++;
+ }
+ }
+ }
+ klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
+ KLM_SIZE_MAX_ALIGN(reg->size) : gcd;
+ for (k = 0; k < reg->size; k += klm_size) {
+ klm_array[klm_index].byte_count = k + klm_size >
+ reg->size ? reg->size - k : klm_size;
+ klm_array[klm_index].mkey = entry->mkey->id;
+ klm_array[klm_index].address = reg->guest_phys_addr + k;
+ klm_index++;
+ }
+ SLIST_INSERT_HEAD(&priv->mr_list, entry, next);
+ }
+ mkey_attr.addr = (uintptr_t)(mem->regions[0].guest_phys_addr);
+ mkey_attr.size = mem_size;
+ mkey_attr.pd = priv->pdn;
+ mkey_attr.umem_id = 0;
+ /* Must be zero for KLM mode. */
+ mkey_attr.log_entity_size = mode == MLX5_MKC_ACCESS_MODE_KLM_FBS ?
+ rte_log2_u64(gcd) : 0;
+ mkey_attr.pg_access = 0;
+ mkey_attr.klm_array = klm_array;
+ mkey_attr.klm_num = klm_index;
+ entry = rte_zmalloc(__func__, sizeof(*entry), 0);
+ if (!entry) {
+ DRV_LOG(ERR, "Failed to allocate memory for indirect entry.");
+ ret = -ENOMEM;
+ goto error;
+ }
+ entry->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr);
+ if (!entry->mkey) {
+ DRV_LOG(ERR, "Failed to create indirect Mkey.");
+ ret = -rte_errno;
+ goto error;
+ }
+ entry->is_indirect = 1;
+ SLIST_INSERT_HEAD(&priv->mr_list, entry, next);
+ priv->gpa_mkey_index = entry->mkey->id;
+ return 0;
+error:
+ if (entry) {
+ if (entry->mkey)
+ mlx5_devx_cmd_destroy(entry->mkey);
+ if (entry->umem)
+ mlx5_glue->devx_umem_dereg(entry->umem);
+ rte_free(entry);
+ }
+ mlx5_vdpa_mem_dereg(priv);
+ rte_errno = -ret;
+ return ret;
+}
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_steer.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_steer.c
new file mode 100644
index 000000000..406c7be17
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_steer.c
@@ -0,0 +1,288 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+#include <netinet/in.h>
+
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_common.h>
+
+#include <mlx5_common.h>
+
+#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
+
+static void
+mlx5_vdpa_rss_flows_destroy(struct mlx5_vdpa_priv *priv)
+{
+ unsigned i;
+
+ for (i = 0; i < RTE_DIM(priv->steer.rss); ++i) {
+ if (priv->steer.rss[i].flow) {
+ claim_zero(mlx5_glue->dv_destroy_flow
+ (priv->steer.rss[i].flow));
+ priv->steer.rss[i].flow = NULL;
+ }
+ if (priv->steer.rss[i].tir_action) {
+ claim_zero(mlx5_glue->destroy_flow_action
+ (priv->steer.rss[i].tir_action));
+ priv->steer.rss[i].tir_action = NULL;
+ }
+ if (priv->steer.rss[i].tir) {
+ claim_zero(mlx5_devx_cmd_destroy
+ (priv->steer.rss[i].tir));
+ priv->steer.rss[i].tir = NULL;
+ }
+ if (priv->steer.rss[i].matcher) {
+ claim_zero(mlx5_glue->dv_destroy_flow_matcher
+ (priv->steer.rss[i].matcher));
+ priv->steer.rss[i].matcher = NULL;
+ }
+ }
+}
+
+void
+mlx5_vdpa_steer_unset(struct mlx5_vdpa_priv *priv)
+{
+ mlx5_vdpa_rss_flows_destroy(priv);
+ if (priv->steer.tbl) {
+ claim_zero(mlx5_glue->dr_destroy_flow_tbl(priv->steer.tbl));
+ priv->steer.tbl = NULL;
+ }
+ if (priv->steer.domain) {
+ claim_zero(mlx5_glue->dr_destroy_domain(priv->steer.domain));
+ priv->steer.domain = NULL;
+ }
+ if (priv->steer.rqt) {
+ claim_zero(mlx5_devx_cmd_destroy(priv->steer.rqt));
+ priv->steer.rqt = NULL;
+ }
+}
+
+#define MLX5_VDPA_DEFAULT_RQT_SIZE 512
+/*
+ * Return the number of queues configured to the table on success, otherwise
+ * -1 on error.
+ */
+static int
+mlx5_vdpa_rqt_prepare(struct mlx5_vdpa_priv *priv)
+{
+ int i;
+ uint32_t rqt_n = RTE_MIN(MLX5_VDPA_DEFAULT_RQT_SIZE,
+ 1 << priv->log_max_rqt_size);
+ struct mlx5_devx_rqt_attr *attr = rte_zmalloc(__func__, sizeof(*attr)
+ + rqt_n *
+ sizeof(uint32_t), 0);
+ uint32_t k = 0, j;
+ int ret = 0, num;
+
+ if (!attr) {
+ DRV_LOG(ERR, "Failed to allocate RQT attributes memory.");
+ rte_errno = ENOMEM;
+ return -ENOMEM;
+ }
+ for (i = 0; i < priv->nr_virtqs; i++) {
+ if (is_virtq_recvq(i, priv->nr_virtqs) &&
+ priv->virtqs[i].enable && priv->virtqs[i].virtq) {
+ attr->rq_list[k] = priv->virtqs[i].virtq->id;
+ k++;
+ }
+ }
+ if (k == 0)
+ /* No enabled RQ to configure for RSS. */
+ return 0;
+ num = (int)k;
+ for (j = 0; k != rqt_n; ++k, ++j)
+ attr->rq_list[k] = attr->rq_list[j];
+ attr->rq_type = MLX5_INLINE_Q_TYPE_VIRTQ;
+ attr->rqt_max_size = rqt_n;
+ attr->rqt_actual_size = rqt_n;
+ if (!priv->steer.rqt) {
+ priv->steer.rqt = mlx5_devx_cmd_create_rqt(priv->ctx, attr);
+ if (!priv->steer.rqt) {
+ DRV_LOG(ERR, "Failed to create RQT.");
+ ret = -rte_errno;
+ }
+ } else {
+ ret = mlx5_devx_cmd_modify_rqt(priv->steer.rqt, attr);
+ if (ret)
+ DRV_LOG(ERR, "Failed to modify RQT.");
+ }
+ rte_free(attr);
+ return ret ? -1 : num;
+}
+
+static int __rte_unused
+mlx5_vdpa_rss_flows_create(struct mlx5_vdpa_priv *priv)
+{
+#ifdef HAVE_MLX5DV_DR
+ struct mlx5_devx_tir_attr tir_att = {
+ .disp_type = MLX5_TIRC_DISP_TYPE_INDIRECT,
+ .rx_hash_fn = MLX5_RX_HASH_FN_TOEPLITZ,
+ .transport_domain = priv->td->id,
+ .indirect_table = priv->steer.rqt->id,
+ .rx_hash_symmetric = 1,
+ .rx_hash_toeplitz_key = { 0x2c, 0xc6, 0x81, 0xd1,
+ 0x5b, 0xdb, 0xf4, 0xf7,
+ 0xfc, 0xa2, 0x83, 0x19,
+ 0xdb, 0x1a, 0x3e, 0x94,
+ 0x6b, 0x9e, 0x38, 0xd9,
+ 0x2c, 0x9c, 0x03, 0xd1,
+ 0xad, 0x99, 0x44, 0xa7,
+ 0xd9, 0x56, 0x3d, 0x59,
+ 0x06, 0x3c, 0x25, 0xf3,
+ 0xfc, 0x1f, 0xdc, 0x2a },
+ };
+ struct {
+ size_t size;
+ /**< Size of match value. Do NOT split size and key! */
+ uint32_t buf[MLX5_ST_SZ_DW(fte_match_param)];
+ /**< Matcher value. This value is used as the mask or a key. */
+ } matcher_mask = {
+ .size = sizeof(matcher_mask.buf),
+ },
+ matcher_value = {
+ .size = sizeof(matcher_value.buf),
+ };
+ struct mlx5dv_flow_matcher_attr dv_attr = {
+ .type = IBV_FLOW_ATTR_NORMAL,
+ .match_mask = (void *)&matcher_mask,
+ };
+ void *match_m = matcher_mask.buf;
+ void *match_v = matcher_value.buf;
+ void *headers_m = MLX5_ADDR_OF(fte_match_param, match_m, outer_headers);
+ void *headers_v = MLX5_ADDR_OF(fte_match_param, match_v, outer_headers);
+ void *actions[1];
+ const uint8_t l3_hash =
+ (1 << MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP) |
+ (1 << MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP);
+ const uint8_t l4_hash =
+ (1 << MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT) |
+ (1 << MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT);
+ enum { PRIO, CRITERIA, IP_VER_M, IP_VER_V, IP_PROT_M, IP_PROT_V, L3_BIT,
+ L4_BIT, HASH, END};
+ const uint8_t vars[RTE_DIM(priv->steer.rss)][END] = {
+ { 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 6, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 4, 0, 0,
+ MLX5_L3_PROT_TYPE_IPV4, 0, l3_hash },
+ { 6, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 6, 0, 0,
+ MLX5_L3_PROT_TYPE_IPV6, 0, l3_hash },
+ { 5, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 4, 0xff,
+ IPPROTO_UDP, MLX5_L3_PROT_TYPE_IPV4, MLX5_L4_PROT_TYPE_UDP,
+ l3_hash | l4_hash },
+ { 5, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 4, 0xff,
+ IPPROTO_TCP, MLX5_L3_PROT_TYPE_IPV4, MLX5_L4_PROT_TYPE_TCP,
+ l3_hash | l4_hash },
+ { 5, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 6, 0xff,
+ IPPROTO_UDP, MLX5_L3_PROT_TYPE_IPV6, MLX5_L4_PROT_TYPE_UDP,
+ l3_hash | l4_hash },
+ { 5, 1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, 0xf, 6, 0xff,
+ IPPROTO_TCP, MLX5_L3_PROT_TYPE_IPV6, MLX5_L4_PROT_TYPE_TCP,
+ l3_hash | l4_hash },
+ };
+ unsigned i;
+
+ for (i = 0; i < RTE_DIM(priv->steer.rss); ++i) {
+ dv_attr.priority = vars[i][PRIO];
+ dv_attr.match_criteria_enable = vars[i][CRITERIA];
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version,
+ vars[i][IP_VER_M]);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version,
+ vars[i][IP_VER_V]);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol,
+ vars[i][IP_PROT_M]);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+ vars[i][IP_PROT_V]);
+ tir_att.rx_hash_field_selector_outer.l3_prot_type =
+ vars[i][L3_BIT];
+ tir_att.rx_hash_field_selector_outer.l4_prot_type =
+ vars[i][L4_BIT];
+ tir_att.rx_hash_field_selector_outer.selected_fields =
+ vars[i][HASH];
+ priv->steer.rss[i].matcher = mlx5_glue->dv_create_flow_matcher
+ (priv->ctx, &dv_attr, priv->steer.tbl);
+ if (!priv->steer.rss[i].matcher) {
+ DRV_LOG(ERR, "Failed to create matcher %d.", i);
+ goto error;
+ }
+ priv->steer.rss[i].tir = mlx5_devx_cmd_create_tir(priv->ctx,
+ &tir_att);
+ if (!priv->steer.rss[i].tir) {
+ DRV_LOG(ERR, "Failed to create TIR %d.", i);
+ goto error;
+ }
+ priv->steer.rss[i].tir_action =
+ mlx5_glue->dv_create_flow_action_dest_devx_tir
+ (priv->steer.rss[i].tir->obj);
+ if (!priv->steer.rss[i].tir_action) {
+ DRV_LOG(ERR, "Failed to create TIR action %d.", i);
+ goto error;
+ }
+ actions[0] = priv->steer.rss[i].tir_action;
+ priv->steer.rss[i].flow = mlx5_glue->dv_create_flow
+ (priv->steer.rss[i].matcher,
+ (void *)&matcher_value, 1, actions);
+ if (!priv->steer.rss[i].flow) {
+ DRV_LOG(ERR, "Failed to create flow %d.", i);
+ goto error;
+ }
+ }
+ return 0;
+error:
+ /* Resources will be freed by the caller. */
+ return -1;
+#else
+ (void)priv;
+ return -ENOTSUP;
+#endif /* HAVE_MLX5DV_DR */
+}
+
+int
+mlx5_vdpa_steer_update(struct mlx5_vdpa_priv *priv)
+{
+ int ret = mlx5_vdpa_rqt_prepare(priv);
+
+ if (ret == 0) {
+ mlx5_vdpa_rss_flows_destroy(priv);
+ if (priv->steer.rqt) {
+ claim_zero(mlx5_devx_cmd_destroy(priv->steer.rqt));
+ priv->steer.rqt = NULL;
+ }
+ } else if (ret < 0) {
+ return ret;
+ } else if (!priv->steer.rss[0].flow) {
+ ret = mlx5_vdpa_rss_flows_create(priv);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot create RSS flows.");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int
+mlx5_vdpa_steer_setup(struct mlx5_vdpa_priv *priv)
+{
+#ifdef HAVE_MLX5DV_DR
+ priv->steer.domain = mlx5_glue->dr_create_domain(priv->ctx,
+ MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
+ if (!priv->steer.domain) {
+ DRV_LOG(ERR, "Failed to create Rx domain.");
+ goto error;
+ }
+ priv->steer.tbl = mlx5_glue->dr_create_flow_tbl(priv->steer.domain, 0);
+ if (!priv->steer.tbl) {
+ DRV_LOG(ERR, "Failed to create table 0 with Rx domain.");
+ goto error;
+ }
+ if (mlx5_vdpa_steer_update(priv))
+ goto error;
+ return 0;
+error:
+ mlx5_vdpa_steer_unset(priv);
+ return -1;
+#else
+ (void)priv;
+ return -ENOTSUP;
+#endif /* HAVE_MLX5DV_DR */
+}
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_utils.h b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_utils.h
new file mode 100644
index 000000000..a239df9a5
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_utils.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_VDPA_UTILS_H_
+#define RTE_PMD_MLX5_VDPA_UTILS_H_
+
+#include <mlx5_common.h>
+
+
+extern int mlx5_vdpa_logtype;
+
+#define MLX5_VDPA_LOG_PREFIX "mlx5_vdpa"
+/* Generic printf()-like logging macro with automatic line feed. */
+#define DRV_LOG(level, ...) \
+ PMD_DRV_LOG_(level, mlx5_vdpa_logtype, MLX5_VDPA_LOG_PREFIX, \
+ __VA_ARGS__ PMD_DRV_LOG_STRIP PMD_DRV_LOG_OPAREN, \
+ PMD_DRV_LOG_CPAREN)
+
+#endif /* RTE_PMD_MLX5_VDPA_UTILS_H_ */
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
new file mode 100644
index 000000000..bd48460b5
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c
@@ -0,0 +1,457 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_io.h>
+
+#include <mlx5_common.h>
+
+#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
+
+
+static void
+mlx5_vdpa_virtq_handler(void *cb_arg)
+{
+ struct mlx5_vdpa_virtq *virtq = cb_arg;
+ struct mlx5_vdpa_priv *priv = virtq->priv;
+ uint64_t buf;
+ int nbytes;
+
+ do {
+ nbytes = read(virtq->intr_handle.fd, &buf, 8);
+ if (nbytes < 0) {
+ if (errno == EINTR ||
+ errno == EWOULDBLOCK ||
+ errno == EAGAIN)
+ continue;
+ DRV_LOG(ERR, "Failed to read kickfd of virtq %d: %s",
+ virtq->index, strerror(errno));
+ }
+ break;
+ } while (1);
+ rte_write32(virtq->index, priv->virtq_db_addr);
+ DRV_LOG(DEBUG, "Ring virtq %u doorbell.", virtq->index);
+}
+
+static int
+mlx5_vdpa_virtq_unset(struct mlx5_vdpa_virtq *virtq)
+{
+ unsigned int i;
+ int retries = MLX5_VDPA_INTR_RETRIES;
+ int ret = -EAGAIN;
+
+ if (virtq->intr_handle.fd != -1) {
+ while (retries-- && ret == -EAGAIN) {
+ ret = rte_intr_callback_unregister(&virtq->intr_handle,
+ mlx5_vdpa_virtq_handler,
+ virtq);
+ if (ret == -EAGAIN) {
+ DRV_LOG(DEBUG, "Try again to unregister fd %d "
+ "of virtq %d interrupt, retries = %d.",
+ virtq->intr_handle.fd,
+ (int)virtq->index, retries);
+ usleep(MLX5_VDPA_INTR_RETRIES_USEC);
+ }
+ }
+ virtq->intr_handle.fd = -1;
+ }
+ if (virtq->virtq)
+ claim_zero(mlx5_devx_cmd_destroy(virtq->virtq));
+ virtq->virtq = NULL;
+ for (i = 0; i < RTE_DIM(virtq->umems); ++i) {
+ if (virtq->umems[i].obj)
+ claim_zero(mlx5_glue->devx_umem_dereg
+ (virtq->umems[i].obj));
+ if (virtq->umems[i].buf)
+ rte_free(virtq->umems[i].buf);
+ }
+ memset(&virtq->umems, 0, sizeof(virtq->umems));
+ if (virtq->eqp.fw_qp)
+ mlx5_vdpa_event_qp_destroy(&virtq->eqp);
+ return 0;
+}
+
+void
+mlx5_vdpa_virtqs_release(struct mlx5_vdpa_priv *priv)
+{
+ int i;
+
+ for (i = 0; i < priv->nr_virtqs; i++) {
+ mlx5_vdpa_virtq_unset(&priv->virtqs[i]);
+ priv->virtqs[i].enable = 0;
+ }
+ if (priv->tis) {
+ claim_zero(mlx5_devx_cmd_destroy(priv->tis));
+ priv->tis = NULL;
+ }
+ if (priv->td) {
+ claim_zero(mlx5_devx_cmd_destroy(priv->td));
+ priv->td = NULL;
+ }
+ if (priv->virtq_db_addr) {
+ claim_zero(munmap(priv->virtq_db_addr, priv->var->length));
+ priv->virtq_db_addr = NULL;
+ }
+ priv->features = 0;
+ priv->nr_virtqs = 0;
+}
+
+int
+mlx5_vdpa_virtq_modify(struct mlx5_vdpa_virtq *virtq, int state)
+{
+ struct mlx5_devx_virtq_attr attr = {
+ .type = MLX5_VIRTQ_MODIFY_TYPE_STATE,
+ .state = state ? MLX5_VIRTQ_STATE_RDY :
+ MLX5_VIRTQ_STATE_SUSPEND,
+ .queue_index = virtq->index,
+ };
+
+ return mlx5_devx_cmd_modify_virtq(virtq->virtq, &attr);
+}
+
+int
+mlx5_vdpa_virtq_stop(struct mlx5_vdpa_priv *priv, int index)
+{
+ struct mlx5_devx_virtq_attr attr = {0};
+ struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
+ int ret = mlx5_vdpa_virtq_modify(virtq, 0);
+
+ if (ret)
+ return -1;
+ if (mlx5_devx_cmd_query_virtq(virtq->virtq, &attr)) {
+ DRV_LOG(ERR, "Failed to query virtq %d.", index);
+ return -1;
+ }
+ DRV_LOG(INFO, "Query vid %d vring %d: hw_available_idx=%d, "
+ "hw_used_index=%d", priv->vid, index,
+ attr.hw_available_index, attr.hw_used_index);
+ ret = rte_vhost_set_vring_base(priv->vid, index,
+ attr.hw_available_index,
+ attr.hw_used_index);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to set virtq %d base.", index);
+ return -1;
+ }
+ return 0;
+}
+
+static uint64_t
+mlx5_vdpa_hva_to_gpa(struct rte_vhost_memory *mem, uint64_t hva)
+{
+ struct rte_vhost_mem_region *reg;
+ uint32_t i;
+ uint64_t gpa = 0;
+
+ for (i = 0; i < mem->nregions; i++) {
+ reg = &mem->regions[i];
+ if (hva >= reg->host_user_addr &&
+ hva < reg->host_user_addr + reg->size) {
+ gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
+ break;
+ }
+ }
+ return gpa;
+}
+
+static int
+mlx5_vdpa_virtq_setup(struct mlx5_vdpa_priv *priv, int index)
+{
+ struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
+ struct rte_vhost_vring vq;
+ struct mlx5_devx_virtq_attr attr = {0};
+ uint64_t gpa;
+ int ret;
+ unsigned int i;
+ uint16_t last_avail_idx;
+ uint16_t last_used_idx;
+
+ ret = rte_vhost_get_vhost_vring(priv->vid, index, &vq);
+ if (ret)
+ return -1;
+ virtq->index = index;
+ virtq->vq_size = vq.size;
+ attr.tso_ipv4 = !!(priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO4));
+ attr.tso_ipv6 = !!(priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO6));
+ attr.tx_csum = !!(priv->features & (1ULL << VIRTIO_NET_F_CSUM));
+ attr.rx_csum = !!(priv->features & (1ULL << VIRTIO_NET_F_GUEST_CSUM));
+ attr.virtio_version_1_0 = !!(priv->features & (1ULL <<
+ VIRTIO_F_VERSION_1));
+ attr.type = (priv->features & (1ULL << VIRTIO_F_RING_PACKED)) ?
+ MLX5_VIRTQ_TYPE_PACKED : MLX5_VIRTQ_TYPE_SPLIT;
+ /*
+ * No need event QPs creation when the guest in poll mode or when the
+ * capability allows it.
+ */
+ attr.event_mode = vq.callfd != -1 || !(priv->caps.event_mode & (1 <<
+ MLX5_VIRTQ_EVENT_MODE_NO_MSIX)) ?
+ MLX5_VIRTQ_EVENT_MODE_QP :
+ MLX5_VIRTQ_EVENT_MODE_NO_MSIX;
+ if (attr.event_mode == MLX5_VIRTQ_EVENT_MODE_QP) {
+ ret = mlx5_vdpa_event_qp_create(priv, vq.size, vq.callfd,
+ &virtq->eqp);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to create event QPs for virtq %d.",
+ index);
+ return -1;
+ }
+ attr.qp_id = virtq->eqp.fw_qp->id;
+ } else {
+ DRV_LOG(INFO, "Virtq %d is, for sure, working by poll mode, no"
+ " need event QPs and event mechanism.", index);
+ }
+ /* Setup 3 UMEMs for each virtq. */
+ for (i = 0; i < RTE_DIM(virtq->umems); ++i) {
+ virtq->umems[i].size = priv->caps.umems[i].a * vq.size +
+ priv->caps.umems[i].b;
+ virtq->umems[i].buf = rte_zmalloc(__func__,
+ virtq->umems[i].size, 4096);
+ if (!virtq->umems[i].buf) {
+ DRV_LOG(ERR, "Cannot allocate umem %d memory for virtq"
+ " %u.", i, index);
+ goto error;
+ }
+ virtq->umems[i].obj = mlx5_glue->devx_umem_reg(priv->ctx,
+ virtq->umems[i].buf,
+ virtq->umems[i].size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!virtq->umems[i].obj) {
+ DRV_LOG(ERR, "Failed to register umem %d for virtq %u.",
+ i, index);
+ goto error;
+ }
+ attr.umems[i].id = virtq->umems[i].obj->umem_id;
+ attr.umems[i].offset = 0;
+ attr.umems[i].size = virtq->umems[i].size;
+ }
+ if (attr.type == MLX5_VIRTQ_TYPE_SPLIT) {
+ gpa = mlx5_vdpa_hva_to_gpa(priv->vmem,
+ (uint64_t)(uintptr_t)vq.desc);
+ if (!gpa) {
+ DRV_LOG(ERR, "Failed to get descriptor ring GPA.");
+ goto error;
+ }
+ attr.desc_addr = gpa;
+ gpa = mlx5_vdpa_hva_to_gpa(priv->vmem,
+ (uint64_t)(uintptr_t)vq.used);
+ if (!gpa) {
+ DRV_LOG(ERR, "Failed to get GPA for used ring.");
+ goto error;
+ }
+ attr.used_addr = gpa;
+ gpa = mlx5_vdpa_hva_to_gpa(priv->vmem,
+ (uint64_t)(uintptr_t)vq.avail);
+ if (!gpa) {
+ DRV_LOG(ERR, "Failed to get GPA for available ring.");
+ goto error;
+ }
+ attr.available_addr = gpa;
+ }
+ ret = rte_vhost_get_vring_base(priv->vid, index, &last_avail_idx,
+ &last_used_idx);
+ if (ret) {
+ last_avail_idx = 0;
+ last_used_idx = 0;
+ DRV_LOG(WARNING, "Couldn't get vring base, idx are set to 0");
+ } else {
+ DRV_LOG(INFO, "vid %d: Init last_avail_idx=%d, last_used_idx=%d for "
+ "virtq %d.", priv->vid, last_avail_idx,
+ last_used_idx, index);
+ }
+ attr.hw_available_index = last_avail_idx;
+ attr.hw_used_index = last_used_idx;
+ attr.q_size = vq.size;
+ attr.mkey = priv->gpa_mkey_index;
+ attr.tis_id = priv->tis->id;
+ attr.queue_index = index;
+ virtq->virtq = mlx5_devx_cmd_create_virtq(priv->ctx, &attr);
+ virtq->priv = priv;
+ if (!virtq->virtq)
+ goto error;
+ if (mlx5_vdpa_virtq_modify(virtq, 1))
+ goto error;
+ virtq->priv = priv;
+ rte_write32(virtq->index, priv->virtq_db_addr);
+ /* Setup doorbell mapping. */
+ virtq->intr_handle.fd = vq.kickfd;
+ if (virtq->intr_handle.fd == -1) {
+ DRV_LOG(WARNING, "Virtq %d kickfd is invalid.", index);
+ if (!priv->direct_notifier) {
+ DRV_LOG(ERR, "Virtq %d cannot be notified.", index);
+ goto error;
+ }
+ } else {
+ virtq->intr_handle.type = RTE_INTR_HANDLE_EXT;
+ if (rte_intr_callback_register(&virtq->intr_handle,
+ mlx5_vdpa_virtq_handler,
+ virtq)) {
+ virtq->intr_handle.fd = -1;
+ DRV_LOG(ERR, "Failed to register virtq %d interrupt.",
+ index);
+ goto error;
+ } else {
+ DRV_LOG(DEBUG, "Register fd %d interrupt for virtq %d.",
+ virtq->intr_handle.fd, index);
+ }
+ }
+ return 0;
+error:
+ mlx5_vdpa_virtq_unset(virtq);
+ return -1;
+}
+
+static int
+mlx5_vdpa_features_validate(struct mlx5_vdpa_priv *priv)
+{
+ if (priv->features & (1ULL << VIRTIO_F_RING_PACKED)) {
+ if (!(priv->caps.virtio_queue_type & (1 <<
+ MLX5_VIRTQ_TYPE_PACKED))) {
+ DRV_LOG(ERR, "Failed to configur PACKED mode for vdev "
+ "%d - it was not reported by HW/driver"
+ " capability.", priv->vid);
+ return -ENOTSUP;
+ }
+ }
+ if (priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO4)) {
+ if (!priv->caps.tso_ipv4) {
+ DRV_LOG(ERR, "Failed to enable TSO4 for vdev %d - TSO4"
+ " was not reported by HW/driver capability.",
+ priv->vid);
+ return -ENOTSUP;
+ }
+ }
+ if (priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO6)) {
+ if (!priv->caps.tso_ipv6) {
+ DRV_LOG(ERR, "Failed to enable TSO6 for vdev %d - TSO6"
+ " was not reported by HW/driver capability.",
+ priv->vid);
+ return -ENOTSUP;
+ }
+ }
+ if (priv->features & (1ULL << VIRTIO_NET_F_CSUM)) {
+ if (!priv->caps.tx_csum) {
+ DRV_LOG(ERR, "Failed to enable CSUM for vdev %d - CSUM"
+ " was not reported by HW/driver capability.",
+ priv->vid);
+ return -ENOTSUP;
+ }
+ }
+ if (priv->features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) {
+ if (!priv->caps.rx_csum) {
+ DRV_LOG(ERR, "Failed to enable GUEST CSUM for vdev %d"
+ " GUEST CSUM was not reported by HW/driver "
+ "capability.", priv->vid);
+ return -ENOTSUP;
+ }
+ }
+ if (priv->features & (1ULL << VIRTIO_F_VERSION_1)) {
+ if (!priv->caps.virtio_version_1_0) {
+ DRV_LOG(ERR, "Failed to enable version 1 for vdev %d "
+ "version 1 was not reported by HW/driver"
+ " capability.", priv->vid);
+ return -ENOTSUP;
+ }
+ }
+ return 0;
+}
+
+int
+mlx5_vdpa_virtqs_prepare(struct mlx5_vdpa_priv *priv)
+{
+ struct mlx5_devx_tis_attr tis_attr = {0};
+ uint32_t i;
+ uint16_t nr_vring = rte_vhost_get_vring_num(priv->vid);
+ int ret = rte_vhost_get_negotiated_features(priv->vid, &priv->features);
+
+ if (ret || mlx5_vdpa_features_validate(priv)) {
+ DRV_LOG(ERR, "Failed to configure negotiated features.");
+ return -1;
+ }
+ if (nr_vring > priv->caps.max_num_virtio_queues * 2) {
+ DRV_LOG(ERR, "Do not support more than %d virtqs(%d).",
+ (int)priv->caps.max_num_virtio_queues * 2,
+ (int)nr_vring);
+ return -1;
+ }
+ /* Always map the entire page. */
+ priv->virtq_db_addr = mmap(NULL, priv->var->length, PROT_READ |
+ PROT_WRITE, MAP_SHARED, priv->ctx->cmd_fd,
+ priv->var->mmap_off);
+ if (priv->virtq_db_addr == MAP_FAILED) {
+ DRV_LOG(ERR, "Failed to map doorbell page %u.", errno);
+ priv->virtq_db_addr = NULL;
+ goto error;
+ } else {
+ DRV_LOG(DEBUG, "VAR address of doorbell mapping is %p.",
+ priv->virtq_db_addr);
+ }
+ priv->td = mlx5_devx_cmd_create_td(priv->ctx);
+ if (!priv->td) {
+ DRV_LOG(ERR, "Failed to create transport domain.");
+ return -rte_errno;
+ }
+ tis_attr.transport_domain = priv->td->id;
+ priv->tis = mlx5_devx_cmd_create_tis(priv->ctx, &tis_attr);
+ if (!priv->tis) {
+ DRV_LOG(ERR, "Failed to create TIS.");
+ goto error;
+ }
+ priv->nr_virtqs = nr_vring;
+ for (i = 0; i < nr_vring; i++) {
+ claim_zero(rte_vhost_enable_guest_notification(priv->vid, i,
+ 1));
+ if (mlx5_vdpa_virtq_setup(priv, i))
+ goto error;
+ }
+ return 0;
+error:
+ mlx5_vdpa_virtqs_release(priv);
+ return -1;
+}
+
+int
+mlx5_vdpa_virtq_enable(struct mlx5_vdpa_priv *priv, int index, int enable)
+{
+ struct mlx5_vdpa_virtq *virtq = &priv->virtqs[index];
+ int ret;
+
+ DRV_LOG(INFO, "Update virtq %d status %sable -> %sable.", index,
+ virtq->enable ? "en" : "dis", enable ? "en" : "dis");
+ if (virtq->enable == !!enable)
+ return 0;
+ if (!priv->configured) {
+ virtq->enable = !!enable;
+ return 0;
+ }
+ if (enable) {
+ /* Configuration might have been updated - reconfigure virtq. */
+ if (virtq->virtq) {
+ ret = mlx5_vdpa_virtq_stop(priv, index);
+ if (ret)
+ DRV_LOG(WARNING, "Failed to stop virtq %d.",
+ index);
+ mlx5_vdpa_virtq_unset(virtq);
+ }
+ ret = mlx5_vdpa_virtq_setup(priv, index);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to setup virtq %d.", index);
+ return ret;
+ /* The only case virtq can stay invalid. */
+ }
+ }
+ virtq->enable = !!enable;
+ if (is_virtq_recvq(virtq->index, priv->nr_virtqs)) {
+ /* Need to add received virtq to the RQT table of the TIRs. */
+ ret = mlx5_vdpa_steer_update(priv);
+ if (ret) {
+ virtq->enable = !enable;
+ return ret;
+ }
+ }
+ return 0;
+}
diff --git a/src/spdk/dpdk/drivers/vdpa/mlx5/rte_pmd_mlx5_vdpa_version.map b/src/spdk/dpdk/drivers/vdpa/mlx5/rte_pmd_mlx5_vdpa_version.map
new file mode 100644
index 000000000..4a76d1d52
--- /dev/null
+++ b/src/spdk/dpdk/drivers/vdpa/mlx5/rte_pmd_mlx5_vdpa_version.map
@@ -0,0 +1,3 @@
+DPDK_21 {
+ local: *;
+};