Adding upstream version 5.10.209.upstream/5.10.209 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 10:05:51 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 10:05:51 +0000
commit: 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch)
tree: a94efe259b9009378be6d90eb30d2b019d95c194 /drivers/vdpa
parent: Initial commit. (diff)
download: linux-5d1646d90e1f2cceb9f0828f4b28318cd0ec7744.tar.xz
linux-5d1646d90e1f2cceb9f0828f4b28318cd0ec7744.zip
17 files changed, 5221 insertions, 0 deletions
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
new file mode 100644
index 000000000..6caf53909
--- /dev/null
+++ b/drivers/vdpa/Kconfig
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig VDPA
+	tristate "vDPA drivers"
+	help
+	  Enable this module to support vDPA device that uses a
+	  datapath which complies with virtio specifications with
+	  vendor specific control path.
+
+if VDPA
+
+config VDPA_SIM
+	tristate "vDPA device simulator"
+	depends on RUNTIME_TESTING_MENU && HAS_DMA
+	select DMA_OPS
+	select VHOST_RING
+	select GENERIC_NET_UTILS
+	default n
+	help
+	  vDPA networking device simulator which loop TX traffic back
+	  to RX. This device is used for testing, prototyping and
+	  development of vDPA.
+
+config IFCVF
+	tristate "Intel IFC VF vDPA driver"
+	depends on PCI_MSI
+	default n
+	help
+	  This kernel module can drive Intel IFC VF NIC to offload
+	  virtio dataplane traffic to hardware.
+	  To compile this driver as a module, choose M here: the module will
+	  be called ifcvf.
+
+config MLX5_VDPA
+	bool
+	select VHOST_IOTLB
+	help
+	  Support library for Mellanox VDPA drivers. Provides code that is
+	  common for all types of VDPA drivers. The following drivers are planned:
+	  net, block.
+
+config MLX5_VDPA_NET
+	tristate "vDPA driver for ConnectX devices"
+	select MLX5_VDPA
+	depends on MLX5_CORE
+	default n
+	help
+	  VDPA network driver for ConnectX6 and newer. Provides offloading
+	  of virtio net datapath such that descriptors put on the ring will
+	  be executed by the hardware. It also supports a variety of stateless
+	  offloads depending on the actual device used and firmware version.
+
+endif # VDPA
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
new file mode 100644
index 000000000..d160e9b63
--- /dev/null
+++ b/drivers/vdpa/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VDPA) += vdpa.o
+obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
+obj-$(CONFIG_IFCVF)    += ifcvf/
+obj-$(CONFIG_MLX5_VDPA) += mlx5/
diff --git a/drivers/vdpa/ifcvf/Makefile b/drivers/vdpa/ifcvf/Makefile
new file mode 100644
index 000000000..d70991599
--- /dev/null
+++ b/drivers/vdpa/ifcvf/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_IFCVF) += ifcvf.o
+ifcvf-$(CONFIG_IFCVF) += ifcvf_main.o ifcvf_base.o
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
new file mode 100644
index 000000000..f2a128e56
--- /dev/null
+++ b/drivers/vdpa/ifcvf/ifcvf_base.c
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#include "ifcvf_base.h"
+
+static inline u8 ifc_ioread8(u8 __iomem *addr)
+{
+	return ioread8(addr);
+}
+static inline u16 ifc_ioread16 (__le16 __iomem *addr)
+{
+	return ioread16(addr);
+}
+
+static inline u32 ifc_ioread32(__le32 __iomem *addr)
+{
+	return ioread32(addr);
+}
+
+static inline void ifc_iowrite8(u8 value, u8 __iomem *addr)
+{
+	iowrite8(value, addr);
+}
+
+static inline void ifc_iowrite16(u16 value, __le16 __iomem *addr)
+{
+	iowrite16(value, addr);
+}
+
+static inline void ifc_iowrite32(u32 value, __le32 __iomem *addr)
+{
+	iowrite32(value, addr);
+}
+
+static void ifc_iowrite64_twopart(u64 val,
+				  __le32 __iomem *lo, __le32 __iomem *hi)
+{
+	ifc_iowrite32((u32)val, lo);
+	ifc_iowrite32(val >> 32, hi);
+}
+
+struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw)
+{
+	return container_of(hw, struct ifcvf_adapter, vf);
+}
+
+static void __iomem *get_cap_addr(struct ifcvf_hw *hw,
+				  struct virtio_pci_cap *cap)
+{
+	struct ifcvf_adapter *ifcvf;
+	struct pci_dev *pdev;
+	u32 length, offset;
+	u8 bar;
+
+	length = le32_to_cpu(cap->length);
+	offset = le32_to_cpu(cap->offset);
+	bar = cap->bar;
+
+	ifcvf= vf_to_adapter(hw);
+	pdev = ifcvf->pdev;
+
+	if (bar >= IFCVF_PCI_MAX_RESOURCE) {
+		IFCVF_DBG(pdev,
+			  "Invalid bar number %u to get capabilities\n", bar);
+		return NULL;
+	}
+
+	if (offset + length > pci_resource_len(pdev, bar)) {
+		IFCVF_DBG(pdev,
+			  "offset(%u) + len(%u) overflows bar%u's capability\n",
+			  offset, length, bar);
+		return NULL;
+	}
+
+	return hw->base[bar] + offset;
+}
+
+static int ifcvf_read_config_range(struct pci_dev *dev,
+				   uint32_t *val, int size, int where)
+{
+	int ret, i;
+
+	for (i = 0; i < size; i += 4) {
+		ret = pci_read_config_dword(dev, where + i, val + i / 4);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev)
+{
+	struct virtio_pci_cap cap;
+	u16 notify_off;
+	int ret;
+	u8 pos;
+	u32 i;
+
+	ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
+	if (ret < 0) {
+		IFCVF_ERR(pdev, "Failed to read PCI capability list\n");
+		return -EIO;
+	}
+
+	while (pos) {
+		ret = ifcvf_read_config_range(pdev, (u32 *)&cap,
+					      sizeof(cap), pos);
+		if (ret < 0) {
+			IFCVF_ERR(pdev,
+				  "Failed to get PCI capability at %x\n", pos);
+			break;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR)
+			goto next;
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cap_addr(hw, &cap);
+			IFCVF_DBG(pdev, "hw->common_cfg = %p\n",
+				  hw->common_cfg);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			pci_read_config_dword(pdev, pos + sizeof(cap),
+					      &hw->notify_off_multiplier);
+			hw->notify_bar = cap.bar;
+			hw->notify_base = get_cap_addr(hw, &cap);
+			IFCVF_DBG(pdev, "hw->notify_base = %p\n",
+				  hw->notify_base);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cap_addr(hw, &cap);
+			IFCVF_DBG(pdev, "hw->isr = %p\n", hw->isr);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->net_cfg = get_cap_addr(hw, &cap);
+			IFCVF_DBG(pdev, "hw->net_cfg = %p\n", hw->net_cfg);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->isr == NULL || hw->net_cfg == NULL) {
+		IFCVF_ERR(pdev, "Incomplete PCI capabilities\n");
+		return -EIO;
+	}
+
+	for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+		ifc_iowrite16(i, &hw->common_cfg->queue_select);
+		notify_off = ifc_ioread16(&hw->common_cfg->queue_notify_off);
+		hw->vring[i].notify_addr = hw->notify_base +
+			notify_off * hw->notify_off_multiplier;
+	}
+
+	hw->lm_cfg = hw->base[IFCVF_LM_BAR];
+
+	IFCVF_DBG(pdev,
+		  "PCI capability mapping: common cfg: %p, notify base: %p\n, isr cfg: %p, device cfg: %p, multiplier: %u\n",
+		  hw->common_cfg, hw->notify_base, hw->isr,
+		  hw->net_cfg, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+u8 ifcvf_get_status(struct ifcvf_hw *hw)
+{
+	return ifc_ioread8(&hw->common_cfg->device_status);
+}
+
+void ifcvf_set_status(struct ifcvf_hw *hw, u8 status)
+{
+	ifc_iowrite8(status, &hw->common_cfg->device_status);
+}
+
+void ifcvf_reset(struct ifcvf_hw *hw)
+{
+	hw->config_cb.callback = NULL;
+	hw->config_cb.private = NULL;
+
+	ifcvf_set_status(hw, 0);
+	/* flush set_status, make sure VF is stopped, reset */
+	ifcvf_get_status(hw);
+}
+
+static void ifcvf_add_status(struct ifcvf_hw *hw, u8 status)
+{
+	if (status != 0)
+		status |= ifcvf_get_status(hw);
+
+	ifcvf_set_status(hw, status);
+	ifcvf_get_status(hw);
+}
+
+u64 ifcvf_get_features(struct ifcvf_hw *hw)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+	u32 features_lo, features_hi;
+
+	ifc_iowrite32(0, &cfg->device_feature_select);
+	features_lo = ifc_ioread32(&cfg->device_feature);
+
+	ifc_iowrite32(1, &cfg->device_feature_select);
+	features_hi = ifc_ioread32(&cfg->device_feature);
+
+	return ((u64)features_hi << 32) | features_lo;
+}
+
+void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset,
+			   void *dst, int length)
+{
+	u8 old_gen, new_gen, *p;
+	int i;
+
+	WARN_ON(offset + length > sizeof(struct virtio_net_config));
+	do {
+		old_gen = ifc_ioread8(&hw->common_cfg->config_generation);
+		p = dst;
+		for (i = 0; i < length; i++)
+			*p++ = ifc_ioread8(hw->net_cfg + offset + i);
+
+		new_gen = ifc_ioread8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+void ifcvf_write_net_config(struct ifcvf_hw *hw, u64 offset,
+			    const void *src, int length)
+{
+	const u8 *p;
+	int i;
+
+	p = src;
+	WARN_ON(offset + length > sizeof(struct virtio_net_config));
+	for (i = 0; i < length; i++)
+		ifc_iowrite8(*p++, hw->net_cfg + offset + i);
+}
+
+static void ifcvf_set_features(struct ifcvf_hw *hw, u64 features)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+
+	ifc_iowrite32(0, &cfg->guest_feature_select);
+	ifc_iowrite32((u32)features, &cfg->guest_feature);
+
+	ifc_iowrite32(1, &cfg->guest_feature_select);
+	ifc_iowrite32(features >> 32, &cfg->guest_feature);
+}
+
+static int ifcvf_config_features(struct ifcvf_hw *hw)
+{
+	struct ifcvf_adapter *ifcvf;
+
+	ifcvf = vf_to_adapter(hw);
+	ifcvf_set_features(hw, hw->req_features);
+	ifcvf_add_status(hw, VIRTIO_CONFIG_S_FEATURES_OK);
+
+	if (!(ifcvf_get_status(hw) & VIRTIO_CONFIG_S_FEATURES_OK)) {
+		IFCVF_ERR(ifcvf->pdev, "Failed to set FEATURES_OK status\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
+{
+	struct ifcvf_lm_cfg __iomem *ifcvf_lm;
+	void __iomem *avail_idx_addr;
+	u16 last_avail_idx;
+	u32 q_pair_id;
+
+	ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
+	q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+	avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
+	last_avail_idx = ifc_ioread16(avail_idx_addr);
+
+	return last_avail_idx;
+}
+
+int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num)
+{
+	struct ifcvf_lm_cfg __iomem *ifcvf_lm;
+	void __iomem *avail_idx_addr;
+	u32 q_pair_id;
+
+	ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
+	q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+	avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
+	hw->vring[qid].last_avail_idx = num;
+	ifc_iowrite16(num, avail_idx_addr);
+
+	return 0;
+}
+
+static int ifcvf_hw_enable(struct ifcvf_hw *hw)
+{
+	struct virtio_pci_common_cfg __iomem *cfg;
+	struct ifcvf_adapter *ifcvf;
+	u32 i;
+
+	ifcvf = vf_to_adapter(hw);
+	cfg = hw->common_cfg;
+	ifc_iowrite16(IFCVF_MSI_CONFIG_OFF, &cfg->msix_config);
+
+	if (ifc_ioread16(&cfg->msix_config) == VIRTIO_MSI_NO_VECTOR) {
+		IFCVF_ERR(ifcvf->pdev, "No msix vector for device config\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < hw->nr_vring; i++) {
+		if (!hw->vring[i].ready)
+			break;
+
+		ifc_iowrite16(i, &cfg->queue_select);
+		ifc_iowrite64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo,
+				     &cfg->queue_desc_hi);
+		ifc_iowrite64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo,
+				      &cfg->queue_avail_hi);
+		ifc_iowrite64_twopart(hw->vring[i].used, &cfg->queue_used_lo,
+				     &cfg->queue_used_hi);
+		ifc_iowrite16(hw->vring[i].size, &cfg->queue_size);
+		ifc_iowrite16(i + IFCVF_MSI_QUEUE_OFF, &cfg->queue_msix_vector);
+
+		if (ifc_ioread16(&cfg->queue_msix_vector) ==
+		    VIRTIO_MSI_NO_VECTOR) {
+			IFCVF_ERR(ifcvf->pdev,
+				  "No msix vector for queue %u\n", i);
+			return -EINVAL;
+		}
+
+		ifcvf_set_vq_state(hw, i, hw->vring[i].last_avail_idx);
+		ifc_iowrite16(1, &cfg->queue_enable);
+	}
+
+	return 0;
+}
+
+static void ifcvf_hw_disable(struct ifcvf_hw *hw)
+{
+	struct virtio_pci_common_cfg __iomem *cfg;
+	u32 i;
+
+	cfg = hw->common_cfg;
+	ifc_iowrite16(VIRTIO_MSI_NO_VECTOR, &cfg->msix_config);
+
+	for (i = 0; i < hw->nr_vring; i++) {
+		ifc_iowrite16(i, &cfg->queue_select);
+		ifc_iowrite16(VIRTIO_MSI_NO_VECTOR, &cfg->queue_msix_vector);
+	}
+
+	ifc_ioread16(&cfg->queue_msix_vector);
+}
+
+int ifcvf_start_hw(struct ifcvf_hw *hw)
+{
+	ifcvf_reset(hw);
+	ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+	ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER);
+
+	if (ifcvf_config_features(hw) < 0)
+		return -EINVAL;
+
+	if (ifcvf_hw_enable(hw) < 0)
+		return -EINVAL;
+
+	ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK);
+
+	return 0;
+}
+
+void ifcvf_stop_hw(struct ifcvf_hw *hw)
+{
+	ifcvf_hw_disable(hw);
+	ifcvf_reset(hw);
+}
+
+void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid)
+{
+	ifc_iowrite16(qid, hw->vring[qid].notify_addr);
+}
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
new file mode 100644
index 000000000..64696d63f
--- /dev/null
+++ b/drivers/vdpa/ifcvf/ifcvf_base.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#ifndef _IFCVF_H_
+#define _IFCVF_H_
+
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+#include <linux/vdpa.h>
+#include <uapi/linux/virtio_net.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_pci.h>
+
+#define IFCVF_VENDOR_ID		0x1AF4
+#define IFCVF_DEVICE_ID		0x1041
+#define IFCVF_SUBSYS_VENDOR_ID	0x8086
+#define IFCVF_SUBSYS_DEVICE_ID	0x001A
+
+#define IFCVF_SUPPORTED_FEATURES \
+		((1ULL << VIRTIO_NET_F_MAC)			| \
+		 (1ULL << VIRTIO_F_ANY_LAYOUT)			| \
+		 (1ULL << VIRTIO_F_VERSION_1)			| \
+		 (1ULL << VIRTIO_NET_F_STATUS)			| \
+		 (1ULL << VIRTIO_F_ORDER_PLATFORM)		| \
+		 (1ULL << VIRTIO_F_ACCESS_PLATFORM)		| \
+		 (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+
+/* Only one queue pair for now. */
+#define IFCVF_MAX_QUEUE_PAIRS	1
+
+#define IFCVF_QUEUE_ALIGNMENT	PAGE_SIZE
+#define IFCVF_QUEUE_MAX		32768
+#define IFCVF_MSI_CONFIG_OFF	0
+#define IFCVF_MSI_QUEUE_OFF	1
+#define IFCVF_PCI_MAX_RESOURCE	6
+
+#define IFCVF_LM_CFG_SIZE		0x40
+#define IFCVF_LM_RING_STATE_OFFSET	0x20
+#define IFCVF_LM_BAR			4
+
+#define IFCVF_ERR(pdev, fmt, ...)	dev_err(&pdev->dev, fmt, ##__VA_ARGS__)
+#define IFCVF_DBG(pdev, fmt, ...)	dev_dbg(&pdev->dev, fmt, ##__VA_ARGS__)
+#define IFCVF_INFO(pdev, fmt, ...)	dev_info(&pdev->dev, fmt, ##__VA_ARGS__)
+
+#define ifcvf_private_to_vf(adapter) \
+	(&((struct ifcvf_adapter *)adapter)->vf)
+
+#define IFCVF_MAX_INTR (IFCVF_MAX_QUEUE_PAIRS * 2 + 1)
+
+struct vring_info {
+	u64 desc;
+	u64 avail;
+	u64 used;
+	u16 size;
+	u16 last_avail_idx;
+	bool ready;
+	void __iomem *notify_addr;
+	u32 irq;
+	struct vdpa_callback cb;
+	char msix_name[256];
+};
+
+struct ifcvf_hw {
+	u8 __iomem *isr;
+	/* Live migration */
+	u8 __iomem *lm_cfg;
+	u16 nr_vring;
+	/* Notification bar number */
+	u8 notify_bar;
+	/* Notificaiton bar address */
+	void __iomem *notify_base;
+	u32 notify_off_multiplier;
+	u64 req_features;
+	struct virtio_pci_common_cfg __iomem *common_cfg;
+	void __iomem *net_cfg;
+	struct vring_info vring[IFCVF_MAX_QUEUE_PAIRS * 2];
+	void __iomem * const *base;
+	char config_msix_name[256];
+	struct vdpa_callback config_cb;
+	unsigned int config_irq;
+};
+
+struct ifcvf_adapter {
+	struct vdpa_device vdpa;
+	struct pci_dev *pdev;
+	struct ifcvf_hw vf;
+};
+
+struct ifcvf_vring_lm_cfg {
+	u32 idx_addr[2];
+	u8 reserved[IFCVF_LM_CFG_SIZE - 8];
+};
+
+struct ifcvf_lm_cfg {
+	u8 reserved[IFCVF_LM_RING_STATE_OFFSET];
+	struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUE_PAIRS];
+};
+
+int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *dev);
+int ifcvf_start_hw(struct ifcvf_hw *hw);
+void ifcvf_stop_hw(struct ifcvf_hw *hw);
+void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid);
+void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset,
+			   void *dst, int length);
+void ifcvf_write_net_config(struct ifcvf_hw *hw, u64 offset,
+			    const void *src, int length);
+u8 ifcvf_get_status(struct ifcvf_hw *hw);
+void ifcvf_set_status(struct ifcvf_hw *hw, u8 status);
+void io_write64_twopart(u64 val, u32 *lo, u32 *hi);
+void ifcvf_reset(struct ifcvf_hw *hw);
+u64 ifcvf_get_features(struct ifcvf_hw *hw);
+u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid);
+int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num);
+struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw);
+#endif /* _IFCVF_H_ */
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
new file mode 100644
index 000000000..8b4028556
--- /dev/null
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -0,0 +1,505 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+#include "ifcvf_base.h"
+
+#define VERSION_STRING  "0.1"
+#define DRIVER_AUTHOR   "Intel Corporation"
+#define IFCVF_DRIVER_NAME       "ifcvf"
+
+static irqreturn_t ifcvf_config_changed(int irq, void *arg)
+{
+	struct ifcvf_hw *vf = arg;
+
+	if (vf->config_cb.callback)
+		return vf->config_cb.callback(vf->config_cb.private);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ifcvf_intr_handler(int irq, void *arg)
+{
+	struct vring_info *vring = arg;
+
+	if (vring->cb.callback)
+		return vring->cb.callback(vring->cb.private);
+
+	return IRQ_HANDLED;
+}
+
+static void ifcvf_free_irq_vectors(void *data)
+{
+	pci_free_irq_vectors(data);
+}
+
+static void ifcvf_free_irq(struct ifcvf_adapter *adapter, int queues)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	struct ifcvf_hw *vf = &adapter->vf;
+	int i;
+
+
+	for (i = 0; i < queues; i++) {
+		devm_free_irq(&pdev->dev, vf->vring[i].irq, &vf->vring[i]);
+		vf->vring[i].irq = -EINVAL;
+	}
+
+	devm_free_irq(&pdev->dev, vf->config_irq, vf);
+	ifcvf_free_irq_vectors(pdev);
+}
+
+static int ifcvf_request_irq(struct ifcvf_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	struct ifcvf_hw *vf = &adapter->vf;
+	int vector, i, ret, irq;
+
+	ret = pci_alloc_irq_vectors(pdev, IFCVF_MAX_INTR,
+				    IFCVF_MAX_INTR, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		IFCVF_ERR(pdev, "Failed to alloc IRQ vectors\n");
+		return ret;
+	}
+
+	snprintf(vf->config_msix_name, 256, "ifcvf[%s]-config\n",
+		 pci_name(pdev));
+	vector = 0;
+	vf->config_irq = pci_irq_vector(pdev, vector);
+	ret = devm_request_irq(&pdev->dev, vf->config_irq,
+			       ifcvf_config_changed, 0,
+			       vf->config_msix_name, vf);
+	if (ret) {
+		IFCVF_ERR(pdev, "Failed to request config irq\n");
+		return ret;
+	}
+
+	for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+		snprintf(vf->vring[i].msix_name, 256, "ifcvf[%s]-%d\n",
+			 pci_name(pdev), i);
+		vector = i + IFCVF_MSI_QUEUE_OFF;
+		irq = pci_irq_vector(pdev, vector);
+		ret = devm_request_irq(&pdev->dev, irq,
+				       ifcvf_intr_handler, 0,
+				       vf->vring[i].msix_name,
+				       &vf->vring[i]);
+		if (ret) {
+			IFCVF_ERR(pdev,
+				  "Failed to request irq for vq %d\n", i);
+			ifcvf_free_irq(adapter, i);
+
+			return ret;
+		}
+
+		vf->vring[i].irq = irq;
+	}
+
+	return 0;
+}
+
+static int ifcvf_start_datapath(void *private)
+{
+	struct ifcvf_hw *vf = ifcvf_private_to_vf(private);
+	u8 status;
+	int ret;
+
+	vf->nr_vring = IFCVF_MAX_QUEUE_PAIRS * 2;
+	ret = ifcvf_start_hw(vf);
+	if (ret < 0) {
+		status = ifcvf_get_status(vf);
+		status |= VIRTIO_CONFIG_S_FAILED;
+		ifcvf_set_status(vf, status);
+	}
+
+	return ret;
+}
+
+static int ifcvf_stop_datapath(void *private)
+{
+	struct ifcvf_hw *vf = ifcvf_private_to_vf(private);
+	int i;
+
+	for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
+		vf->vring[i].cb.callback = NULL;
+
+	ifcvf_stop_hw(vf);
+
+	return 0;
+}
+
+static void ifcvf_reset_vring(struct ifcvf_adapter *adapter)
+{
+	struct ifcvf_hw *vf = ifcvf_private_to_vf(adapter);
+	int i;
+
+	for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+		vf->vring[i].last_avail_idx = 0;
+		vf->vring[i].desc = 0;
+		vf->vring[i].avail = 0;
+		vf->vring[i].used = 0;
+		vf->vring[i].ready = 0;
+		vf->vring[i].cb.callback = NULL;
+		vf->vring[i].cb.private = NULL;
+	}
+
+	ifcvf_reset(vf);
+}
+
+static struct ifcvf_adapter *vdpa_to_adapter(struct vdpa_device *vdpa_dev)
+{
+	return container_of(vdpa_dev, struct ifcvf_adapter, vdpa);
+}
+
+static struct ifcvf_hw *vdpa_to_vf(struct vdpa_device *vdpa_dev)
+{
+	struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
+
+	return &adapter->vf;
+}
+
+static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+	u64 features;
+
+	features = ifcvf_get_features(vf) & IFCVF_SUPPORTED_FEATURES;
+
+	return features;
+}
+
+static int ifcvf_vdpa_set_features(struct vdpa_device *vdpa_dev, u64 features)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	vf->req_features = features;
+
+	return 0;
+}
+
+static u8 ifcvf_vdpa_get_status(struct vdpa_device *vdpa_dev)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	return ifcvf_get_status(vf);
+}
+
+static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
+{
+	struct ifcvf_adapter *adapter;
+	struct ifcvf_hw *vf;
+	u8 status_old;
+	int ret;
+
+	vf  = vdpa_to_vf(vdpa_dev);
+	adapter = dev_get_drvdata(vdpa_dev->dev.parent);
+	status_old = ifcvf_get_status(vf);
+
+	if (status_old == status)
+		return;
+
+	if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) &&
+	    !(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+		ifcvf_stop_datapath(adapter);
+		ifcvf_free_irq(adapter, IFCVF_MAX_QUEUE_PAIRS * 2);
+	}
+
+	if (status == 0) {
+		ifcvf_reset_vring(adapter);
+		return;
+	}
+
+	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+	    !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
+		ret = ifcvf_request_irq(adapter);
+		if (ret) {
+			status = ifcvf_get_status(vf);
+			status |= VIRTIO_CONFIG_S_FAILED;
+			ifcvf_set_status(vf, status);
+			return;
+		}
+
+		if (ifcvf_start_datapath(adapter) < 0)
+			IFCVF_ERR(adapter->pdev,
+				  "Failed to set ifcvf vdpa  status %u\n",
+				  status);
+	}
+
+	ifcvf_set_status(vf, status);
+}
+
+static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
+{
+	return IFCVF_QUEUE_MAX;
+}
+
+static int ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+				   struct vdpa_vq_state *state)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	state->avail_index = ifcvf_get_vq_state(vf, qid);
+	return 0;
+}
+
+static int ifcvf_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+				   const struct vdpa_vq_state *state)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	return ifcvf_set_vq_state(vf, qid, state->avail_index);
+}
+
+static void ifcvf_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid,
+				 struct vdpa_callback *cb)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	vf->vring[qid].cb = *cb;
+}
+
+static void ifcvf_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev,
+				    u16 qid, bool ready)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	vf->vring[qid].ready = ready;
+}
+
+static bool ifcvf_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	return vf->vring[qid].ready;
+}
+
+static void ifcvf_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid,
+				  u32 num)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	vf->vring[qid].size = num;
+}
+
+static int ifcvf_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid,
+				     u64 desc_area, u64 driver_area,
+				     u64 device_area)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	vf->vring[qid].desc = desc_area;
+	vf->vring[qid].avail = driver_area;
+	vf->vring[qid].used = device_area;
+
+	return 0;
+}
+
+static void ifcvf_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	ifcvf_notify_queue(vf, qid);
+}
+
+static u32 ifcvf_vdpa_get_generation(struct vdpa_device *vdpa_dev)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	return ioread8(&vf->common_cfg->config_generation);
+}
+
+static u32 ifcvf_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
+{
+	return VIRTIO_ID_NET;
+}
+
+static u32 ifcvf_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
+{
+	return IFCVF_SUBSYS_VENDOR_ID;
+}
+
+static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
+{
+	return IFCVF_QUEUE_ALIGNMENT;
+}
+
+static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev,
+				  unsigned int offset,
+				  void *buf, unsigned int len)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	WARN_ON(offset + len > sizeof(struct virtio_net_config));
+	ifcvf_read_net_config(vf, offset, buf, len);
+}
+
+static void ifcvf_vdpa_set_config(struct vdpa_device *vdpa_dev,
+				  unsigned int offset, const void *buf,
+				  unsigned int len)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	WARN_ON(offset + len > sizeof(struct virtio_net_config));
+	ifcvf_write_net_config(vf, offset, buf, len);
+}
+
+static void ifcvf_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
+				     struct vdpa_callback *cb)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	vf->config_cb.callback = cb->callback;
+	vf->config_cb.private = cb->private;
+}
+
+static int ifcvf_vdpa_get_vq_irq(struct vdpa_device *vdpa_dev,
+				 u16 qid)
+{
+	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+	return vf->vring[qid].irq;
+}
+
+/*
+ * IFCVF currently does't have on-chip IOMMU, so not
+ * implemented set_map()/dma_map()/dma_unmap()
+ */
+static const struct vdpa_config_ops ifc_vdpa_ops = {
+	.get_features	= ifcvf_vdpa_get_features,
+	.set_features	= ifcvf_vdpa_set_features,
+	.get_status	= ifcvf_vdpa_get_status,
+	.set_status	= ifcvf_vdpa_set_status,
+	.get_vq_num_max	= ifcvf_vdpa_get_vq_num_max,
+	.get_vq_state	= ifcvf_vdpa_get_vq_state,
+	.set_vq_state	= ifcvf_vdpa_set_vq_state,
+	.set_vq_cb	= ifcvf_vdpa_set_vq_cb,
+	.set_vq_ready	= ifcvf_vdpa_set_vq_ready,
+	.get_vq_ready	= ifcvf_vdpa_get_vq_ready,
+	.set_vq_num	= ifcvf_vdpa_set_vq_num,
+	.set_vq_address	= ifcvf_vdpa_set_vq_address,
+	.get_vq_irq	= ifcvf_vdpa_get_vq_irq,
+	.kick_vq	= ifcvf_vdpa_kick_vq,
+	.get_generation	= ifcvf_vdpa_get_generation,
+	.get_device_id	= ifcvf_vdpa_get_device_id,
+	.get_vendor_id	= ifcvf_vdpa_get_vendor_id,
+	.get_vq_align	= ifcvf_vdpa_get_vq_align,
+	.get_config	= ifcvf_vdpa_get_config,
+	.set_config	= ifcvf_vdpa_set_config,
+	.set_config_cb  = ifcvf_vdpa_set_config_cb,
+};
+
+static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct device *dev = &pdev->dev;
+	struct ifcvf_adapter *adapter;
+	struct ifcvf_hw *vf;
+	int ret, i;
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		IFCVF_ERR(pdev, "Failed to enable device\n");
+		return ret;
+	}
+
+	ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4),
+				 IFCVF_DRIVER_NAME);
+	if (ret) {
+		IFCVF_ERR(pdev, "Failed to request MMIO region\n");
+		return ret;
+	}
+
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		IFCVF_ERR(pdev, "No usable DMA confiugration\n");
+		return ret;
+	}
+
+	ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		IFCVF_ERR(pdev,
+			  "No usable coherent DMA confiugration\n");
+		return ret;
+	}
+
+	ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev);
+	if (ret) {
+		IFCVF_ERR(pdev,
+			  "Failed for adding devres for freeing irq vectors\n");
+		return ret;
+	}
+
+	adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
+				    dev, &ifc_vdpa_ops,
+				    IFCVF_MAX_QUEUE_PAIRS * 2);
+	if (adapter == NULL) {
+		IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
+		return -ENOMEM;
+	}
+
+	pci_set_master(pdev);
+	pci_set_drvdata(pdev, adapter);
+
+	vf = &adapter->vf;
+	vf->base = pcim_iomap_table(pdev);
+
+	adapter->pdev = pdev;
+	adapter->vdpa.dma_dev = &pdev->dev;
+
+	ret = ifcvf_init_hw(vf, pdev);
+	if (ret) {
+		IFCVF_ERR(pdev, "Failed to init IFCVF hw\n");
+		goto err;
+	}
+
+	for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
+		vf->vring[i].irq = -EINVAL;
+
+	ret = vdpa_register_device(&adapter->vdpa);
+	if (ret) {
+		IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus");
+		goto err;
+	}
+
+	return 0;
+
+err:
+	put_device(&adapter->vdpa.dev);
+	return ret;
+}
+
+static void ifcvf_remove(struct pci_dev *pdev)
+{
+	struct ifcvf_adapter *adapter = pci_get_drvdata(pdev);
+
+	vdpa_unregister_device(&adapter->vdpa);
+}
+
+static struct pci_device_id ifcvf_pci_ids[] = {
+	{ PCI_DEVICE_SUB(IFCVF_VENDOR_ID,
+		IFCVF_DEVICE_ID,
+		IFCVF_SUBSYS_VENDOR_ID,
+		IFCVF_SUBSYS_DEVICE_ID) },
+	{ 0 },
+};
+MODULE_DEVICE_TABLE(pci, ifcvf_pci_ids);
+
+static struct pci_driver ifcvf_driver = {
+	.name     = IFCVF_DRIVER_NAME,
+	.id_table = ifcvf_pci_ids,
+	.probe    = ifcvf_probe,
+	.remove   = ifcvf_remove,
+};
+
+module_pci_driver(ifcvf_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(VERSION_STRING);
diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
new file mode 100644
index 000000000..89a5beded
--- /dev/null
+++ b/drivers/vdpa/mlx5/Makefile
@@ -0,0 +1,4 @@
+subdir-ccflags-y += -I$(srctree)/drivers/vdpa/mlx5/core
+
+obj-$(CONFIG_MLX5_VDPA_NET) += mlx5_vdpa.o
+mlx5_vdpa-$(CONFIG_MLX5_VDPA_NET) += net/main.o net/mlx5_vnet.o core/resources.o core/mr.o
diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
new file mode 100644
index 000000000..b6cc53ba9
--- /dev/null
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#ifndef __MLX5_VDPA_H__
+#define __MLX5_VDPA_H__
+
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/vdpa.h>
+#include <linux/mlx5/driver.h>
+
+#define MLX5V_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
+
+struct mlx5_vdpa_direct_mr {
+	u64 start;
+	u64 end;
+	u32 perm;
+	struct mlx5_core_mkey mr;
+	struct sg_table sg_head;
+	int log_size;
+	int nsg;
+	int nent;
+	struct list_head list;
+	u64 offset;
+};
+
+struct mlx5_vdpa_mr {
+	struct mlx5_core_mkey mkey;
+
+	/* list of direct MRs descendants of this indirect mr */
+	struct list_head head;
+	unsigned long num_directs;
+	unsigned long num_klms;
+	bool initialized;
+
+	/* serialize mkey creation and destruction */
+	struct mutex mkey_mtx;
+};
+
+struct mlx5_vdpa_resources {
+	u32 pdn;
+	struct mlx5_uars_page *uar;
+	void __iomem *kick_addr;
+	u16 uid;
+	u32 null_mkey;
+	bool valid;
+};
+
+struct mlx5_vdpa_dev {
+	struct vdpa_device vdev;
+	struct mlx5_core_dev *mdev;
+	struct mlx5_vdpa_resources res;
+
+	u64 mlx_features;
+	u64 actual_features;
+	u8 status;
+	u32 max_vqs;
+	u32 generation;
+
+	struct mlx5_vdpa_mr mr;
+};
+
+int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid);
+int mlx5_vdpa_dealloc_pd(struct mlx5_vdpa_dev *dev, u32 pdn, u16 uid);
+int mlx5_vdpa_get_null_mkey(struct mlx5_vdpa_dev *dev, u32 *null_mkey);
+int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn);
+void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn);
+int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn);
+void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn);
+int mlx5_vdpa_create_tir(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tirn);
+void mlx5_vdpa_destroy_tir(struct mlx5_vdpa_dev *mvdev, u32 tirn);
+int mlx5_vdpa_alloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 *tdn);
+void mlx5_vdpa_dealloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 tdn);
+int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev);
+void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev);
+int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *mkey, u32 *in,
+			  int inlen);
+int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *mkey);
+int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
+			     bool *change_map);
+int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb);
+void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev);
+
+#define mlx5_vdpa_warn(__dev, format, ...)                                                         \
+	dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, __func__, __LINE__,     \
+		 current->pid, ##__VA_ARGS__)
+
+#define mlx5_vdpa_info(__dev, format, ...)                                                         \
+	dev_info((__dev)->mdev->device, "%s:%d:(pid %d): " format, __func__, __LINE__,             \
+		 current->pid, ##__VA_ARGS__)
+
+#define mlx5_vdpa_dbg(__dev, format, ...)                                                          \
+	dev_debug((__dev)->mdev->device, "%s:%d:(pid %d): " format, __func__, __LINE__,            \
+		  current->pid, ##__VA_ARGS__)
+
+#endif /* __MLX5_VDPA_H__ */
diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h b/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h
new file mode 100644
index 000000000..f6f57a29b
--- /dev/null
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#ifndef __MLX5_VDPA_IFC_H_
+#define __MLX5_VDPA_IFC_H_
+
+#include <linux/mlx5/mlx5_ifc.h>
+
+enum {
+	MLX5_VIRTIO_Q_EVENT_MODE_NO_MSIX_MODE  = 0x0,
+	MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE       = 0x1,
+	MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE     = 0x2,
+};
+
+enum {
+	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT   = 0x1, // do I check this caps?
+	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED  = 0x2,
+};
+
+enum {
+	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT   = 0,
+	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED  = 1,
+};
+
+struct mlx5_ifc_virtio_q_bits {
+	u8    virtio_q_type[0x8];
+	u8    reserved_at_8[0x5];
+	u8    event_mode[0x3];
+	u8    queue_index[0x10];
+
+	u8    full_emulation[0x1];
+	u8    virtio_version_1_0[0x1];
+	u8    reserved_at_22[0x2];
+	u8    offload_type[0x4];
+	u8    event_qpn_or_msix[0x18];
+
+	u8    doorbell_stride_index[0x10];
+	u8    queue_size[0x10];
+
+	u8    device_emulation_id[0x20];
+
+	u8    desc_addr[0x40];
+
+	u8    used_addr[0x40];
+
+	u8    available_addr[0x40];
+
+	u8    virtio_q_mkey[0x20];
+
+	u8    max_tunnel_desc[0x10];
+	u8    reserved_at_170[0x8];
+	u8    error_type[0x8];
+
+	u8    umem_1_id[0x20];
+
+	u8    umem_1_size[0x20];
+
+	u8    umem_1_offset[0x40];
+
+	u8    umem_2_id[0x20];
+
+	u8    umem_2_size[0x20];
+
+	u8    umem_2_offset[0x40];
+
+	u8    umem_3_id[0x20];
+
+	u8    umem_3_size[0x20];
+
+	u8    umem_3_offset[0x40];
+
+	u8    counter_set_id[0x20];
+
+	u8    reserved_at_320[0x8];
+	u8    pd[0x18];
+
+	u8    reserved_at_340[0xc0];
+};
+
+struct mlx5_ifc_virtio_net_q_object_bits {
+	u8    modify_field_select[0x40];
+
+	u8    reserved_at_40[0x20];
+
+	u8    vhca_id[0x10];
+	u8    reserved_at_70[0x10];
+
+	u8    queue_feature_bit_mask_12_3[0xa];
+	u8    dirty_bitmap_dump_enable[0x1];
+	u8    vhost_log_page[0x5];
+	u8    reserved_at_90[0xc];
+	u8    state[0x4];
+
+	u8    reserved_at_a0[0x5];
+	u8    queue_feature_bit_mask_2_0[0x3];
+	u8    tisn_or_qpn[0x18];
+
+	u8    dirty_bitmap_mkey[0x20];
+
+	u8    dirty_bitmap_size[0x20];
+
+	u8    dirty_bitmap_addr[0x40];
+
+	u8    hw_available_index[0x10];
+	u8    hw_used_index[0x10];
+
+	u8    reserved_at_160[0xa0];
+
+	struct mlx5_ifc_virtio_q_bits virtio_q_context;
+};
+
+struct mlx5_ifc_create_virtio_net_q_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+
+	struct mlx5_ifc_virtio_net_q_object_bits obj_context;
+};
+
+struct mlx5_ifc_create_virtio_net_q_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+};
+
+struct mlx5_ifc_destroy_virtio_net_q_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_out_cmd_hdr;
+};
+
+struct mlx5_ifc_destroy_virtio_net_q_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+};
+
+struct mlx5_ifc_query_virtio_net_q_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+};
+
+struct mlx5_ifc_query_virtio_net_q_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+
+	struct mlx5_ifc_virtio_net_q_object_bits obj_context;
+};
+
+enum {
+	MLX5_VIRTQ_MODIFY_MASK_STATE                    = (u64)1 << 0,
+	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS      = (u64)1 << 3,
+	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
+};
+
+enum {
+	MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT     = 0x0,
+	MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY      = 0x1,
+	MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND  = 0x2,
+	MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR      = 0x3,
+};
+
+enum {
+	MLX5_RQTC_LIST_Q_TYPE_RQ            = 0x0,
+	MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q  = 0x1,
+};
+
+struct mlx5_ifc_modify_virtio_net_q_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
+
+	struct mlx5_ifc_virtio_net_q_object_bits obj_context;
+};
+
+struct mlx5_ifc_modify_virtio_net_q_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+};
+
+#endif /* __MLX5_VDPA_IFC_H_ */
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
new file mode 100644
index 000000000..1f94ea46c
--- /dev/null
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#include <linux/vdpa.h>
+#include <linux/gcd.h>
+#include <linux/string.h>
+#include <linux/mlx5/qp.h>
+#include "mlx5_vdpa.h"
+
+/* DIV_ROUND_UP where the divider is a power of 2 give by its log base 2 value */
+#define MLX5_DIV_ROUND_UP_POW2(_n, _s) \
+({ \
+	u64 __s = _s; \
+	u64 _res; \
+	_res = (((_n) + (1 << (__s)) - 1) >> (__s)); \
+	_res; \
+})
+
+static int get_octo_len(u64 len, int page_shift)
+{
+	u64 page_size = 1ULL << page_shift;
+	int npages;
+
+	npages = ALIGN(len, page_size) >> page_shift;
+	return (npages + 1) / 2;
+}
+
+static void mlx5_set_access_mode(void *mkc, int mode)
+{
+	MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, mode >> 2);
+}
+
+static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, __be64 *mtt)
+{
+	struct scatterlist *sg;
+	int nsg = mr->nsg;
+	u64 dma_addr;
+	u64 dma_len;
+	int j = 0;
+	int i;
+
+	for_each_sg(mr->sg_head.sgl, sg, mr->nent, i) {
+		for (dma_addr = sg_dma_address(sg), dma_len = sg_dma_len(sg);
+		     nsg && dma_len;
+		     nsg--, dma_addr += BIT(mr->log_size), dma_len -= BIT(mr->log_size))
+			mtt[j++] = cpu_to_be64(dma_addr);
+	}
+}
+
+static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)
+{
+	int inlen;
+	void *mkc;
+	void *in;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + roundup(MLX5_ST_SZ_BYTES(mtt) * mr->nsg, 16);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, lw, !!(mr->perm & VHOST_MAP_WO));
+	MLX5_SET(mkc, mkc, lr, !!(mr->perm & VHOST_MAP_RO));
+	mlx5_set_access_mode(mkc, MLX5_MKC_ACCESS_MODE_MTT);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, pd, mvdev->res.pdn);
+	MLX5_SET64(mkc, mkc, start_addr, mr->offset);
+	MLX5_SET64(mkc, mkc, len, mr->end - mr->start);
+	MLX5_SET(mkc, mkc, log_page_size, mr->log_size);
+	MLX5_SET(mkc, mkc, translations_octword_size,
+		 get_octo_len(mr->end - mr->start, mr->log_size));
+	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
+		 get_octo_len(mr->end - mr->start, mr->log_size));
+	populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt));
+	err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen);
+	kvfree(in);
+	if (err) {
+		mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)
+{
+	mlx5_vdpa_destroy_mkey(mvdev, &mr->mr);
+}
+
+static u64 map_start(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr)
+{
+	return max_t(u64, map->start, mr->start);
+}
+
+static u64 map_end(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr)
+{
+	return min_t(u64, map->last + 1, mr->end);
+}
+
+static u64 maplen(struct vhost_iotlb_map *map, struct mlx5_vdpa_direct_mr *mr)
+{
+	return map_end(map, mr) - map_start(map, mr);
+}
+
+#define MLX5_VDPA_INVALID_START_ADDR ((u64)-1)
+#define MLX5_VDPA_INVALID_LEN ((u64)-1)
+
+static u64 indir_start_addr(struct mlx5_vdpa_mr *mkey)
+{
+	struct mlx5_vdpa_direct_mr *s;
+
+	s = list_first_entry_or_null(&mkey->head, struct mlx5_vdpa_direct_mr, list);
+	if (!s)
+		return MLX5_VDPA_INVALID_START_ADDR;
+
+	return s->start;
+}
+
+static u64 indir_len(struct mlx5_vdpa_mr *mkey)
+{
+	struct mlx5_vdpa_direct_mr *s;
+	struct mlx5_vdpa_direct_mr *e;
+
+	s = list_first_entry_or_null(&mkey->head, struct mlx5_vdpa_direct_mr, list);
+	if (!s)
+		return MLX5_VDPA_INVALID_LEN;
+
+	e = list_last_entry(&mkey->head, struct mlx5_vdpa_direct_mr, list);
+
+	return e->end - s->start;
+}
+
+#define LOG_MAX_KLM_SIZE 30
+#define MAX_KLM_SIZE BIT(LOG_MAX_KLM_SIZE)
+
+static u32 klm_bcount(u64 size)
+{
+	return (u32)size;
+}
+
+static void fill_indir(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mkey, void *in)
+{
+	struct mlx5_vdpa_direct_mr *dmr;
+	struct mlx5_klm *klmarr;
+	struct mlx5_klm *klm;
+	bool first = true;
+	u64 preve;
+	int i;
+
+	klmarr = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+	i = 0;
+	list_for_each_entry(dmr, &mkey->head, list) {
+again:
+		klm = &klmarr[i++];
+		if (first) {
+			preve = dmr->start;
+			first = false;
+		}
+
+		if (preve == dmr->start) {
+			klm->key = cpu_to_be32(dmr->mr.key);
+			klm->bcount = cpu_to_be32(klm_bcount(dmr->end - dmr->start));
+			preve = dmr->end;
+		} else {
+			klm->key = cpu_to_be32(mvdev->res.null_mkey);
+			klm->bcount = cpu_to_be32(klm_bcount(dmr->start - preve));
+			preve = dmr->start;
+			goto again;
+		}
+	}
+}
+
+static int klm_byte_size(int nklms)
+{
+	return 16 * ALIGN(nklms, 4);
+}
+
+static int create_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
+{
+	int inlen;
+	void *mkc;
+	void *in;
+	int err;
+	u64 start;
+	u64 len;
+
+	start = indir_start_addr(mr);
+	len = indir_len(mr);
+	if (start == MLX5_VDPA_INVALID_START_ADDR || len == MLX5_VDPA_INVALID_LEN)
+		return -EINVAL;
+
+	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + klm_byte_size(mr->num_klms);
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+	mlx5_set_access_mode(mkc, MLX5_MKC_ACCESS_MODE_KLMS);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, pd, mvdev->res.pdn);
+	MLX5_SET64(mkc, mkc, start_addr, start);
+	MLX5_SET64(mkc, mkc, len, len);
+	MLX5_SET(mkc, mkc, translations_octword_size, klm_byte_size(mr->num_klms) / 16);
+	MLX5_SET(create_mkey_in, in, translations_octword_actual_size, mr->num_klms);
+	fill_indir(mvdev, mr, in);
+	err = mlx5_vdpa_create_mkey(mvdev, &mr->mkey, in, inlen);
+	kfree(in);
+	return err;
+}
+
+static void destroy_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mkey)
+{
+	mlx5_vdpa_destroy_mkey(mvdev, &mkey->mkey);
+}
+
+static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr,
+			 struct vhost_iotlb *iotlb)
+{
+	struct vhost_iotlb_map *map;
+	unsigned long lgcd = 0;
+	int log_entity_size;
+	unsigned long size;
+	u64 start = 0;
+	int err;
+	struct page *pg;
+	unsigned int nsg;
+	int sglen;
+	u64 pa;
+	u64 paend;
+	struct scatterlist *sg;
+	struct device *dma = mvdev->mdev->device;
+
+	for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1);
+	     map; map = vhost_iotlb_itree_next(map, start, mr->end - 1)) {
+		size = maplen(map, mr);
+		lgcd = gcd(lgcd, size);
+		start += size;
+	}
+	log_entity_size = ilog2(lgcd);
+
+	sglen = 1 << log_entity_size;
+	nsg = MLX5_DIV_ROUND_UP_POW2(mr->end - mr->start, log_entity_size);
+
+	err = sg_alloc_table(&mr->sg_head, nsg, GFP_KERNEL);
+	if (err)
+		return err;
+
+	sg = mr->sg_head.sgl;
+	for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1);
+	     map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) {
+		paend = map->addr + maplen(map, mr);
+		for (pa = map->addr; pa < paend; pa += sglen) {
+			pg = pfn_to_page(__phys_to_pfn(pa));
+			if (!sg) {
+				mlx5_vdpa_warn(mvdev, "sg null. start 0x%llx, end 0x%llx\n",
+					       map->start, map->last + 1);
+				err = -ENOMEM;
+				goto err_map;
+			}
+			sg_set_page(sg, pg, sglen, 0);
+			sg = sg_next(sg);
+			if (!sg)
+				goto done;
+		}
+	}
+done:
+	mr->log_size = log_entity_size;
+	mr->nsg = nsg;
+	mr->nent = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0);
+	if (!mr->nent) {
+		err = -ENOMEM;
+		goto err_map;
+	}
+
+	err = create_direct_mr(mvdev, mr);
+	if (err)
+		goto err_direct;
+
+	return 0;
+
+err_direct:
+	dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0);
+err_map:
+	sg_free_table(&mr->sg_head);
+	return err;
+}
+
+static void unmap_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)
+{
+	struct device *dma = mvdev->mdev->device;
+
+	destroy_direct_mr(mvdev, mr);
+	dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0);
+	sg_free_table(&mr->sg_head);
+}
+
+static int add_direct_chain(struct mlx5_vdpa_dev *mvdev, u64 start, u64 size, u8 perm,
+			    struct vhost_iotlb *iotlb)
+{
+	struct mlx5_vdpa_mr *mr = &mvdev->mr;
+	struct mlx5_vdpa_direct_mr *dmr;
+	struct mlx5_vdpa_direct_mr *n;
+	LIST_HEAD(tmp);
+	u64 st;
+	u64 sz;
+	int err;
+	int i = 0;
+
+	st = start;
+	while (size) {
+		sz = (u32)min_t(u64, MAX_KLM_SIZE, size);
+		dmr = kzalloc(sizeof(*dmr), GFP_KERNEL);
+		if (!dmr) {
+			err = -ENOMEM;
+			goto err_alloc;
+		}
+
+		dmr->start = st;
+		dmr->end = st + sz;
+		dmr->perm = perm;
+		err = map_direct_mr(mvdev, dmr, iotlb);
+		if (err) {
+			kfree(dmr);
+			goto err_alloc;
+		}
+
+		list_add_tail(&dmr->list, &tmp);
+		size -= sz;
+		mr->num_directs++;
+		mr->num_klms++;
+		st += sz;
+		i++;
+	}
+	list_splice_tail(&tmp, &mr->head);
+	return 0;
+
+err_alloc:
+	list_for_each_entry_safe(dmr, n, &mr->head, list) {
+		list_del_init(&dmr->list);
+		unmap_direct_mr(mvdev, dmr);
+		kfree(dmr);
+	}
+	return err;
+}
+
+/* The iotlb pointer contains a list of maps. Go over the maps, possibly
+ * merging mergeable maps, and create direct memory keys that provide the
+ * device access to memory. The direct mkeys are then referred to by the
+ * indirect memory key that provides access to the enitre address space given
+ * by iotlb.
+ */
+static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+{
+	struct mlx5_vdpa_mr *mr = &mvdev->mr;
+	struct mlx5_vdpa_direct_mr *dmr;
+	struct mlx5_vdpa_direct_mr *n;
+	struct vhost_iotlb_map *map;
+	u32 pperm = U16_MAX;
+	u64 last = U64_MAX;
+	u64 ps = U64_MAX;
+	u64 pe = U64_MAX;
+	u64 start = 0;
+	int err = 0;
+	int nnuls;
+
+	if (mr->initialized)
+		return 0;
+
+	INIT_LIST_HEAD(&mr->head);
+	for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+	     map = vhost_iotlb_itree_next(map, start, last)) {
+		start = map->start;
+		if (pe == map->start && pperm == map->perm) {
+			pe = map->last + 1;
+		} else {
+			if (ps != U64_MAX) {
+				if (pe < map->start) {
+					/* We have a hole in the map. Check how
+					 * many null keys are required to fill it.
+					 */
+					nnuls = MLX5_DIV_ROUND_UP_POW2(map->start - pe,
+								       LOG_MAX_KLM_SIZE);
+					mr->num_klms += nnuls;
+				}
+				err = add_direct_chain(mvdev, ps, pe - ps, pperm, iotlb);
+				if (err)
+					goto err_chain;
+			}
+			ps = map->start;
+			pe = map->last + 1;
+			pperm = map->perm;
+		}
+	}
+	err = add_direct_chain(mvdev, ps, pe - ps, pperm, iotlb);
+	if (err)
+		goto err_chain;
+
+	/* Create the memory key that defines the guests's address space. This
+	 * memory key refers to the direct keys that contain the MTT
+	 * translations
+	 */
+	err = create_indirect_key(mvdev, mr);
+	if (err)
+		goto err_chain;
+
+	mr->initialized = true;
+	return 0;
+
+err_chain:
+	list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) {
+		list_del_init(&dmr->list);
+		unmap_direct_mr(mvdev, dmr);
+		kfree(dmr);
+	}
+	return err;
+}
+
+int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+{
+	struct mlx5_vdpa_mr *mr = &mvdev->mr;
+	int err;
+
+	mutex_lock(&mr->mkey_mtx);
+	err = _mlx5_vdpa_create_mr(mvdev, iotlb);
+	mutex_unlock(&mr->mkey_mtx);
+	return err;
+}
+
+void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev)
+{
+	struct mlx5_vdpa_mr *mr = &mvdev->mr;
+	struct mlx5_vdpa_direct_mr *dmr;
+	struct mlx5_vdpa_direct_mr *n;
+
+	mutex_lock(&mr->mkey_mtx);
+	if (!mr->initialized)
+		goto out;
+
+	destroy_indirect_key(mvdev, mr);
+	list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) {
+		list_del_init(&dmr->list);
+		unmap_direct_mr(mvdev, dmr);
+		kfree(dmr);
+	}
+	mr->initialized = false;
+out:
+	mutex_unlock(&mr->mkey_mtx);
+}
+
+int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
+			     bool *change_map)
+{
+	struct mlx5_vdpa_mr *mr = &mvdev->mr;
+	int err = 0;
+
+	*change_map = false;
+	mutex_lock(&mr->mkey_mtx);
+	if (mr->initialized) {
+		mlx5_vdpa_info(mvdev, "memory map update\n");
+		*change_map = true;
+	}
+	if (!*change_map)
+		err = _mlx5_vdpa_create_mr(mvdev, iotlb);
+	mutex_unlock(&mr->mkey_mtx);
+
+	return err;
+}
diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c
new file mode 100644
index 000000000..96e6421c5
--- /dev/null
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_vdpa.h"
+
+static int alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+
+	u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {};
+	int err;
+
+	MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
+	MLX5_SET(alloc_pd_in, in, uid, uid);
+
+	err = mlx5_cmd_exec_inout(mdev, alloc_pd, in, out);
+	if (!err)
+		*pdn = MLX5_GET(alloc_pd_out, out, pd);
+
+	return err;
+}
+
+static int dealloc_pd(struct mlx5_vdpa_dev *dev, u32 pdn, u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {};
+	struct mlx5_core_dev *mdev = dev->mdev;
+
+	MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD);
+	MLX5_SET(dealloc_pd_in, in, pd, pdn);
+	MLX5_SET(dealloc_pd_in, in, uid, uid);
+	return mlx5_cmd_exec_in(mdev, dealloc_pd, in);
+}
+
+static int get_null_mkey(struct mlx5_vdpa_dev *dev, u32 *null_mkey)
+{
+	u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {};
+	struct mlx5_core_dev *mdev = dev->mdev;
+	int err;
+
+	MLX5_SET(query_special_contexts_in, in, opcode, MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+	err = mlx5_cmd_exec_inout(mdev, query_special_contexts, in, out);
+	if (!err)
+		*null_mkey = MLX5_GET(query_special_contexts_out, out, null_mkey);
+	return err;
+}
+
+static int create_uctx(struct mlx5_vdpa_dev *mvdev, u16 *uid)
+{
+	u32 out[MLX5_ST_SZ_DW(create_uctx_out)] = {};
+	int inlen;
+	void *in;
+	int err;
+
+	/* 0 means not supported */
+	if (!MLX5_CAP_GEN(mvdev->mdev, log_max_uctx))
+		return -EOPNOTSUPP;
+
+	inlen = MLX5_ST_SZ_BYTES(create_uctx_in);
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX);
+	MLX5_SET(create_uctx_in, in, uctx.cap, MLX5_UCTX_CAP_RAW_TX);
+
+	err = mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out));
+	kfree(in);
+	if (!err)
+		*uid = MLX5_GET(create_uctx_out, out, uid);
+
+	return err;
+}
+
+static void destroy_uctx(struct mlx5_vdpa_dev *mvdev, u32 uid)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_uctx_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(destroy_uctx_in)] = {};
+
+	MLX5_SET(destroy_uctx_in, in, opcode, MLX5_CMD_OP_DESTROY_UCTX);
+	MLX5_SET(destroy_uctx_in, in, uid, uid);
+
+	mlx5_cmd_exec(mvdev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {};
+	int err;
+
+	MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS);
+	MLX5_SET(create_tis_in, in, uid, mvdev->res.uid);
+	err = mlx5_cmd_exec_inout(mvdev->mdev, create_tis, in, out);
+	if (!err)
+		*tisn = MLX5_GET(create_tis_out, out, tisn);
+
+	return err;
+}
+
+void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {};
+
+	MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS);
+	MLX5_SET(destroy_tis_in, in, uid, mvdev->res.uid);
+	MLX5_SET(destroy_tis_in, in, tisn, tisn);
+	mlx5_cmd_exec_in(mvdev->mdev, destroy_tis, in);
+}
+
+int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {};
+	int err;
+
+	MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT);
+	err = mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out));
+	if (!err)
+		*rqtn = MLX5_GET(create_rqt_out, out, rqtn);
+
+	return err;
+}
+
+void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {};
+
+	MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
+	MLX5_SET(destroy_rqt_in, in, uid, mvdev->res.uid);
+	MLX5_SET(destroy_rqt_in, in, rqtn, rqtn);
+	mlx5_cmd_exec_in(mvdev->mdev, destroy_rqt, in);
+}
+
+int mlx5_vdpa_create_tir(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tirn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {};
+	int err;
+
+	MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR);
+	err = mlx5_cmd_exec_inout(mvdev->mdev, create_tir, in, out);
+	if (!err)
+		*tirn = MLX5_GET(create_tir_out, out, tirn);
+
+	return err;
+}
+
+void mlx5_vdpa_destroy_tir(struct mlx5_vdpa_dev *mvdev, u32 tirn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {};
+
+	MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR);
+	MLX5_SET(destroy_tir_in, in, uid, mvdev->res.uid);
+	MLX5_SET(destroy_tir_in, in, tirn, tirn);
+	mlx5_cmd_exec_in(mvdev->mdev, destroy_tir, in);
+}
+
+int mlx5_vdpa_alloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 *tdn)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {};
+	int err;
+
+	MLX5_SET(alloc_transport_domain_in, in, opcode, MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN);
+	MLX5_SET(alloc_transport_domain_in, in, uid, mvdev->res.uid);
+
+	err = mlx5_cmd_exec_inout(mvdev->mdev, alloc_transport_domain, in, out);
+	if (!err)
+		*tdn = MLX5_GET(alloc_transport_domain_out, out, transport_domain);
+
+	return err;
+}
+
+void mlx5_vdpa_dealloc_transport_domain(struct mlx5_vdpa_dev *mvdev, u32 tdn)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {};
+
+	MLX5_SET(dealloc_transport_domain_in, in, opcode, MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN);
+	MLX5_SET(dealloc_transport_domain_in, in, uid, mvdev->res.uid);
+	MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn);
+	mlx5_cmd_exec_in(mvdev->mdev, dealloc_transport_domain, in);
+}
+
+int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *mkey, u32 *in,
+			  int inlen)
+{
+	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {};
+	u32 mkey_index;
+	void *mkc;
+	int err;
+
+	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
+	MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
+
+	err = mlx5_cmd_exec(mvdev->mdev, in, inlen, lout, sizeof(lout));
+	if (err)
+		return err;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
+	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
+	mkey->size = MLX5_GET64(mkc, mkc, len);
+	mkey->key |= mlx5_idx_to_mkey(mkey_index);
+	mkey->pd = MLX5_GET(mkc, mkc, pd);
+	return 0;
+}
+
+int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *mkey)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {};
+
+	MLX5_SET(destroy_mkey_in, in, uid, mvdev->res.uid);
+	MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
+	MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
+	return mlx5_cmd_exec_in(mvdev->mdev, destroy_mkey, in);
+}
+
+int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
+{
+	u64 offset = MLX5_CAP64_DEV_VDPA_EMULATION(mvdev->mdev, doorbell_bar_offset);
+	struct mlx5_vdpa_resources *res = &mvdev->res;
+	struct mlx5_core_dev *mdev = mvdev->mdev;
+	u64 kick_addr;
+	int err;
+
+	if (res->valid) {
+		mlx5_vdpa_warn(mvdev, "resources already allocated\n");
+		return -EINVAL;
+	}
+	mutex_init(&mvdev->mr.mkey_mtx);
+	res->uar = mlx5_get_uars_page(mdev);
+	if (IS_ERR(res->uar)) {
+		err = PTR_ERR(res->uar);
+		goto err_uars;
+	}
+
+	err = create_uctx(mvdev, &res->uid);
+	if (err)
+		goto err_uctx;
+
+	err = alloc_pd(mvdev, &res->pdn, res->uid);
+	if (err)
+		goto err_pd;
+
+	err = get_null_mkey(mvdev, &res->null_mkey);
+	if (err)
+		goto err_key;
+
+	kick_addr = pci_resource_start(mdev->pdev, 0) + offset;
+	res->kick_addr = ioremap(kick_addr, PAGE_SIZE);
+	if (!res->kick_addr) {
+		err = -ENOMEM;
+		goto err_key;
+	}
+	res->valid = true;
+
+	return 0;
+
+err_key:
+	dealloc_pd(mvdev, res->pdn, res->uid);
+err_pd:
+	destroy_uctx(mvdev, res->uid);
+err_uctx:
+	mlx5_put_uars_page(mdev, res->uar);
+err_uars:
+	mutex_destroy(&mvdev->mr.mkey_mtx);
+	return err;
+}
+
+void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev)
+{
+	struct mlx5_vdpa_resources *res = &mvdev->res;
+
+	if (!res->valid)
+		return;
+
+	iounmap(res->kick_addr);
+	res->kick_addr = NULL;
+	dealloc_pd(mvdev, res->pdn, res->uid);
+	destroy_uctx(mvdev, res->uid);
+	mlx5_put_uars_page(mvdev->mdev, res->uar);
+	mutex_destroy(&mvdev->mr.mkey_mtx);
+	res->valid = false;
+}
diff --git a/drivers/vdpa/mlx5/net/main.c b/drivers/vdpa/mlx5/net/main.c
new file mode 100644
index 000000000..838cd9838
--- /dev/null
+++ b/drivers/vdpa/mlx5/net/main.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+#include "mlx5_vdpa_ifc.h"
+#include "mlx5_vnet.h"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox VDPA driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static bool required_caps_supported(struct mlx5_core_dev *mdev)
+{
+	u8 event_mode;
+	u64 got;
+
+	got = MLX5_CAP_GEN_64(mdev, general_obj_types);
+
+	if (!(got & MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q))
+		return false;
+
+	event_mode = MLX5_CAP_DEV_VDPA_EMULATION(mdev, event_mode);
+	if (!(event_mode & MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE))
+		return false;
+
+	if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, eth_frame_offload_type))
+		return false;
+
+	return true;
+}
+
+static void *mlx5_vdpa_add(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_vdpa_dev *vdev;
+
+	if (mlx5_core_is_pf(mdev))
+		return NULL;
+
+	if (!required_caps_supported(mdev)) {
+		dev_info(mdev->device, "virtio net emulation not supported\n");
+		return NULL;
+	}
+	vdev = mlx5_vdpa_add_dev(mdev);
+	if (IS_ERR(vdev))
+		return NULL;
+
+	return vdev;
+}
+
+static void mlx5_vdpa_remove(struct mlx5_core_dev *mdev, void *context)
+{
+	struct mlx5_vdpa_dev *vdev = context;
+
+	mlx5_vdpa_remove_dev(vdev);
+}
+
+static struct mlx5_interface mlx5_vdpa_interface = {
+	.add = mlx5_vdpa_add,
+	.remove = mlx5_vdpa_remove,
+	.protocol = MLX5_INTERFACE_PROTOCOL_VDPA,
+};
+
+static int __init mlx5_vdpa_init(void)
+{
+	return mlx5_register_interface(&mlx5_vdpa_interface);
+}
+
+static void __exit mlx5_vdpa_exit(void)
+{
+	mlx5_unregister_interface(&mlx5_vdpa_interface);
+}
+
+module_init(mlx5_vdpa_init);
+module_exit(mlx5_vdpa_exit);
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
new file mode 100644
index 000000000..577ff786f
--- /dev/null
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -0,0 +1,2053 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#include <linux/vdpa.h>
+#include <uapi/linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/fs.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/mpfs.h>
+#include "mlx5_vnet.h"
+#include "mlx5_vdpa_ifc.h"
+#include "mlx5_vdpa.h"
+
+#define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
+
+#define VALID_FEATURES_MASK                                                                        \
+	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
+	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
+	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
+	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
+	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
+	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
+	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
+	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
+	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
+	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
+	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
+	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
+	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
+
+#define VALID_STATUS_MASK                                                                          \
+	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
+	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
+
+struct mlx5_vdpa_net_resources {
+	u32 tisn;
+	u32 tdn;
+	u32 tirn;
+	u32 rqtn;
+	bool valid;
+};
+
+struct mlx5_vdpa_cq_buf {
+	struct mlx5_frag_buf_ctrl fbc;
+	struct mlx5_frag_buf frag_buf;
+	int cqe_size;
+	int nent;
+};
+
+struct mlx5_vdpa_cq {
+	struct mlx5_core_cq mcq;
+	struct mlx5_vdpa_cq_buf buf;
+	struct mlx5_db db;
+	int cqe;
+};
+
+struct mlx5_vdpa_umem {
+	struct mlx5_frag_buf_ctrl fbc;
+	struct mlx5_frag_buf frag_buf;
+	int size;
+	u32 id;
+};
+
+struct mlx5_vdpa_qp {
+	struct mlx5_core_qp mqp;
+	struct mlx5_frag_buf frag_buf;
+	struct mlx5_db db;
+	u16 head;
+	bool fw;
+};
+
+struct mlx5_vq_restore_info {
+	u32 num_ent;
+	u64 desc_addr;
+	u64 device_addr;
+	u64 driver_addr;
+	u16 avail_index;
+	u16 used_index;
+	bool ready;
+	struct vdpa_callback cb;
+	bool restore;
+};
+
+struct mlx5_vdpa_virtqueue {
+	bool ready;
+	u64 desc_addr;
+	u64 device_addr;
+	u64 driver_addr;
+	u32 num_ent;
+	struct vdpa_callback event_cb;
+
+	/* Resources for implementing the notification channel from the device
+	 * to the driver. fwqp is the firmware end of an RC connection; the
+	 * other end is vqqp used by the driver. cq is is where completions are
+	 * reported.
+	 */
+	struct mlx5_vdpa_cq cq;
+	struct mlx5_vdpa_qp fwqp;
+	struct mlx5_vdpa_qp vqqp;
+
+	/* umem resources are required for the virtqueue operation. They're use
+	 * is internal and they must be provided by the driver.
+	 */
+	struct mlx5_vdpa_umem umem1;
+	struct mlx5_vdpa_umem umem2;
+	struct mlx5_vdpa_umem umem3;
+
+	bool initialized;
+	int index;
+	u32 virtq_id;
+	struct mlx5_vdpa_net *ndev;
+	u16 avail_idx;
+	u16 used_idx;
+	int fw_state;
+
+	/* keep last in the struct */
+	struct mlx5_vq_restore_info ri;
+};
+
+/* We will remove this limitation once mlx5_vdpa_alloc_resources()
+ * provides for driver space allocation
+ */
+#define MLX5_MAX_SUPPORTED_VQS 16
+
+struct mlx5_vdpa_net {
+	struct mlx5_vdpa_dev mvdev;
+	struct mlx5_vdpa_net_resources res;
+	struct virtio_net_config config;
+	struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS];
+
+	/* Serialize vq resources creation and destruction. This is required
+	 * since memory map might change and we need to destroy and create
+	 * resources while driver in operational.
+	 */
+	struct mutex reslock;
+	struct mlx5_flow_table *rxft;
+	struct mlx5_fc *rx_counter;
+	struct mlx5_flow_handle *rx_rule;
+	bool setup;
+	u16 mtu;
+};
+
+static void free_resources(struct mlx5_vdpa_net *ndev);
+static void init_mvqs(struct mlx5_vdpa_net *ndev);
+static int setup_driver(struct mlx5_vdpa_net *ndev);
+static void teardown_driver(struct mlx5_vdpa_net *ndev);
+
+static bool mlx5_vdpa_debug;
+
+#define MLX5_LOG_VIO_FLAG(_feature)                                                                \
+	do {                                                                                       \
+		if (features & BIT_ULL(_feature))                                                  \
+			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
+	} while (0)
+
+#define MLX5_LOG_VIO_STAT(_status)                                                                 \
+	do {                                                                                       \
+		if (status & (_status))                                                            \
+			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
+	} while (0)
+
+static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
+{
+	if (status & ~VALID_STATUS_MASK)
+		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
+			       status & ~VALID_STATUS_MASK);
+
+	if (!mlx5_vdpa_debug)
+		return;
+
+	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
+	if (set && !status) {
+		mlx5_vdpa_info(mvdev, "driver resets the device\n");
+		return;
+	}
+
+	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
+	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
+	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
+	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
+	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
+	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
+}
+
+static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
+{
+	if (features & ~VALID_FEATURES_MASK)
+		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
+			       features & ~VALID_FEATURES_MASK);
+
+	if (!mlx5_vdpa_debug)
+		return;
+
+	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
+	if (!features)
+		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
+
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
+	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
+	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
+	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
+	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
+	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
+	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
+	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
+	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
+}
+
+static int create_tis(struct mlx5_vdpa_net *ndev)
+{
+	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
+	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
+	void *tisc;
+	int err;
+
+	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
+	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
+	if (err)
+		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
+
+	return err;
+}
+
+static void destroy_tis(struct mlx5_vdpa_net *ndev)
+{
+	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
+}
+
+#define MLX5_VDPA_CQE_SIZE 64
+#define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
+
+static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
+{
+	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
+	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
+	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
+	int err;
+
+	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
+				       ndev->mvdev.mdev->priv.numa_node);
+	if (err)
+		return err;
+
+	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
+
+	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
+	buf->nent = nent;
+
+	return 0;
+}
+
+static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
+{
+	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
+
+	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
+					ndev->mvdev.mdev->priv.numa_node);
+}
+
+static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
+{
+	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
+}
+
+static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
+{
+	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
+}
+
+static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
+{
+	struct mlx5_cqe64 *cqe64;
+	void *cqe;
+	int i;
+
+	for (i = 0; i < buf->nent; i++) {
+		cqe = get_cqe(vcq, i);
+		cqe64 = cqe;
+		cqe64->op_own = MLX5_CQE_INVALID << 4;
+	}
+}
+
+static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
+{
+	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
+
+	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
+	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
+		return cqe64;
+
+	return NULL;
+}
+
+static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
+{
+	vqp->head += n;
+	vqp->db.db[0] = cpu_to_be32(vqp->head);
+}
+
+static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
+		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
+{
+	struct mlx5_vdpa_qp *vqp;
+	__be64 *pas;
+	void *qpc;
+
+	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
+	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+	if (vqp->fw) {
+		/* Firmware QP is allocated by the driver for the firmware's
+		 * use so we can skip part of the params as they will be chosen by firmware
+		 */
+		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
+		MLX5_SET(qpc, qpc, no_sq, 1);
+		return;
+	}
+
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
+	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
+	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
+	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(qpc, qpc, no_sq, 1);
+	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
+	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
+	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
+	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
+	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
+}
+
+static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
+{
+	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
+					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
+					ndev->mvdev.mdev->priv.numa_node);
+}
+
+static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
+{
+	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
+}
+
+static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
+		     struct mlx5_vdpa_qp *vqp)
+{
+	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
+	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
+	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
+	void *qpc;
+	void *in;
+	int err;
+
+	if (!vqp->fw) {
+		vqp = &mvq->vqqp;
+		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
+		if (err)
+			return err;
+
+		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
+		if (err)
+			goto err_db;
+		inlen += vqp->frag_buf.npages * sizeof(__be64);
+	}
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_kzalloc;
+	}
+
+	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
+	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
+	if (!vqp->fw)
+		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
+	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
+	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
+	kfree(in);
+	if (err)
+		goto err_kzalloc;
+
+	vqp->mqp.uid = ndev->mvdev.res.uid;
+	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
+
+	if (!vqp->fw)
+		rx_post(vqp, mvq->num_ent);
+
+	return 0;
+
+err_kzalloc:
+	if (!vqp->fw)
+		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
+err_db:
+	if (!vqp->fw)
+		rq_buf_free(ndev, vqp);
+
+	return err;
+}
+
+static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
+
+	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
+	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
+	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
+		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
+	if (!vqp->fw) {
+		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
+		rq_buf_free(ndev, vqp);
+	}
+}
+
+static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
+{
+	struct mlx5_cqe64 *cqe64;
+
+	cqe64 = next_cqe_sw(vcq);
+	if (!cqe64)
+		return -EAGAIN;
+
+	vcq->mcq.cons_index++;
+	return 0;
+}
+
+static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
+{
+	mlx5_cq_set_ci(&mvq->cq.mcq);
+
+	/* make sure CQ cosumer update is visible to the hardware before updating
+	 * RX doorbell record.
+	 */
+	dma_wmb();
+	rx_post(&mvq->vqqp, num);
+	if (mvq->event_cb.callback)
+		mvq->event_cb.callback(mvq->event_cb.private);
+}
+
+static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
+{
+	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
+	struct mlx5_vdpa_net *ndev = mvq->ndev;
+	void __iomem *uar_page = ndev->mvdev.res.uar->map;
+	int num = 0;
+
+	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
+		num++;
+		if (num > mvq->num_ent / 2) {
+			/* If completions keep coming while we poll, we want to
+			 * let the hardware know that we consumed them by
+			 * updating the doorbell record.  We also let vdpa core
+			 * know about this so it passes it on the virtio driver
+			 * on the guest.
+			 */
+			mlx5_vdpa_handle_completions(mvq, num);
+			num = 0;
+		}
+	}
+
+	if (num)
+		mlx5_vdpa_handle_completions(mvq, num);
+
+	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
+}
+
+static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
+{
+	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
+	void __iomem *uar_page = ndev->mvdev.res.uar->map;
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
+	struct mlx5_vdpa_cq *vcq = &mvq->cq;
+	__be64 *pas;
+	int inlen;
+	void *cqc;
+	void *in;
+	int err;
+	int eqn;
+
+	err = mlx5_db_alloc(mdev, &vcq->db);
+	if (err)
+		return err;
+
+	vcq->mcq.set_ci_db = vcq->db.db;
+	vcq->mcq.arm_db = vcq->db.db + 1;
+	vcq->mcq.cqe_sz = 64;
+
+	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
+	if (err)
+		goto err_db;
+
+	cq_frag_buf_init(vcq, &vcq->buf);
+
+	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_vzalloc;
+	}
+
+	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
+	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
+	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
+
+	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+
+	/* Use vector 0 by default. Consider adding code to choose least used
+	 * vector.
+	 */
+	err = mlx5_vector2eqn(mdev, 0, &eqn);
+	if (err)
+		goto err_vec;
+
+	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
+	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
+	MLX5_SET(cqc, cqc, c_eqn, eqn);
+	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
+
+	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
+	if (err)
+		goto err_vec;
+
+	vcq->mcq.comp = mlx5_vdpa_cq_comp;
+	vcq->cqe = num_ent;
+	vcq->mcq.set_ci_db = vcq->db.db;
+	vcq->mcq.arm_db = vcq->db.db + 1;
+	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
+	kfree(in);
+	return 0;
+
+err_vec:
+	kfree(in);
+err_vzalloc:
+	cq_frag_buf_free(ndev, &vcq->buf);
+err_db:
+	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
+	return err;
+}
+
+static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
+{
+	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
+	struct mlx5_vdpa_cq *vcq = &mvq->cq;
+
+	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
+		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
+		return;
+	}
+	cq_frag_buf_free(ndev, &vcq->buf);
+	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
+}
+
+static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
+			  struct mlx5_vdpa_umem **umemp)
+{
+	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
+	int p_a;
+	int p_b;
+
+	switch (num) {
+	case 1:
+		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
+		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
+		*umemp = &mvq->umem1;
+		break;
+	case 2:
+		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
+		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
+		*umemp = &mvq->umem2;
+		break;
+	case 3:
+		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
+		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
+		*umemp = &mvq->umem3;
+		break;
+	}
+	(*umemp)->size = p_a * mvq->num_ent + p_b;
+}
+
+static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
+{
+	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
+}
+
+static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
+{
+	int inlen;
+	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
+	void *um;
+	void *in;
+	int err;
+	__be64 *pas;
+	struct mlx5_vdpa_umem *umem;
+
+	set_umem_size(ndev, mvq, num, &umem);
+	err = umem_frag_buf_alloc(ndev, umem, umem->size);
+	if (err)
+		return err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_in;
+	}
+
+	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
+	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
+	um = MLX5_ADDR_OF(create_umem_in, in, umem);
+	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
+
+	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
+	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
+
+	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
+	if (err) {
+		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
+		goto err_cmd;
+	}
+
+	kfree(in);
+	umem->id = MLX5_GET(create_umem_out, out, umem_id);
+
+	return 0;
+
+err_cmd:
+	kfree(in);
+err_in:
+	umem_frag_buf_free(ndev, umem);
+	return err;
+}
+
+static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
+	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
+	struct mlx5_vdpa_umem *umem;
+
+	switch (num) {
+	case 1:
+		umem = &mvq->umem1;
+		break;
+	case 2:
+		umem = &mvq->umem2;
+		break;
+	case 3:
+		umem = &mvq->umem3;
+		break;
+	}
+
+	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
+	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
+	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
+		return;
+
+	umem_frag_buf_free(ndev, umem);
+}
+
+static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+	int num;
+	int err;
+
+	for (num = 1; num <= 3; num++) {
+		err = create_umem(ndev, mvq, num);
+		if (err)
+			goto err_umem;
+	}
+	return 0;
+
+err_umem:
+	for (num--; num > 0; num--)
+		umem_destroy(ndev, mvq, num);
+
+	return err;
+}
+
+static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+	int num;
+
+	for (num = 3; num > 0; num--)
+		umem_destroy(ndev, mvq, num);
+}
+
+static int get_queue_type(struct mlx5_vdpa_net *ndev)
+{
+	u32 type_mask;
+
+	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
+
+	/* prefer split queue */
+	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED)
+		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
+
+	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT));
+
+	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
+}
+
+static bool vq_is_tx(u16 idx)
+{
+	return idx % 2;
+}
+
+static u16 get_features_12_3(u64 features)
+{
+	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
+	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
+	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
+	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
+}
+
+static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
+	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
+	void *obj_context;
+	void *cmd_hdr;
+	void *vq_ctx;
+	void *in;
+	int err;
+
+	err = umems_create(ndev, mvq);
+	if (err)
+		return err;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_alloc;
+	}
+
+	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
+
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
+
+	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
+	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
+	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
+	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
+		 get_features_12_3(ndev->mvdev.actual_features));
+	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
+	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
+
+	if (vq_is_tx(mvq->index))
+		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
+
+	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
+	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
+	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
+	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
+	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
+		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
+	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
+	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
+	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
+	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey.key);
+	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
+	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
+	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
+	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
+	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
+	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
+	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
+
+	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
+	if (err)
+		goto err_cmd;
+
+	kfree(in);
+	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+
+	return 0;
+
+err_cmd:
+	kfree(in);
+err_alloc:
+	umems_destroy(ndev, mvq);
+	return err;
+}
+
+static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
+	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
+
+	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
+		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
+	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
+	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
+		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
+	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
+		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
+		return;
+	}
+	umems_destroy(ndev, mvq);
+}
+
+static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
+{
+	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
+}
+
+static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
+{
+	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
+}
+
+static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
+			int *outlen, u32 qpn, u32 rqpn)
+{
+	void *qpc;
+	void *pp;
+
+	switch (cmd) {
+	case MLX5_CMD_OP_2RST_QP:
+		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
+		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
+		*in = kzalloc(*inlen, GFP_KERNEL);
+		*out = kzalloc(*outlen, GFP_KERNEL);
+		if (!*in || !*out)
+			goto outerr;
+
+		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
+		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
+		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
+		break;
+	case MLX5_CMD_OP_RST2INIT_QP:
+		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
+		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
+		*in = kzalloc(*inlen, GFP_KERNEL);
+		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
+		if (!*in || !*out)
+			goto outerr;
+
+		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
+		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
+		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
+		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
+		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
+		MLX5_SET(qpc, qpc, rwe, 1);
+		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+		MLX5_SET(ads, pp, vhca_port_num, 1);
+		break;
+	case MLX5_CMD_OP_INIT2RTR_QP:
+		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
+		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
+		*in = kzalloc(*inlen, GFP_KERNEL);
+		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
+		if (!*in || !*out)
+			goto outerr;
+
+		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
+		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
+		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
+		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
+		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
+		MLX5_SET(qpc, qpc, log_msg_max, 30);
+		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
+		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+		MLX5_SET(ads, pp, fl, 1);
+		break;
+	case MLX5_CMD_OP_RTR2RTS_QP:
+		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
+		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
+		*in = kzalloc(*inlen, GFP_KERNEL);
+		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
+		if (!*in || !*out)
+			goto outerr;
+
+		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
+		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
+		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
+		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
+		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+		MLX5_SET(ads, pp, ack_timeout, 14);
+		MLX5_SET(qpc, qpc, retry_count, 7);
+		MLX5_SET(qpc, qpc, rnr_retry, 7);
+		break;
+	default:
+		goto outerr_nullify;
+	}
+
+	return;
+
+outerr:
+	kfree(*in);
+	kfree(*out);
+outerr_nullify:
+	*in = NULL;
+	*out = NULL;
+}
+
+static void free_inout(void *in, void *out)
+{
+	kfree(in);
+	kfree(out);
+}
+
+/* Two QPs are used by each virtqueue. One is used by the driver and one by
+ * firmware. The fw argument indicates whether the subjected QP is the one used
+ * by firmware.
+ */
+static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
+{
+	int outlen;
+	int inlen;
+	void *out;
+	void *in;
+	int err;
+
+	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
+	if (!in || !out)
+		return -ENOMEM;
+
+	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
+	free_inout(in, out);
+	return err;
+}
+
+static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+	int err;
+
+	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
+	if (err)
+		return err;
+
+	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
+	if (err)
+		return err;
+
+	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
+	if (err)
+		return err;
+
+	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
+	if (err)
+		return err;
+
+	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
+	if (err)
+		return err;
+
+	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
+	if (err)
+		return err;
+
+	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
+}
+
+struct mlx5_virtq_attr {
+	u8 state;
+	u16 available_index;
+	u16 used_index;
+};
+
+static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
+			   struct mlx5_virtq_attr *attr)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
+	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
+	void *out;
+	void *obj_context;
+	void *cmd_hdr;
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
+
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
+	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
+	if (err)
+		goto err_cmd;
+
+	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
+	memset(attr, 0, sizeof(*attr));
+	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
+	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
+	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
+	kfree(out);
+	return 0;
+
+err_cmd:
+	kfree(out);
+	return err;
+}
+
+static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
+	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
+	void *obj_context;
+	void *cmd_hdr;
+	void *in;
+	int err;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
+
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
+
+	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
+	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
+		   MLX5_VIRTQ_MODIFY_MASK_STATE);
+	MLX5_SET(virtio_net_q_object, obj_context, state, state);
+	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
+	kfree(in);
+	if (!err)
+		mvq->fw_state = state;
+
+	return err;
+}
+
+static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+	u16 idx = mvq->index;
+	int err;
+
+	if (!mvq->num_ent)
+		return 0;
+
+	if (mvq->initialized) {
+		mlx5_vdpa_warn(&ndev->mvdev, "attempt re init\n");
+		return -EINVAL;
+	}
+
+	err = cq_create(ndev, idx, mvq->num_ent);
+	if (err)
+		return err;
+
+	err = qp_create(ndev, mvq, &mvq->fwqp);
+	if (err)
+		goto err_fwqp;
+
+	err = qp_create(ndev, mvq, &mvq->vqqp);
+	if (err)
+		goto err_vqqp;
+
+	err = connect_qps(ndev, mvq);
+	if (err)
+		goto err_connect;
+
+	err = create_virtqueue(ndev, mvq);
+	if (err)
+		goto err_connect;
+
+	if (mvq->ready) {
+		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
+		if (err) {
+			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
+				       idx, err);
+			goto err_connect;
+		}
+	}
+
+	mvq->initialized = true;
+	return 0;
+
+err_connect:
+	qp_destroy(ndev, &mvq->vqqp);
+err_vqqp:
+	qp_destroy(ndev, &mvq->fwqp);
+err_fwqp:
+	cq_destroy(ndev, idx);
+	return err;
+}
+
+static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+	struct mlx5_virtq_attr attr;
+
+	if (!mvq->initialized)
+		return;
+
+	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
+		return;
+
+	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
+		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
+
+	if (query_virtqueue(ndev, mvq, &attr)) {
+		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
+		return;
+	}
+	mvq->avail_idx = attr.available_index;
+	mvq->used_idx = attr.used_index;
+}
+
+static void suspend_vqs(struct mlx5_vdpa_net *ndev)
+{
+	int i;
+
+	for (i = 0; i < MLX5_MAX_SUPPORTED_VQS; i++)
+		suspend_vq(ndev, &ndev->vqs[i]);
+}
+
+static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+	if (!mvq->initialized)
+		return;
+
+	suspend_vq(ndev, mvq);
+	destroy_virtqueue(ndev, mvq);
+	qp_destroy(ndev, &mvq->vqqp);
+	qp_destroy(ndev, &mvq->fwqp);
+	cq_destroy(ndev, mvq->index);
+	mvq->initialized = false;
+}
+
+static int create_rqt(struct mlx5_vdpa_net *ndev)
+{
+	int log_max_rqt;
+	__be32 *list;
+	void *rqtc;
+	int inlen;
+	void *in;
+	int i, j;
+	int err;
+
+	log_max_rqt = min_t(int, 1, MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
+	if (log_max_rqt < 1)
+		return -EOPNOTSUPP;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + (1 << log_max_rqt) * MLX5_ST_SZ_BYTES(rq_num);
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
+	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
+	MLX5_SET(rqtc, rqtc, rqt_max_size, 1 << log_max_rqt);
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, 1);
+	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
+	for (i = 0, j = 0; j < ndev->mvdev.max_vqs; j++) {
+		if (!ndev->vqs[j].initialized)
+			continue;
+
+		if (!vq_is_tx(ndev->vqs[j].index)) {
+			list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
+			i++;
+		}
+	}
+
+	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
+	kfree(in);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void destroy_rqt(struct mlx5_vdpa_net *ndev)
+{
+	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
+}
+
+static int create_tir(struct mlx5_vdpa_net *ndev)
+{
+#define HASH_IP_L4PORTS                                                                            \
+	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
+	 MLX5_HASH_FIELD_SEL_L4_DPORT)
+	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
+						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
+						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
+						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
+						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
+	void *rss_key;
+	void *outer;
+	void *tirc;
+	void *in;
+	int err;
+
+	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
+	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+
+	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
+	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
+	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
+
+	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
+	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
+	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
+
+	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
+	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
+
+	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
+	kfree(in);
+	return err;
+}
+
+static void destroy_tir(struct mlx5_vdpa_net *ndev)
+{
+	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
+}
+
+static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
+{
+	struct mlx5_flow_destination dest[2] = {};
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_flow_act flow_act = {};
+	struct mlx5_flow_namespace *ns;
+	int err;
+
+	/* for now, one entry, match all, forward to tir */
+	ft_attr.max_fte = 1;
+	ft_attr.autogroup.max_num_groups = 1;
+
+	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
+	if (!ns) {
+		mlx5_vdpa_warn(&ndev->mvdev, "get flow namespace\n");
+		return -EOPNOTSUPP;
+	}
+
+	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+	if (IS_ERR(ndev->rxft))
+		return PTR_ERR(ndev->rxft);
+
+	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
+	if (IS_ERR(ndev->rx_counter)) {
+		err = PTR_ERR(ndev->rx_counter);
+		goto err_fc;
+	}
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
+	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	dest[0].tir_num = ndev->res.tirn;
+	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
+	ndev->rx_rule = mlx5_add_flow_rules(ndev->rxft, NULL, &flow_act, dest, 2);
+	if (IS_ERR(ndev->rx_rule)) {
+		err = PTR_ERR(ndev->rx_rule);
+		ndev->rx_rule = NULL;
+		goto err_rule;
+	}
+
+	return 0;
+
+err_rule:
+	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
+err_fc:
+	mlx5_destroy_flow_table(ndev->rxft);
+	return err;
+}
+
+static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
+{
+	if (!ndev->rx_rule)
+		return;
+
+	mlx5_del_flow_rules(ndev->rx_rule);
+	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
+	mlx5_destroy_flow_table(ndev->rxft);
+
+	ndev->rx_rule = NULL;
+}
+
+static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+
+	if (unlikely(!mvq->ready))
+		return;
+
+	iowrite16(idx, ndev->mvdev.res.kick_addr);
+}
+
+static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
+				    u64 driver_area, u64 device_area)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+
+	mvq->desc_addr = desc_area;
+	mvq->device_addr = device_area;
+	mvq->driver_addr = driver_area;
+	return 0;
+}
+
+static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *mvq;
+
+	mvq = &ndev->vqs[idx];
+	mvq->num_ent = num;
+}
+
+static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *vq = &ndev->vqs[idx];
+
+	vq->event_cb = *cb;
+}
+
+static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+
+	if (!ready)
+		suspend_vq(ndev, mvq);
+
+	mvq->ready = ready;
+}
+
+static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+
+	return mvq->ready;
+}
+
+static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
+				  const struct vdpa_vq_state *state)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+
+	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
+		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
+		return -EINVAL;
+	}
+
+	mvq->used_idx = state->avail_index;
+	mvq->avail_idx = state->avail_index;
+	return 0;
+}
+
+static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+	struct mlx5_virtq_attr attr;
+	int err;
+
+	/* If the virtq object was destroyed, use the value saved at
+	 * the last minute of suspend_vq. This caters for userspace
+	 * that cares about emulating the index after vq is stopped.
+	 */
+	if (!mvq->initialized) {
+		/* Firmware returns a wrong value for the available index.
+		 * Since both values should be identical, we take the value of
+		 * used_idx which is reported correctly.
+		 */
+		state->avail_index = mvq->used_idx;
+		return 0;
+	}
+
+	err = query_virtqueue(ndev, mvq, &attr);
+	if (err) {
+		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
+		return err;
+	}
+	state->avail_index = attr.used_index;
+	return 0;
+}
+
+static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
+{
+	return PAGE_SIZE;
+}
+
+enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
+	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
+	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
+	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
+};
+
+static u64 mlx_to_vritio_features(u16 dev_features)
+{
+	u64 result = 0;
+
+	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
+		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
+	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
+		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
+	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
+		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
+	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
+		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
+
+	return result;
+}
+
+static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	u16 dev_features;
+
+	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask);
+	ndev->mvdev.mlx_features = mlx_to_vritio_features(dev_features);
+	if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
+		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
+	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
+	print_features(mvdev, ndev->mvdev.mlx_features, false);
+	return ndev->mvdev.mlx_features;
+}
+
+static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
+{
+	/* Minimum features to expect */
+	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
+		return -EOPNOTSUPP;
+
+	/* Double check features combination sent down by the driver.
+	 * Fail invalid features due to absence of the depended feature.
+	 *
+	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
+	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
+	 * By failing the invalid features sent down by untrusted drivers,
+	 * we're assured the assumption made upon is_index_valid() and
+	 * is_ctrl_vq_idx() will not be compromised.
+	 */
+	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
+            BIT_ULL(VIRTIO_NET_F_MQ))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); i++) {
+		err = setup_vq(ndev, &ndev->vqs[i]);
+		if (err)
+			goto err_vq;
+	}
+
+	return 0;
+
+err_vq:
+	for (--i; i >= 0; i--)
+		teardown_vq(ndev, &ndev->vqs[i]);
+
+	return err;
+}
+
+static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
+{
+	struct mlx5_vdpa_virtqueue *mvq;
+	int i;
+
+	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+		mvq = &ndev->vqs[i];
+		if (!mvq->initialized)
+			continue;
+
+		teardown_vq(ndev, mvq);
+	}
+}
+
+/* TODO: cross-endian support */
+static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
+{
+	return virtio_legacy_is_little_endian() ||
+		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
+}
+
+static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
+{
+	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
+static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	int err;
+
+	print_features(mvdev, features, true);
+
+	err = verify_driver_features(mvdev, features);
+	if (err)
+		return err;
+
+	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
+	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
+	ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
+	return err;
+}
+
+static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
+{
+	/* not implemented */
+	mlx5_vdpa_warn(to_mvdev(vdev), "set config callback not supported\n");
+}
+
+#define MLX5_VDPA_MAX_VQ_ENTRIES 256
+static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
+{
+	return MLX5_VDPA_MAX_VQ_ENTRIES;
+}
+
+static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
+{
+	return VIRTIO_ID_NET;
+}
+
+static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
+{
+	return PCI_VENDOR_ID_MELLANOX;
+}
+
+static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+	print_status(mvdev, ndev->mvdev.status, false);
+	return ndev->mvdev.status;
+}
+
+static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
+{
+	struct mlx5_vq_restore_info *ri = &mvq->ri;
+	struct mlx5_virtq_attr attr;
+	int err;
+
+	if (!mvq->initialized)
+		return 0;
+
+	err = query_virtqueue(ndev, mvq, &attr);
+	if (err)
+		return err;
+
+	ri->avail_index = attr.available_index;
+	ri->used_index = attr.used_index;
+	ri->ready = mvq->ready;
+	ri->num_ent = mvq->num_ent;
+	ri->desc_addr = mvq->desc_addr;
+	ri->device_addr = mvq->device_addr;
+	ri->driver_addr = mvq->driver_addr;
+	ri->cb = mvq->event_cb;
+	ri->restore = true;
+	return 0;
+}
+
+static int save_channels_info(struct mlx5_vdpa_net *ndev)
+{
+	int i;
+
+	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
+		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
+		save_channel_info(ndev, &ndev->vqs[i]);
+	}
+	return 0;
+}
+
+static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
+{
+	int i;
+
+	for (i = 0; i < ndev->mvdev.max_vqs; i++)
+		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
+}
+
+static void restore_channels_info(struct mlx5_vdpa_net *ndev)
+{
+	struct mlx5_vdpa_virtqueue *mvq;
+	struct mlx5_vq_restore_info *ri;
+	int i;
+
+	mlx5_clear_vqs(ndev);
+	init_mvqs(ndev);
+	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
+		mvq = &ndev->vqs[i];
+		ri = &mvq->ri;
+		if (!ri->restore)
+			continue;
+
+		mvq->avail_idx = ri->avail_index;
+		mvq->used_idx = ri->used_index;
+		mvq->ready = ri->ready;
+		mvq->num_ent = ri->num_ent;
+		mvq->desc_addr = ri->desc_addr;
+		mvq->device_addr = ri->device_addr;
+		mvq->driver_addr = ri->driver_addr;
+		mvq->event_cb = ri->cb;
+	}
+}
+
+static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *iotlb)
+{
+	int err;
+
+	suspend_vqs(ndev);
+	err = save_channels_info(ndev);
+	if (err)
+		goto err_mr;
+
+	teardown_driver(ndev);
+	mlx5_vdpa_destroy_mr(&ndev->mvdev);
+	err = mlx5_vdpa_create_mr(&ndev->mvdev, iotlb);
+	if (err)
+		goto err_mr;
+
+	if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
+		return 0;
+
+	restore_channels_info(ndev);
+	err = setup_driver(ndev);
+	if (err)
+		goto err_setup;
+
+	return 0;
+
+err_setup:
+	mlx5_vdpa_destroy_mr(&ndev->mvdev);
+err_mr:
+	return err;
+}
+
+static int setup_driver(struct mlx5_vdpa_net *ndev)
+{
+	int err;
+
+	mutex_lock(&ndev->reslock);
+	if (ndev->setup) {
+		mlx5_vdpa_warn(&ndev->mvdev, "setup driver called for already setup driver\n");
+		err = 0;
+		goto out;
+	}
+	err = setup_virtqueues(ndev);
+	if (err) {
+		mlx5_vdpa_warn(&ndev->mvdev, "setup_virtqueues\n");
+		goto out;
+	}
+
+	err = create_rqt(ndev);
+	if (err) {
+		mlx5_vdpa_warn(&ndev->mvdev, "create_rqt\n");
+		goto err_rqt;
+	}
+
+	err = create_tir(ndev);
+	if (err) {
+		mlx5_vdpa_warn(&ndev->mvdev, "create_tir\n");
+		goto err_tir;
+	}
+
+	err = add_fwd_to_tir(ndev);
+	if (err) {
+		mlx5_vdpa_warn(&ndev->mvdev, "add_fwd_to_tir\n");
+		goto err_fwd;
+	}
+	ndev->setup = true;
+	mutex_unlock(&ndev->reslock);
+
+	return 0;
+
+err_fwd:
+	destroy_tir(ndev);
+err_tir:
+	destroy_rqt(ndev);
+err_rqt:
+	teardown_virtqueues(ndev);
+out:
+	mutex_unlock(&ndev->reslock);
+	return err;
+}
+
+static void teardown_driver(struct mlx5_vdpa_net *ndev)
+{
+	mutex_lock(&ndev->reslock);
+	if (!ndev->setup)
+		goto out;
+
+	remove_fwd_to_tir(ndev);
+	destroy_tir(ndev);
+	destroy_rqt(ndev);
+	teardown_virtqueues(ndev);
+	ndev->setup = false;
+out:
+	mutex_unlock(&ndev->reslock);
+}
+
+static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
+{
+	int i;
+
+	for (i = 0; i < ndev->mvdev.max_vqs; i++)
+		ndev->vqs[i].ready = false;
+}
+
+static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	int err;
+
+	print_status(mvdev, status, true);
+	if (!status) {
+		mlx5_vdpa_info(mvdev, "performing device reset\n");
+		teardown_driver(ndev);
+		clear_vqs_ready(ndev);
+		mlx5_vdpa_destroy_mr(&ndev->mvdev);
+		ndev->mvdev.status = 0;
+		ndev->mvdev.mlx_features = 0;
+		++mvdev->generation;
+		return;
+	}
+
+	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
+		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+			err = setup_driver(ndev);
+			if (err) {
+				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
+				goto err_setup;
+			}
+		} else {
+			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
+			return;
+		}
+	}
+
+	ndev->mvdev.status = status;
+	return;
+
+err_setup:
+	mlx5_vdpa_destroy_mr(&ndev->mvdev);
+	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
+}
+
+static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
+				 unsigned int len)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+	if (offset + len <= sizeof(struct virtio_net_config))
+		memcpy(buf, (u8 *)&ndev->config + offset, len);
+}
+
+static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
+				 unsigned int len)
+{
+	/* not supported */
+}
+
+static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+
+	return mvdev->generation;
+}
+
+static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	bool change_map;
+	int err;
+
+	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
+	if (err) {
+		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
+		return err;
+	}
+
+	if (change_map)
+		return mlx5_vdpa_change_map(ndev, iotlb);
+
+	return 0;
+}
+
+static void mlx5_vdpa_free(struct vdpa_device *vdev)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_core_dev *pfmdev;
+	struct mlx5_vdpa_net *ndev;
+
+	ndev = to_mlx5_vdpa_ndev(mvdev);
+
+	free_resources(ndev);
+	if (!is_zero_ether_addr(ndev->config.mac)) {
+		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
+		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
+	}
+	mlx5_vdpa_free_resources(&ndev->mvdev);
+	mutex_destroy(&ndev->reslock);
+}
+
+static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
+{
+	struct vdpa_notification_area ret = {};
+
+	return ret;
+}
+
+static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct vdpa_config_ops mlx5_vdpa_ops = {
+	.set_vq_address = mlx5_vdpa_set_vq_address,
+	.set_vq_num = mlx5_vdpa_set_vq_num,
+	.kick_vq = mlx5_vdpa_kick_vq,
+	.set_vq_cb = mlx5_vdpa_set_vq_cb,
+	.set_vq_ready = mlx5_vdpa_set_vq_ready,
+	.get_vq_ready = mlx5_vdpa_get_vq_ready,
+	.set_vq_state = mlx5_vdpa_set_vq_state,
+	.get_vq_state = mlx5_vdpa_get_vq_state,
+	.get_vq_notification = mlx5_get_vq_notification,
+	.get_vq_irq = mlx5_get_vq_irq,
+	.get_vq_align = mlx5_vdpa_get_vq_align,
+	.get_features = mlx5_vdpa_get_features,
+	.set_features = mlx5_vdpa_set_features,
+	.set_config_cb = mlx5_vdpa_set_config_cb,
+	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
+	.get_device_id = mlx5_vdpa_get_device_id,
+	.get_vendor_id = mlx5_vdpa_get_vendor_id,
+	.get_status = mlx5_vdpa_get_status,
+	.set_status = mlx5_vdpa_set_status,
+	.get_config = mlx5_vdpa_get_config,
+	.set_config = mlx5_vdpa_set_config,
+	.get_generation = mlx5_vdpa_get_generation,
+	.set_map = mlx5_vdpa_set_map,
+	.free = mlx5_vdpa_free,
+};
+
+static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
+{
+	u16 hw_mtu;
+	int err;
+
+	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
+	if (err)
+		return err;
+
+	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
+	return 0;
+}
+
+static int alloc_resources(struct mlx5_vdpa_net *ndev)
+{
+	struct mlx5_vdpa_net_resources *res = &ndev->res;
+	int err;
+
+	if (res->valid) {
+		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
+		return -EEXIST;
+	}
+
+	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
+	if (err)
+		return err;
+
+	err = create_tis(ndev);
+	if (err)
+		goto err_tis;
+
+	res->valid = true;
+
+	return 0;
+
+err_tis:
+	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
+	return err;
+}
+
+static void free_resources(struct mlx5_vdpa_net *ndev)
+{
+	struct mlx5_vdpa_net_resources *res = &ndev->res;
+
+	if (!res->valid)
+		return;
+
+	destroy_tis(ndev);
+	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
+	res->valid = false;
+}
+
+static void init_mvqs(struct mlx5_vdpa_net *ndev)
+{
+	struct mlx5_vdpa_virtqueue *mvq;
+	int i;
+
+	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) {
+		mvq = &ndev->vqs[i];
+		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
+		mvq->index = i;
+		mvq->ndev = ndev;
+		mvq->fwqp.fw = true;
+	}
+	for (; i < ndev->mvdev.max_vqs; i++) {
+		mvq = &ndev->vqs[i];
+		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
+		mvq->index = i;
+		mvq->ndev = ndev;
+	}
+}
+
+void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev)
+{
+	struct virtio_net_config *config;
+	struct mlx5_core_dev *pfmdev;
+	struct mlx5_vdpa_dev *mvdev;
+	struct mlx5_vdpa_net *ndev;
+	u32 max_vqs;
+	int err;
+
+	/* we save one virtqueue for control virtqueue should we require it */
+	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
+	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
+
+	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
+				 2 * mlx5_vdpa_max_qps(max_vqs));
+	if (IS_ERR(ndev))
+		return ndev;
+
+	ndev->mvdev.max_vqs = max_vqs;
+	mvdev = &ndev->mvdev;
+	mvdev->mdev = mdev;
+	init_mvqs(ndev);
+	mutex_init(&ndev->reslock);
+	config = &ndev->config;
+	err = query_mtu(mdev, &ndev->mtu);
+	if (err)
+		goto err_mtu;
+
+	err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
+	if (err)
+		goto err_mtu;
+
+	if (!is_zero_ether_addr(config->mac)) {
+		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
+		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
+		if (err)
+			goto err_mtu;
+	}
+
+	mvdev->vdev.dma_dev = mdev->device;
+	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
+	if (err)
+		goto err_mpfs;
+
+	err = alloc_resources(ndev);
+	if (err)
+		goto err_res;
+
+	err = vdpa_register_device(&mvdev->vdev);
+	if (err)
+		goto err_reg;
+
+	return ndev;
+
+err_reg:
+	free_resources(ndev);
+err_res:
+	mlx5_vdpa_free_resources(&ndev->mvdev);
+err_mpfs:
+	if (!is_zero_ether_addr(config->mac))
+		mlx5_mpfs_del_mac(pfmdev, config->mac);
+err_mtu:
+	mutex_destroy(&ndev->reslock);
+	put_device(&mvdev->vdev.dev);
+	return ERR_PTR(err);
+}
+
+void mlx5_vdpa_remove_dev(struct mlx5_vdpa_dev *mvdev)
+{
+	vdpa_unregister_device(&mvdev->vdev);
+}
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.h b/drivers/vdpa/mlx5/net/mlx5_vnet.h
new file mode 100644
index 000000000..f2d6d68b0
--- /dev/null
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+
+#ifndef __MLX5_VNET_H_
+#define __MLX5_VNET_H_
+
+#include <linux/vdpa.h>
+#include <linux/virtio_net.h>
+#include <linux/vringh.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include "mlx5_vdpa.h"
+
+static inline u32 mlx5_vdpa_max_qps(int max_vqs)
+{
+	return max_vqs / 2;
+}
+
+#define to_mlx5_vdpa_ndev(__mvdev) container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
+void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev);
+void mlx5_vdpa_remove_dev(struct mlx5_vdpa_dev *mvdev);
+
+#endif /* __MLX5_VNET_H_ */
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
new file mode 100644
index 000000000..a69ffc991
--- /dev/null
+++ b/drivers/vdpa/vdpa.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bus.
+ *
+ * Copyright (c) 2020, Red Hat. All rights reserved.
+ *     Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/vdpa.h>
+
+static DEFINE_IDA(vdpa_index_ida);
+
+static int vdpa_dev_probe(struct device *d)
+{
+	struct vdpa_device *vdev = dev_to_vdpa(d);
+	struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver);
+	int ret = 0;
+
+	if (drv && drv->probe)
+		ret = drv->probe(vdev);
+
+	return ret;
+}
+
+static int vdpa_dev_remove(struct device *d)
+{
+	struct vdpa_device *vdev = dev_to_vdpa(d);
+	struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver);
+
+	if (drv && drv->remove)
+		drv->remove(vdev);
+
+	return 0;
+}
+
+static struct bus_type vdpa_bus = {
+	.name  = "vdpa",
+	.probe = vdpa_dev_probe,
+	.remove = vdpa_dev_remove,
+};
+
+static void vdpa_release_dev(struct device *d)
+{
+	struct vdpa_device *vdev = dev_to_vdpa(d);
+	const struct vdpa_config_ops *ops = vdev->config;
+
+	if (ops->free)
+		ops->free(vdev);
+
+	ida_simple_remove(&vdpa_index_ida, vdev->index);
+	kfree(vdev);
+}
+
+/**
+ * __vdpa_alloc_device - allocate and initilaize a vDPA device
+ * This allows driver to some prepartion after device is
+ * initialized but before registered.
+ * @parent: the parent device
+ * @config: the bus operations that is supported by this device
+ * @nvqs: number of virtqueues supported by this device
+ * @size: size of the parent structure that contains private data
+ *
+ * Driver should use vdpa_alloc_device() wrapper macro instead of
+ * using this directly.
+ *
+ * Returns an error when parent/config/dma_dev is not set or fail to get
+ * ida.
+ */
+struct vdpa_device *__vdpa_alloc_device(struct device *parent,
+					const struct vdpa_config_ops *config,
+					int nvqs,
+					size_t size)
+{
+	struct vdpa_device *vdev;
+	int err = -EINVAL;
+
+	if (!config)
+		goto err;
+
+	if (!!config->dma_map != !!config->dma_unmap)
+		goto err;
+
+	err = -ENOMEM;
+	vdev = kzalloc(size, GFP_KERNEL);
+	if (!vdev)
+		goto err;
+
+	err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
+	if (err < 0)
+		goto err_ida;
+
+	vdev->dev.bus = &vdpa_bus;
+	vdev->dev.parent = parent;
+	vdev->dev.release = vdpa_release_dev;
+	vdev->index = err;
+	vdev->config = config;
+	vdev->features_valid = false;
+	vdev->nvqs = nvqs;
+
+	err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index);
+	if (err)
+		goto err_name;
+
+	device_initialize(&vdev->dev);
+
+	return vdev;
+
+err_name:
+	ida_simple_remove(&vdpa_index_ida, vdev->index);
+err_ida:
+	kfree(vdev);
+err:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(__vdpa_alloc_device);
+
+/**
+ * vdpa_register_device - register a vDPA device
+ * Callers must have a succeed call of vdpa_alloc_device() before.
+ * @vdev: the vdpa device to be registered to vDPA bus
+ *
+ * Returns an error when fail to add to vDPA bus
+ */
+int vdpa_register_device(struct vdpa_device *vdev)
+{
+	return device_add(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(vdpa_register_device);
+
+/**
+ * vdpa_unregister_device - unregister a vDPA device
+ * @vdev: the vdpa device to be unregisted from vDPA bus
+ */
+void vdpa_unregister_device(struct vdpa_device *vdev)
+{
+	device_unregister(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(vdpa_unregister_device);
+
+/**
+ * __vdpa_register_driver - register a vDPA device driver
+ * @drv: the vdpa device driver to be registered
+ * @owner: module owner of the driver
+ *
+ * Returns an err when fail to do the registration
+ */
+int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner)
+{
+	drv->driver.bus = &vdpa_bus;
+	drv->driver.owner = owner;
+
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(__vdpa_register_driver);
+
+/**
+ * vdpa_unregister_driver - unregister a vDPA device driver
+ * @drv: the vdpa device driver to be unregistered
+ */
+void vdpa_unregister_driver(struct vdpa_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(vdpa_unregister_driver);
+
+static int vdpa_init(void)
+{
+	return bus_register(&vdpa_bus);
+}
+
+static void __exit vdpa_exit(void)
+{
+	bus_unregister(&vdpa_bus);
+	ida_destroy(&vdpa_index_ida);
+}
+core_initcall(vdpa_init);
+module_exit(vdpa_exit);
+
+MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/vdpa/vdpa_sim/Makefile b/drivers/vdpa/vdpa_sim/Makefile
new file mode 100644
index 000000000..b40278f65
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
new file mode 100644
index 000000000..e65c0fa95
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDPA networking device simulator.
+ *
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ *     Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/uuid.h>
+#include <linux/iommu.h>
+#include <linux/dma-map-ops.h>
+#include <linux/sysfs.h>
+#include <linux/file.h>
+#include <linux/etherdevice.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_byteorder.h>
+#include <linux/vhost_iotlb.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_net.h>
+
+#define DRV_VERSION  "0.1"
+#define DRV_AUTHOR   "Jason Wang <jasowang@redhat.com>"
+#define DRV_DESC     "vDPA Device Simulator"
+#define DRV_LICENSE  "GPL v2"
+
+static int batch_mapping = 1;
+module_param(batch_mapping, int, 0444);
+MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable");
+
+static char *macaddr;
+module_param(macaddr, charp, 0);
+MODULE_PARM_DESC(macaddr, "Ethernet MAC address");
+
+u8 macaddr_buf[ETH_ALEN];
+
+struct vdpasim_virtqueue {
+	struct vringh vring;
+	struct vringh_kiov iov;
+	unsigned short head;
+	bool ready;
+	u64 desc_addr;
+	u64 device_addr;
+	u64 driver_addr;
+	u32 num;
+	void *private;
+	irqreturn_t (*cb)(void *data);
+};
+
+#define VDPASIM_QUEUE_ALIGN PAGE_SIZE
+#define VDPASIM_QUEUE_MAX 256
+#define VDPASIM_DEVICE_ID 0x1
+#define VDPASIM_VENDOR_ID 0
+#define VDPASIM_VQ_NUM 0x2
+#define VDPASIM_NAME "vdpasim-netdev"
+
+static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |
+			      (1ULL << VIRTIO_F_VERSION_1)  |
+			      (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
+			      (1ULL << VIRTIO_NET_F_MAC);
+
+struct vdpasim;
+
+struct vdpasim_dev_attr {
+	size_t config_size;
+	int nvqs;
+	void (*get_config)(struct vdpasim *vdpasim, void *config);
+};
+
+/* State of each vdpasim device */
+struct vdpasim {
+	struct vdpa_device vdpa;
+	struct vdpasim_virtqueue *vqs;
+	struct work_struct work;
+	struct vdpasim_dev_attr dev_attr;
+	/* spinlock to synchronize virtqueue state */
+	spinlock_t lock;
+	/* virtio config according to device type */
+	void *config;
+	struct vhost_iotlb *iommu;
+	void *buffer;
+	u32 status;
+	u32 generation;
+	u64 features;
+	/* spinlock to synchronize iommu table */
+	spinlock_t iommu_lock;
+};
+
+/* TODO: cross-endian support */
+static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim)
+{
+	return virtio_legacy_is_little_endian() ||
+		(vdpasim->features & (1ULL << VIRTIO_F_VERSION_1));
+}
+
+static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val)
+{
+	return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val)
+{
+	return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static struct vdpasim *vdpasim_dev;
+
+static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
+{
+	return container_of(vdpa, struct vdpasim, vdpa);
+}
+
+static struct vdpasim *dev_to_sim(struct device *dev)
+{
+	struct vdpa_device *vdpa = dev_to_vdpa(dev);
+
+	return vdpa_to_sim(vdpa);
+}
+
+static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx)
+{
+	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+	vringh_init_iotlb(&vq->vring, vdpasim_features,
+			  VDPASIM_QUEUE_MAX, false,
+			  (struct vring_desc *)(uintptr_t)vq->desc_addr,
+			  (struct vring_avail *)
+			  (uintptr_t)vq->driver_addr,
+			  (struct vring_used *)
+			  (uintptr_t)vq->device_addr);
+}
+
+static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)
+{
+	vq->ready = false;
+	vq->desc_addr = 0;
+	vq->driver_addr = 0;
+	vq->device_addr = 0;
+	vq->cb = NULL;
+	vq->private = NULL;
+	vringh_init_iotlb(&vq->vring, vdpasim_features, VDPASIM_QUEUE_MAX,
+			  false, NULL, NULL, NULL);
+}
+
+static void vdpasim_reset(struct vdpasim *vdpasim)
+{
+	int i;
+
+	for (i = 0; i < vdpasim->dev_attr.nvqs; i++)
+		vdpasim_vq_reset(&vdpasim->vqs[i]);
+
+	spin_lock(&vdpasim->iommu_lock);
+	vhost_iotlb_reset(vdpasim->iommu);
+	spin_unlock(&vdpasim->iommu_lock);
+
+	vdpasim->features = 0;
+	vdpasim->status = 0;
+	++vdpasim->generation;
+}
+
+static void vdpasim_work(struct work_struct *work)
+{
+	struct vdpasim *vdpasim = container_of(work, struct
+						 vdpasim, work);
+	struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
+	struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
+	ssize_t read, write;
+	size_t total_write;
+	int pkts = 0;
+	int err;
+
+	spin_lock(&vdpasim->lock);
+
+	if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
+		goto out;
+
+	if (!txq->ready || !rxq->ready)
+		goto out;
+
+	while (true) {
+		total_write = 0;
+		err = vringh_getdesc_iotlb(&txq->vring, &txq->iov, NULL,
+					   &txq->head, GFP_ATOMIC);
+		if (err <= 0)
+			break;
+
+		err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->iov,
+					   &rxq->head, GFP_ATOMIC);
+		if (err <= 0) {
+			vringh_complete_iotlb(&txq->vring, txq->head, 0);
+			break;
+		}
+
+		while (true) {
+			read = vringh_iov_pull_iotlb(&txq->vring, &txq->iov,
+						     vdpasim->buffer,
+						     PAGE_SIZE);
+			if (read <= 0)
+				break;
+
+			write = vringh_iov_push_iotlb(&rxq->vring, &rxq->iov,
+						      vdpasim->buffer, read);
+			if (write <= 0)
+				break;
+
+			total_write += write;
+		}
+
+		/* Make sure data is wrote before advancing index */
+		smp_wmb();
+
+		vringh_complete_iotlb(&txq->vring, txq->head, 0);
+		vringh_complete_iotlb(&rxq->vring, rxq->head, total_write);
+
+		/* Make sure used is visible before rasing the interrupt. */
+		smp_wmb();
+
+		local_bh_disable();
+		if (txq->cb)
+			txq->cb(txq->private);
+		if (rxq->cb)
+			rxq->cb(rxq->private);
+		local_bh_enable();
+
+		if (++pkts > 4) {
+			schedule_work(&vdpasim->work);
+			goto out;
+		}
+	}
+
+out:
+	spin_unlock(&vdpasim->lock);
+}
+
+static int dir_to_perm(enum dma_data_direction dir)
+{
+	int perm = -EFAULT;
+
+	switch (dir) {
+	case DMA_FROM_DEVICE:
+		perm = VHOST_MAP_WO;
+		break;
+	case DMA_TO_DEVICE:
+		perm = VHOST_MAP_RO;
+		break;
+	case DMA_BIDIRECTIONAL:
+		perm = VHOST_MAP_RW;
+		break;
+	default:
+		break;
+	}
+
+	return perm;
+}
+
+static dma_addr_t vdpasim_map_page(struct device *dev, struct page *page,
+				   unsigned long offset, size_t size,
+				   enum dma_data_direction dir,
+				   unsigned long attrs)
+{
+	struct vdpasim *vdpasim = dev_to_sim(dev);
+	struct vhost_iotlb *iommu = vdpasim->iommu;
+	u64 pa = (page_to_pfn(page) << PAGE_SHIFT) + offset;
+	int ret, perm = dir_to_perm(dir);
+
+	if (perm < 0)
+		return DMA_MAPPING_ERROR;
+
+	/* For simplicity, use identical mapping to avoid e.g iova
+	 * allocator.
+	 */
+	spin_lock(&vdpasim->iommu_lock);
+	ret = vhost_iotlb_add_range(iommu, pa, pa + size - 1,
+				    pa, dir_to_perm(dir));
+	spin_unlock(&vdpasim->iommu_lock);
+	if (ret)
+		return DMA_MAPPING_ERROR;
+
+	return (dma_addr_t)(pa);
+}
+
+static void vdpasim_unmap_page(struct device *dev, dma_addr_t dma_addr,
+			       size_t size, enum dma_data_direction dir,
+			       unsigned long attrs)
+{
+	struct vdpasim *vdpasim = dev_to_sim(dev);
+	struct vhost_iotlb *iommu = vdpasim->iommu;
+
+	spin_lock(&vdpasim->iommu_lock);
+	vhost_iotlb_del_range(iommu, (u64)dma_addr,
+			      (u64)dma_addr + size - 1);
+	spin_unlock(&vdpasim->iommu_lock);
+}
+
+static void *vdpasim_alloc_coherent(struct device *dev, size_t size,
+				    dma_addr_t *dma_addr, gfp_t flag,
+				    unsigned long attrs)
+{
+	struct vdpasim *vdpasim = dev_to_sim(dev);
+	struct vhost_iotlb *iommu = vdpasim->iommu;
+	void *addr = kmalloc(size, flag);
+	int ret;
+
+	spin_lock(&vdpasim->iommu_lock);
+	if (!addr) {
+		*dma_addr = DMA_MAPPING_ERROR;
+	} else {
+		u64 pa = virt_to_phys(addr);
+
+		ret = vhost_iotlb_add_range(iommu, (u64)pa,
+					    (u64)pa + size - 1,
+					    pa, VHOST_MAP_RW);
+		if (ret) {
+			*dma_addr = DMA_MAPPING_ERROR;
+			kfree(addr);
+			addr = NULL;
+		} else
+			*dma_addr = (dma_addr_t)pa;
+	}
+	spin_unlock(&vdpasim->iommu_lock);
+
+	return addr;
+}
+
+static void vdpasim_free_coherent(struct device *dev, size_t size,
+				  void *vaddr, dma_addr_t dma_addr,
+				  unsigned long attrs)
+{
+	struct vdpasim *vdpasim = dev_to_sim(dev);
+	struct vhost_iotlb *iommu = vdpasim->iommu;
+
+	spin_lock(&vdpasim->iommu_lock);
+	vhost_iotlb_del_range(iommu, (u64)dma_addr,
+			      (u64)dma_addr + size - 1);
+	spin_unlock(&vdpasim->iommu_lock);
+
+	kfree(phys_to_virt((uintptr_t)dma_addr));
+}
+
+static const struct dma_map_ops vdpasim_dma_ops = {
+	.map_page = vdpasim_map_page,
+	.unmap_page = vdpasim_unmap_page,
+	.alloc = vdpasim_alloc_coherent,
+	.free = vdpasim_free_coherent,
+};
+
+static const struct vdpa_config_ops vdpasim_net_config_ops;
+static const struct vdpa_config_ops vdpasim_net_batch_config_ops;
+
+static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
+{
+	const struct vdpa_config_ops *ops;
+	struct vdpasim *vdpasim;
+	struct device *dev;
+	int i, ret = -ENOMEM;
+
+	if (batch_mapping)
+		ops = &vdpasim_net_batch_config_ops;
+	else
+		ops = &vdpasim_net_config_ops;
+
+	vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
+				    dev_attr->nvqs);
+	if (!vdpasim)
+		goto err_alloc;
+
+	vdpasim->dev_attr = *dev_attr;
+	INIT_WORK(&vdpasim->work, vdpasim_work);
+	spin_lock_init(&vdpasim->lock);
+	spin_lock_init(&vdpasim->iommu_lock);
+
+	dev = &vdpasim->vdpa.dev;
+	dev->dma_mask = &dev->coherent_dma_mask;
+	if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)))
+		goto err_iommu;
+	set_dma_ops(dev, &vdpasim_dma_ops);
+
+	vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL);
+	if (!vdpasim->config)
+		goto err_iommu;
+
+	vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue),
+			       GFP_KERNEL);
+	if (!vdpasim->vqs)
+		goto err_iommu;
+
+	vdpasim->iommu = vhost_iotlb_alloc(2048, 0);
+	if (!vdpasim->iommu)
+		goto err_iommu;
+
+	vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!vdpasim->buffer)
+		goto err_iommu;
+
+	if (macaddr) {
+		mac_pton(macaddr, macaddr_buf);
+		if (!is_valid_ether_addr(macaddr_buf)) {
+			ret = -EADDRNOTAVAIL;
+			goto err_iommu;
+		}
+	} else {
+		eth_random_addr(macaddr_buf);
+	}
+
+	for (i = 0; i < dev_attr->nvqs; i++)
+		vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu);
+
+	vdpasim->vdpa.dma_dev = dev;
+	ret = vdpa_register_device(&vdpasim->vdpa);
+	if (ret)
+		goto err_iommu;
+
+	return vdpasim;
+
+err_iommu:
+	put_device(dev);
+err_alloc:
+	return ERR_PTR(ret);
+}
+
+static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx,
+				  u64 desc_area, u64 driver_area,
+				  u64 device_area)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+	vq->desc_addr = desc_area;
+	vq->driver_addr = driver_area;
+	vq->device_addr = device_area;
+
+	return 0;
+}
+
+static void vdpasim_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+	vq->num = num;
+}
+
+static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+	if (vq->ready)
+		schedule_work(&vdpasim->work);
+}
+
+static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
+			      struct vdpa_callback *cb)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+	vq->cb = cb->callback;
+	vq->private = cb->private;
+}
+
+static void vdpasim_set_vq_ready(struct vdpa_device *vdpa, u16 idx, bool ready)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+	bool old_ready;
+
+	spin_lock(&vdpasim->lock);
+	old_ready = vq->ready;
+	vq->ready = ready;
+	if (vq->ready && !old_ready) {
+		vdpasim_queue_ready(vdpasim, idx);
+	}
+	spin_unlock(&vdpasim->lock);
+}
+
+static bool vdpasim_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+	return vq->ready;
+}
+
+static int vdpasim_set_vq_state(struct vdpa_device *vdpa, u16 idx,
+				const struct vdpa_vq_state *state)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+	struct vringh *vrh = &vq->vring;
+
+	spin_lock(&vdpasim->lock);
+	vrh->last_avail_idx = state->avail_index;
+	spin_unlock(&vdpasim->lock);
+
+	return 0;
+}
+
+static int vdpasim_get_vq_state(struct vdpa_device *vdpa, u16 idx,
+				struct vdpa_vq_state *state)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+	struct vringh *vrh = &vq->vring;
+
+	state->avail_index = vrh->last_avail_idx;
+	return 0;
+}
+
+static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa)
+{
+	return VDPASIM_QUEUE_ALIGN;
+}
+
+static u64 vdpasim_get_features(struct vdpa_device *vdpa)
+{
+	return vdpasim_features;
+}
+
+static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+	/* DMA mapping must be done by driver */
+	if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
+		return -EINVAL;
+
+	vdpasim->features = features & vdpasim_features;
+
+	return 0;
+}
+
+static void vdpasim_set_config_cb(struct vdpa_device *vdpa,
+				  struct vdpa_callback *cb)
+{
+	/* We don't support config interrupt */
+}
+
+static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa)
+{
+	return VDPASIM_QUEUE_MAX;
+}
+
+static u32 vdpasim_get_device_id(struct vdpa_device *vdpa)
+{
+	return VDPASIM_DEVICE_ID;
+}
+
+static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa)
+{
+	return VDPASIM_VENDOR_ID;
+}
+
+static u8 vdpasim_get_status(struct vdpa_device *vdpa)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	u8 status;
+
+	spin_lock(&vdpasim->lock);
+	status = vdpasim->status;
+	spin_unlock(&vdpasim->lock);
+
+	return status;
+}
+
+static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+	spin_lock(&vdpasim->lock);
+	vdpasim->status = status;
+	if (status == 0)
+		vdpasim_reset(vdpasim);
+	spin_unlock(&vdpasim->lock);
+}
+
+static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
+			     void *buf, unsigned int len)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+	if (offset + len > vdpasim->dev_attr.config_size)
+		return;
+
+	if (vdpasim->dev_attr.get_config)
+		vdpasim->dev_attr.get_config(vdpasim, vdpasim->config);
+
+	memcpy(buf, vdpasim->config + offset, len);
+}
+
+static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset,
+			     const void *buf, unsigned int len)
+{
+	/* No writable config supportted by vdpasim */
+}
+
+static u32 vdpasim_get_generation(struct vdpa_device *vdpa)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+	return vdpasim->generation;
+}
+
+static struct vdpa_iova_range vdpasim_get_iova_range(struct vdpa_device *vdpa)
+{
+	struct vdpa_iova_range range = {
+		.first = 0ULL,
+		.last = ULLONG_MAX,
+	};
+
+	return range;
+}
+
+static int vdpasim_set_map(struct vdpa_device *vdpa,
+			   struct vhost_iotlb *iotlb)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	struct vhost_iotlb_map *map;
+	u64 start = 0ULL, last = 0ULL - 1;
+	int ret;
+
+	spin_lock(&vdpasim->iommu_lock);
+	vhost_iotlb_reset(vdpasim->iommu);
+
+	for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+	     map = vhost_iotlb_itree_next(map, start, last)) {
+		ret = vhost_iotlb_add_range(vdpasim->iommu, map->start,
+					    map->last, map->addr, map->perm);
+		if (ret)
+			goto err;
+	}
+	spin_unlock(&vdpasim->iommu_lock);
+	return 0;
+
+err:
+	vhost_iotlb_reset(vdpasim->iommu);
+	spin_unlock(&vdpasim->iommu_lock);
+	return ret;
+}
+
+static int vdpasim_dma_map(struct vdpa_device *vdpa, u64 iova, u64 size,
+			   u64 pa, u32 perm)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+	int ret;
+
+	spin_lock(&vdpasim->iommu_lock);
+	ret = vhost_iotlb_add_range(vdpasim->iommu, iova, iova + size - 1, pa,
+				    perm);
+	spin_unlock(&vdpasim->iommu_lock);
+
+	return ret;
+}
+
+static int vdpasim_dma_unmap(struct vdpa_device *vdpa, u64 iova, u64 size)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+	spin_lock(&vdpasim->iommu_lock);
+	vhost_iotlb_del_range(vdpasim->iommu, iova, iova + size - 1);
+	spin_unlock(&vdpasim->iommu_lock);
+
+	return 0;
+}
+
+static void vdpasim_free(struct vdpa_device *vdpa)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+	cancel_work_sync(&vdpasim->work);
+	kfree(vdpasim->buffer);
+	if (vdpasim->iommu)
+		vhost_iotlb_free(vdpasim->iommu);
+	kfree(vdpasim->vqs);
+	kfree(vdpasim->config);
+}
+
+static const struct vdpa_config_ops vdpasim_net_config_ops = {
+	.set_vq_address         = vdpasim_set_vq_address,
+	.set_vq_num             = vdpasim_set_vq_num,
+	.kick_vq                = vdpasim_kick_vq,
+	.set_vq_cb              = vdpasim_set_vq_cb,
+	.set_vq_ready           = vdpasim_set_vq_ready,
+	.get_vq_ready           = vdpasim_get_vq_ready,
+	.set_vq_state           = vdpasim_set_vq_state,
+	.get_vq_state           = vdpasim_get_vq_state,
+	.get_vq_align           = vdpasim_get_vq_align,
+	.get_features           = vdpasim_get_features,
+	.set_features           = vdpasim_set_features,
+	.set_config_cb          = vdpasim_set_config_cb,
+	.get_vq_num_max         = vdpasim_get_vq_num_max,
+	.get_device_id          = vdpasim_get_device_id,
+	.get_vendor_id          = vdpasim_get_vendor_id,
+	.get_status             = vdpasim_get_status,
+	.set_status             = vdpasim_set_status,
+	.get_config             = vdpasim_get_config,
+	.set_config             = vdpasim_set_config,
+	.get_generation         = vdpasim_get_generation,
+	.get_iova_range         = vdpasim_get_iova_range,
+	.dma_map                = vdpasim_dma_map,
+	.dma_unmap              = vdpasim_dma_unmap,
+	.free                   = vdpasim_free,
+};
+
+static const struct vdpa_config_ops vdpasim_net_batch_config_ops = {
+	.set_vq_address         = vdpasim_set_vq_address,
+	.set_vq_num             = vdpasim_set_vq_num,
+	.kick_vq                = vdpasim_kick_vq,
+	.set_vq_cb              = vdpasim_set_vq_cb,
+	.set_vq_ready           = vdpasim_set_vq_ready,
+	.get_vq_ready           = vdpasim_get_vq_ready,
+	.set_vq_state           = vdpasim_set_vq_state,
+	.get_vq_state           = vdpasim_get_vq_state,
+	.get_vq_align           = vdpasim_get_vq_align,
+	.get_features           = vdpasim_get_features,
+	.set_features           = vdpasim_set_features,
+	.set_config_cb          = vdpasim_set_config_cb,
+	.get_vq_num_max         = vdpasim_get_vq_num_max,
+	.get_device_id          = vdpasim_get_device_id,
+	.get_vendor_id          = vdpasim_get_vendor_id,
+	.get_status             = vdpasim_get_status,
+	.set_status             = vdpasim_set_status,
+	.get_config             = vdpasim_get_config,
+	.set_config             = vdpasim_set_config,
+	.get_generation         = vdpasim_get_generation,
+	.get_iova_range         = vdpasim_get_iova_range,
+	.set_map                = vdpasim_set_map,
+	.free                   = vdpasim_free,
+};
+
+static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config)
+{
+	struct virtio_net_config *net_config =
+		(struct virtio_net_config *)config;
+
+	net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
+	net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
+	memcpy(net_config->mac, macaddr_buf, ETH_ALEN);
+}
+
+static int __init vdpasim_dev_init(void)
+{
+	struct vdpasim_dev_attr dev_attr = {};
+
+	dev_attr.nvqs = VDPASIM_VQ_NUM;
+	dev_attr.config_size = sizeof(struct virtio_net_config);
+	dev_attr.get_config = vdpasim_net_get_config;
+
+	vdpasim_dev = vdpasim_create(&dev_attr);
+
+	if (!IS_ERR(vdpasim_dev))
+		return 0;
+
+	return PTR_ERR(vdpasim_dev);
+}
+
+static void __exit vdpasim_dev_exit(void)
+{
+	struct vdpa_device *vdpa = &vdpasim_dev->vdpa;
+
+	vdpa_unregister_device(vdpa);
+}
+
+module_init(vdpasim_dev_init)
+module_exit(vdpasim_dev_exit)
+
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 10:05:51 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 10:05:51 +0000
commit	5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch)
tree	a94efe259b9009378be6d90eb30d2b019d95c194 /drivers/vdpa
parent	Initial commit. (diff)
download	linux-5d1646d90e1f2cceb9f0828f4b28318cd0ec7744.tar.xz linux-5d1646d90e1f2cceb9f0828f4b28318cd0ec7744.zip